-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSearcherThread.cpp
63 lines (55 loc) · 2.17 KB
/
SearcherThread.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#include "SearcherThread.h"
#include "Config.h"
#include "CurlInteractionStructs.h"
#include "TermMatcher.h"
#include "ThreadSafeQueue.h"
#include "ThreadSafeSet.h"
#include <atomic>
#include <fstream>
#include <iostream>
#include <string>
#include <thread>
#include <unordered_set>
SearcherThread::SearcherThread(curlIO cIO, std::atomic<int>* killS, ThreadSafeQueue<std::string>* dQueue, ThreadSafeSet<std::string>* cDomains, Config* config) {
curlOutputQueue = cIO.output;
curlUrls = cIO.urls;
killSwitch = killS;
domainQueue = dQueue;
checkedDomains = cDomains;
sleepLockMilliseconds = std::chrono::milliseconds(20);
output = std::ofstream("output.txt", std::ofstream::out);
}
void SearcherThread::search(TermMatcher* validator) {
// While the killswitch hasnt been thrown
while(killSwitch->load() == 0) {
if(!pushToCurlQueue() && !consumeCurlQueue(validator))
std::this_thread::sleep_for(sleepLockMilliseconds);
}
std::cout << "Searcher Exiting\n";
output.close();
}
bool SearcherThread::pushToCurlQueue() {
std::string domainToCheck;
if(domainQueue->empty())
return false;
if(domainQueue->safePop(&domainToCheck)) {
// If the domain has not been visited yet, push these elements to the curlUrls queue
if(checkedDomains->safeInsert(domainToCheck))
curlUrls->push(domainToCheck);
}
return true;
}
bool SearcherThread::consumeCurlQueue(TermMatcher* validator) {
const std::string htmlDoctypeTag = "<!DOCTYPE";
siteData curlOutput;
if(curlOutputQueue->empty())
return false;
if(curlOutputQueue->safePop(&curlOutput)) {
// Check for the DOCTYPE decleration then write the domain if the site contains enough terms
if(!curlOutput.siteContents.empty() && (curlOutput.siteContents.front().compare(0, htmlDoctypeTag.size(), htmlDoctypeTag)) == 0)
if(validator && validator->matchTerms(curlOutput.siteContents, false))
output << curlOutput.siteUrl << std::endl;
}
return true;
}
//For now maybe limit based on output queue size? << gonna do a cIO setting