-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCrawler.cpp
181 lines (158 loc) · 8.26 KB
/
Crawler.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#include "Crawler.h"
#include "Config.h"
#include "CurlInteractionStructs.h"
#include "TermMatcher.h"
#include "ThreadSafeSet.h"
#include "ThreadSafeQueue.h"
#include <atomic>
#include <boost/regex.hpp>
#include <iostream>
#include <queue>
#include <string>
#include <thread>
#include <unordered_set>
#include <unordered_map>
Crawler::Crawler(curlIO cIO, ThreadSafeQueue<std::string>* initialQueue, std::atomic<int>* killS, ThreadSafeQueue<std::string>* extractedDomains, std::unordered_set<std::string> eDomains, Config* config) {
const int defaultMaxRequestsPerDomain = 150;
const int defaultMaxExtractedLinksPerPage = 500;
maxDomainSize = 253; // https://www.freesoft.org/CIE/RFC/1035/9.htm
sleepLockMilliseconds = std::chrono::milliseconds(20);
curlOutputQueue = cIO.output;
urlQueue = cIO.urls;
killSwitch = killS;
extractedDomainQueue = extractedDomains;
excludedDomains = eDomains;
maxRequestsPerDomain = config->getIntConfig("Crawler_MaxRequestsPerDomain", defaultMaxRequestsPerDomain);
maxExtractedLinksPerPage = config->getIntConfig("Crawler_MaxExtractedLinksPerPage", defaultMaxExtractedLinksPerPage);
std::string initialQueueData;
if(initialQueue->empty())
std::cout << "ERROR: sources.txt Has Not Been Populated\n";
else
while(initialQueue->safePop(&initialQueueData) && initialQueueData.length() < maxDomainSize) {
std::queue<std::string> domain;
if(initialQueueData.length() < maxDomainSize) {
extractDomains(initialQueueData, &domain);
if(!domain.empty())
queuedUrls.push(std::make_pair(initialQueueData, domain.front()));
else
std::cout << "ERROR: Invalid Initial Domain: " << initialQueueData << "\n";
} else
std::cout << "ERROR: Initial Domain Exceeds Max Size: " << initialQueueData << "\n";
}
pushUrls();
}
void Crawler::crawl(TermMatcher* validator) {
// Thread will operate until the killswitch is thrown
while(killSwitch->load() == 0) {
siteData data;
if(curlOutputQueue->safePop(&data)) {
domainScraper(data, validator);
pushUrls();
// If there is no work, the crawler thread sleeps
} else
std::this_thread::sleep_for(sleepLockMilliseconds);
}
std::cout << "Crawler Exiting\n";
}
void Crawler::domainScraper(siteData inputData, TermMatcher* validator) {
/**
* Checks to see if there is any data, then checks to see if data returned is an HTML document.
* If there is no data, or the document does start with the DOCTYPE decleration, the site is ignored.
*/
const std::string htmlDoctypeTag = "<!DOCTYPE";
if(inputData.siteContents.empty() || !(inputData.siteContents.front().compare(0, htmlDoctypeTag.size(), htmlDoctypeTag)) == 0)
return;
// If the site has the number of required terms, use regex to search for links and call extractDomains. Otherwise, the site is ignored
if(validator->matchTerms(inputData.siteContents, false)) {
const int urlIndex = 1;
const int domainIndex = 3;
const char selfReferencingLink = '/';
const std::string defaultProtocol = "https://";
// processSiteContents is called on the site contents to parse out potential domains
processSiteContents(inputData);
// This regex idientifies A HREF links for the crawler to follow
// This regex ignores URL fragments by truncating the fragment part of the URL
boost::regex expression(R"(<a[^h>]*href=\"(([^:\/]*:\/\/([^\/":?]+)|\/)[^"&=>#?]*))");
std::queue<std::string> siteDomain;
// Find current URL's domain
extractDomains(inputData.siteUrl, &siteDomain);
int linksFound = 0;
// Limit total links processed per page
for(size_t i = 0; i < inputData.siteContents.size() && linksFound < maxExtractedLinksPerPage; i++) {
boost::sregex_iterator j = boost::sregex_iterator(inputData.siteContents[i].begin(), inputData.siteContents[i].end(), expression);
boost::sregex_iterator end;
// Iterate through every found link
for(; j != end && linksFound < maxExtractedLinksPerPage; j++, linksFound++) {
boost::smatch match = *j;
/**
* Handle self referencing links by concatenating current_url to just the domain, then adding the rest of the match
* If the root domain could not be extracted, ignore the self referencing URL
*/
if(match.str(urlIndex)[0] == selfReferencingLink && !siteDomain.empty())
// Send URL for validation
queuedUrls.push(std::make_pair(defaultProtocol + siteDomain.front() + match.str(urlIndex), siteDomain.front()));
// Exclude excluded domains
else if(excludedDomains.empty() || excludedDomains.find(match.str(domainIndex)) == excludedDomains.end())
queuedUrls.push(std::make_pair(match.str(urlIndex), match.str(domainIndex)));
}
}
}
}
void Crawler::extractDomains(std::string data, std::queue<std::string>* extractedDomains) {
// This expression is used to filter for domains
boost::regex expression(R"([^\w\.\-]([\w-]+?\.(([\w-]+?\.)+)?([a-zA-Z]+|XN--[A-Za-z0-9]+)))");
boost::sregex_iterator i = boost::sregex_iterator(data.begin(), data.end(), expression);
boost::sregex_iterator end;
/**
* For every domain extracted, check for a valid top level domain, then exclude any matches
* that are followed by a `(` character to reduce false positives
*/
for(; i != end; i++) {
boost::smatch match = *i;
if(TOP_LEVEL_DOMAINS.find(match.str(4)) != TOP_LEVEL_DOMAINS.end() && ((std::string)match.suffix())[1] != '(')
extractedDomains->push(match.str(1));
}
}
void Crawler::extractDomains(std::vector<std::string> data, std::queue<std::string>* extractedDomains) {
// For every string in the data
for(std::string it : data) {
// Perform domain extraction on the string
extractDomains(it, extractedDomains);
}
}
void Crawler::processSiteContents(siteData inputData) {
std::queue<std::string> extractedDomains;
extractDomains(inputData.siteContents, &extractedDomains);
while(!extractedDomains.empty()) {
// Excludes domains on the exclusion list, and domains that are too large
if(extractedDomains.front().size() < maxDomainSize && excludedDomains.find(extractedDomains.front()) == excludedDomains.end())
extractedDomainQueue->push(extractedDomains.front());
extractedDomains.pop();
}
}
void Crawler::pushUrls() {
while(!queuedUrls.empty()) {
std::string url = queuedUrls.front().first;
std::string domain = queuedUrls.front().second;
if(!traversedDomains.contains(domain) && url.length() > 0) {
// If this URL's domain has never been seen before, create a new domain entry
if(visitedUrlsPerDomain.find(domain) == visitedUrlsPerDomain.end()) {
std::unordered_set<std::string> newSet = {url};
visitedUrlsPerDomain.insert(std::make_pair(domain, newSet));
urlQueue->push(url);
// If the domain has been seen, and the URL has not been visited before
} else if(visitedUrlsPerDomain[domain].find(url) == visitedUrlsPerDomain[domain].end()) {
// If the number of URLs visited meets the MAX_LINKS_PER_DOMAIN after this addition
if(visitedUrlsPerDomain[domain].size() >= maxRequestsPerDomain) {
// Exclude URLs associated with this domain in the future
traversedDomains.safeInsert(domain);
// Save memory by deleting the URLs stored in the map
visitedUrlsPerDomain.erase(visitedUrlsPerDomain.find(domain));
} else
visitedUrlsPerDomain[domain].insert(url);
urlQueue->push(url);
}
}
queuedUrls.pop();
}
}