diff --git a/constants/common.js b/constants/common.js index 782d4eac..584f5e19 100644 --- a/constants/common.js +++ b/constants/common.js @@ -721,7 +721,9 @@ export const getLinksFromSitemap = async ( browser, userDataDirectory, userUrlInput, - isIntelligent + isIntelligent, + username, + password ) => { const urls = {}; // dictionary of requests to urls to be scanned @@ -731,11 +733,22 @@ export const getLinksFromSitemap = async ( const addToUrlList = url => { if (!url) return; if (isDisallowedInRobotsTxt(url)) return; + + // add basic auth credentials to the URL + (username && password)? url = addBasicAuthCredentials(url, username, password): url; + const request = new Request({ url: encodeURI(url) }); if (isUrlPdf(url)) { request.skipNavigation = true; } urls[url] = request; +}; + + const addBasicAuthCredentials = (url, username, password) => { + const urlObject = new URL(url); + urlObject.username = username; + urlObject.password = password; + return urlObject.toString(); }; const calculateCloseness = (sitemapUrl) => { diff --git a/crawlers/crawlSitemap.js b/crawlers/crawlSitemap.js index eadd0bc7..41d0bd22 100644 --- a/crawlers/crawlSitemap.js +++ b/crawlers/crawlSitemap.js @@ -63,7 +63,10 @@ const crawlSitemap = async ( } } - linksFromSitemap = await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl, browser, userDataDirectory, userUrlInputFromIntelligent, fromCrawlIntelligentSitemap) + const username = basicAuthRegex.test(sitemapUrl) ? sitemapUrl.split('://')[1].split(':')[0] : null; + const password = basicAuthRegex.test(sitemapUrl) ? sitemapUrl.split(':')[2].split('@')[0] : null; + + linksFromSitemap = await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl, browser, userDataDirectory, userUrlInputFromIntelligent, fromCrawlIntelligentSitemap, username, password) /** * Regex to match http://username:password@hostname.com @@ -131,6 +134,13 @@ const crawlSitemap = async ( requestList, preNavigationHooks: preNavigationHooks(extraHTTPHeaders), requestHandler: async ({ page, request, response, sendRequest }) => { + + // remove basic auth credentials so it wont be displayed in report + if (isBasicAuth){ + request.url ? request.url = `${request.url.split('://')[0]}://${request.url.split('@')[1]}` : null; + request.loadedUrl ? request.loadedUrl = `${request.loadedUrl.split('://')[0]}://${request.loadedUrl.split('@')[1]}` : null; + } + const actualUrl = request.loadedUrl || request.url; if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) { @@ -239,6 +249,10 @@ const crawlSitemap = async ( }, failedRequestHandler: async ({ request }) => { + if (isBasicAuth){ + request.url ? request.url = `${request.url.split('://')[0]}://${request.url.split('@')[1]}` : null; + } + // check if scanned pages have reached limit due to multi-instances of handler running if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) { return;