Skip to content

Commit

Permalink
Handle basic auth credentials for sitemap (#339)
Browse files Browse the repository at this point in the history
* handle basic auth credentials for sitemap

* remove auth credentials in failedRequestHandler
  • Loading branch information
joshualai9922 authored May 24, 2024
1 parent f172fe7 commit 699b3e1
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 2 deletions.
15 changes: 14 additions & 1 deletion constants/common.js
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,9 @@ export const getLinksFromSitemap = async (
browser,
userDataDirectory,
userUrlInput,
isIntelligent
isIntelligent,
username,
password
) => {

const urls = {}; // dictionary of requests to urls to be scanned
Expand All @@ -731,11 +733,22 @@ export const getLinksFromSitemap = async (
const addToUrlList = url => {
if (!url) return;
if (isDisallowedInRobotsTxt(url)) return;

// add basic auth credentials to the URL
(username && password)? url = addBasicAuthCredentials(url, username, password): url;

const request = new Request({ url: encodeURI(url) });
if (isUrlPdf(url)) {
request.skipNavigation = true;
}
urls[url] = request;
};

const addBasicAuthCredentials = (url, username, password) => {
const urlObject = new URL(url);
urlObject.username = username;
urlObject.password = password;
return urlObject.toString();
};

const calculateCloseness = (sitemapUrl) => {
Expand Down
16 changes: 15 additions & 1 deletion crawlers/crawlSitemap.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ const crawlSitemap = async (
}
}

linksFromSitemap = await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl, browser, userDataDirectory, userUrlInputFromIntelligent, fromCrawlIntelligentSitemap)
const username = basicAuthRegex.test(sitemapUrl) ? sitemapUrl.split('://')[1].split(':')[0] : null;
const password = basicAuthRegex.test(sitemapUrl) ? sitemapUrl.split(':')[2].split('@')[0] : null;

linksFromSitemap = await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl, browser, userDataDirectory, userUrlInputFromIntelligent, fromCrawlIntelligentSitemap, username, password)

/**
* Regex to match http://username:password@hostname.com
Expand Down Expand Up @@ -131,6 +134,13 @@ const crawlSitemap = async (
requestList,
preNavigationHooks: preNavigationHooks(extraHTTPHeaders),
requestHandler: async ({ page, request, response, sendRequest }) => {

// remove basic auth credentials so it wont be displayed in report
if (isBasicAuth){
request.url ? request.url = `${request.url.split('://')[0]}://${request.url.split('@')[1]}` : null;
request.loadedUrl ? request.loadedUrl = `${request.loadedUrl.split('://')[0]}://${request.loadedUrl.split('@')[1]}` : null;
}

const actualUrl = request.loadedUrl || request.url;

if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
Expand Down Expand Up @@ -239,6 +249,10 @@ const crawlSitemap = async (
},
failedRequestHandler: async ({ request }) => {

if (isBasicAuth){
request.url ? request.url = `${request.url.split('://')[0]}://${request.url.split('@')[1]}` : null;
}

// check if scanned pages have reached limit due to multi-instances of handler running
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
return;
Expand Down

0 comments on commit 699b3e1

Please sign in to comment.