diff --git a/crawlers/crawlDomain.js b/crawlers/crawlDomain.js index f474dc6d..72b5534b 100644 --- a/crawlers/crawlDomain.js +++ b/crawlers/crawlDomain.js @@ -19,6 +19,7 @@ import { isDisallowedInRobotsTxt, getUrlsFromRobotsTxt, getBlackListedPatterns, + urlWithoutAuth } from '../constants/common.js'; import { areLinksEqual, isFollowStrategy } from '../utils.js'; import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js'; @@ -337,7 +338,11 @@ const crawlDomain = async ( } await waitForPageLoaded(page, 10000); - const actualUrl = page.url(); // Initialize with the actual URL + let actualUrl = request.url; + + if (page.url() !== 'about:blank') { + actualUrl = page.url(); + } if (!isScanPdfs) { if (isExcluded(actualUrl) || isUrlPdf(actualUrl)) { @@ -478,13 +483,13 @@ const crawlDomain = async ( }); urlsCrawled.scanned.push({ - url: request.url, + url: urlWithoutAuth(request.url), pageTitle: results.pageTitle, actualUrl: request.loadedUrl, // i.e. actualUrl }); urlsCrawled.scannedRedirects.push({ - fromUrl: request.url, + fromUrl: urlWithoutAuth(request.url), toUrl: request.loadedUrl, // i.e. actualUrl }); @@ -498,9 +503,9 @@ const crawlDomain = async ( if (urlsCrawled.scanned.length < maxRequestsPerCrawl) { guiInfoLog(guiInfoStatusTypes.SCANNED, { numScanned: urlsCrawled.scanned.length, - urlScanned: request.url, + urlScanned: urlWithoutAuth(request.url), }); - urlsCrawled.scanned.push({ url: request.url, pageTitle: results.pageTitle }); + urlsCrawled.scanned.push({ url: urlWithoutAuth(request.url), pageTitle: results.pageTitle }); await dataset.pushData(results); } } diff --git a/crawlers/crawlSitemap.js b/crawlers/crawlSitemap.js index d6b485e1..2679fdfa 100644 --- a/crawlers/crawlSitemap.js +++ b/crawlers/crawlSitemap.js @@ -14,6 +14,7 @@ import { getPlaywrightLaunchOptions, messageOptions, isSkippedUrl, + urlWithoutAuth, } from '../constants/common.js'; import { areLinksEqual, isWhitelistedContentType } from '../utils.js'; import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js'; @@ -249,20 +250,20 @@ const crawlSitemap = async ( } urlsCrawled.scanned.push({ - url: request.url, + url: urlWithoutAuth(request.url), pageTitle: results.pageTitle, actualUrl: request.loadedUrl, // i.e. actualUrl }); urlsCrawled.scannedRedirects.push({ - fromUrl: request.url, + fromUrl: urlWithoutAuth(request.url), toUrl: request.loadedUrl, // i.e. actualUrl }); results.url = request.url; results.actualUrl = request.loadedUrl; } else { - urlsCrawled.scanned.push({ url: request.url, pageTitle: results.pageTitle }); + urlsCrawled.scanned.push({ url: urlWithoutAuth(request.url), pageTitle: results.pageTitle }); } await dataset.pushData(results); } else { diff --git a/npmIndex.js b/npmIndex.js index dc248760..e53f807e 100644 --- a/npmIndex.js +++ b/npmIndex.js @@ -7,7 +7,8 @@ import { deleteClonedProfiles, getBrowserToRun, getPlaywrightLaunchOptions, - submitForm + submitForm, + urlWithoutAuth } from './constants/common.js' import { createCrawleeSubFolders, filterAxeResults } from './crawlers/commonCrawlerFunc.js'; import { @@ -126,7 +127,7 @@ export const init = async ( } const pageIndex = urlsCrawled.scanned.length + 1; const filteredResults = filterAxeResults(res.axeScanResults, res.pageTitle, { pageIndex , metadata }); - urlsCrawled.scanned.push({ url: res.pageUrl, pageTitle: `${pageIndex}: ${res.pageTitle}` }); + urlsCrawled.scanned.push({ url: urlWithoutAuth(res.pageUrl), pageTitle: `${pageIndex}: ${res.pageTitle}` }); mustFixIssues += filteredResults.mustFix ? filteredResults.mustFix.totalItems : 0; goodToFixIssues += filteredResults.goodToFix ? filteredResults.goodToFix.totalItems : 0;