Skip to content

Commit

Permalink
Clean url, fix issue crawling with PDF downloads where url returns ab…
Browse files Browse the repository at this point in the history
…out:blank
  • Loading branch information
younglim committed May 25, 2024
1 parent f3e1481 commit 104cfe2
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 10 deletions.
15 changes: 10 additions & 5 deletions crawlers/crawlDomain.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {
isDisallowedInRobotsTxt,
getUrlsFromRobotsTxt,
getBlackListedPatterns,
urlWithoutAuth
} from '../constants/common.js';
import { areLinksEqual, isFollowStrategy } from '../utils.js';
import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
Expand Down Expand Up @@ -337,7 +338,11 @@ const crawlDomain = async (
}

await waitForPageLoaded(page, 10000);
const actualUrl = page.url(); // Initialize with the actual URL
let actualUrl = request.url;

if (page.url() !== 'about:blank') {
actualUrl = page.url();
}

if (!isScanPdfs) {
if (isExcluded(actualUrl) || isUrlPdf(actualUrl)) {
Expand Down Expand Up @@ -478,13 +483,13 @@ const crawlDomain = async (
});

urlsCrawled.scanned.push({
url: request.url,
url: urlWithoutAuth(request.url),
pageTitle: results.pageTitle,
actualUrl: request.loadedUrl, // i.e. actualUrl
});

urlsCrawled.scannedRedirects.push({
fromUrl: request.url,
fromUrl: urlWithoutAuth(request.url),
toUrl: request.loadedUrl, // i.e. actualUrl
});

Expand All @@ -498,9 +503,9 @@ const crawlDomain = async (
if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
guiInfoLog(guiInfoStatusTypes.SCANNED, {
numScanned: urlsCrawled.scanned.length,
urlScanned: request.url,
urlScanned: urlWithoutAuth(request.url),
});
urlsCrawled.scanned.push({ url: request.url, pageTitle: results.pageTitle });
urlsCrawled.scanned.push({ url: urlWithoutAuth(request.url), pageTitle: results.pageTitle });
await dataset.pushData(results);
}
}
Expand Down
7 changes: 4 additions & 3 deletions crawlers/crawlSitemap.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import {
getPlaywrightLaunchOptions,
messageOptions,
isSkippedUrl,
urlWithoutAuth,
} from '../constants/common.js';
import { areLinksEqual, isWhitelistedContentType } from '../utils.js';
import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
Expand Down Expand Up @@ -249,20 +250,20 @@ const crawlSitemap = async (
}

urlsCrawled.scanned.push({
url: request.url,
url: urlWithoutAuth(request.url),
pageTitle: results.pageTitle,
actualUrl: request.loadedUrl, // i.e. actualUrl
});

urlsCrawled.scannedRedirects.push({
fromUrl: request.url,
fromUrl: urlWithoutAuth(request.url),
toUrl: request.loadedUrl, // i.e. actualUrl
});

results.url = request.url;
results.actualUrl = request.loadedUrl;
} else {
urlsCrawled.scanned.push({ url: request.url, pageTitle: results.pageTitle });
urlsCrawled.scanned.push({ url: urlWithoutAuth(request.url), pageTitle: results.pageTitle });
}
await dataset.pushData(results);
} else {
Expand Down
5 changes: 3 additions & 2 deletions npmIndex.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ import {
deleteClonedProfiles,
getBrowserToRun,
getPlaywrightLaunchOptions,
submitForm
submitForm,
urlWithoutAuth
} from './constants/common.js'
import { createCrawleeSubFolders, filterAxeResults } from './crawlers/commonCrawlerFunc.js';
import {
Expand Down Expand Up @@ -126,7 +127,7 @@ export const init = async (
}
const pageIndex = urlsCrawled.scanned.length + 1;
const filteredResults = filterAxeResults(res.axeScanResults, res.pageTitle, { pageIndex , metadata });
urlsCrawled.scanned.push({ url: res.pageUrl, pageTitle: `${pageIndex}: ${res.pageTitle}` });
urlsCrawled.scanned.push({ url: urlWithoutAuth(res.pageUrl), pageTitle: `${pageIndex}: ${res.pageTitle}` });

mustFixIssues += filteredResults.mustFix ? filteredResults.mustFix.totalItems : 0;
goodToFixIssues += filteredResults.goodToFix ? filteredResults.goodToFix.totalItems : 0;
Expand Down

0 comments on commit 104cfe2

Please sign in to comment.