From 3c782c4e170c38ec1dc738df7ca2b48f5b39e33a Mon Sep 17 00:00:00 2001 From: Seth Battis Date: Fri, 3 Jan 2025 14:13:48 -0500 Subject: [PATCH] fix(msar)!: download is back to status quo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - AuthenticatedFetch now extends PuppteerSession.Authenticated, and each download is a baseFork() - Reorganized parameters to clean up redundancies and pass CLI arguments more consistently throughout - Updated AuthenticatedFetch, HTTPFetch debugging output for greater consistency - Followed `defaults` model in Download.args modules, as in other *.args modules BREAKING: —haltOnError replaced by standard —ignoreErrors (or —no-ignoreErrors) --- packages/msar/src/bin/commands/download.ts | 47 +++++++++--------- .../msar/src/workflows/Download/Downloader.ts | 33 ++++++++----- .../Download/Downloader/AuthenticatedFetch.ts | 49 +++++++------------ .../Download/Downloader/HTTPFetch.ts | 24 +++++++-- .../msar/src/workflows/Download/Spider.ts | 48 ++++++++---------- packages/msar/src/workflows/Download/args.ts | 9 +++- .../msar/src/workflows/Download/args/flags.ts | 10 ++-- .../src/workflows/Download/args/options.ts | 17 ++++--- .../msar/src/workflows/Download/args/parse.ts | 14 ++---- 9 files changed, 130 insertions(+), 121 deletions(-) diff --git a/packages/msar/src/bin/commands/download.ts b/packages/msar/src/bin/commands/download.ts index d6ef842..c6e6a7b 100644 --- a/packages/msar/src/bin/commands/download.ts +++ b/packages/msar/src/bin/commands/download.ts @@ -13,8 +13,7 @@ import * as Snapshot from '../../workflows/Snapshot.js'; } = cli.init({ args: { requirePositionals: 1, - options: Download.args.options, - flags: Download.args.flags, + ...Download.args, man: [ { text: 'Download the supporting files for an existing snapshot JSON file.. This command expects either 1 or 2 arguments: at least a path to an existing snapshot file, and optionally also the desired path to the output folder of supporting files.' @@ -23,32 +22,26 @@ import * as Snapshot from '../../workflows/Snapshot.js'; } }); - const { - downloadOptions, - puppeteerOptions, - credentials, - outputOptions: { pretty, outputPath: _outputPath }, - quit - } = Download.args.parse(values); + const { outputOptions, ...options } = Download.args.parse(values); + const { quit } = options; + const { pretty } = outputOptions; + let { outputPath } = outputOptions; const spinner = cli.spinner(); spinner.start('Reading snaphot file'); const snapshotPath = path.resolve(process.cwd(), snapshotPathArg!); - let outputPath: string; - if (!_outputPath) { + if (!outputPath) { outputPath = path.join( path.dirname(snapshotPath!), path.basename(snapshotPath!, '.json') ); } else { - if (fs.existsSync(_outputPath)) { + if (fs.existsSync(outputPath)) { outputPath = await common.output.avoidOverwrite( - path.join(_outputPath, path.basename(snapshotPath!, '.json')) + path.join(outputPath, path.basename(snapshotPath!, '.json')) ); - } else { - outputPath = _outputPath; } } @@ -63,10 +56,22 @@ import * as Snapshot from '../../workflows/Snapshot.js'; `Read ${snapshots.length} snapshots from ${cli.colors.url(snapshotPath)}` ); + const host = snapshots + .map((snapshot) => snapshot.Metadata.Host) + .reduce((host: string | undefined, other: string) => { + if (!host) { + return other; + } else if (host !== other) { + throw new Error('Multiple hosts present in snapshot file.'); + } + }, undefined); + if (!host) { + throw new Error('No host present in snapshot file.'); + } const spider = new Download.Spider({ - outputPath, - credentials, - host: snapshots[0].Metadata.Host + host, + outputOptions: { ...outputOptions, outputPath }, + ...options }); const indices: (string | undefined)[] = []; @@ -78,10 +83,8 @@ import * as Snapshot from '../../workflows/Snapshot.js'; ); indices.push( await spider.download(snapshot, { - ...downloadOptions, - outputPath, - ...puppeteerOptions, - pretty + ...options, + outputOptions: { ...outputOptions, outputPath } }) ); bar.increment(); diff --git a/packages/msar/src/workflows/Download/Downloader.ts b/packages/msar/src/workflows/Download/Downloader.ts index b17eb45..b4e0078 100644 --- a/packages/msar/src/workflows/Download/Downloader.ts +++ b/packages/msar/src/workflows/Download/Downloader.ts @@ -1,22 +1,33 @@ +import * as common from '../../common.js'; import * as Cache from './Cache.js'; -import { - AuthenticatedFetch, - Options as AuthOptions -} from './Downloader/AuthenticatedFetch.js'; -import { HTTPFetch, Options as HTTPOptions } from './Downloader/HTTPFetch.js'; +import * as AuthenticatedFetch from './Downloader/AuthenticatedFetch.js'; +import * as HTTPFetch from './Downloader/HTTPFetch.js'; import { Strategy } from './Downloader/Strategy.js'; -export type Options = AuthOptions & HTTPOptions; +export type Options = { + host: string; +} & common.output.args.Parsed & + common.PuppeteerSession.args.Parsed & + common.workflow.args.Parsed; +// TODO Downloader needs to honor --concurrentThreads export class Downloader implements Strategy { - private auth: AuthenticatedFetch; - private http: HTTPFetch; + private auth: AuthenticatedFetch.Downloader; + private http: HTTPFetch.Downloader; private host: string; - public constructor({ outputPath, host, ...options }: Options) { + public constructor({ host, outputOptions, ...options }: Options) { + const { outputPath } = outputOptions; + if (!outputPath) { + throw new common.output.OutputError('Downloader requires outputPath'); + } this.host = host; - this.auth = new AuthenticatedFetch({ outputPath, host, ...options }); - this.http = new HTTPFetch({ outputPath }); + this.auth = new AuthenticatedFetch.Downloader({ + host, + outputOptions, + ...options + }); + this.http = new HTTPFetch.Downloader({ outputPath, ...options }); } public async download(original: string, filename?: string) { diff --git a/packages/msar/src/workflows/Download/Downloader/AuthenticatedFetch.ts b/packages/msar/src/workflows/Download/Downloader/AuthenticatedFetch.ts index 623159f..f9cfb6c 100644 --- a/packages/msar/src/workflows/Download/Downloader/AuthenticatedFetch.ts +++ b/packages/msar/src/workflows/Download/Downloader/AuthenticatedFetch.ts @@ -1,5 +1,4 @@ import cli from '@battis/qui-cli'; -import { Mutex } from 'async-mutex'; import { PuppeteerSession } from 'datadirect-puppeteer'; import { EventEmitter } from 'node:events'; import fs from 'node:fs'; @@ -21,48 +20,38 @@ type FilepathVariantsOptions = { }; export type Options = { - outputPath: string; - host: string; -} & PuppeteerSession.Options; + host: URL | string; +} & common.output.args.Parsed & + PuppeteerSession.Options; const TEMP = path.join('/tmp/msar/download', crypto.randomUUID()); const DOWNLOADS = path.join(os.homedir(), 'Downloads'); -export class AuthenticatedFetch +export class Downloader extends PuppeteerSession.Authenticated implements Strategy { private outputPath: string; - private preparing = new Mutex(); private emitter = new EventEmitter(); - public constructor({ outputPath, host, ...options }: Options) { - super(host, options); + public constructor({ + host, + outputOptions: { outputPath }, + ...options + }: Options) { + super(`https://${host}`, options); + if (!outputPath) { + throw new common.output.OutputError( + 'AuthenticatedFetch requires outputPath' + ); + } this.outputPath = outputPath; } public async download(url: string, filename?: string) { - /* - * FIXME refactoring broke `msar download` - * ```sh - * - Connecting to /path/to/myschoolapp-reporting/var/download.log - * ✔ Logging level all to /path/to/myschoolapp-reporting/var/download.log - * - Reading snaphot file - * ✔ Read 1 snapshots from /path/to/myschoolapp-reporting/var/2024 - 2025 - Horace Bixby - Sandbox (Y) - 97551579.json - * Group 97551579: Downloading supporting files - * Task Terminated with exit code 1 - * node:internal/url:806 - * const href = bindingUrl.parse(input, base, raiseException); - * ^ - * TypeError: Invalid URL - * at new URL (node:internal/url:806:29) - * at AuthenticatedFetch.openURL (file:///path/to/myschoolapp-reporting/packages/datadirect-puppeteer/dist/PuppeteerSession/Base.js:45:25) { - * code: 'ERR_INVALID_URL', - * input: 'example.myschoolapp.com' - * } - * ``` - */ - const session = await this.fork('about:blank'); + cli.log.debug(`AuthenticatedFetch: ${url}`); + await this.ready(); + const session = await this.baseFork('about:blank'); const client = await session.page.createCDPSession(); await client.send('Fetch.enable', { @@ -125,7 +114,7 @@ export class AuthenticatedFetch if (fs.existsSync(possiblePaths[key])) { fs.renameSync(possiblePaths[key], destFilepath); cli.log.debug( - `Moved ${key} file to ${cli.colors.url(localPath)}` + `Moved ${key} file ${cli.colors.url(possiblePaths[key])} to ${cli.colors.url(localPath)}` ); this.emitter.emit(url, { localPath, filename }); return; diff --git a/packages/msar/src/workflows/Download/Downloader/HTTPFetch.ts b/packages/msar/src/workflows/Download/Downloader/HTTPFetch.ts index abf33ba..3cd8cbe 100644 --- a/packages/msar/src/workflows/Download/Downloader/HTTPFetch.ts +++ b/packages/msar/src/workflows/Download/Downloader/HTTPFetch.ts @@ -7,18 +7,34 @@ import { } from '../filenameFromDisposition.js'; import { Strategy } from './Strategy.js'; -export type Options = { outputPath: string }; +export type Options = { outputPath: string } & common.workflow.args.Parsed; -export class HTTPFetch implements Strategy { +export class Downloader implements Strategy { private outputPath: string; + private logRequests: boolean; - public constructor({ outputPath }: Options) { + public constructor({ outputPath, logRequests }: Options) { this.outputPath = outputPath; + this.logRequests = logRequests; } public async download(url: string, filename?: string) { - cli.log.debug(`Directly fetching ${cli.colors.url(url)}`); + cli.log.debug(`HTTPFetch: ${cli.colors.url(url)}`); const response = await fetch(url); + if (this.logRequests) { + cli.log.debug({ + url, + response: { + url: response.url, + redirected: response.redirected, + type: response.type, + ok: response.ok, + status: response.status, + statusText: response.statusText, + headers: response.headers + } + }); + } if (response.ok && response.body) { return { localPath: await common.output.writeFetchedFile({ diff --git a/packages/msar/src/workflows/Download/Spider.ts b/packages/msar/src/workflows/Download/Spider.ts index c8c6ef9..33c293c 100644 --- a/packages/msar/src/workflows/Download/Spider.ts +++ b/packages/msar/src/workflows/Download/Spider.ts @@ -3,43 +3,40 @@ import path from 'node:path'; import * as common from '../../common.js'; import * as Snapshot from '../Snapshot.js'; import * as Cache from './Cache.js'; -import { Downloader, Options as DownloaderOptions } from './Downloader.js'; +import * as Downloader from './Downloader.js'; -export type BaseOptions = { +export type Options = { include?: RegExp[]; exclude?: RegExp[]; - haltOnError?: boolean; -}; +} & common.args.Parsed; -type TraverseOptions = BaseOptions & { +type TraverseOptions = Options & { host: string; pathToComponent: string; }; -type DownloadOptions = BaseOptions & { - pretty?: boolean; - outputPath: string; -}; - -export type Options = DownloaderOptions; - export class Spider { - private downloader: Downloader; + private downloader: Downloader.Downloader; - public constructor(options: Options) { - this.downloader = new Downloader(options); + public constructor(options: Downloader.Options) { + this.downloader = new Downloader.Downloader(options); } public async download( snapshot: Snapshot.Data, - { pretty = false, outputPath, ...options }: DownloadOptions + { outputOptions, ...options }: Options ) { + const { outputPath, pretty } = outputOptions; + if (!outputPath) { + throw new common.output.OutputError('Spider requires outputPath'); + } if (snapshot) { cli.log.debug( `Group ${snapshot.SectionInfo?.Id || cli.colors.error('unknown')}: Downloading supporting files` ); await this.traverse(snapshot, { host: snapshot.Metadata.Host, + outputOptions, ...options, pathToComponent: path.basename(outputPath) }); @@ -63,17 +60,15 @@ export class Spider { private async traverse( snapshotComponent: object, - { host, pathToComponent, include, exclude, haltOnError }: TraverseOptions + { pathToComponent, ...options }: TraverseOptions ) { + const { include, exclude, ignoreErrors } = options; if (Array.isArray(snapshotComponent)) { await Promise.allSettled( snapshotComponent.map(async (elt, i) => { await this.traverse(elt, { - host, pathToComponent: `${pathToComponent}[${i}]`, - include, - exclude, - haltOnError + ...options }); }) ); @@ -87,11 +82,8 @@ export class Spider { return; } else if (typeof snapshotComponent[key] === 'object') { await this.traverse(snapshotComponent[key], { - host, pathToComponent: `${pathToComponent}.${key}`, - include, - exclude, - haltOnError + ...options }); /* * FIXME FileName files in topics are at /ftpimages/:SchoolId/topics/:FileName @@ -130,9 +122,7 @@ export class Spider { `${pathToComponent}[${key}]: ${item.localPath || item.error}` ); } catch (error) { - if (haltOnError) { - throw error; - } else { + if (ignoreErrors) { const message = `Download ${cli.colors.value(key)} ${cli.colors.url( snapshotComponent[key] )} failed: ${error}`; @@ -142,6 +132,8 @@ export class Spider { accessed: new Date(), error: message }; + } else { + throw error; } } } diff --git a/packages/msar/src/workflows/Download/args.ts b/packages/msar/src/workflows/Download/args.ts index 930de2b..72411de 100644 --- a/packages/msar/src/workflows/Download/args.ts +++ b/packages/msar/src/workflows/Download/args.ts @@ -1,3 +1,8 @@ -export * from './args/flags.js'; -export * from './args/options.js'; +import { defaults as flagDefaults } from './args/flags.js'; +import { defaults as optionsDefaults } from './args/options.js'; + +export { flags } from './args/flags.js'; +export { options } from './args/options.js'; export * from './args/parse.js'; + +export const defaults = { ...flagDefaults, ...optionsDefaults }; diff --git a/packages/msar/src/workflows/Download/args/flags.ts b/packages/msar/src/workflows/Download/args/flags.ts index f42792e..64d604e 100644 --- a/packages/msar/src/workflows/Download/args/flags.ts +++ b/packages/msar/src/workflows/Download/args/flags.ts @@ -1,9 +1,5 @@ -import cli from '@battis/qui-cli'; import * as common from '../../../common.js'; -export const flags = { - ...common.args.flags, - haltOnError: { - description: `Halt on an error downloading a supporting file (default: ${cli.colors.value('false')}` - } -}; +export const defaults = {}; + +export const flags = common.args.flags; diff --git a/packages/msar/src/workflows/Download/args/options.ts b/packages/msar/src/workflows/Download/args/options.ts index 400d8eb..003e36a 100644 --- a/packages/msar/src/workflows/Download/args/options.ts +++ b/packages/msar/src/workflows/Download/args/options.ts @@ -2,23 +2,26 @@ import cli from '@battis/qui-cli'; import path from 'node:path'; import * as common from '../../../common.js'; -const defaultOutputPath = path.join( - process.cwd(), - `${new Date().toISOString().replace(/[:/.]/g, '-')}-export` -); +export const defaults = { + outputPath: path.join( + process.cwd(), + `${new Date().toISOString().replace(/[:/.]/g, '-')}-export` + ), + include: '^\\/' +}; export const options = { ...common.args.options, outputPath: { ...common.args.options.outputPath, description: `${common.args.options.outputPath?.description} (defaults to the name of the snapshot file)`, - default: defaultOutputPath + default: defaults.outputPath }, include: { description: `Comma-separated list of regular expressions to match URLs to be included in download (e.g. ${cli.colors.quotedValue('"^\\/,example\\.com"')}, default ${cli.colors.quotedValue('"^\\/"')} to include only URLs on Blackbaud's servers)`, - default: '^\\/' + default: defaults.include }, exclude: { - description: `Comma-separated list of regular expressions to match URLs to exclude from download (e.g. ${cli.colors.quotedValue('"example\\.com,foo\\..+\\.com"')}, default: ${cli.colors.value('undefined')})` + description: `Comma-separated list of regular expressions to match URLs to exclude from download (e.g. ${cli.colors.quotedValue('"example\\.com,foo\\..+\\.com"')}` } }; diff --git a/packages/msar/src/workflows/Download/args/parse.ts b/packages/msar/src/workflows/Download/args/parse.ts index 1f3991b..d9ead63 100644 --- a/packages/msar/src/workflows/Download/args/parse.ts +++ b/packages/msar/src/workflows/Download/args/parse.ts @@ -1,11 +1,8 @@ import * as common from '../../../common.js'; export type Parsed = common.args.Parsed & { - downloadOptions: { - include?: RegExp[]; - exclude?: RegExp[]; - haltOnError?: boolean; - }; + include?: RegExp[]; + exclude?: RegExp[]; }; function stringToRegExpArray(arg: string): RegExp[] | undefined { @@ -17,10 +14,7 @@ function stringToRegExpArray(arg: string): RegExp[] | undefined { export function parse(values: Record): Parsed { return { ...common.args.parse(values), - downloadOptions: { - include: stringToRegExpArray(values.include), - exclude: stringToRegExpArray(values.exclude), - haltOnError: !!values.haltOnError - } + include: stringToRegExpArray(values.include), + exclude: stringToRegExpArray(values.exclude) }; }