From fdc3c80d33d55c6e37ab4ee278b019872dd94fae Mon Sep 17 00:00:00 2001 From: an-lee Date: Sun, 18 Feb 2024 10:56:52 +0800 Subject: [PATCH] Fix download script (#317) * bundle tiny.en as whisper default model * improve download-whisper-model script * improve download-ffmpeg-wasm script --- enjoy/scripts/download-ffmpeg-wasm.mjs | 32 +++++++++++++------ enjoy/scripts/download-whisper-model.mjs | 37 ++++++++++++++++------ enjoy/src/constants.ts | 15 ++++++--- enjoy/src/main/whisper.ts | 39 +++++++++++++----------- 4 files changed, 81 insertions(+), 42 deletions(-) diff --git a/enjoy/scripts/download-ffmpeg-wasm.mjs b/enjoy/scripts/download-ffmpeg-wasm.mjs index 40e03eb7e..8eb381957 100755 --- a/enjoy/scripts/download-ffmpeg-wasm.mjs +++ b/enjoy/scripts/download-ffmpeg-wasm.mjs @@ -35,9 +35,7 @@ await Promise.all( console.info(chalk.green(`✅ File ${file.name} valid`)); } else { console.warn( - chalk.yellow( - `❌ File ${file.name} not valid, start to redownload` - ) + chalk.yellow(`❌ File ${file.name} not valid, start to redownload`) ); fs.removeSync(path.join(dir, file.name)); pendingFiles.push(file); @@ -81,6 +79,8 @@ if (proxyUrl) { } const download = async (url, dest, md5) => { + console.info(chalk.blue(`=> Start to download ${url} to ${dest}`)); + return spinner(async () => { console.info(chalk.blue(`=> Start to download file ${url}`)); await axios @@ -89,22 +89,27 @@ const download = async (url, dest, md5) => { }) .then(async (response) => { const data = Buffer.from(response.data, "binary"); + console.info(chalk.green(`✅ ${dest} downloaded successfully`)); fs.writeFileSync(dest, data); const hash = await hashFile(dest, { algo: "md5" }); if (hash === md5) { - console.info(chalk.green(`✅ ${dest} downloaded successfully`)); + console.info(chalk.green(`✅ ${dest} valid`)); } else { console.error( chalk.red( - `❌ Error: ${dest} MD5 not match, ${hash} should be ${md5}` + `❌ Error: ${dest} not valid. \nPlease try again using the command "yarn workspace enjoy download-ffmpeg-wasm"` ) ); process.exit(1); } }) .catch((err) => { - console.error(chalk.red(`❌ Error: ${err}`)); + console.error( + chalk.red( + `❌ Failed to download(${err}). \nPlease try again using the command "yarn workspace enjoy download-ffmpeg-wasm"` + ) + ); process.exit(1); }); }); @@ -126,12 +131,17 @@ const cleanup = () => { try { fs.removeSync(path.join(dir, file.name)); } catch (err) { - console.error(chalk.red(`❌ Error: ${err}`)); + console.error( + chalk.red( + `❌ Failed to download(${err}). \nPlease try again using the command "yarn workspace enjoy download-ffmpeg-wasm"` + ) + ); } }); }; -const baseURL = "https://unpkg.com/@ffmpeg/core-mt@0.12.6/dist/esm"; +// const baseURL = "https://unpkg.com/@ffmpeg/core-mt@0.12.6/dist/esm"; +const baseURL = "https://enjoy-storage.baizhiheizi.com"; try { await Promise.all( pendingFiles.map((file) => @@ -139,7 +149,11 @@ try { ) ); } catch (err) { - console.error(chalk.red(`❌ Error: ${err}`)); + console.error( + chalk.red( + `❌ Failed to download(${err}). \nPlease try again using the command "yarn workspace enjoy download-ffmpeg-wasm"` + ) + ); cleanup(); process.exit(1); } diff --git a/enjoy/scripts/download-whisper-model.mjs b/enjoy/scripts/download-whisper-model.mjs index b932c58e1..ece00a19b 100755 --- a/enjoy/scripts/download-whisper-model.mjs +++ b/enjoy/scripts/download-whisper-model.mjs @@ -4,8 +4,8 @@ import axios from "axios"; import progress from "progress"; import { createHash } from "crypto"; -const model = "ggml-base.en-q5_1.bin"; -const md5 = "55309cc6613788f07ac7988985210734"; +const model = "ggml-tiny.en.bin"; +const sha = "c78c86eb1a8faa21b369bcd33207cc90d64ae9df"; const dir = path.join(process.cwd(), "lib/whisper.cpp/models"); @@ -15,8 +15,8 @@ fs.ensureDirSync(dir); try { if (fs.statSync(path.join(dir, model)).isFile()) { console.info(chalk.green(`✅ Model ${model} already exists`)); - const hash = await hashFile(path.join(dir, model), { algo: "md5" }); - if (hash === md5) { + const hash = await hashFile(path.join(dir, model), { algo: "sha1" }); + if (hash === sha) { console.info(chalk.green(`✅ Model ${model} valid`)); process.exit(0); } else { @@ -50,11 +50,12 @@ if (proxyUrl) { }; } -const modelUrlPrefix = - "https://huggingface.co/ggerganov/whisper.cpp/resolve/main"; +// const modelUrlPrefix = +// "https://huggingface.co/ggerganov/whisper.cpp/resolve/main"; +const modelUrlPrefix = "https://enjoy-storage.baizhiheizi.com"; function hashFile(path, options) { - const algo = options.algo || "md5"; + const algo = options.algo || "sha1"; return new Promise((resolve, reject) => { const hash = createHash(algo); const stream = fs.createReadStream(path); @@ -65,6 +66,7 @@ function hashFile(path, options) { } const download = async (url, dest) => { + console.info(chalk.blue(`=> Start to download from ${url} to ${dest}`)); return axios .get(url, { responseType: "stream" }) .then((response) => { @@ -82,13 +84,28 @@ const download = async (url, dest) => { progressBar.tick(chunk.length); }); - response.data.pipe(fs.createWriteStream(dest)).on("close", () => { + response.data.pipe(fs.createWriteStream(dest)).on("close", async () => { console.info(chalk.green(`✅ Model ${model} downloaded successfully`)); - process.exit(0); + const hash = await hashFile(path.join(dir, model), { algo: "sha1" }); + if (hash === sha) { + console.info(chalk.green(`✅ Model ${model} valid`)); + process.exit(0); + } else { + console.error( + chalk.red( + `❌ Model ${model} not valid, please try again using command \`yarn workspace enjoy download-whisper-model\`` + ) + ); + process.exit(1); + } }); }) .catch((err) => { - console.error(chalk.red(`❌ Error: ${err}`)); + console.error( + chalk.red( + `❌ Failed to download ${url}: ${err}.\nPlease try again using command \`yarn workspace enjoy download-whisper-model\`` + ) + ); process.exit(1); }); }; diff --git a/enjoy/src/constants.ts b/enjoy/src/constants.ts index 0f92cc59c..0355afbcb 100644 --- a/enjoy/src/constants.ts +++ b/enjoy/src/constants.ts @@ -12,31 +12,36 @@ export const WHISPER_MODELS_OPTIONS = [ { type: "tiny", name: "ggml-tiny.en.bin", - size: "77.7 MB", + size: "75 MB", + sha: "c78c86eb1a8faa21b369bcd33207cc90d64ae9df", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin", }, { type: "base", name: "ggml-base.en.bin", - size: "148 MB", + size: "142 MB", + sha: "137c40403d78fd54d454da0f9bd998f78703390c", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin", }, { type: "small", name: "ggml-small.en.bin", - size: "488 MB", + size: "466 MB", + sha: "db8a495a91d927739e50b3fc1cc4c6b8f6c2d022", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin", }, { type: "medium", name: "ggml-medium.en.bin", - size: "1.53 GB", + size: "1.5 GB", + sha: "8c30f0e44ce9560643ebd10bbe50cd20eafd3723", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en.bin", }, { type: "large", name: "ggml-large-v3.bin", - size: "3.09 GB", + size: "2.9 GB", + sha: "ad82bf6a9043ceed055076d0fd39f5f186ff8062", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3.bin", }, ]; diff --git a/enjoy/src/main/whisper.ts b/enjoy/src/main/whisper.ts index 76a3b0fc1..c3e5acfcc 100644 --- a/enjoy/src/main/whisper.ts +++ b/enjoy/src/main/whisper.ts @@ -10,7 +10,7 @@ const logger = log.scope("whisper"); class Whipser { private binMain: string; - private defaultModel: string; + private bundledModelsDir: string; public config: WhisperConfigType; constructor(config?: WhisperConfigType) { @@ -20,13 +20,7 @@ class Whipser { "whisper", "main" ); - this.defaultModel = path.join( - __dirname, - "lib", - "whisper", - "models", - "ggml-base.en-q5_1.bin" - ); + this.bundledModelsDir = path.join(__dirname, "lib", "whisper", "models"); if (fs.existsSync(customWhisperPath)) { this.binMain = customWhisperPath; } else { @@ -36,23 +30,32 @@ class Whipser { currentModel() { if (!this.config.availableModels) return; - if (!this.config.model) { - const model = this.config.availableModels[0]; - settings.setSync("whisper.model", this.config.availableModels[0].name); - return model.savePath; + + let model: WhisperConfigType["availableModels"][0]; + if (this.config.model) { + model = (this.config.availableModels || []).find( + (m) => m.name === this.config.model + ); + } + if (!model) { + model = this.config.availableModels[0]; } - return (this.config.availableModels || []).find( - (m) => m.name === this.config.model - )?.savePath; + settings.setSync("whisper.model", model.name); + return model.savePath; } async initialize() { + const bundleModels = fs.readdirSync(this.bundledModelsDir); + const dir = path.join(settings.libraryPath(), "whisper", "models"); fs.ensureDirSync(dir); const files = fs.readdirSync(dir); + + const availableModelFiles = bundleModels.concat(files); + const models = []; - for (const file of files) { + for (const file of availableModelFiles) { const model = WHISPER_MODELS_OPTIONS.find((m) => m.name == file); if (!model) continue; @@ -102,7 +105,7 @@ class Whipser { async check() { await this.initialize(); - const model = this.currentModel() || this.defaultModel; + const model = this.currentModel(); const sampleFile = path.join(__dirname, "samples", "jfk.wav"); const tmpDir = settings.cachePath(); @@ -169,7 +172,7 @@ class Whipser { throw new Error("No file or blob provided"); } - const model = this.currentModel() || this.defaultModel; + const model = this.currentModel(); if (blob) { const format = blob.type.split("/")[1];