From 91066e070fc4673b10b91056d76f2b2014ee43ec Mon Sep 17 00:00:00 2001 From: ztimson Date: Sat, 21 Feb 2026 00:51:01 -0500 Subject: [PATCH] WIP ASR --- package.json | 2 +- src/audio.ts | 29 ++++++++++++++++------------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/package.json b/package.json index 565cfa4..e689b1b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@ztimson/ai-utils", - "version": "0.7.9", + "version": "0.7.10", "description": "AI Utility library", "author": "Zak Timson", "license": "MIT", diff --git a/src/audio.ts b/src/audio.ts index ec3b5bf..2647036 100644 --- a/src/audio.ts +++ b/src/audio.ts @@ -1,6 +1,6 @@ import {execSync, spawn} from 'node:child_process'; -import {mkdtempSync, rmSync} from 'node:fs'; -import fs from 'node:fs/promises'; +import {mkdtempSync} from 'node:fs'; +import fs, {rm} from 'node:fs/promises'; import {tmpdir} from 'node:os'; import Path, {join} from 'node:path'; import {AbortablePromise, Ai} from './ai.ts'; @@ -40,6 +40,7 @@ print(json.dumps(segments)) this.downloadAsrModel(opts.model).then(m => { let output = ''; const args = [opts.diarization ? '-owts' : '-nt', '-m', m, '-f', file]; + console.log(this.ai.options.whisper + ' ' + args.join(' ')) proc = spawn(this.ai.options.whisper, args, {stdio: ['ignore', 'pipe', 'ignore']}); proc.on('error', (err: Error) => reject(err)); proc.stdout.on('data', (data: Buffer) => output += data.toString()); @@ -76,13 +77,10 @@ print(json.dumps(segments)) if(aborted) return; if(!p && !p3) throw new Error('Pyannote is not installed: pip install pyannote.audio'); const binary = p3 ? 'python3' : 'python'; - let tmp: string | null = null; return new Promise((resolve, reject) => { - tmp = join(mkdtempSync(join(tmpdir(), 'audio-')), 'converted.wav'); - execSync(`ffmpeg -i "${file}" -ar 16000 -ac 1 -f wav "${tmp}"`, { stdio: 'ignore' }); if(aborted) return; let output = ''; - const proc = spawn(binary, ['-c', this.pyannote, tmp]); + const proc = spawn(binary, ['-c', this.pyannote, file]); proc.stdout.on('data', (data: Buffer) => output += data.toString()); proc.stderr.on('data', (data: Buffer) => console.error(data.toString())); proc.on('close', (code: number) => { @@ -95,7 +93,7 @@ print(json.dumps(segments)) }); proc.on('error', reject); abort = () => proc.kill('SIGTERM'); - }).finally(() => { if(tmp) rmSync(Path.dirname(tmp), { recursive: true, force: true }); }); + }); })); return Object.assign(p, {abort}); } @@ -129,17 +127,22 @@ print(json.dumps(segments)) asr(file: string, options: { model?: string; diarization?: boolean | 'id' } = {}): AbortablePromise { if(!this.ai.options.whisper) throw new Error('Whisper not configured'); - const transcript = this.runAsr(file, {model: options.model, diarization: !!options.diarization}); - const diarization: any = options.diarization ? this.runDiarization(file) : Promise.resolve(null); - const abort = () => { + const tmp = join(mkdtempSync(join(tmpdir(), 'audio-')), 'converted.wav'); + execSync(`ffmpeg -i "${file}" -ar 16000 -ac 1 -f wav "${tmp}"`, { stdio: 'ignore' }); + const clean = () => rm(Path.dirname(tmp), { recursive: true, force: true }).catch(() => {}); + const transcript = this.runAsr(tmp, {model: options.model, diarization: !!options.diarization}); + const diarization: any = options.diarization ? this.runDiarization(tmp) : Promise.resolve(null); + let aborted = false, abort = () => { + aborted = true; transcript.abort(); diarization?.abort?.(); + clean(); }; const response = Promise.all([transcript, diarization]).then(async ([t, d]) => { - if(!options.diarization) return t; + if(aborted || !options.diarization) return t; t = this.combineSpeakerTranscript(t, d); - if(options.diarization === 'id') { + if(!aborted && options.diarization === 'id') { if(!this.ai.language.defaultModel) throw new Error('Configure an LLM for advanced ASR speaker detection'); let chunks = this.ai.language.chunk(t, 500, 0); if(chunks.length > 4) chunks = [...chunks.slice(0, 3), chunks.at(-1)]; @@ -150,7 +153,7 @@ print(json.dumps(segments)) Object.entries(names).forEach(([speaker, name]) => t = t.replaceAll(`[Speaker ${speaker}]`, `[${name}]`)); } return t; - }); + }).finally(() => clean()); return Object.assign(response, {abort}); }