diff --git a/package.json b/package.json index 178924e..72d8c15 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@ztimson/ai-utils", - "version": "0.6.4", + "version": "0.6.5", "description": "AI Utility library", "author": "Zak Timson", "license": "MIT", diff --git a/src/audio.ts b/src/audio.ts index 67eeb71..50104e0 100644 --- a/src/audio.ts +++ b/src/audio.ts @@ -2,12 +2,12 @@ import {spawn} from 'node:child_process'; import {pipeline} from '@xenova/transformers'; import * as fs from 'node:fs'; import {AbortablePromise, Ai} from './ai.ts'; -import * as wavefile from 'wavefile'; +import wavefile from 'wavefile'; export class Audio { private whisperPipeline: any; - constructor(private ai: Ai) {} + constructor(private ai: Ai) { } private combineSpeakerTranscript(chunks: any[], speakers: any[]): string { const speakerMap = new Map(); @@ -96,14 +96,25 @@ print(json.dumps(segments)) if(aborted) return resolve(null); if(!this.whisperPipeline) this.whisperPipeline = await pipeline('automatic-speech-recognition', `Xenova/${model}`, { cache_dir: this.ai.options.path, quantized: true }); - // Transcript + // Prepare audio file (convert to mono channel wave) if(aborted) return resolve(null); - const audioData = fs.readFileSync(path); - const wav = new wavefile.WaveFile(audioData); + const wav = new wavefile.WaveFile(fs.readFileSync(path)); wav.toBitDepth('32f'); wav.toSampleRate(16000); - const buffer = wav.getSamples(); - const transcriptResult = await this.whisperPipeline(buffer, {return_timestamps: speaker ? 'word' : false, chunk_length_s: 30,}); + const samples = wav.getSamples(); + let buffer; + if(Array.isArray(samples)) { // stereo to mono - average the channels + const left = samples[0]; + const right = samples[1]; + buffer = new Float32Array(left.length); + for (let i = 0; i < left.length; i++) buffer[i] = (left[i] + right[i]) / 2; + } else { + buffer = samples; + } + + // Transcribe + if(aborted) return resolve(null); + const transcriptResult = await this.whisperPipeline(buffer, {return_timestamps: speaker ? 'word' : false}); if(!speaker) return resolve(transcriptResult.text?.trim() || null); // Speaker Diarization