Fix ASR

2026-02-12 18:32:19 -05:00
parent 22d5427e86
commit 3ed206923f
2 changed files with 19 additions and 8 deletions
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@ztimson/ai-utils",
-	"version": "0.6.4",
+	"version": "0.6.5",
 	"description": "AI Utility library",
 	"author": "Zak Timson",
 	"license": "MIT",
--- a/src/audio.ts
+++ b/src/audio.ts
@@ -2,12 +2,12 @@ import {spawn} from 'node:child_process';
 import {pipeline} from '@xenova/transformers';
 import * as fs from 'node:fs';
 import {AbortablePromise, Ai} from './ai.ts';
-import * as wavefile from 'wavefile';
+import wavefile from 'wavefile';
 export class Audio {
 	private whisperPipeline: any;
-	constructor(private ai: Ai) {}
+	constructor(private ai: Ai) { }
 	private combineSpeakerTranscript(chunks: any[], speakers: any[]): string {
 		const speakerMap = new Map();
@@ -96,14 +96,25 @@ print(json.dumps(segments))
 				if(aborted) return resolve(null);
 				if(!this.whisperPipeline) this.whisperPipeline = await pipeline('automatic-speech-recognition', `Xenova/${model}`, { cache_dir: this.ai.options.path, quantized: true });
-				// Transcript
+				// Prepare audio file (convert to mono channel wave)
 				if(aborted) return resolve(null);
-				const audioData = fs.readFileSync(path);
+				const wav = new wavefile.WaveFile(fs.readFileSync(path));
 				const wav = new wavefile.WaveFile(audioData);
 				wav.toBitDepth('32f');
 				wav.toSampleRate(16000);
-				const buffer = wav.getSamples();
+				const samples = wav.getSamples();
-				const transcriptResult = await this.whisperPipeline(buffer, {return_timestamps: speaker ? 'word' : false, chunk_length_s: 30,});
+				let buffer;
 				if(Array.isArray(samples)) { // stereo to mono - average the channels
 					const left = samples[0];
 					const right = samples[1];
 					buffer = new Float32Array(left.length);
 					for (let i = 0; i < left.length; i++) buffer[i] = (left[i] + right[i]) / 2;
 				} else {
 					buffer = samples;
 				}
 				// Transcribe
 				if(aborted) return resolve(null);
 				const transcriptResult = await this.whisperPipeline(buffer, {return_timestamps: speaker ? 'word' : false});
 				if(!speaker) return resolve(transcriptResult.text?.trim() || null);
 				// Speaker Diarization