Speaker diarization

2026-02-12 11:26:11 -05:00
parent 27506d20af
commit 54730a2b9a
4 changed files with 113 additions and 56 deletions
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@ztimson/ai-utils",
-	"version": "0.5.6",
+	"version": "0.6.0",
 	"description": "AI Utility library",
 	"author": "Zak Timson",
 	"license": "MIT",
--- a/src/ai.ts
+++ b/src/ai.ts
@@ -10,24 +10,16 @@ export type AbortablePromise<T> = Promise<T> & {
 export type AiOptions = {
 	/** Path to models */
 	path?: string;
-	/** Embedding model */
+	/** ASR model: whisper-tiny, whisper-base */
-	embedder?: string; // all-MiniLM-L6-v2, bge-small-en-v1.5, bge-large-en-v1.5
+	asr?: string;
 	/** Embedding model: all-MiniLM-L6-v2, bge-small-en-v1.5, bge-large-en-v1.5 */
 	embedder?: string;
 	/** Large language models, first is default */
 	llm?: Omit<LLMRequest, 'model'> & {
 		models: {[model: string]: AnthropicConfig | OllamaConfig | OpenAiConfig};
 	}
-	/** Tesseract OCR configuration */
+	/** OCR model: eng, eng_best, eng_fast */
-	tesseract?: {
+	ocr?: string;
 		/** Model: eng, eng_best, eng_fast */
 		model?: string;
 	}
 	/** Whisper ASR configuration */
 	whisper?: {
 		/** Whisper binary location */
 		binary: string;
 		/** Model: `ggml-base.en.bin` */
 		model: string;
 	}
 }
 export class Ai {
--- a/src/audio.ts
+++ b/src/audio.ts
@@ -1,50 +1,115 @@
-import {spawn} from 'node:child_process';
+import { spawn } from 'node:child_process';
-import fs from 'node:fs/promises';
+import { pipeline } from '@xenova/transformers';
-import Path from 'node:path';
+import { AbortablePromise, Ai } from './ai.ts';
 import {AbortablePromise, Ai} from './ai.ts';
 export class Audio {
-	private downloads: {[key: string]: Promise<string>} = {};
+	private whisperPipeline: any;
 	private whisperModel!: string;
-	constructor(private ai: Ai) {
+	constructor(private ai: Ai) {}
 		if(ai.options.whisper?.binary) {
 			this.whisperModel = ai.options.whisper?.model.endsWith('.bin') ? ai.options.whisper?.model : ai.options.whisper?.model + '.bin';
 			this.downloadAsrModel();
 		}
 	}
-	asr(path: string, model: string = this.whisperModel): AbortablePromise<string | null> {
+	private combineSpeakerTranscript(chunks: any[], speakers: any[]): string {
-		if(!this.ai.options.whisper?.binary) throw new Error('Whisper not configured');
+		const speakerMap = new Map();
-		let abort: any = () => {};
+		let speakerCount = 0;
-		const p = new Promise<string | null>(async (resolve, reject) => {
+		speakers.forEach((seg: any) => {
-			const m = await this.downloadAsrModel(model);
+			if(!speakerMap.has(seg.speaker)) speakerMap.set(seg.speaker, ++speakerCount);
 			let output = '';
 			const proc = spawn(<string>this.ai.options.whisper?.binary, ['-nt', '-np', '-m', m, '-f', path], {stdio: ['ignore', 'pipe', 'ignore']});
 			abort = () => proc.kill('SIGTERM');
 			proc.on('error', (err: Error) => reject(err));
 			proc.stdout.on('data', (data: Buffer) => output += data.toString());
 			proc.on('close', (code: number) => {
 				if(code === 0) resolve(output.trim() || null);
 				else reject(new Error(`Exit code ${code}`));
 			});
 		});
-		return Object.assign(p, {abort});
+
 		const lines: string[] = [];
 		let currentSpeaker = -1;
 		let currentText = '';
 		chunks.forEach((chunk: any) => {
 			const time = chunk.timestamp[0];
 			const speaker = speakers.find((s: any) => time >= s.start && time <= s.end);
 			const speakerNum = speaker ? speakerMap.get(speaker.speaker) : 1;
 			if (speakerNum !== currentSpeaker) {
 				if(currentText) lines.push(`[speaker ${currentSpeaker}]: ${currentText.trim()}`);
 				currentSpeaker = speakerNum;
 				currentText = chunk.text;
 			} else {
 				currentText += chunk.text;
 			}
 		});
 		if(currentText) lines.push(`[speaker ${currentSpeaker}]: ${currentText.trim()}`);
 		return lines.join('\n');
 	}
-	async downloadAsrModel(model: string = this.whisperModel): Promise<string> {
+	private async isPyannoteInstalled(): Promise<boolean> {
-		if(!this.ai.options.whisper?.binary) throw new Error('Whisper not configured');
+		return new Promise((resolve) => {
-		if(!model.endsWith('.bin')) model += '.bin';
+			const proc = spawn('python3', ['-c', 'import pyannote.audio']);
-		const p = Path.join(<string>this.ai.options.path, model);
+			proc.on('close', (code: number) => resolve(code === 0));
-		if(await fs.stat(p).then(() => true).catch(() => false)) return p;
+			proc.on('error', () => resolve(false));
-		if(!!this.downloads[model]) return this.downloads[model];
+		});
-		this.downloads[model] = fetch(`https://huggingface.co/ggerganov/whisper.cpp/resolve/main/${model}`)
+	}
-			.then(resp => resp.arrayBuffer())
+
-			.then(arr => Buffer.from(arr)).then(async buffer => {
+	private async runDiarization(audioPath: string): Promise<any[]> {
-				await fs.writeFile(p, buffer);
+		if(!await this.isPyannoteInstalled()) throw new Error('Pyannote is not installed: pip install pyannote.audio');
-				delete this.downloads[model];
+		const script = `
-				return p;
+import sys
 import json
 from pyannote.audio import Pipeline
 os.environ['TORCH_HOME'] = "${this.ai.options.path}"
 pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
 diarization = pipeline(sys.argv[1])
 segments = []
 for turn, _, speaker in diarization.itertracks(yield_label=True):
    segments.append({
        "start": turn.start,
        "end": turn.end,
        "speaker": speaker
    })
 print(json.dumps(segments))
 `;
 		return new Promise((resolve, reject) => {
 			let output = '';
 			const proc = spawn('python3', ['-c', script, audioPath]);
 			proc.stdout.on('data', (data: Buffer) => output += data.toString());
 			proc.stderr.on('data', (data: Buffer) => console.error(data.toString()));
 			proc.on('close', (code: number) => {
 				if(code === 0) {
 					try {
 						resolve(JSON.parse(output));
 					} catch (err) {
 						reject(new Error('Failed to parse diarization output'));
 					}
 				} else {
 					reject(new Error(`Python process exited with code ${code}`));
 				}
 			});
-		return this.downloads[model];
+
 			proc.on('error', reject);
 		});
 	}
 	asr(path: string, options: { model?: string; speaker?: boolean } = {}): AbortablePromise<string | null> {
 		const { model = this.ai.options.asr || 'whisper-base', speaker = false } = options;
 		let aborted = false;
 		const abort = () => { aborted = true; };
 		const p = new Promise<string | null>(async (resolve, reject) => {
 			try {
 				if(aborted) return resolve(null);
 				if(!this.whisperPipeline) this.whisperPipeline = await pipeline('automatic-speech-recognition', `Xenova/${model}`, { cache_dir: this.ai.options.path, quantized: true });
 				// Transcript
 				if(aborted) return resolve(null);
 				const transcriptResult = await this.whisperPipeline(path, {return_timestamps: speaker ? 'word' : false, chunk_length_s: 30,});
 				if(!speaker) return resolve(transcriptResult.text?.trim() || null);
 				// Speaker Diarization
 				if(aborted) return resolve(null);
 				const speakers = await this.runDiarization(path);
 				if(aborted) return resolve(null);
 				const combined = this.combineSpeakerTranscript(transcriptResult.chunks || [], speakers);
 				resolve(combined);
 			} catch (err) {
 				reject(err);
 			}
 		});
 		return Object.assign(p, { abort });
 	}
 }
--- a/src/vision.ts
+++ b/src/vision.ts
@@ -13,7 +13,7 @@ export class Vision {
 	ocr(path: string): AbortablePromise<string | null> {
 		let worker: any;
 		const p = new Promise<string | null>(async res => {
-			worker = await createWorker(this.ai.options.tesseract?.model || 'eng', 2, {cachePath: this.ai.options.path});
+			worker = await createWorker(this.ai.options.ocr || 'eng', 2, {cachePath: this.ai.options.path});
 			const {data} = await worker.recognize(path);
 			await worker.terminate();
 			res(data.text.trim() || null);