From 54730a2b9acd04ea498d13bbb6947d8dcdee73a7 Mon Sep 17 00:00:00 2001
From: ztimson <zaktimson@gmail.com>
Date: Thu, 12 Feb 2026 11:26:11 -0500
Subject: [PATCH] Speaker diarization

---
 package.json  |   2 +-
 src/ai.ts     |  20 +++----
 src/audio.ts  | 145 ++++++++++++++++++++++++++++++++++++--------------
 src/vision.ts |   2 +-
 4 files changed, 113 insertions(+), 56 deletions(-)
diff --git a/package.json b/package.json
index a0296fc..6182527 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@ztimson/ai-utils",
-	"version": "0.5.6",
+	"version": "0.6.0",
 	"description": "AI Utility library",
 	"author": "Zak Timson",
 	"license": "MIT",
diff --git a/src/ai.ts b/src/ai.ts
index c681dad..18b63a9 100644
--- a/src/ai.ts
+++ b/src/ai.ts
@@ -10,24 +10,16 @@ export type AbortablePromise<T> = Promise<T> & {
 export type AiOptions = {
 	/** Path to models */
 	path?: string;
-	/** Embedding model */
-	embedder?: string; // all-MiniLM-L6-v2, bge-small-en-v1.5, bge-large-en-v1.5
+	/** ASR model: whisper-tiny, whisper-base */
+	asr?: string;
+	/** Embedding model: all-MiniLM-L6-v2, bge-small-en-v1.5, bge-large-en-v1.5 */
+	embedder?: string;
 	/** Large language models, first is default */
 	llm?: Omit<LLMRequest, 'model'> & {
 		models: {[model: string]: AnthropicConfig | OllamaConfig | OpenAiConfig};
 	}
-	/** Tesseract OCR configuration */
-	tesseract?: {
-		/** Model: eng, eng_best, eng_fast */
-		model?: string;
-	}
-	/** Whisper ASR configuration */
-	whisper?: {
-		/** Whisper binary location */
-		binary: string;
-		/** Model: `ggml-base.en.bin` */
-		model: string;
-	}
+	/** OCR model: eng, eng_best, eng_fast */
+	ocr?: string;
 }
 
 export class Ai {
diff --git a/src/audio.ts b/src/audio.ts
index aec5670..e5a540b 100644
--- a/src/audio.ts
+++ b/src/audio.ts
@@ -1,50 +1,115 @@
-import {spawn} from 'node:child_process';
-import fs from 'node:fs/promises';
-import Path from 'node:path';
-import {AbortablePromise, Ai} from './ai.ts';
+import { spawn } from 'node:child_process';
+import { pipeline } from '@xenova/transformers';
+import { AbortablePromise, Ai } from './ai.ts';
 
 export class Audio {
-	private downloads: {[key: string]: Promise<string>} = {};
-	private whisperModel!: string;
+	private whisperPipeline: any;
 
-	constructor(private ai: Ai) {
-		if(ai.options.whisper?.binary) {
-			this.whisperModel = ai.options.whisper?.model.endsWith('.bin') ? ai.options.whisper?.model : ai.options.whisper?.model + '.bin';
-			this.downloadAsrModel();
-		}
-	}
+	constructor(private ai: Ai) {}
 
-	asr(path: string, model: string = this.whisperModel): AbortablePromise<string | null> {
-		if(!this.ai.options.whisper?.binary) throw new Error('Whisper not configured');
-		let abort: any = () => {};
-		const p = new Promise<string | null>(async (resolve, reject) => {
-			const m = await this.downloadAsrModel(model);
-			let output = '';
-			const proc = spawn(<string>this.ai.options.whisper?.binary, ['-nt', '-np', '-m', m, '-f', path], {stdio: ['ignore', 'pipe', 'ignore']});
-			abort = () => proc.kill('SIGTERM');
-			proc.on('error', (err: Error) => reject(err));
-			proc.stdout.on('data', (data: Buffer) => output += data.toString());
-			proc.on('close', (code: number) => {
-				if(code === 0) resolve(output.trim() || null);
-				else reject(new Error(`Exit code ${code}`));
-			});
+	private combineSpeakerTranscript(chunks: any[], speakers: any[]): string {
+		const speakerMap = new Map();
+		let speakerCount = 0;
+		speakers.forEach((seg: any) => {
+			if(!speakerMap.has(seg.speaker)) speakerMap.set(seg.speaker, ++speakerCount);
 		});
-		return Object.assign(p, {abort});
+
+		const lines: string[] = [];
+		let currentSpeaker = -1;
+		let currentText = '';
+		chunks.forEach((chunk: any) => {
+			const time = chunk.timestamp[0];
+			const speaker = speakers.find((s: any) => time >= s.start && time <= s.end);
+			const speakerNum = speaker ? speakerMap.get(speaker.speaker) : 1;
+			if (speakerNum !== currentSpeaker) {
+				if(currentText) lines.push(`[speaker ${currentSpeaker}]: ${currentText.trim()}`);
+				currentSpeaker = speakerNum;
+				currentText = chunk.text;
+			} else {
+				currentText += chunk.text;
+			}
+		});
+		if(currentText) lines.push(`[speaker ${currentSpeaker}]: ${currentText.trim()}`);
+		return lines.join('\n');
 	}
 
-	async downloadAsrModel(model: string = this.whisperModel): Promise<string> {
-		if(!this.ai.options.whisper?.binary) throw new Error('Whisper not configured');
-		if(!model.endsWith('.bin')) model += '.bin';
-		const p = Path.join(<string>this.ai.options.path, model);
-		if(await fs.stat(p).then(() => true).catch(() => false)) return p;
-		if(!!this.downloads[model]) return this.downloads[model];
-		this.downloads[model] = fetch(`https://huggingface.co/ggerganov/whisper.cpp/resolve/main/${model}`)
-			.then(resp => resp.arrayBuffer())
-			.then(arr => Buffer.from(arr)).then(async buffer => {
-				await fs.writeFile(p, buffer);
-				delete this.downloads[model];
-				return p;
+	private async isPyannoteInstalled(): Promise<boolean> {
+		return new Promise((resolve) => {
+			const proc = spawn('python3', ['-c', 'import pyannote.audio']);
+			proc.on('close', (code: number) => resolve(code === 0));
+			proc.on('error', () => resolve(false));
+		});
+	}
+
+	private async runDiarization(audioPath: string): Promise<any[]> {
+		if(!await this.isPyannoteInstalled()) throw new Error('Pyannote is not installed: pip install pyannote.audio');
+		const script = `
+import sys
+import json
+from pyannote.audio import Pipeline
+
+os.environ['TORCH_HOME'] = "${this.ai.options.path}"
+pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
+diarization = pipeline(sys.argv[1])
+
+segments = []
+for turn, _, speaker in diarization.itertracks(yield_label=True):
+    segments.append({
+        "start": turn.start,
+        "end": turn.end,
+        "speaker": speaker
+    })
+
+print(json.dumps(segments))
+`;
+
+		return new Promise((resolve, reject) => {
+			let output = '';
+			const proc = spawn('python3', ['-c', script, audioPath]);
+			proc.stdout.on('data', (data: Buffer) => output += data.toString());
+			proc.stderr.on('data', (data: Buffer) => console.error(data.toString()));
+			proc.on('close', (code: number) => {
+				if(code === 0) {
+					try {
+						resolve(JSON.parse(output));
+					} catch (err) {
+						reject(new Error('Failed to parse diarization output'));
+					}
+				} else {
+					reject(new Error(`Python process exited with code ${code}`));
+				}
 			});
-		return this.downloads[model];
+
+			proc.on('error', reject);
+		});
+	}
+
+	asr(path: string, options: { model?: string; speaker?: boolean } = {}): AbortablePromise<string | null> {
+		const { model = this.ai.options.asr || 'whisper-base', speaker = false } = options;
+		let aborted = false;
+		const abort = () => { aborted = true; };
+
+		const p = new Promise<string | null>(async (resolve, reject) => {
+			try {
+				if(aborted) return resolve(null);
+				if(!this.whisperPipeline) this.whisperPipeline = await pipeline('automatic-speech-recognition', `Xenova/${model}`, { cache_dir: this.ai.options.path, quantized: true });
+
+				// Transcript
+				if(aborted) return resolve(null);
+				const transcriptResult = await this.whisperPipeline(path, {return_timestamps: speaker ? 'word' : false, chunk_length_s: 30,});
+				if(!speaker) return resolve(transcriptResult.text?.trim() || null);
+
+				// Speaker Diarization
+				if(aborted) return resolve(null);
+				const speakers = await this.runDiarization(path);
+				if(aborted) return resolve(null);
+				const combined = this.combineSpeakerTranscript(transcriptResult.chunks || [], speakers);
+				resolve(combined);
+			} catch (err) {
+				reject(err);
+			}
+		});
+
+		return Object.assign(p, { abort });
 	}
 }
diff --git a/src/vision.ts b/src/vision.ts
index 055c586..484ce43 100644
--- a/src/vision.ts
+++ b/src/vision.ts
@@ -13,7 +13,7 @@ export class Vision {
 	ocr(path: string): AbortablePromise<string | null> {
 		let worker: any;
 		const p = new Promise<string | null>(async res => {
-			worker = await createWorker(this.ai.options.tesseract?.model || 'eng', 2, {cachePath: this.ai.options.path});
+			worker = await createWorker(this.ai.options.ocr || 'eng', 2, {cachePath: this.ai.options.path});
 			const {data} = await worker.recognize(path);
 			await worker.terminate();
 			res(data.text.trim() || null);