From 54730a2b9acd04ea498d13bbb6947d8dcdee73a7 Mon Sep 17 00:00:00 2001 From: ztimson Date: Thu, 12 Feb 2026 11:26:11 -0500 Subject: [PATCH] Speaker diarization --- package.json | 2 +- src/ai.ts | 20 +++---- src/audio.ts | 145 ++++++++++++++++++++++++++++++++++++-------------- src/vision.ts | 2 +- 4 files changed, 113 insertions(+), 56 deletions(-) diff --git a/package.json b/package.json index a0296fc..6182527 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@ztimson/ai-utils", - "version": "0.5.6", + "version": "0.6.0", "description": "AI Utility library", "author": "Zak Timson", "license": "MIT", diff --git a/src/ai.ts b/src/ai.ts index c681dad..18b63a9 100644 --- a/src/ai.ts +++ b/src/ai.ts @@ -10,24 +10,16 @@ export type AbortablePromise = Promise & { export type AiOptions = { /** Path to models */ path?: string; - /** Embedding model */ - embedder?: string; // all-MiniLM-L6-v2, bge-small-en-v1.5, bge-large-en-v1.5 + /** ASR model: whisper-tiny, whisper-base */ + asr?: string; + /** Embedding model: all-MiniLM-L6-v2, bge-small-en-v1.5, bge-large-en-v1.5 */ + embedder?: string; /** Large language models, first is default */ llm?: Omit & { models: {[model: string]: AnthropicConfig | OllamaConfig | OpenAiConfig}; } - /** Tesseract OCR configuration */ - tesseract?: { - /** Model: eng, eng_best, eng_fast */ - model?: string; - } - /** Whisper ASR configuration */ - whisper?: { - /** Whisper binary location */ - binary: string; - /** Model: `ggml-base.en.bin` */ - model: string; - } + /** OCR model: eng, eng_best, eng_fast */ + ocr?: string; } export class Ai { diff --git a/src/audio.ts b/src/audio.ts index aec5670..e5a540b 100644 --- a/src/audio.ts +++ b/src/audio.ts @@ -1,50 +1,115 @@ -import {spawn} from 'node:child_process'; -import fs from 'node:fs/promises'; -import Path from 'node:path'; -import {AbortablePromise, Ai} from './ai.ts'; +import { spawn } from 'node:child_process'; +import { pipeline } from '@xenova/transformers'; +import { AbortablePromise, Ai } from './ai.ts'; export class Audio { - private downloads: {[key: string]: Promise} = {}; - private whisperModel!: string; + private whisperPipeline: any; - constructor(private ai: Ai) { - if(ai.options.whisper?.binary) { - this.whisperModel = ai.options.whisper?.model.endsWith('.bin') ? ai.options.whisper?.model : ai.options.whisper?.model + '.bin'; - this.downloadAsrModel(); - } - } + constructor(private ai: Ai) {} - asr(path: string, model: string = this.whisperModel): AbortablePromise { - if(!this.ai.options.whisper?.binary) throw new Error('Whisper not configured'); - let abort: any = () => {}; - const p = new Promise(async (resolve, reject) => { - const m = await this.downloadAsrModel(model); - let output = ''; - const proc = spawn(this.ai.options.whisper?.binary, ['-nt', '-np', '-m', m, '-f', path], {stdio: ['ignore', 'pipe', 'ignore']}); - abort = () => proc.kill('SIGTERM'); - proc.on('error', (err: Error) => reject(err)); - proc.stdout.on('data', (data: Buffer) => output += data.toString()); - proc.on('close', (code: number) => { - if(code === 0) resolve(output.trim() || null); - else reject(new Error(`Exit code ${code}`)); - }); + private combineSpeakerTranscript(chunks: any[], speakers: any[]): string { + const speakerMap = new Map(); + let speakerCount = 0; + speakers.forEach((seg: any) => { + if(!speakerMap.has(seg.speaker)) speakerMap.set(seg.speaker, ++speakerCount); }); - return Object.assign(p, {abort}); + + const lines: string[] = []; + let currentSpeaker = -1; + let currentText = ''; + chunks.forEach((chunk: any) => { + const time = chunk.timestamp[0]; + const speaker = speakers.find((s: any) => time >= s.start && time <= s.end); + const speakerNum = speaker ? speakerMap.get(speaker.speaker) : 1; + if (speakerNum !== currentSpeaker) { + if(currentText) lines.push(`[speaker ${currentSpeaker}]: ${currentText.trim()}`); + currentSpeaker = speakerNum; + currentText = chunk.text; + } else { + currentText += chunk.text; + } + }); + if(currentText) lines.push(`[speaker ${currentSpeaker}]: ${currentText.trim()}`); + return lines.join('\n'); } - async downloadAsrModel(model: string = this.whisperModel): Promise { - if(!this.ai.options.whisper?.binary) throw new Error('Whisper not configured'); - if(!model.endsWith('.bin')) model += '.bin'; - const p = Path.join(this.ai.options.path, model); - if(await fs.stat(p).then(() => true).catch(() => false)) return p; - if(!!this.downloads[model]) return this.downloads[model]; - this.downloads[model] = fetch(`https://huggingface.co/ggerganov/whisper.cpp/resolve/main/${model}`) - .then(resp => resp.arrayBuffer()) - .then(arr => Buffer.from(arr)).then(async buffer => { - await fs.writeFile(p, buffer); - delete this.downloads[model]; - return p; + private async isPyannoteInstalled(): Promise { + return new Promise((resolve) => { + const proc = spawn('python3', ['-c', 'import pyannote.audio']); + proc.on('close', (code: number) => resolve(code === 0)); + proc.on('error', () => resolve(false)); + }); + } + + private async runDiarization(audioPath: string): Promise { + if(!await this.isPyannoteInstalled()) throw new Error('Pyannote is not installed: pip install pyannote.audio'); + const script = ` +import sys +import json +from pyannote.audio import Pipeline + +os.environ['TORCH_HOME'] = "${this.ai.options.path}" +pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1") +diarization = pipeline(sys.argv[1]) + +segments = [] +for turn, _, speaker in diarization.itertracks(yield_label=True): + segments.append({ + "start": turn.start, + "end": turn.end, + "speaker": speaker + }) + +print(json.dumps(segments)) +`; + + return new Promise((resolve, reject) => { + let output = ''; + const proc = spawn('python3', ['-c', script, audioPath]); + proc.stdout.on('data', (data: Buffer) => output += data.toString()); + proc.stderr.on('data', (data: Buffer) => console.error(data.toString())); + proc.on('close', (code: number) => { + if(code === 0) { + try { + resolve(JSON.parse(output)); + } catch (err) { + reject(new Error('Failed to parse diarization output')); + } + } else { + reject(new Error(`Python process exited with code ${code}`)); + } }); - return this.downloads[model]; + + proc.on('error', reject); + }); + } + + asr(path: string, options: { model?: string; speaker?: boolean } = {}): AbortablePromise { + const { model = this.ai.options.asr || 'whisper-base', speaker = false } = options; + let aborted = false; + const abort = () => { aborted = true; }; + + const p = new Promise(async (resolve, reject) => { + try { + if(aborted) return resolve(null); + if(!this.whisperPipeline) this.whisperPipeline = await pipeline('automatic-speech-recognition', `Xenova/${model}`, { cache_dir: this.ai.options.path, quantized: true }); + + // Transcript + if(aborted) return resolve(null); + const transcriptResult = await this.whisperPipeline(path, {return_timestamps: speaker ? 'word' : false, chunk_length_s: 30,}); + if(!speaker) return resolve(transcriptResult.text?.trim() || null); + + // Speaker Diarization + if(aborted) return resolve(null); + const speakers = await this.runDiarization(path); + if(aborted) return resolve(null); + const combined = this.combineSpeakerTranscript(transcriptResult.chunks || [], speakers); + resolve(combined); + } catch (err) { + reject(err); + } + }); + + return Object.assign(p, { abort }); } } diff --git a/src/vision.ts b/src/vision.ts index 055c586..484ce43 100644 --- a/src/vision.ts +++ b/src/vision.ts @@ -13,7 +13,7 @@ export class Vision { ocr(path: string): AbortablePromise { let worker: any; const p = new Promise(async res => { - worker = await createWorker(this.ai.options.tesseract?.model || 'eng', 2, {cachePath: this.ai.options.path}); + worker = await createWorker(this.ai.options.ocr || 'eng', 2, {cachePath: this.ai.options.path}); const {data} = await worker.recognize(path); await worker.terminate(); res(data.text.trim() || null);