From abd290246ca714a4ec8d706afeeb3bad543c560d Mon Sep 17 00:00:00 2001 From: ztimson Date: Sun, 22 Feb 2026 09:29:31 -0500 Subject: [PATCH] LLM ASR --- package.json | 2 +- src/audio.ts | 209 ++++++++++++++++++++++++++++++-------------------- src/llm.ts | 24 +++--- tsconfig.json | 3 +- 4 files changed, 143 insertions(+), 95 deletions(-) diff --git a/package.json b/package.json index 58fd0be..bab4656 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@ztimson/ai-utils", - "version": "0.8.0", + "version": "0.8.1", "description": "AI Utility library", "author": "Zak Timson", "license": "MIT", diff --git a/src/audio.ts b/src/audio.ts index 01a7d0d..3301fca 100644 --- a/src/audio.ts +++ b/src/audio.ts @@ -35,6 +35,121 @@ print(json.dumps(segments)) `; } + private async addPunctuation(timestampData: any, llm?: boolean, cadence = 150): Promise { + const countSyllables = (word: string): number => { + word = word.toLowerCase().replace(/[^a-z]/g, ''); + if(word.length <= 3) return 1; + const matches = word.match(/[aeiouy]+/g); + let count = matches ? matches.length : 1; + if(word.endsWith('e')) count--; + return Math.max(1, count); + }; + + let result = ''; + timestampData.transcription.filter((word, i) => { + let skip = false; + const prevWord = timestampData.transcription[i - 1]; + const nextWord = timestampData.transcription[i + 1]; + if(!word.text && nextWord) { + nextWord.offsets.from = word.offsets.from; + nextWord.timestamps.from = word.offsets.from; + } else if(word.text && word.text[0] != ' ' && prevWord) { + prevWord.offsets.to = word.offsets.to; + prevWord.timestamps.to = word.timestamps.to; + prevWord.text += word.text; + skip = true; + } + return !!word.text && !skip; + }).forEach((word: any) => { + const capital = /^[A-Z]/.test(word.text.trim()); + const length = word.offsets.to - word.offsets.from; + const syllables = countSyllables(word.text.trim()); + const expected = syllables * cadence; + if(capital && length > expected * 2 && word.text[0] == ' ') result += '.'; + result += word.text; + }); + if(!llm) return result.trim(); + return this.ai.language.ask(result, { + system: 'Remove any misplaced punctuation from the following ASR transcript using the replace tool. Avoid modifying words unless there is an obvious typo', + temperature: 0.1, + tools: [{ + name: 'replace', + description: 'Use find and replace to fix errors', + args: { + find: {type: 'string', description: 'Text to find', required: true}, + replace: {type: 'string', description: 'Text to replace', required: true} + }, + fn: (args) => result = result.replace(args.find, args.replace) + }] + }).then(() => result); + } + + private async diarizeTranscript(timestampData: any, speakers: any[], llm: boolean): Promise { + const speakerMap = new Map(); + let speakerCount = 0; + speakers.forEach((seg: any) => { + if(!speakerMap.has(seg.speaker)) speakerMap.set(seg.speaker, ++speakerCount); + }); + + const punctuatedText = await this.addPunctuation(timestampData, llm); + const sentences = punctuatedText.match(/[^.!?]+[.!?]+/g) || [punctuatedText]; + const words = timestampData.transcription.filter((w: any) => w.text.trim()); + + // Assign speaker to each sentence + const sentencesWithSpeakers = sentences.map(sentence => { + sentence = sentence.trim(); + if(!sentence) return null; + + const sentenceWords = sentence.toLowerCase().replace(/[^\w\s]/g, '').split(/\s+/); + const speakerWordCount = new Map(); + + sentenceWords.forEach(sw => { + const word = words.find((w: any) => sw === w.text.trim().toLowerCase().replace(/[^\w]/g, '')); + if(!word) return; + + const wordTime = word.offsets.from / 1000; + const speaker = speakers.find((seg: any) => wordTime >= seg.start && wordTime <= seg.end); + if(speaker) { + const spkNum = speakerMap.get(speaker.speaker); + speakerWordCount.set(spkNum, (speakerWordCount.get(spkNum) || 0) + 1); + } + }); + + let bestSpeaker = 1; + let maxWords = 0; + speakerWordCount.forEach((count, speaker) => { + if(count > maxWords) { + maxWords = count; + bestSpeaker = speaker; + } + }); + + return {speaker: bestSpeaker, text: sentence}; + }).filter(s => s !== null); + + // Merge adjacent sentences from same speaker + const merged: Array<{speaker: number, text: string}> = []; + sentencesWithSpeakers.forEach(item => { + const last = merged[merged.length - 1]; + if(last && last.speaker === item.speaker) { + last.text += ' ' + item.text; + } else { + merged.push({...item}); + } + }); + + let transcript = merged.map(item => `[Speaker ${item.speaker}]: ${item.text}`).join('\n').trim(); + if(!llm) return transcript; + let chunks = this.ai.language.chunk(transcript, 500, 0); + if(chunks.length > 4) chunks = [...chunks.slice(0, 3), chunks.at(-1)]; + const names = await this.ai.language.json(chunks.join('\n'), '{1: "Detected Name", 2: "Second Name"}', { + system: 'Use the following transcript to identify speakers. Only identify speakers you are positive about, dont mention speakers you are unsure about in your response', + temperature: 0.1, + }); + Object.entries(names).forEach(([speaker, name]) => transcript = transcript.replaceAll(`[Speaker ${speaker}]`, `[${name}]`)); + return transcript; + } + private runAsr(file: string, opts: {model?: string, diarization?: boolean} = {}): AbortablePromise { let proc: any; const p = new Promise((resolve, reject) => { @@ -111,102 +226,28 @@ print(json.dumps(segments)) return Object.assign(p, {abort}); } - private async combineSpeakerTranscript(punctuatedText: string, timestampData: any, speakers: any[]): Promise { - const speakerMap = new Map(); - let speakerCount = 0; - speakers.forEach((seg: any) => { - if(!speakerMap.has(seg.speaker)) speakerMap.set(seg.speaker, ++speakerCount); - }); - - const sentences = punctuatedText.match(/[^.!?]+[.!?]+/g) || [punctuatedText]; - const lines: string[] = []; - - sentences.forEach(sentence => { - sentence = sentence.trim(); - if(!sentence) return; - - const words = sentence.toLowerCase().replace(/[^\w\s]/g, '').split(/\s+/); - let startTime = Infinity, endTime = 0; - const wordTimings: {start: number, end: number}[] = []; - - timestampData.transcription.forEach((word: any) => { - const wordText = word.text.trim().toLowerCase(); - if(words.some(w => wordText.includes(w))) { - const start = word.offsets.from / 1000; - const end = word.offsets.to / 1000; - wordTimings.push({start, end}); - if(start < startTime) startTime = start; - if(end > endTime) endTime = end; - } - }); - - if(startTime === Infinity) return; - - // Weight by word-level overlap instead of sentence span - const speakerScores = new Map(); - - wordTimings.forEach(wt => { - speakers.forEach((seg: any) => { - const overlap = Math.max(0, Math.min(wt.end, seg.end) - Math.max(wt.start, seg.start)); - const duration = wt.end - wt.start; - if(duration > 0) { - const score = overlap / duration; // % of word covered - const spkNum = speakerMap.get(seg.speaker); - speakerScores.set(spkNum, (speakerScores.get(spkNum) || 0) + score); - } - }); - }); - - let bestSpeaker = 1; - let maxScore = 0; - speakerScores.forEach((score, speaker) => { - if(score > maxScore) { - maxScore = score; - bestSpeaker = speaker; - } - }); - - lines.push(`[Speaker ${bestSpeaker}]: ${sentence}`); - }); - - return lines.join('\n').trim(); - } - - asr(file: string, options: { model?: string; diarization?: boolean | 'id' } = {}): AbortablePromise { + asr(file: string, options: { model?: string; diarization?: boolean | 'llm' } = {}): AbortablePromise { if(!this.ai.options.whisper) throw new Error('Whisper not configured'); const tmp = join(mkdtempSync(join(tmpdir(), 'audio-')), 'converted.wav'); execSync(`ffmpeg -i "${file}" -ar 16000 -ac 1 -f wav "${tmp}"`, { stdio: 'ignore' }); const clean = () => fs.rm(Path.dirname(tmp), {recursive: true, force: true}).catch(() => {}); - const transcript = this.runAsr(tmp, {model: options.model, diarization: false}); - const timestamps: any = !options.diarization ? Promise.resolve(null) : this.runAsr(tmp, {model: options.model, diarization: true}); - const diarization: any = !options.diarization ? Promise.resolve(null) : this.runDiarization(tmp); + + if(!options.diarization) return this.runAsr(tmp, {model: options.model}); + const timestamps = this.runAsr(tmp, {model: options.model, diarization: true}); + const diarization = this.runDiarization(tmp); let aborted = false, abort = () => { aborted = true; - transcript.abort(); - timestamps?.abort?.(); - diarization?.abort?.(); + timestamps.abort(); + diarization.abort(); clean(); }; - const response = Promise.allSettled([transcript, timestamps, diarization]).then(async ([t, ts, d]) => { - if(t.status == 'rejected') throw new Error('Whisper.cpp punctuated:\n' + t.reason); + const response = Promise.allSettled([timestamps, diarization]).then(async ([ts, d]) => { if(ts.status == 'rejected') throw new Error('Whisper.cpp timestamps:\n' + ts.reason); if(d.status == 'rejected') throw new Error('Pyannote:\n' + d.reason); - if(aborted || !options.diarization) return t.value; - - let transcript = await this.combineSpeakerTranscript(t.value, ts.value, d.value); - if(!aborted && options.diarization === 'id') { - if(!this.ai.language.defaultModel) throw new Error('Configure an LLM for advanced ASR speaker detection'); - let chunks = this.ai.language.chunk(transcript, 500, 0); - if(chunks.length > 4) chunks = [...chunks.slice(0, 3), chunks.at(-1)]; - const names = await this.ai.language.json(chunks.join('\n'), '{1: "Detected Name", 2: "Second Name"}', { - system: 'Use the following transcript to identify speakers. Only identify speakers you are positive about, dont mention speakers you are unsure about in your response', - temperature: 0.1, - }); - Object.entries(names).forEach(([speaker, name]) => transcript = transcript.replaceAll(`[Speaker ${speaker}]`, `[${name}]`)); - } - return transcript; + if(aborted || !options.diarization) return ts.value; + return this.diarizeTranscript(ts.value, d.value, options.diarization == 'llm'); }).finally(() => clean()); return Object.assign(response, {abort}); } diff --git a/src/llm.ts b/src/llm.ts index 4e4c650..296f714 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -145,7 +145,7 @@ class LLM { // Handle compression and memory extraction if(options.compress || options.memory) { - let compressed = null; + let compressed: any = null; if(options.compress) { compressed = await this.ai.language.compressHistory(options.history, options.compress.max, options.compress.min, options); options.history.splice(0, options.history.length, ...compressed.history); @@ -164,6 +164,15 @@ class LLM { }), {abort}); } + async code(message: string, options?: LLMRequest): Promise { + const resp = await this.ask(message, {...options, system: [ + options?.system, + 'Return your response in a code block' + ].filter(t => !!t).join(('\n'))}); + const codeBlock = /```(?:.+)?\s*([\s\S]*?)```/.exec(resp); + return codeBlock ? codeBlock[1].trim() : null; + } + /** * Compress chat history to reduce context size * @param {LLMMessage[]} history Chatlog that will be compressed @@ -343,14 +352,11 @@ class LLM { * @returns {Promise<{} | {} | RegExpExecArray | null>} */ async json(text: string, schema: string, options?: LLMRequest): Promise { - let resp = await this.ask(text, {...options, system: (options?.system ? `${options.system}\n` : '') + `Only respond using a JSON code block matching this schema: -\`\`\`json -${schema} -\`\`\``}); - if(!resp) return {}; - const codeBlock = /```(?:.+)?\s*([\s\S]*?)```/.exec(resp); - const jsonStr = codeBlock ? codeBlock[1].trim() : resp; - return JSONAttemptParse(jsonStr, {}); + const code = await this.code(text, {...options, system: [ + options?.system, + `Only respond using JSON matching this schema:\n\`\`\`json\n${schema}\n\`\`\`` + ].filter(t => !!t).join('\n')}); + return code ? JSONAttemptParse(code, {}) : null; } /** diff --git a/tsconfig.json b/tsconfig.json index bce5016..b34c07a 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -15,6 +15,7 @@ "noEmit": true, /* Linting */ - "strict": true + "strict": true, + "noImplicitAny": false } }