Fix ASR
This commit is contained in:
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@ztimson/ai-utils",
|
"name": "@ztimson/ai-utils",
|
||||||
"version": "0.6.4",
|
"version": "0.6.5",
|
||||||
"description": "AI Utility library",
|
"description": "AI Utility library",
|
||||||
"author": "Zak Timson",
|
"author": "Zak Timson",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
|||||||
25
src/audio.ts
25
src/audio.ts
@@ -2,12 +2,12 @@ import {spawn} from 'node:child_process';
|
|||||||
import {pipeline} from '@xenova/transformers';
|
import {pipeline} from '@xenova/transformers';
|
||||||
import * as fs from 'node:fs';
|
import * as fs from 'node:fs';
|
||||||
import {AbortablePromise, Ai} from './ai.ts';
|
import {AbortablePromise, Ai} from './ai.ts';
|
||||||
import * as wavefile from 'wavefile';
|
import wavefile from 'wavefile';
|
||||||
|
|
||||||
export class Audio {
|
export class Audio {
|
||||||
private whisperPipeline: any;
|
private whisperPipeline: any;
|
||||||
|
|
||||||
constructor(private ai: Ai) {}
|
constructor(private ai: Ai) { }
|
||||||
|
|
||||||
private combineSpeakerTranscript(chunks: any[], speakers: any[]): string {
|
private combineSpeakerTranscript(chunks: any[], speakers: any[]): string {
|
||||||
const speakerMap = new Map();
|
const speakerMap = new Map();
|
||||||
@@ -96,14 +96,25 @@ print(json.dumps(segments))
|
|||||||
if(aborted) return resolve(null);
|
if(aborted) return resolve(null);
|
||||||
if(!this.whisperPipeline) this.whisperPipeline = await pipeline('automatic-speech-recognition', `Xenova/${model}`, { cache_dir: this.ai.options.path, quantized: true });
|
if(!this.whisperPipeline) this.whisperPipeline = await pipeline('automatic-speech-recognition', `Xenova/${model}`, { cache_dir: this.ai.options.path, quantized: true });
|
||||||
|
|
||||||
// Transcript
|
// Prepare audio file (convert to mono channel wave)
|
||||||
if(aborted) return resolve(null);
|
if(aborted) return resolve(null);
|
||||||
const audioData = fs.readFileSync(path);
|
const wav = new wavefile.WaveFile(fs.readFileSync(path));
|
||||||
const wav = new wavefile.WaveFile(audioData);
|
|
||||||
wav.toBitDepth('32f');
|
wav.toBitDepth('32f');
|
||||||
wav.toSampleRate(16000);
|
wav.toSampleRate(16000);
|
||||||
const buffer = wav.getSamples();
|
const samples = wav.getSamples();
|
||||||
const transcriptResult = await this.whisperPipeline(buffer, {return_timestamps: speaker ? 'word' : false, chunk_length_s: 30,});
|
let buffer;
|
||||||
|
if(Array.isArray(samples)) { // stereo to mono - average the channels
|
||||||
|
const left = samples[0];
|
||||||
|
const right = samples[1];
|
||||||
|
buffer = new Float32Array(left.length);
|
||||||
|
for (let i = 0; i < left.length; i++) buffer[i] = (left[i] + right[i]) / 2;
|
||||||
|
} else {
|
||||||
|
buffer = samples;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transcribe
|
||||||
|
if(aborted) return resolve(null);
|
||||||
|
const transcriptResult = await this.whisperPipeline(buffer, {return_timestamps: speaker ? 'word' : false});
|
||||||
if(!speaker) return resolve(transcriptResult.text?.trim() || null);
|
if(!speaker) return resolve(transcriptResult.text?.trim() || null);
|
||||||
|
|
||||||
// Speaker Diarization
|
// Speaker Diarization
|
||||||
|
|||||||
Reference in New Issue
Block a user