From d5bf1ec47eeb36873c73bd55b585dce92e91c503 Mon Sep 17 00:00:00 2001 From: ztimson Date: Fri, 30 Jan 2026 10:38:51 -0500 Subject: [PATCH] Pulled chunking out into its own exported function for easy access --- package.json | 2 +- src/llm.ts | 55 +++++++++++++++++++++++----------------------------- 2 files changed, 25 insertions(+), 32 deletions(-) diff --git a/package.json b/package.json index a3435c3..c816ea9 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@ztimson/ai-utils", - "version": "0.2.6", + "version": "0.2.7", "description": "AI Utility library", "author": "Zak Timson", "license": "MIT", diff --git a/src/llm.ts b/src/llm.ts index 8c9df29..10e20cd 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -160,17 +160,35 @@ export class LLM { return denominator === 0 ? 0 : dotProduct / denominator; } - embedding(target: object | string, maxTokens = 500, overlapTokens = 50) { + chunk(target: object | string, maxTokens = 500, overlapTokens = 50): string[] { const objString = (obj: any, path = ''): string[] => { - if(obj === null || obj === undefined) return []; + if(!obj) return []; return Object.entries(obj).flatMap(([key, value]) => { const p = path ? `${path}${isNaN(+key) ? `.${key}` : `[${key}]`}` : key; - if(typeof value === 'object' && value !== null && !Array.isArray(value)) return objString(value, p); - const valueStr = Array.isArray(value) ? value.join(', ') : String(value); - return `${p}: ${valueStr}`; + if(typeof value === 'object' && !Array.isArray(value)) return objString(value, p); + return `${p}: ${Array.isArray(value) ? value.join(', ') : value}`; }); }; + const lines = typeof target === 'object' ? objString(target) : target.split('\n'); + const tokens = lines.flatMap(l => [...l.split(/\s+/).filter(Boolean), '\n']); + const chunks: string[] = []; + for(let i = 0; i < tokens.length;) { + let text = '', j = i; + while(j < tokens.length) { + const next = text + (text ? ' ' : '') + tokens[j]; + if(this.estimateTokens(next.replace(/\s*\n\s*/g, '\n')) > maxTokens && text) break; + text = next; + j++; + } + const clean = text.replace(/\s*\n\s*/g, '\n').trim(); + if(clean) chunks.push(clean); + i = Math.max(j - overlapTokens, j === i ? i + 1 : j); + } + return chunks; + } + + embedding(target: object | string, maxTokens = 500, overlapTokens = 50) { const embed = (text: string): Promise => { return new Promise((resolve, reject) => { const id = this.embedId++; @@ -179,32 +197,7 @@ export class LLM { }); }; - // Tokenize - const lines = typeof target === 'object' ? objString(target) : target.split('\n'); - const tokens = lines.flatMap(line => [...line.split(/\s+/).filter(w => w.trim()), '\n']); - - // Chunk - const chunks: string[] = []; - let start = 0; - while (start < tokens.length) { - let end = start; - let text = ''; - // Build chunk - while (end < tokens.length) { - const nextToken = tokens[end]; - const testText = text + (text ? ' ' : '') + nextToken; - const testTokens = this.estimateTokens(testText.replace(/\s*\n\s*/g, '\n')); - if (testTokens > maxTokens && text) break; - text = testText; - end++; - } - // Save chunk - const cleanText = text.replace(/\s*\n\s*/g, '\n').trim(); - if(cleanText) chunks.push(cleanText); - start = end - overlapTokens; - if(start <= end - tokens.length + end) start = end; - } - + const chunks = this.chunk(target, maxTokens, overlapTokens); return Promise.all(chunks.map(async (text, index) => ({ index, embedding: await embed(text),