* Fixed llm response object (double encoding)
All checks were successful
Publish Library / Build NPM Project (push) Successful in 25s
Publish Library / Tag Version (push) Successful in 12s

+ added wikitools
+ Improved webpage reading tool
This commit is contained in:
2026-03-29 23:00:40 -04:00
parent d2e711fbf2
commit ee7b85301b
5 changed files with 1064 additions and 355 deletions

1256
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -29,7 +29,7 @@
"@tensorflow/tfjs": "^4.22.0",
"@xenova/transformers": "^2.17.2",
"@ztimson/node-utils": "^1.0.7",
"@ztimson/utils": "^0.28.13",
"@ztimson/utils": "^0.28.16",
"cheerio": "^1.2.0",
"openai": "^6.22.0",
"tesseract.js": "^7.0.0"

View File

@@ -119,7 +119,7 @@ export class Anthropic extends LLMProvider {
if(!tool) return {tool_use_id: toolCall.id, is_error: true, content: 'Tool not found'};
try {
const result = await tool.fn(toolCall.input, options?.stream, this.ai);
return {type: 'tool_result', tool_use_id: toolCall.id, content: JSONSanitize(result)};
return {type: 'tool_result', tool_use_id: toolCall.id, content: typeof result == 'object' ? JSONSanitize(result) : result};
} catch (err: any) {
return {type: 'tool_result', tool_use_id: toolCall.id, is_error: true, content: err?.message || err?.toString() || 'Unknown'};
}

View File

@@ -148,7 +148,7 @@ export class OpenAi extends LLMProvider {
try {
const args = JSONAttemptParse(toolCall.function.arguments, {});
const result = await tool.fn(args, options.stream, this.ai);
return {role: 'tool', tool_call_id: toolCall.id, content: JSONSanitize(result)};
return {role: 'tool', tool_call_id: toolCall.id, content: typeof result == 'object' ? JSONSanitize(result) : result};
} catch (err: any) {
return {role: 'tool', tool_call_id: toolCall.id, content: JSONSanitize({error: err?.message || err?.toString() || 'Unknown'})};
}

View File

@@ -1,10 +1,12 @@
import * as cheerio from 'cheerio';
import {$Sync} from '@ztimson/node-utils';
import {ASet, consoleInterceptor, Http, fn as Fn} from '@ztimson/utils';
import {ASet, consoleInterceptor, Http, fn as Fn, decodeHtml} from '@ztimson/utils';
import * as os from 'node:os';
import {Ai} from './ai.ts';
import {LLMRequest} from './llm.ts';
const UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)';
const getShell = () => {
if(os.platform() == 'win32') return 'cmd';
return $Sync`echo $SHELL`?.split('/').pop() || 'bash';
@@ -129,123 +131,107 @@ export const PythonTool: AiTool = {
export const ReadWebpageTool: AiTool = {
name: 'read_webpage',
description: 'Extract clean, structured content from a webpage or convert media/documents to accessible formats',
description: 'Extract clean content from webpages, or convert media/documents to accessible formats',
args: {
url: {type: 'string', description: 'URL to extract content from', required: true},
mimeRegex: {type: 'string', description: 'Optional: Regex pattern to filter MIME types (e.g., "^image/", "text/", "application/pdf")'},
maxSize: {type: 'number', description: 'Optional: Max file size in bytes for binary content (default: 10MB)'}
url: {type: 'string', description: 'URL to read', required: true},
mimeRegex: {type: 'string', description: 'Optional regex to filter MIME types (e.g., "^image/", "text/")'}
},
fn: async (args: {url: string; mimeRegex?: string;}) => {
const maxSize = 10 * 1024 * 1024; // 10 MB
fn: async (args: {url: string; mimeRegex?: string}) => {
const ua = 'AiTools-Webpage/1.0';
const maxSize = 10 * 1024 * 1024;
const response = await fetch(args.url, {
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5"
'User-Agent': ua,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5'
},
redirect: 'follow'
}).catch(err => {throw new Error(`Failed to fetch: ${err.message}`)});
const contentType = response.headers.get('content-type') || '';
const mimeType = contentType.split(';')[0].trim().toLowerCase();
const charset = contentType.match(/charset=([^;]+)/)?.[1] || 'utf-8';
// Filter by MIME type if specified
if (args.mimeRegex) {
const regex = new RegExp(args.mimeRegex, 'i');
if (!regex.test(mimeType)) {
return {url: args.url, error: 'MIME type rejected', mimeType, filter: args.mimeRegex};
}
if(args.mimeRegex && !new RegExp(args.mimeRegex, 'i').test(mimeType)) {
return `❌ MIME type rejected: ${mimeType} (filter: ${args.mimeRegex})`;
}
// Handle images, audio, video -> data URL
if (mimeType.startsWith('image/') || mimeType.startsWith('audio/') || mimeType.startsWith('video/')) {
if(mimeType.match(/^(image|audio|video)\//)) {
const buffer = await response.arrayBuffer();
if (buffer.byteLength > maxSize) {
return {url: args.url, type: 'media', mimeType, error: 'File too large', size: buffer.byteLength, maxSize};
if(buffer.byteLength > maxSize) {
return `❌ File too large: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (max 10MB)\nType: ${mimeType}`;
}
const base64 = Buffer.from(buffer).toString('base64');
return {url: args.url, type: 'media', mimeType, dataUrl: `data:${mimeType};base64,${base64}`, size: buffer.byteLength};
return `## Media File\n**Type:** ${mimeType}\n**Size:** ${(buffer.byteLength / 1024).toFixed(1)}KB\n**Data URL:** \`data:${mimeType};base64,${base64.slice(0, 100)}...\``;
}
// Handle plain text, json, xml, csv
if (mimeType.match(/^(text\/(plain|csv|xml)|application\/(json|xml|csv|x-yaml))/) ||
args.url.match(/\.(txt|json|xml|csv|yaml|yml|md)$/i)) {
if(mimeType.match(/^text\/(plain|csv|xml)/) || args.url.match(/\.(txt|csv|xml|md|yaml|yml)$/i)) {
const text = await response.text();
return {url: args.url, type: 'text', mimeType, content: text.slice(0, 100000)};
const truncated = text.length > 50000 ? text.slice(0, 50000) : text;
return `## Text File\n**Type:** ${mimeType}\n**URL:** ${args.url}\n\n${truncated}`;
}
// Handle PDFs and other binaries -> data URL
if (mimeType === 'application/pdf' || mimeType.startsWith('application/') && !mimeType.includes('html')) {
if(mimeType.match(/application\/(json|xml|csv)/)) {
const text = await response.text();
const truncated = text.length > 50000 ? text.slice(0, 50000) : text;
return `## Structured Data\n**Type:** ${mimeType}\n**URL:** ${args.url}\n\n\`\`\`\n${truncated}\n\`\`\``;
}
if(mimeType === 'application/pdf' || (mimeType.startsWith('application/') && !mimeType.includes('html'))) {
const buffer = await response.arrayBuffer();
if (buffer.byteLength > maxSize) {
return {url: args.url, type: 'binary', mimeType, error: 'File too large', size: buffer.byteLength, maxSize};
if(buffer.byteLength > maxSize) {
return `❌ File too large: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (max 10MB)\nType: ${mimeType}`;
}
const base64 = Buffer.from(buffer).toString('base64');
return {url: args.url, type: 'binary', mimeType, dataUrl: `data:${mimeType};base64,${base64}`, size: buffer.byteLength};
return `## Binary File\n**Type:** ${mimeType}\n**Size:** ${(buffer.byteLength / 1024).toFixed(1)}KB\n**Data URL:** \`data:${mimeType};base64,${base64.slice(0, 100)}...\``;
}
// Default HTML handling
// HTML
const html = await response.text();
const $ = cheerio.load(html);
// Remove noise
$('script, style, nav, footer, header, aside, iframe, noscript, svg, [role="navigation"], [role="banner"], [role="complementary"], .ad, .ads, .advertisement, .cookie, .popup, .modal, .sidebar, .related, .comments, .social-share').remove();
// Extract metadata
const metadata = {
title: $('meta[property="og:title"]').attr('content') || $('title').text() || '',
description: $('meta[name="description"]').attr('content') || $('meta[property="og:description"]').attr('content') || '',
author: $('meta[name="author"]').attr('content') || '',
published: $('meta[property="article:published_time"]').attr('content') || $('time').attr('datetime') || '',
image: $('meta[property="og:image"]').attr('content') || ''
};
// Extract structured content
$('script, style, nav, footer, header, aside, iframe, noscript, svg').remove();
$('[role="navigation"], [role="banner"], [role="complementary"]').remove();
$('[aria-hidden="true"], [hidden], .visually-hidden, .sr-only, .screen-reader-text').remove();
$('.ad, .ads, .advertisement, .cookie, .popup, .modal, .sidebar, .related, .comments, .social-share').remove();
$('button, [class*="share"], [class*="follow"], [class*="social"]').remove();
const title = $('meta[property="og:title"]').attr('content') || $('title').text().trim() || '';
const description = $('meta[name="description"]').attr('content') || $('meta[property="og:description"]').attr('content') || '';
const author = $('meta[name="author"]').attr('content') || '';
let content = '';
const contentSelectors = ['article', 'main', '[role="main"]', '.content', '.post-content', '.entry-content', '.article-content', 'body'];
for (const selector of contentSelectors) {
const el = $(selector).first();
if (el.length && el.text().trim().length > 200) {
content = el.text();
break;
const selectors = ['article', 'main', '[role="main"]', '.content', '.post-content', '.entry-content', '.article-content'];
for(const sel of selectors) {
const el = $(sel).first();
if(el.length && el.text().trim().length > 200) {
const paragraphs: string[] = [];
el.find('p').each((_, p) => {
const text = $(p).text().trim();
if(text.length > 80) paragraphs.push(text);
});
if(paragraphs.length > 2) {
content = paragraphs.join('\n\n');
break;
}
}
}
if (!content) content = $('body').text();
// Clean whitespace but preserve structure
content = content
.replace(/\n\s*\n\s*\n/g, '\n\n')
.replace(/[ \t]+/g, ' ')
.trim()
.slice(0, 50000);
// Extract links if minimal content
let links: any[] = [];
if (content.length < 500) {
$('a[href]').each((_, el) => {
const href = $(el).attr('href');
const text = $(el).text().trim();
if (href && text && !href.startsWith('#')) {
links.push({text, href});
}
if(!content) {
const paragraphs: string[] = [];
$('body p').each((_, p) => {
const text = $(p).text().trim();
if(text.length > 80) paragraphs.push(text);
});
links = links.slice(0, 50);
content = paragraphs.slice(0, 30).join('\n\n');
}
return {
url: args.url,
type: 'html',
title: metadata.title.trim(),
description: metadata.description.trim(),
author: metadata.author.trim(),
published: metadata.published,
content,
links: links.length ? links : undefined,
};
// Decode escaped newlines and clean
const parts = [`## ${title || 'Webpage'}`];
if(description) parts.push(`_${description}_`);
if(author) parts.push(`👤 ${author}`);
parts.push(`🔗 ${args.url}\n`);
parts.push(content);
return decodeHtml(parts.join('\n\n').replaceAll(/\n{3,}/g, '\n\n'));
}
}
};
export const WebSearchTool: AiTool = {
name: 'web_search',
@@ -259,7 +245,7 @@ export const WebSearchTool: AiTool = {
length: number;
}) => {
const html = await fetch(`https://html.duckduckgo.com/html/?q=${encodeURIComponent(args.query)}`, {
headers: {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)", "Accept-Language": "en-US,en;q=0.9"}
headers: {"User-Agent": UA, "Accept-Language": "en-US,en;q=0.9"}
}).then(resp => resp.text());
let match, regex = /<a .*?href="(.+?)".+?<\/a>/g;
const results = new ASet<string>();
@@ -274,10 +260,8 @@ export const WebSearchTool: AiTool = {
}
class WikipediaClient {
private ua = 'AiTools-Wikipedia/1.0';
private async get(url: string): Promise<any> {
const resp = await fetch(url, {headers: {'User-Agent': this.ua}});
const resp = await fetch(url, {headers: {'User-Agent': UA}});
return resp.json();
}
@@ -321,11 +305,9 @@ class WikipediaClient {
async lookup(query: string, detail: 'intro' | 'full' = 'intro'): Promise<string> {
const results = await this.searchTitles(query, 6);
if(!results.length) return `❌ No Wikipedia articles found for "${query}"`;
const title = results[0].title;
const url = this.pageUrl(title);
const content = await this.fetchExtract(title, detail === 'intro');
const text = this.truncate(content, detail === 'intro' ? 2000 : 8000);
return `## ${title}\n🔗 ${url}\n\n${text}`;
}
@@ -333,7 +315,6 @@ class WikipediaClient {
async search(query: string): Promise<string> {
const results = await this.searchTitles(query, 8);
if(!results.length) return `❌ No results for "${query}"`;
const lines = [`### Search results for "${query}"\n`];
for(let i = 0; i < results.length; i++) {
const r = results[i];