352 lines
13 KiB
TypeScript
352 lines
13 KiB
TypeScript
import * as cheerio from 'cheerio';
|
|
import {$Sync} from '@ztimson/node-utils';
|
|
import {ASet, consoleInterceptor, Http, fn as Fn, decodeHtml} from '@ztimson/utils';
|
|
import * as os from 'node:os';
|
|
import {Ai} from './ai.ts';
|
|
import {LLMRequest} from './llm.ts';
|
|
|
|
const UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)';
|
|
|
|
const getShell = () => {
|
|
if(os.platform() == 'win32') return 'cmd';
|
|
return $Sync`echo $SHELL`?.split('/').pop() || 'bash';
|
|
}
|
|
|
|
export type AiToolArg = {[key: string]: {
|
|
/** Argument type */
|
|
type: 'array' | 'boolean' | 'number' | 'object' | 'string',
|
|
/** Argument description */
|
|
description: string,
|
|
/** Required argument */
|
|
required?: boolean;
|
|
/** Default value */
|
|
default?: any,
|
|
/** Options */
|
|
enum?: string[],
|
|
/** Minimum value or length */
|
|
min?: number,
|
|
/** Maximum value or length */
|
|
max?: number,
|
|
/** Match pattern */
|
|
pattern?: string,
|
|
/** Child arguments */
|
|
items?: {[key: string]: AiToolArg}
|
|
}}
|
|
|
|
export type AiTool = {
|
|
/** Tool ID / Name - Must be snail_case */
|
|
name: string,
|
|
/** Tool description / prompt */
|
|
description: string,
|
|
/** Tool arguments */
|
|
args?: AiToolArg,
|
|
/** Callback function */
|
|
fn: (args: any, stream: LLMRequest['stream'], ai: Ai) => any | Promise<any>,
|
|
};
|
|
|
|
export const CliTool: AiTool = {
|
|
name: 'cli',
|
|
description: 'Use the command line interface, returns any output',
|
|
args: {command: {type: 'string', description: 'Command to run', required: true}},
|
|
fn: (args: {command: string}) => $Sync`${args.command}`
|
|
}
|
|
|
|
export const DateTimeTool: AiTool = {
|
|
name: 'get_datetime',
|
|
description: 'Get local date / time',
|
|
args: {},
|
|
fn: async () => new Date().toString()
|
|
}
|
|
|
|
export const DateTimeUTCTool: AiTool = {
|
|
name: 'get_datetime_utc',
|
|
description: 'Get current UTC date / time',
|
|
args: {},
|
|
fn: async () => new Date().toUTCString()
|
|
}
|
|
|
|
export const ExecTool: AiTool = {
|
|
name: 'exec',
|
|
description: 'Run code/scripts',
|
|
args: {
|
|
language: {type: 'string', description: `Execution language (CLI: ${getShell()})`, enum: ['cli', 'node', 'python'], required: true},
|
|
code: {type: 'string', description: 'Code to execute', required: true}
|
|
},
|
|
fn: async (args, stream, ai) => {
|
|
try {
|
|
switch(args.language) {
|
|
case 'cli':
|
|
return await CliTool.fn({command: args.code}, stream, ai);
|
|
case 'node':
|
|
return await JSTool.fn({code: args.code}, stream, ai);
|
|
case 'python':
|
|
return await PythonTool.fn({code: args.code}, stream, ai);
|
|
default:
|
|
throw new Error(`Unsupported language: ${args.language}`);
|
|
}
|
|
} catch(err: any) {
|
|
return {error: err?.message || err.toString()};
|
|
}
|
|
}
|
|
}
|
|
|
|
export const FetchTool: AiTool = {
|
|
name: 'fetch',
|
|
description: 'Make HTTP request to URL',
|
|
args: {
|
|
url: {type: 'string', description: 'URL to fetch', required: true},
|
|
method: {type: 'string', description: 'HTTP method to use', enum: ['GET', 'POST', 'PUT', 'DELETE'], default: 'GET'},
|
|
headers: {type: 'object', description: 'HTTP headers to send', default: {}},
|
|
body: {type: 'object', description: 'HTTP body to send'},
|
|
},
|
|
fn: (args: {
|
|
url: string;
|
|
method: 'GET' | 'POST' | 'PUT' | 'DELETE';
|
|
headers: {[key: string]: string};
|
|
body: any;
|
|
}) => new Http({url: args.url, headers: args.headers}).request({method: args.method || 'GET', body: args.body})
|
|
}
|
|
|
|
export const JSTool: AiTool = {
|
|
name: 'exec_javascript',
|
|
description: 'Execute commonjs javascript',
|
|
args: {
|
|
code: {type: 'string', description: 'CommonJS javascript', required: true}
|
|
},
|
|
fn: async (args: {code: string}) => {
|
|
const c = consoleInterceptor(null);
|
|
const resp = await Fn<any>({console: c}, args.code, true).catch((err: any) => c.output.error.push(err));
|
|
return {...c.output, return: resp, stdout: undefined, stderr: undefined};
|
|
}
|
|
}
|
|
|
|
export const PythonTool: AiTool = {
|
|
name: 'exec_javascript',
|
|
description: 'Execute commonjs javascript',
|
|
args: {
|
|
code: {type: 'string', description: 'CommonJS javascript', required: true}
|
|
},
|
|
fn: async (args: {code: string}) => ({result: $Sync`python -c "${args.code}"`})
|
|
}
|
|
|
|
export const ReadWebpageTool: AiTool = {
|
|
name: 'read_webpage',
|
|
description: 'Extract clean content from webpages, or convert media/documents to accessible formats',
|
|
args: {
|
|
url: {type: 'string', description: 'URL to read', required: true},
|
|
mimeRegex: {type: 'string', description: 'Optional regex to filter MIME types (e.g., "^image/", "text/")'}
|
|
},
|
|
fn: async (args: {url: string; mimeRegex?: string}) => {
|
|
const ua = 'AiTools-Webpage/1.0';
|
|
const maxSize = 10 * 1024 * 1024;
|
|
|
|
const response = await fetch(args.url, {
|
|
headers: {
|
|
'User-Agent': ua,
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5'
|
|
},
|
|
redirect: 'follow'
|
|
}).catch(err => {throw new Error(`Failed to fetch: ${err.message}`)});
|
|
|
|
const contentType = response.headers.get('content-type') || '';
|
|
const mimeType = contentType.split(';')[0].trim().toLowerCase();
|
|
|
|
if(args.mimeRegex && !new RegExp(args.mimeRegex, 'i').test(mimeType)) {
|
|
return `❌ MIME type rejected: ${mimeType} (filter: ${args.mimeRegex})`;
|
|
}
|
|
|
|
if(mimeType.match(/^(image|audio|video)\//)) {
|
|
const buffer = await response.arrayBuffer();
|
|
if(buffer.byteLength > maxSize) {
|
|
return `❌ File too large: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (max 10MB)\nType: ${mimeType}`;
|
|
}
|
|
const base64 = Buffer.from(buffer).toString('base64');
|
|
return `## Media File\n**Type:** ${mimeType}\n**Size:** ${(buffer.byteLength / 1024).toFixed(1)}KB\n**Data URL:** \`data:${mimeType};base64,${base64.slice(0, 100)}...\``;
|
|
}
|
|
|
|
if(mimeType.match(/^text\/(plain|csv|xml)/) || args.url.match(/\.(txt|csv|xml|md|yaml|yml)$/i)) {
|
|
const text = await response.text();
|
|
const truncated = text.length > 50000 ? text.slice(0, 50000) : text;
|
|
return `## Text File\n**Type:** ${mimeType}\n**URL:** ${args.url}\n\n${truncated}`;
|
|
}
|
|
|
|
if(mimeType.match(/application\/(json|xml|csv)/)) {
|
|
const text = await response.text();
|
|
const truncated = text.length > 50000 ? text.slice(0, 50000) : text;
|
|
return `## Structured Data\n**Type:** ${mimeType}\n**URL:** ${args.url}\n\n\`\`\`\n${truncated}\n\`\`\``;
|
|
}
|
|
|
|
if(mimeType === 'application/pdf' || (mimeType.startsWith('application/') && !mimeType.includes('html'))) {
|
|
const buffer = await response.arrayBuffer();
|
|
if(buffer.byteLength > maxSize) {
|
|
return `❌ File too large: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (max 10MB)\nType: ${mimeType}`;
|
|
}
|
|
const base64 = Buffer.from(buffer).toString('base64');
|
|
return `## Binary File\n**Type:** ${mimeType}\n**Size:** ${(buffer.byteLength / 1024).toFixed(1)}KB\n**Data URL:** \`data:${mimeType};base64,${base64.slice(0, 100)}...\``;
|
|
}
|
|
|
|
// HTML
|
|
const html = await response.text();
|
|
const $ = cheerio.load(html);
|
|
$('script, style, nav, footer, header, aside, iframe, noscript, svg').remove();
|
|
$('[role="navigation"], [role="banner"], [role="complementary"]').remove();
|
|
$('[aria-hidden="true"], [hidden], .visually-hidden, .sr-only, .screen-reader-text').remove();
|
|
$('.ad, .ads, .advertisement, .cookie, .popup, .modal, .sidebar, .related, .comments, .social-share').remove();
|
|
$('button, [class*="share"], [class*="follow"], [class*="social"]').remove();
|
|
const title = $('meta[property="og:title"]').attr('content') || $('title').text().trim() || '';
|
|
const description = $('meta[name="description"]').attr('content') || $('meta[property="og:description"]').attr('content') || '';
|
|
const author = $('meta[name="author"]').attr('content') || '';
|
|
let content = '';
|
|
const selectors = ['article', 'main', '[role="main"]', '.content', '.post-content', '.entry-content', '.article-content'];
|
|
for(const sel of selectors) {
|
|
const el = $(sel).first();
|
|
if(el.length && el.text().trim().length > 200) {
|
|
const paragraphs: string[] = [];
|
|
el.find('p').each((_, p) => {
|
|
const text = $(p).text().trim();
|
|
if(text.length > 80) paragraphs.push(text);
|
|
});
|
|
if(paragraphs.length > 2) {
|
|
content = paragraphs.join('\n\n');
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(!content) {
|
|
const paragraphs: string[] = [];
|
|
$('body p').each((_, p) => {
|
|
const text = $(p).text().trim();
|
|
if(text.length > 80) paragraphs.push(text);
|
|
});
|
|
content = paragraphs.slice(0, 30).join('\n\n');
|
|
}
|
|
|
|
// Decode escaped newlines and clean
|
|
const parts = [`## ${title || 'Webpage'}`];
|
|
if(description) parts.push(`_${description}_`);
|
|
if(author) parts.push(`👤 ${author}`);
|
|
parts.push(`🔗 ${args.url}\n`);
|
|
parts.push(content);
|
|
return decodeHtml(parts.join('\n\n').replaceAll(/\n{3,}/g, '\n\n'));
|
|
}
|
|
};
|
|
|
|
export const WebSearchTool: AiTool = {
|
|
name: 'web_search',
|
|
description: 'Use duckduckgo (anonymous) to find find relevant online resources. Returns a list of URLs that works great with the `read_webpage` tool',
|
|
args: {
|
|
query: {type: 'string', description: 'Search string', required: true},
|
|
length: {type: 'string', description: 'Number of results to return', default: 5},
|
|
},
|
|
fn: async (args: {
|
|
query: string;
|
|
length: number;
|
|
}) => {
|
|
const html = await fetch(`https://html.duckduckgo.com/html/?q=${encodeURIComponent(args.query)}`, {
|
|
headers: {"User-Agent": UA, "Accept-Language": "en-US,en;q=0.9"}
|
|
}).then(resp => resp.text());
|
|
let match, regex = /<a .*?href="(.+?)".+?<\/a>/g;
|
|
const results = new ASet<string>();
|
|
while((match = regex.exec(html)) !== null) {
|
|
let url = /uddg=(.+)&?/.exec(decodeURIComponent(match[1]))?.[1];
|
|
if(url) url = decodeURIComponent(url);
|
|
if(url) results.add(url);
|
|
if(results.size >= (args.length || 5)) break;
|
|
}
|
|
return results;
|
|
}
|
|
}
|
|
|
|
class WikipediaClient {
|
|
private async get(url: string): Promise<any> {
|
|
const resp = await fetch(url, {headers: {'User-Agent': UA}});
|
|
return resp.json();
|
|
}
|
|
|
|
private api(params: Record<string, any>): Promise<any> {
|
|
const qs = new URLSearchParams({...params, format: 'json', utf8: '1'}).toString();
|
|
return this.get(`https://en.wikipedia.org/w/api.php?${qs}`);
|
|
}
|
|
|
|
private clean(text: string): string {
|
|
return text.replace(/\n{3,}/g, '\n\n').replace(/ {2,}/g, ' ').replace(/\[\d+\]/g, '').trim();
|
|
}
|
|
|
|
private truncate(text: string, max: number): string {
|
|
if(text.length <= max) return text;
|
|
const cut = text.slice(0, max);
|
|
const lastPara = cut.lastIndexOf('\n\n');
|
|
return lastPara > max * 0.7 ? cut.slice(0, lastPara) : cut;
|
|
}
|
|
|
|
private async searchTitles(query: string, limit = 6): Promise<any[]> {
|
|
const data = await this.api({action: 'query', list: 'search', srsearch: query, srlimit: limit, srprop: 'snippet'});
|
|
return data.query?.search || [];
|
|
}
|
|
|
|
private async fetchExtract(title: string, intro = false): Promise<string> {
|
|
const params: any = {action: 'query', prop: 'extracts', titles: title, explaintext: 1, redirects: 1};
|
|
if(intro) params.exintro = 1;
|
|
const data = await this.api(params);
|
|
const page = Object.values(data.query?.pages || {})[0] as any;
|
|
return this.clean(page?.extract || '');
|
|
}
|
|
|
|
private pageUrl(title: string): string {
|
|
return `https://en.wikipedia.org/wiki/${encodeURIComponent(title.replace(/ /g, '_'))}`;
|
|
}
|
|
|
|
private stripHtml(text: string): string {
|
|
return text.replace(/<[^>]+>/g, '');
|
|
}
|
|
|
|
async lookup(query: string, detail: 'intro' | 'full' = 'intro'): Promise<string> {
|
|
const results = await this.searchTitles(query, 6);
|
|
if(!results.length) return `❌ No Wikipedia articles found for "${query}"`;
|
|
const title = results[0].title;
|
|
const url = this.pageUrl(title);
|
|
const content = await this.fetchExtract(title, detail === 'intro');
|
|
const text = this.truncate(content, detail === 'intro' ? 2000 : 8000);
|
|
return `## ${title}\n🔗 ${url}\n\n${text}`;
|
|
}
|
|
|
|
async search(query: string): Promise<string> {
|
|
const results = await this.searchTitles(query, 8);
|
|
if(!results.length) return `❌ No results for "${query}"`;
|
|
const lines = [`### Search results for "${query}"\n`];
|
|
for(let i = 0; i < results.length; i++) {
|
|
const r = results[i];
|
|
const snippet = this.truncate(this.stripHtml(r.snippet || ''), 150);
|
|
lines.push(`**${i + 1}. ${r.title}**\n${snippet}\n${this.pageUrl(r.title)}`);
|
|
}
|
|
return lines.join('\n\n');
|
|
}
|
|
}
|
|
|
|
export const WikipediaLookupTool: AiTool = {
|
|
name: 'wikipedia_lookup',
|
|
description: 'Get Wikipedia article content',
|
|
args: {
|
|
query: {type: 'string', description: 'Topic or article title', required: true},
|
|
detail: {type: 'string', description: 'Content level: "intro" (summary, default) or "full" (complete article)', enum: ['intro', 'full'], default: 'intro'}
|
|
},
|
|
fn: async (args: {query: string; detail?: 'intro' | 'full'}) => {
|
|
const wiki = new WikipediaClient();
|
|
return wiki.lookup(args.query, args.detail || 'intro');
|
|
}
|
|
};
|
|
|
|
export const WikipediaSearchTool: AiTool = {
|
|
name: 'wikipedia_search',
|
|
description: 'Search Wikipedia for matching articles',
|
|
args: {
|
|
query: {type: 'string', description: 'Search terms', required: true}
|
|
},
|
|
fn: async (args: {query: string}) => {
|
|
const wiki = new WikipediaClient();
|
|
return wiki.search(args.query);
|
|
}
|
|
};
|