From 52a3e734844192c5cf09df5732cf1e10b61a14ea Mon Sep 17 00:00:00 2001 From: ztimson Date: Sat, 21 Mar 2026 14:34:24 -0400 Subject: [PATCH] Improved read_webpage tool --- package.json | 2 +- src/tools.ts | 104 ++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 96 insertions(+), 10 deletions(-) diff --git a/package.json b/package.json index 1672e21..347d2f2 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@ztimson/ai-utils", - "version": "0.8.10", + "version": "0.8.11", "description": "AI Utility library", "author": "Zak Timson", "license": "MIT", diff --git a/src/tools.ts b/src/tools.ts index 8d70b78..b4d830f 100644 --- a/src/tools.ts +++ b/src/tools.ts @@ -129,24 +129,82 @@ export const PythonTool: AiTool = { export const ReadWebpageTool: AiTool = { name: 'read_webpage', - description: 'Extract clean, structured content from a webpage. Use after web_search to read specific URLs', + description: 'Extract clean, structured content from a webpage or convert media/documents to accessible formats', args: { url: {type: 'string', description: 'URL to extract content from', required: true}, - focus: {type: 'string', description: 'Optional: What aspect to focus on (e.g., "pricing", "features", "contact info")'} + mimeRegex: {type: 'string', description: 'Optional: Regex pattern to filter MIME types (e.g., "^image/", "text/", "application/pdf")'}, + maxSize: {type: 'number', description: 'Optional: Max file size in bytes for binary content (default: 10MB)'} }, - fn: async (args: {url: string; focus?: string}) => { - const html = await fetch(args.url, {headers: {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}}) - .then(r => r.text()).catch(err => {throw new Error(`Failed to fetch: ${err.message}`)}); + fn: async (args: {url: string; mimeRegex?: string;}) => { + const maxSize = 10 * 1024 * 1024; // 10 MB + const response = await fetch(args.url, { + headers: { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5" + }, + redirect: 'follow' + }).catch(err => {throw new Error(`Failed to fetch: ${err.message}`)}); + + const contentType = response.headers.get('content-type') || ''; + const mimeType = contentType.split(';')[0].trim().toLowerCase(); + const charset = contentType.match(/charset=([^;]+)/)?.[1] || 'utf-8'; + + // Filter by MIME type if specified + if (args.mimeRegex) { + const regex = new RegExp(args.mimeRegex, 'i'); + if (!regex.test(mimeType)) { + return {url: args.url, error: 'MIME type rejected', mimeType, filter: args.mimeRegex}; + } + } + + // Handle images, audio, video -> data URL + if (mimeType.startsWith('image/') || mimeType.startsWith('audio/') || mimeType.startsWith('video/')) { + const buffer = await response.arrayBuffer(); + if (buffer.byteLength > maxSize) { + return {url: args.url, type: 'media', mimeType, error: 'File too large', size: buffer.byteLength, maxSize}; + } + const base64 = Buffer.from(buffer).toString('base64'); + return {url: args.url, type: 'media', mimeType, dataUrl: `data:${mimeType};base64,${base64}`, size: buffer.byteLength}; + } + + // Handle plain text, json, xml, csv + if (mimeType.match(/^(text\/(plain|csv|xml)|application\/(json|xml|csv|x-yaml))/) || + args.url.match(/\.(txt|json|xml|csv|yaml|yml|md)$/i)) { + const text = await response.text(); + return {url: args.url, type: 'text', mimeType, content: text.slice(0, 100000)}; + } + + // Handle PDFs and other binaries -> data URL + if (mimeType === 'application/pdf' || mimeType.startsWith('application/') && !mimeType.includes('html')) { + const buffer = await response.arrayBuffer(); + if (buffer.byteLength > maxSize) { + return {url: args.url, type: 'binary', mimeType, error: 'File too large', size: buffer.byteLength, maxSize}; + } + const base64 = Buffer.from(buffer).toString('base64'); + return {url: args.url, type: 'binary', mimeType, dataUrl: `data:${mimeType};base64,${base64}`, size: buffer.byteLength}; + } + + // Default HTML handling + const html = await response.text(); const $ = cheerio.load(html); - $('script, style, nav, footer, header, aside, iframe, noscript, [role="navigation"], [role="banner"], .ad, .ads, .cookie, .popup').remove(); + + // Remove noise + $('script, style, nav, footer, header, aside, iframe, noscript, svg, [role="navigation"], [role="banner"], [role="complementary"], .ad, .ads, .advertisement, .cookie, .popup, .modal, .sidebar, .related, .comments, .social-share').remove(); + + // Extract metadata const metadata = { title: $('meta[property="og:title"]').attr('content') || $('title').text() || '', description: $('meta[name="description"]').attr('content') || $('meta[property="og:description"]').attr('content') || '', + author: $('meta[name="author"]').attr('content') || '', + published: $('meta[property="article:published_time"]').attr('content') || $('time').attr('datetime') || '', + image: $('meta[property="og:image"]').attr('content') || '' }; + // Extract structured content let content = ''; - const contentSelectors = ['article', 'main', '[role="main"]', '.content', '.post', '.entry', 'body']; + const contentSelectors = ['article', 'main', '[role="main"]', '.content', '.post-content', '.entry-content', '.article-content', 'body']; for (const selector of contentSelectors) { const el = $(selector).first(); if (el.length && el.text().trim().length > 200) { @@ -155,9 +213,37 @@ export const ReadWebpageTool: AiTool = { } } if (!content) content = $('body').text(); - content = content.replace(/\s+/g, ' ').trim().slice(0, 8000); - return {url: args.url, title: metadata.title.trim(), description: metadata.description.trim(), content, focus: args.focus}; + // Clean whitespace but preserve structure + content = content + .replace(/\n\s*\n\s*\n/g, '\n\n') + .replace(/[ \t]+/g, ' ') + .trim() + .slice(0, 50000); + + // Extract links if minimal content + let links: any[] = []; + if (content.length < 500) { + $('a[href]').each((_, el) => { + const href = $(el).attr('href'); + const text = $(el).text().trim(); + if (href && text && !href.startsWith('#')) { + links.push({text, href}); + } + }); + links = links.slice(0, 50); + } + + return { + url: args.url, + type: 'html', + title: metadata.title.trim(), + description: metadata.description.trim(), + author: metadata.author.trim(), + published: metadata.published, + content, + links: links.length ? links : undefined, + }; } }