From 52a3e734844192c5cf09df5732cf1e10b61a14ea Mon Sep 17 00:00:00 2001
From: ztimson <zaktimson@gmail.com>
Date: Sat, 21 Mar 2026 14:34:24 -0400
Subject: [PATCH] Improved read_webpage tool

---
 package.json |   2 +-
 src/tools.ts | 104 ++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 96 insertions(+), 10 deletions(-)

diff --git a/package.json b/package.json
index 1672e21..347d2f2 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@ztimson/ai-utils",
-	"version": "0.8.10",
+	"version": "0.8.11",
 	"description": "AI Utility library",
 	"author": "Zak Timson",
 	"license": "MIT",
diff --git a/src/tools.ts b/src/tools.ts
index 8d70b78..b4d830f 100644
--- a/src/tools.ts
+++ b/src/tools.ts
@@ -129,24 +129,82 @@ export const PythonTool: AiTool = {
 
 export const ReadWebpageTool: AiTool = {
 	name: 'read_webpage',
-	description: 'Extract clean, structured content from a webpage. Use after web_search to read specific URLs',
+	description: 'Extract clean, structured content from a webpage or convert media/documents to accessible formats',
 	args: {
 		url: {type: 'string', description: 'URL to extract content from', required: true},
-		focus: {type: 'string', description: 'Optional: What aspect to focus on (e.g., "pricing", "features", "contact info")'}
+		mimeRegex: {type: 'string', description: 'Optional: Regex pattern to filter MIME types (e.g., "^image/", "text/", "application/pdf")'},
+		maxSize: {type: 'number', description: 'Optional: Max file size in bytes for binary content (default: 10MB)'}
 	},
-	fn: async (args: {url: string; focus?: string}) => {
-		const html = await fetch(args.url, {headers: {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}})
-			.then(r => r.text()).catch(err => {throw new Error(`Failed to fetch: ${err.message}`)});
+	fn: async (args: {url: string; mimeRegex?: string;}) => {
+		const maxSize = 10 * 1024 * 1024; // 10 MB
 
+		const response = await fetch(args.url, {
+			headers: {
+				"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+				"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+				"Accept-Language": "en-US,en;q=0.5"
+			},
+			redirect: 'follow'
+		}).catch(err => {throw new Error(`Failed to fetch: ${err.message}`)});
+
+		const contentType = response.headers.get('content-type') || '';
+		const mimeType = contentType.split(';')[0].trim().toLowerCase();
+		const charset = contentType.match(/charset=([^;]+)/)?.[1] || 'utf-8';
+
+		// Filter by MIME type if specified
+		if (args.mimeRegex) {
+			const regex = new RegExp(args.mimeRegex, 'i');
+			if (!regex.test(mimeType)) {
+				return {url: args.url, error: 'MIME type rejected', mimeType, filter: args.mimeRegex};
+			}
+		}
+
+		// Handle images, audio, video -> data URL
+		if (mimeType.startsWith('image/') || mimeType.startsWith('audio/') || mimeType.startsWith('video/')) {
+			const buffer = await response.arrayBuffer();
+			if (buffer.byteLength > maxSize) {
+				return {url: args.url, type: 'media', mimeType, error: 'File too large', size: buffer.byteLength, maxSize};
+			}
+			const base64 = Buffer.from(buffer).toString('base64');
+			return {url: args.url, type: 'media', mimeType, dataUrl: `data:${mimeType};base64,${base64}`, size: buffer.byteLength};
+		}
+
+		// Handle plain text, json, xml, csv
+		if (mimeType.match(/^(text\/(plain|csv|xml)|application\/(json|xml|csv|x-yaml))/) ||
+			args.url.match(/\.(txt|json|xml|csv|yaml|yml|md)$/i)) {
+			const text = await response.text();
+			return {url: args.url, type: 'text', mimeType, content: text.slice(0, 100000)};
+		}
+
+		// Handle PDFs and other binaries -> data URL
+		if (mimeType === 'application/pdf' || mimeType.startsWith('application/') && !mimeType.includes('html')) {
+			const buffer = await response.arrayBuffer();
+			if (buffer.byteLength > maxSize) {
+				return {url: args.url, type: 'binary', mimeType, error: 'File too large', size: buffer.byteLength, maxSize};
+			}
+			const base64 = Buffer.from(buffer).toString('base64');
+			return {url: args.url, type: 'binary', mimeType, dataUrl: `data:${mimeType};base64,${base64}`, size: buffer.byteLength};
+		}
+
+		// Default HTML handling
+		const html = await response.text();
 		const $ = cheerio.load(html);
-		$('script, style, nav, footer, header, aside, iframe, noscript, [role="navigation"], [role="banner"], .ad, .ads, .cookie, .popup').remove();
+
+		// Remove noise
+		$('script, style, nav, footer, header, aside, iframe, noscript, svg, [role="navigation"], [role="banner"], [role="complementary"], .ad, .ads, .advertisement, .cookie, .popup, .modal, .sidebar, .related, .comments, .social-share').remove();
+
+		// Extract metadata
 		const metadata = {
 			title: $('meta[property="og:title"]').attr('content') || $('title').text() || '',
 			description: $('meta[name="description"]').attr('content') || $('meta[property="og:description"]').attr('content') || '',
+			author: $('meta[name="author"]').attr('content') || '',
+			published: $('meta[property="article:published_time"]').attr('content') || $('time').attr('datetime') || '',
+			image: $('meta[property="og:image"]').attr('content') || ''
 		};
 
+		// Extract structured content
 		let content = '';
-		const contentSelectors = ['article', 'main', '[role="main"]', '.content', '.post', '.entry', 'body'];
+		const contentSelectors = ['article', 'main', '[role="main"]', '.content', '.post-content', '.entry-content', '.article-content', 'body'];
 		for (const selector of contentSelectors) {
 			const el = $(selector).first();
 			if (el.length && el.text().trim().length > 200) {
@@ -155,9 +213,37 @@ export const ReadWebpageTool: AiTool = {
 			}
 		}
 		if (!content) content = $('body').text();
-		content = content.replace(/\s+/g, ' ').trim().slice(0, 8000);
 
-		return {url: args.url, title: metadata.title.trim(), description: metadata.description.trim(), content, focus: args.focus};
+		// Clean whitespace but preserve structure
+		content = content
+			.replace(/\n\s*\n\s*\n/g, '\n\n')
+			.replace(/[ \t]+/g, ' ')
+			.trim()
+			.slice(0, 50000);
+
+		// Extract links if minimal content
+		let links: any[] = [];
+		if (content.length < 500) {
+			$('a[href]').each((_, el) => {
+				const href = $(el).attr('href');
+				const text = $(el).text().trim();
+				if (href && text && !href.startsWith('#')) {
+					links.push({text, href});
+				}
+			});
+			links = links.slice(0, 50);
+		}
+
+		return {
+			url: args.url,
+			type: 'html',
+			title: metadata.title.trim(),
+			description: metadata.description.trim(),
+			author: metadata.author.trim(),
+			published: metadata.published,
+			content,
+			links: links.length ? links : undefined,
+		};
 	}
 }