feat: Add WebSearchTool and URLFetcherTool for web content access

- Add WebSearchTool with DuckDuckGo integration for web search
    - Provides titles, snippets, and links for current information

  - Add URLFetcherTool for AI-powered web content analysis
    - Fetches and converts HTML content to markdown
    - Processes content using AI with user-provided prompts
    - Includes 15-minute caching for efficiency
    - Uses queryQuick for fast content analysis

  - Register both tools in the tools registry
  - Update documentation to reflect new web capabilities
This commit is contained in:
Yulong Li 2025-08-28 17:50:02 +08:00
parent 3d963fb4a6
commit e3d903e7bc
11 changed files with 529 additions and 8 deletions

View File

@ -92,6 +92,7 @@ Standardized tool interface enabling:
- File operations (read, write, edit)
- Shell command execution
- Code searching and analysis
- Web search and content analysis
- Task management and planning
- External tool integration via MCP
@ -151,6 +152,7 @@ Permission checks are mandatory for potentially dangerous operations, with clear
- **File Manipulation**: Read, write, and edit files with validation
- **Command Execution**: Run shell commands with output capture
- **Search & Analysis**: Find code patterns and dependencies
- **Web Search & Content**: Search the web and analyze web content with AI
- **Task Management**: Plan and track development tasks
### AI-Enhanced Features

View File

@ -120,6 +120,39 @@ export abstract class Tool {
- Size and permission information
- Pattern filtering
#### WebSearchTool
- **Purpose**: Search the web for current information
- **Key Features**:
- DuckDuckGo integration for web search
- Returns all available results (no artificial limits)
- Provides titles, snippets, and links
- Fast search results for current events and data
- **Use Cases**: Finding recent news, current documentation, product updates
- **Implementation**: HTML parsing of DuckDuckGo search results
#### URLFetcherTool
- **Purpose**: Fetch and analyze web content using AI
- **Key Features**:
- Fetches content from any URL
- Converts HTML to clean markdown
- AI-powered content analysis based on user prompts
- 15-minute caching for raw content efficiency
- Smart content truncation for large pages
- Uses quick model for fast analysis
- **Input Schema**:
```typescript
{
url: string (URI format, required)
prompt: string (analysis instruction, required)
}
```
- **Use Cases**:
- Summarizing articles or documentation
- Extracting specific information from web pages
- Analyzing pricing, features, or technical requirements
- Content research and analysis
- **Implementation**: Combines web fetching with AI analysis using queryQuick
### 3. System Execution Tools
#### BashTool
@ -173,14 +206,6 @@ export abstract class Tool {
- Error propagation
- **Implementation**: JSON-RPC over stdio/SSE
#### WebFetchTool
- **Purpose**: Fetch and process web content
- **Key Features**:
- HTML to markdown conversion
- Content extraction
- Caching support
- Redirect handling
## Tool Implementation Guide
### Creating a New Tool

View File

@ -76,12 +76,15 @@
"lru-cache": "^11.1.0",
"marked": "^15.0.12",
"nanoid": "^5.1.5",
"node-fetch": "^3.3.2",
"node-html-parser": "^7.0.1",
"openai": "^4.104.0",
"react": "18.3.1",
"semver": "^7.7.2",
"shell-quote": "^1.8.3",
"spawn-rx": "^5.1.2",
"tsx": "^4.20.3",
"turndown": "^7.2.1",
"undici": "^7.11.0",
"wrap-ansi": "^9.0.0",
"zod": "^3.25.76",

View File

@ -16,6 +16,8 @@ import { NotebookEditTool } from './tools/NotebookEditTool/NotebookEditTool'
import { NotebookReadTool } from './tools/NotebookReadTool/NotebookReadTool'
import { ThinkTool } from './tools/ThinkTool/ThinkTool'
import { TodoWriteTool } from './tools/TodoWriteTool/TodoWriteTool'
import { WebSearchTool } from './tools/WebSearchTool/WebSearchTool'
import { URLFetcherTool } from './tools/URLFetcherTool/URLFetcherTool'
import { getMCPTools } from './services/mcpClient'
import { memoize } from 'lodash-es'
@ -38,6 +40,8 @@ export const getAllTools = (): Tool[] => {
NotebookEditTool as unknown as Tool,
ThinkTool as unknown as Tool,
TodoWriteTool as unknown as Tool,
WebSearchTool as unknown as Tool,
URLFetcherTool as unknown as Tool,
...ANT_ONLY_TOOLS,
]
}

View File

@ -0,0 +1,178 @@
import { Box, Text } from 'ink'
import React from 'react'
import { z } from 'zod'
import fetch from 'node-fetch'
import { Cost } from '../../components/Cost'
import { FallbackToolUseRejectedMessage } from '../../components/FallbackToolUseRejectedMessage'
import { Tool, ToolUseContext } from '../../Tool'
import { DESCRIPTION, TOOL_NAME_FOR_PROMPT } from './prompt'
import { convertHtmlToMarkdown } from './htmlToMarkdown'
import { urlCache } from './cache'
import { queryQuick } from '../../services/claude'
const inputSchema = z.strictObject({
url: z.string().url().describe('The URL to fetch content from'),
prompt: z.string().describe('The prompt to run on the fetched content'),
})
type Input = z.infer<typeof inputSchema>
type Output = {
url: string
fromCache: boolean
aiAnalysis: string
}
function normalizeUrl(url: string): string {
// Auto-upgrade HTTP to HTTPS
if (url.startsWith('http://')) {
return url.replace('http://', 'https://')
}
return url
}
export const URLFetcherTool = {
name: TOOL_NAME_FOR_PROMPT,
async description() {
return DESCRIPTION
},
userFacingName: () => 'URL Fetcher',
inputSchema,
isReadOnly: () => true,
isConcurrencySafe: () => true,
async isEnabled() {
return true
},
needsPermissions() {
return false
},
async prompt() {
return DESCRIPTION
},
renderToolUseMessage({ url, prompt }: Input) {
return `Fetching content from ${url} and analyzing with prompt: "${prompt}"`
},
renderToolUseRejectedMessage() {
return <FallbackToolUseRejectedMessage />
},
renderToolResultMessage(output: Output) {
const statusText = output.fromCache ? 'from cache' : 'fetched'
return (
<Box justifyContent="space-between" width="100%">
<Box flexDirection="row">
<Text>&nbsp;&nbsp; &nbsp;Content </Text>
<Text bold>{statusText} </Text>
<Text>and analyzed</Text>
</Box>
<Cost costUSD={0} durationMs={0} debug={false} />
</Box>
)
},
renderResultForAssistant(output: Output) {
if (!output.aiAnalysis.trim()) {
return `No content could be analyzed from URL: ${output.url}`
}
return output.aiAnalysis
},
async *call({ url, prompt }: Input, {}: ToolUseContext) {
const normalizedUrl = normalizeUrl(url)
try {
let content: string
let fromCache = false
// Check cache first
const cachedContent = urlCache.get(normalizedUrl)
if (cachedContent) {
content = cachedContent
fromCache = true
} else {
// Fetch from URL with AbortController for timeout
const abortController = new AbortController()
const timeout = setTimeout(() => abortController.abort(), 30000)
const response = await fetch(normalizedUrl, {
method: 'GET',
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; URLFetcher/1.0)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
},
signal: abortController.signal,
redirect: 'follow',
})
clearTimeout(timeout)
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
}
const contentType = response.headers.get('content-type') || ''
if (!contentType.includes('text/') && !contentType.includes('application/')) {
throw new Error(`Unsupported content type: ${contentType}`)
}
const html = await response.text()
content = convertHtmlToMarkdown(html)
// Cache the result
urlCache.set(normalizedUrl, content)
fromCache = false
}
// Truncate content if too large (keep within reasonable token limits)
const maxContentLength = 50000 // ~15k tokens approximately
const truncatedContent = content.length > maxContentLength
? content.substring(0, maxContentLength) + '\n\n[Content truncated due to length]'
: content
// AI Analysis - always performed fresh, even with cached content
const systemPrompt = [
'You are analyzing web content based on a user\'s specific request.',
'The content has been extracted from a webpage and converted to markdown.',
'Provide a focused response that directly addresses the user\'s prompt.',
]
const userPrompt = `Here is the content from ${normalizedUrl}:
${truncatedContent}
User request: ${prompt}`
const aiResponse = await queryQuick({
systemPrompt,
userPrompt,
enablePromptCaching: false,
})
const output: Output = {
url: normalizedUrl,
fromCache,
aiAnalysis: aiResponse.message.content[0]?.text || 'Unable to analyze content',
}
yield {
type: 'result' as const,
resultForAssistant: this.renderResultForAssistant(output),
data: output,
}
} catch (error: any) {
const output: Output = {
url: normalizedUrl,
fromCache: false,
aiAnalysis: '',
}
yield {
type: 'result' as const,
resultForAssistant: `Error processing URL ${normalizedUrl}: ${error.message}`,
data: output,
}
}
},
} satisfies Tool<typeof inputSchema, Output>

View File

@ -0,0 +1,55 @@
interface CacheEntry {
content: string
timestamp: number
}
class URLCache {
private cache = new Map<string, CacheEntry>()
private readonly CACHE_DURATION = 15 * 60 * 1000 // 15 minutes in milliseconds
set(url: string, content: string): void {
this.cache.set(url, {
content,
timestamp: Date.now()
})
}
get(url: string): string | null {
const entry = this.cache.get(url)
if (!entry) {
return null
}
// Check if entry has expired
if (Date.now() - entry.timestamp > this.CACHE_DURATION) {
this.cache.delete(url)
return null
}
return entry.content
}
clear(): void {
this.cache.clear()
}
// Clean expired entries
private cleanExpired(): void {
const now = Date.now()
for (const [url, entry] of this.cache.entries()) {
if (now - entry.timestamp > this.CACHE_DURATION) {
this.cache.delete(url)
}
}
}
// Auto-clean expired entries every 5 minutes
constructor() {
setInterval(() => {
this.cleanExpired()
}, 5 * 60 * 1000) // 5 minutes
}
}
// Export singleton instance
export const urlCache = new URLCache()

View File

@ -0,0 +1,55 @@
import TurndownService from 'turndown'
const turndownService = new TurndownService({
headingStyle: 'atx',
hr: '---',
bulletListMarker: '-',
codeBlockStyle: 'fenced',
fence: '```',
emDelimiter: '_',
strongDelimiter: '**'
})
// Configure rules to handle common HTML elements
turndownService.addRule('removeScripts', {
filter: ['script', 'style', 'noscript'],
replacement: () => ''
})
turndownService.addRule('removeComments', {
filter: (node) => node.nodeType === 8, // Comment nodes
replacement: () => ''
})
turndownService.addRule('cleanLinks', {
filter: 'a',
replacement: (content, node) => {
const href = node.getAttribute('href')
if (!href || href.startsWith('javascript:') || href.startsWith('#')) {
return content
}
return `[${content}](${href})`
}
})
export function convertHtmlToMarkdown(html: string): string {
try {
// Clean up the HTML before conversion
const cleanHtml = html
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '') // Remove script tags
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') // Remove style tags
.replace(/<!--[\s\S]*?-->/g, '') // Remove HTML comments
.replace(/\s+/g, ' ') // Normalize whitespace
.trim()
const markdown = turndownService.turndown(cleanHtml)
// Clean up the resulting markdown
return markdown
.replace(/\n{3,}/g, '\n\n') // Remove excessive line breaks
.replace(/^\s+|\s+$/gm, '') // Remove leading/trailing spaces on each line
.trim()
} catch (error) {
throw new Error(`Failed to convert HTML to markdown: ${error instanceof Error ? error.message : String(error)}`)
}
}

View File

@ -0,0 +1,17 @@
export const TOOL_NAME_FOR_PROMPT = 'URLFetcher'
export const DESCRIPTION = `- Fetches content from a specified URL and processes it using an AI model
- Takes a URL and a prompt as input
- Fetches the URL content, converts HTML to markdown
- Processes the content with the prompt using a small, fast model
- Returns the model's response about the content
- Use this tool when you need to retrieve and analyze web content
Usage notes:
- IMPORTANT: If an MCP-provided web fetch tool is available, prefer using that tool instead of this one, as it may have fewer restrictions. All MCP-provided tools start with "mcp__".
- The URL must be a fully-formed valid URL (e.g., https://example.com)
- HTTP URLs will be automatically upgraded to HTTPS
- The prompt should describe what information you want to extract from the page
- This tool is read-only and does not modify any files
- Results may be summarized if the content is very large
- Includes a self-cleaning 15-minute cache for faster responses when repeatedly accessing the same URL
- When a URL redirects, the tool will inform you and provide the redirect URL in a special format. You should then make a new URLFetcher request with the redirect URL to fetch the content.`

View File

@ -0,0 +1,103 @@
import { Box, Text } from 'ink'
import React from 'react'
import { z } from 'zod'
import { Cost } from '../../components/Cost'
import { FallbackToolUseRejectedMessage } from '../../components/FallbackToolUseRejectedMessage'
import { Tool, ToolUseContext } from '../../Tool'
import { DESCRIPTION, TOOL_NAME_FOR_PROMPT } from './prompt'
import { SearchResult, searchProviders } from './searchProviders'
const inputSchema = z.strictObject({
query: z.string().describe('The search query'),
})
type Input = z.infer<typeof inputSchema>
type Output = {
durationMs: number
results: SearchResult[]
}
export const WebSearchTool = {
name: TOOL_NAME_FOR_PROMPT,
async description() {
return DESCRIPTION
},
userFacingName: () => 'Web Search',
inputSchema,
isReadOnly: () => true,
isConcurrencySafe: () => true,
async isEnabled() {
return true
},
needsPermissions() {
return false
},
async prompt() {
return DESCRIPTION
},
renderToolUseMessage({ query }: Input) {
return `Searching for: "${query}" using DuckDuckGo`
},
renderToolUseRejectedMessage() {
return <FallbackToolUseRejectedMessage />
},
renderToolResultMessage(output: Output) {
return (
<Box justifyContent="space-between" width="100%">
<Box flexDirection="row">
<Text>&nbsp;&nbsp; &nbsp;Found </Text>
<Text bold>{output.results.length} </Text>
<Text>
{output.results.length === 1 ? 'result' : 'results'} using DuckDuckGo
</Text>
</Box>
<Cost costUSD={0} durationMs={output.durationMs} debug={false} />
</Box>
)
},
renderResultForAssistant(output: Output) {
if (output.results.length === 0) {
return `No results found using DuckDuckGo.`
}
let result = `Found ${output.results.length} search results using DuckDuckGo:\n\n`
output.results.forEach((item, index) => {
result += `${index + 1}. **${item.title}**\n`
result += ` ${item.snippet}\n`
result += ` Link: ${item.link}\n\n`
})
result += `You can reference these results to provide current, accurate information to the user.`
return result
},
async *call({ query }: Input, {}: ToolUseContext) {
const start = Date.now()
try {
const searchResults = await searchProviders.duckduckgo.search(query)
const output: Output = {
results: searchResults,
durationMs: Date.now() - start,
}
yield {
type: 'result' as const,
resultForAssistant: this.renderResultForAssistant(output),
data: output,
}
} catch (error: any) {
const output: Output = {
results: [],
durationMs: Date.now() - start,
}
yield {
type: 'result' as const,
resultForAssistant: `An error occurred during web search with DuckDuckGo: ${error.message}`,
data: output,
}
}
},
} satisfies Tool<typeof inputSchema, Output>

View File

@ -0,0 +1,13 @@
export const TOOL_NAME_FOR_PROMPT = 'WebSearch'
export const DESCRIPTION = `- Allows Kode to search the web and use the results to inform responses
- Provides up-to-date information for current events and recent data
- Returns search result information formatted as search result blocks
- Use this tool for accessing information beyond the Kode's knowledge cutoff
- Searches are performed automatically within a single API call using DuckDuckGo
Usage notes:
- Use when you need current information not in training data
- Effective for recent news, current events, product updates, or real-time data
- Search queries should be specific and well-targeted for best results
- Results include both title and snippet content for context`

View File

@ -0,0 +1,66 @@
import fetch from 'node-fetch'
import { parse } from 'node-html-parser'
export interface SearchResult {
title: string
snippet: string
link: string
}
export interface SearchProvider {
search: (query: string, apiKey?: string) => Promise<SearchResult[]>
isEnabled: (apiKey?: string) => boolean
}
const duckDuckGoSearchProvider: SearchProvider = {
isEnabled: () => true,
search: async (query: string): Promise<SearchResult[]> => {
const response = await fetch(`https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
if (!response.ok) {
throw new Error(`DuckDuckGo search failed with status: ${response.status}`);
}
const html = await response.text();
const root = parse(html);
const results: SearchResult[] = [];
const resultNodes = root.querySelectorAll('.result.web-result');
for (const node of resultNodes) {
const titleNode = node.querySelector('.result__a');
const snippetNode = node.querySelector('.result__snippet');
if (titleNode && snippetNode) {
const title = titleNode.text;
const link = titleNode.getAttribute('href');
const snippet = snippetNode.text;
if (title && link && snippet) {
// Clean the link - DuckDuckGo doesn't use uddg parameter anymore
let cleanLink = link;
if (link.startsWith('https://duckduckgo.com/l/?uddg=')) {
try {
const url = new URL(link);
cleanLink = url.searchParams.get('uddg') || link;
} catch {
cleanLink = link;
}
}
results.push({ title: title.trim(), snippet: snippet.trim(), link: cleanLink });
}
}
}
return results;
},
}
export const searchProviders = {
duckduckgo: duckDuckGoSearchProvider,
}