feat: Add WebSearchTool and URLFetcherTool for web content access

- Add WebSearchTool with DuckDuckGo integration for web search - Provides titles, snippets, and links for current information - Add URLFetcherTool for AI-powered web content analysis - Fetches and converts HTML content to markdown - Processes content using AI with user-provided prompts - Includes 15-minute caching for efficiency - Uses queryQuick for fast content analysis - Register both tools in the tools registry - Update documentation to reflect new web capabilities
2025-08-28 17:50:02 +08:00 · 2025-08-28 17:50:02 +08:00 · e3d903e7bc
commit e3d903e7bc
parent 3d963fb4a6
11 changed files with 529 additions and 8 deletions
--- a/docs/develop/overview.md
+++ b/docs/develop/overview.md
@ -92,6 +92,7 @@ Standardized tool interface enabling:
 - File operations (read, write, edit)
 - Shell command execution
 - Code searching and analysis
+- Web search and content analysis
 - Task management and planning
 - External tool integration via MCP

@ -151,6 +152,7 @@ Permission checks are mandatory for potentially dangerous operations, with clear
 - **File Manipulation**: Read, write, and edit files with validation
 - **Command Execution**: Run shell commands with output capture
 - **Search & Analysis**: Find code patterns and dependencies
+- **Web Search & Content**: Search the web and analyze web content with AI
 - **Task Management**: Plan and track development tasks

 ### AI-Enhanced Features
--- a/docs/develop/tools-system.md
+++ b/docs/develop/tools-system.md
@ -120,6 +120,39 @@ export abstract class Tool {
  - Size and permission information
  - Pattern filtering

+#### WebSearchTool
+- **Purpose**: Search the web for current information
+- **Key Features**:
+  - DuckDuckGo integration for web search
+  - Returns all available results (no artificial limits)
+  - Provides titles, snippets, and links
+  - Fast search results for current events and data
+- **Use Cases**: Finding recent news, current documentation, product updates
+- **Implementation**: HTML parsing of DuckDuckGo search results
+
+#### URLFetcherTool
+- **Purpose**: Fetch and analyze web content using AI
+- **Key Features**:
+  - Fetches content from any URL
+  - Converts HTML to clean markdown
+  - AI-powered content analysis based on user prompts
+  - 15-minute caching for raw content efficiency
+  - Smart content truncation for large pages
+  - Uses quick model for fast analysis
+- **Input Schema**:
+  ```typescript
+  {
+    url: string (URI format, required)
+    prompt: string (analysis instruction, required) 
+  }
+  ```
+- **Use Cases**: 
+  - Summarizing articles or documentation
+  - Extracting specific information from web pages
+  - Analyzing pricing, features, or technical requirements
+  - Content research and analysis
+- **Implementation**: Combines web fetching with AI analysis using queryQuick
+
 ### 3. System Execution Tools

 #### BashTool
@ -173,14 +206,6 @@ export abstract class Tool {
  - Error propagation
 - **Implementation**: JSON-RPC over stdio/SSE

-#### WebFetchTool
- **Purpose**: Fetch and process web content
- **Key Features**:
-  - HTML to markdown conversion
-  - Content extraction
-  - Caching support
-  - Redirect handling
-
 ## Tool Implementation Guide

 ### Creating a New Tool
--- a/package.json
+++ b/package.json
@ -76,12 +76,15 @@
    "lru-cache": "^11.1.0",
    "marked": "^15.0.12",
    "nanoid": "^5.1.5",
+    "node-fetch": "^3.3.2",
+    "node-html-parser": "^7.0.1",
    "openai": "^4.104.0",
    "react": "18.3.1",
    "semver": "^7.7.2",
    "shell-quote": "^1.8.3",
    "spawn-rx": "^5.1.2",
    "tsx": "^4.20.3",
+    "turndown": "^7.2.1",
    "undici": "^7.11.0",
    "wrap-ansi": "^9.0.0",
    "zod": "^3.25.76",
--- a/src/tools.ts
+++ b/src/tools.ts
@ -16,6 +16,8 @@ import { NotebookEditTool } from './tools/NotebookEditTool/NotebookEditTool'
 import { NotebookReadTool } from './tools/NotebookReadTool/NotebookReadTool'
 import { ThinkTool } from './tools/ThinkTool/ThinkTool'
 import { TodoWriteTool } from './tools/TodoWriteTool/TodoWriteTool'
+import { WebSearchTool } from './tools/WebSearchTool/WebSearchTool'
+import { URLFetcherTool } from './tools/URLFetcherTool/URLFetcherTool'
 import { getMCPTools } from './services/mcpClient'
 import { memoize } from 'lodash-es'

@ -38,6 +40,8 @@ export const getAllTools = (): Tool[] => {
    NotebookEditTool as unknown as Tool,
    ThinkTool as unknown as Tool,
    TodoWriteTool as unknown as Tool,
+    WebSearchTool as unknown as Tool,
+    URLFetcherTool as unknown as Tool,
    ...ANT_ONLY_TOOLS,
  ]
 }
--- a/src/tools/URLFetcherTool/URLFetcherTool.tsx
+++ b/src/tools/URLFetcherTool/URLFetcherTool.tsx
@ -0,0 +1,178 @@
+import { Box, Text } from 'ink'
+import React from 'react'
+import { z } from 'zod'
+import fetch from 'node-fetch'
+import { Cost } from '../../components/Cost'
+import { FallbackToolUseRejectedMessage } from '../../components/FallbackToolUseRejectedMessage'
+import { Tool, ToolUseContext } from '../../Tool'
+import { DESCRIPTION, TOOL_NAME_FOR_PROMPT } from './prompt'
+import { convertHtmlToMarkdown } from './htmlToMarkdown'
+import { urlCache } from './cache'
+import { queryQuick } from '../../services/claude'
+
+const inputSchema = z.strictObject({
+  url: z.string().url().describe('The URL to fetch content from'),
+  prompt: z.string().describe('The prompt to run on the fetched content'),
+})
+
+type Input = z.infer<typeof inputSchema>
+type Output = {
+  url: string
+  fromCache: boolean
+  aiAnalysis: string
+}
+
+function normalizeUrl(url: string): string {
+  // Auto-upgrade HTTP to HTTPS
+  if (url.startsWith('http://')) {
+    return url.replace('http://', 'https://')
+  }
+  return url
+}
+
+export const URLFetcherTool = {
+  name: TOOL_NAME_FOR_PROMPT,
+  async description() {
+    return DESCRIPTION
+  },
+  userFacingName: () => 'URL Fetcher',
+  inputSchema,
+  isReadOnly: () => true,
+  isConcurrencySafe: () => true,
+  async isEnabled() {
+    return true
+  },
+  needsPermissions() {
+    return false
+  },
+  async prompt() {
+    return DESCRIPTION
+  },
+  renderToolUseMessage({ url, prompt }: Input) {
+    return `Fetching content from ${url} and analyzing with prompt: "${prompt}"`
+  },
+  renderToolUseRejectedMessage() {
+    return <FallbackToolUseRejectedMessage />
+  },
+  renderToolResultMessage(output: Output) {
+    const statusText = output.fromCache ? 'from cache' : 'fetched'
+    
+    return (
+      <Box justifyContent="space-between" width="100%">
+        <Box flexDirection="row">
+          <Text>&nbsp;&nbsp;⎿ &nbsp;Content </Text>
+          <Text bold>{statusText} </Text>
+          <Text>and analyzed</Text>
+        </Box>
+        <Cost costUSD={0} durationMs={0} debug={false} />
+      </Box>
+    )
+  },
+  renderResultForAssistant(output: Output) {
+    if (!output.aiAnalysis.trim()) {
+      return `No content could be analyzed from URL: ${output.url}`
+    }
+    
+    return output.aiAnalysis
+  },
+  async *call({ url, prompt }: Input, {}: ToolUseContext) {
+    const normalizedUrl = normalizeUrl(url)
+    
+    try {
+      let content: string
+      let fromCache = false
+
+      // Check cache first
+      const cachedContent = urlCache.get(normalizedUrl)
+      if (cachedContent) {
+        content = cachedContent
+        fromCache = true
+      } else {
+        // Fetch from URL with AbortController for timeout
+        const abortController = new AbortController()
+        const timeout = setTimeout(() => abortController.abort(), 30000)
+        
+        const response = await fetch(normalizedUrl, {
+          method: 'GET',
+          headers: {
+            'User-Agent': 'Mozilla/5.0 (compatible; URLFetcher/1.0)',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+          },
+          signal: abortController.signal,
+          redirect: 'follow',
+        })
+        
+        clearTimeout(timeout)
+
+        if (!response.ok) {
+          throw new Error(`HTTP ${response.status}: ${response.statusText}`)
+        }
+
+        const contentType = response.headers.get('content-type') || ''
+        if (!contentType.includes('text/') && !contentType.includes('application/')) {
+          throw new Error(`Unsupported content type: ${contentType}`)
+        }
+
+        const html = await response.text()
+        content = convertHtmlToMarkdown(html)
+        
+        // Cache the result
+        urlCache.set(normalizedUrl, content)
+        fromCache = false
+      }
+
+      // Truncate content if too large (keep within reasonable token limits)
+      const maxContentLength = 50000 // ~15k tokens approximately
+      const truncatedContent = content.length > maxContentLength 
+        ? content.substring(0, maxContentLength) + '\n\n[Content truncated due to length]'
+        : content
+
+      // AI Analysis - always performed fresh, even with cached content
+      const systemPrompt = [
+        'You are analyzing web content based on a user\'s specific request.',
+        'The content has been extracted from a webpage and converted to markdown.',
+        'Provide a focused response that directly addresses the user\'s prompt.',
+      ]
+
+      const userPrompt = `Here is the content from ${normalizedUrl}:
+
+${truncatedContent}
+
+User request: ${prompt}`
+
+      const aiResponse = await queryQuick({
+        systemPrompt,
+        userPrompt,
+        enablePromptCaching: false,
+      })
+
+      const output: Output = {
+        url: normalizedUrl,
+        fromCache,
+        aiAnalysis: aiResponse.message.content[0]?.text || 'Unable to analyze content',
+      }
+
+      yield {
+        type: 'result' as const,
+        resultForAssistant: this.renderResultForAssistant(output),
+        data: output,
+      }
+    } catch (error: any) {
+      const output: Output = {
+        url: normalizedUrl,
+        fromCache: false,
+        aiAnalysis: '',
+      }
+      
+      yield {
+        type: 'result' as const,
+        resultForAssistant: `Error processing URL ${normalizedUrl}: ${error.message}`,
+        data: output,
+      }
+    }
+  },
+} satisfies Tool<typeof inputSchema, Output>
--- a/src/tools/URLFetcherTool/cache.ts
+++ b/src/tools/URLFetcherTool/cache.ts
@ -0,0 +1,55 @@
+interface CacheEntry {
+  content: string
+  timestamp: number
+}
+
+class URLCache {
+  private cache = new Map<string, CacheEntry>()
+  private readonly CACHE_DURATION = 15 * 60 * 1000 // 15 minutes in milliseconds
+
+  set(url: string, content: string): void {
+    this.cache.set(url, {
+      content,
+      timestamp: Date.now()
+    })
+  }
+
+  get(url: string): string | null {
+    const entry = this.cache.get(url)
+    if (!entry) {
+      return null
+    }
+
+    // Check if entry has expired
+    if (Date.now() - entry.timestamp > this.CACHE_DURATION) {
+      this.cache.delete(url)
+      return null
+    }
+
+    return entry.content
+  }
+
+  clear(): void {
+    this.cache.clear()
+  }
+
+  // Clean expired entries
+  private cleanExpired(): void {
+    const now = Date.now()
+    for (const [url, entry] of this.cache.entries()) {
+      if (now - entry.timestamp > this.CACHE_DURATION) {
+        this.cache.delete(url)
+      }
+    }
+  }
+
+  // Auto-clean expired entries every 5 minutes
+  constructor() {
+    setInterval(() => {
+      this.cleanExpired()
+    }, 5 * 60 * 1000) // 5 minutes
+  }
+}
+
+// Export singleton instance
+export const urlCache = new URLCache()
--- a/src/tools/URLFetcherTool/htmlToMarkdown.ts
+++ b/src/tools/URLFetcherTool/htmlToMarkdown.ts
@ -0,0 +1,55 @@
+import TurndownService from 'turndown'
+
+const turndownService = new TurndownService({
+  headingStyle: 'atx',
+  hr: '---',
+  bulletListMarker: '-',
+  codeBlockStyle: 'fenced',
+  fence: '```',
+  emDelimiter: '_',
+  strongDelimiter: '**'
+})
+
+// Configure rules to handle common HTML elements
+turndownService.addRule('removeScripts', {
+  filter: ['script', 'style', 'noscript'],
+  replacement: () => ''
+})
+
+turndownService.addRule('removeComments', {
+  filter: (node) => node.nodeType === 8, // Comment nodes
+  replacement: () => ''
+})
+
+turndownService.addRule('cleanLinks', {
+  filter: 'a',
+  replacement: (content, node) => {
+    const href = node.getAttribute('href')
+    if (!href || href.startsWith('javascript:') || href.startsWith('#')) {
+      return content
+    }
+    return `[${content}](${href})`
+  }
+})
+
+export function convertHtmlToMarkdown(html: string): string {
+  try {
+    // Clean up the HTML before conversion
+    const cleanHtml = html
+      .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '') // Remove script tags
+      .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') // Remove style tags
+      .replace(/<!--[\s\S]*?-->/g, '') // Remove HTML comments
+      .replace(/\s+/g, ' ') // Normalize whitespace
+      .trim()
+
+    const markdown = turndownService.turndown(cleanHtml)
+    
+    // Clean up the resulting markdown
+    return markdown
+      .replace(/\n{3,}/g, '\n\n') // Remove excessive line breaks
+      .replace(/^\s+|\s+$/gm, '') // Remove leading/trailing spaces on each line
+      .trim()
+  } catch (error) {
+    throw new Error(`Failed to convert HTML to markdown: ${error instanceof Error ? error.message : String(error)}`)
+  }
+}
--- a/src/tools/URLFetcherTool/prompt.ts
+++ b/src/tools/URLFetcherTool/prompt.ts
@ -0,0 +1,17 @@
+export const TOOL_NAME_FOR_PROMPT = 'URLFetcher'
+export const DESCRIPTION = `- Fetches content from a specified URL and processes it using an AI model
+- Takes a URL and a prompt as input
+- Fetches the URL content, converts HTML to markdown
+- Processes the content with the prompt using a small, fast model
+- Returns the model's response about the content
+- Use this tool when you need to retrieve and analyze web content
+
+Usage notes:
+- IMPORTANT: If an MCP-provided web fetch tool is available, prefer using that tool instead of this one, as it may have fewer restrictions. All MCP-provided tools start with "mcp__".
+- The URL must be a fully-formed valid URL (e.g., https://example.com)
+- HTTP URLs will be automatically upgraded to HTTPS
+- The prompt should describe what information you want to extract from the page
+- This tool is read-only and does not modify any files
+- Results may be summarized if the content is very large
+- Includes a self-cleaning 15-minute cache for faster responses when repeatedly accessing the same URL
+- When a URL redirects, the tool will inform you and provide the redirect URL in a special format. You should then make a new URLFetcher request with the redirect URL to fetch the content.`
--- a/src/tools/WebSearchTool/WebSearchTool.tsx
+++ b/src/tools/WebSearchTool/WebSearchTool.tsx
@ -0,0 +1,103 @@
+import { Box, Text } from 'ink'
+import React from 'react'
+import { z } from 'zod'
+import { Cost } from '../../components/Cost'
+import { FallbackToolUseRejectedMessage } from '../../components/FallbackToolUseRejectedMessage'
+import { Tool, ToolUseContext } from '../../Tool'
+import { DESCRIPTION, TOOL_NAME_FOR_PROMPT } from './prompt'
+import { SearchResult, searchProviders } from './searchProviders'
+
+const inputSchema = z.strictObject({
+  query: z.string().describe('The search query'),
+})
+
+type Input = z.infer<typeof inputSchema>
+type Output = {
+  durationMs: number
+  results: SearchResult[]
+}
+
+
+export const WebSearchTool = {
+  name: TOOL_NAME_FOR_PROMPT,
+  async description() {
+    return DESCRIPTION
+  },
+  userFacingName: () => 'Web Search',
+  inputSchema,
+  isReadOnly: () => true,
+  isConcurrencySafe: () => true,
+  async isEnabled() {
+    return true
+  },
+  needsPermissions() {
+    return false
+  },
+  async prompt() {
+    return DESCRIPTION
+  },
+  renderToolUseMessage({ query }: Input) {
+    return `Searching for: "${query}" using DuckDuckGo`
+  },
+  renderToolUseRejectedMessage() {
+    return <FallbackToolUseRejectedMessage />
+  },
+  renderToolResultMessage(output: Output) {
+    return (
+      <Box justifyContent="space-between" width="100%">
+        <Box flexDirection="row">
+          <Text>&nbsp;&nbsp;⎿ &nbsp;Found </Text>
+          <Text bold>{output.results.length} </Text>
+          <Text>
+            {output.results.length === 1 ? 'result' : 'results'} using DuckDuckGo
+          </Text>
+        </Box>
+        <Cost costUSD={0} durationMs={output.durationMs} debug={false} />
+      </Box>
+    )
+  },
+  renderResultForAssistant(output: Output) {
+    if (output.results.length === 0) {
+      return `No results found using DuckDuckGo.`
+    }
+    
+    let result = `Found ${output.results.length} search results using DuckDuckGo:\n\n`
+    
+    output.results.forEach((item, index) => {
+      result += `${index + 1}. **${item.title}**\n`
+      result += `   ${item.snippet}\n`
+      result += `   Link: ${item.link}\n\n`
+    })
+    
+    result += `You can reference these results to provide current, accurate information to the user.`
+    return result
+  },
+  async *call({ query }: Input, {}: ToolUseContext) {
+    const start = Date.now()
+
+    try {
+      const searchResults = await searchProviders.duckduckgo.search(query)
+      
+      const output: Output = {
+        results: searchResults,
+        durationMs: Date.now() - start,
+      }
+
+      yield {
+        type: 'result' as const,
+        resultForAssistant: this.renderResultForAssistant(output),
+        data: output,
+      }
+    } catch (error: any) {
+      const output: Output = {
+        results: [],
+        durationMs: Date.now() - start,
+      }
+      yield {
+        type: 'result' as const,
+        resultForAssistant: `An error occurred during web search with DuckDuckGo: ${error.message}`,
+        data: output,
+      }
+    }
+  },
+} satisfies Tool<typeof inputSchema, Output>
--- a/src/tools/WebSearchTool/prompt.ts
+++ b/src/tools/WebSearchTool/prompt.ts
@ -0,0 +1,13 @@
+
+export const TOOL_NAME_FOR_PROMPT = 'WebSearch'
+export const DESCRIPTION = `- Allows Kode to search the web and use the results to inform responses
+- Provides up-to-date information for current events and recent data
+- Returns search result information formatted as search result blocks
+- Use this tool for accessing information beyond the Kode's knowledge cutoff
+- Searches are performed automatically within a single API call using DuckDuckGo
+
+Usage notes:
+- Use when you need current information not in training data
+- Effective for recent news, current events, product updates, or real-time data
+- Search queries should be specific and well-targeted for best results
+- Results include both title and snippet content for context`
--- a/src/tools/WebSearchTool/searchProviders.ts
+++ b/src/tools/WebSearchTool/searchProviders.ts
@ -0,0 +1,66 @@
+import fetch from 'node-fetch'
+import { parse } from 'node-html-parser'
+
+export interface SearchResult {
+  title: string
+  snippet: string
+  link: string
+}
+
+export interface SearchProvider {
+  search: (query: string, apiKey?: string) => Promise<SearchResult[]>
+  isEnabled: (apiKey?: string) => boolean
+}
+
+
+const duckDuckGoSearchProvider: SearchProvider = {
+  isEnabled: () => true,
+  search: async (query: string): Promise<SearchResult[]> => {
+    const response = await fetch(`https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`, {
+        headers: {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+    });
+
+    if (!response.ok) {
+        throw new Error(`DuckDuckGo search failed with status: ${response.status}`);
+    }
+
+    const html = await response.text();
+    const root = parse(html);
+    const results: SearchResult[] = [];
+
+    const resultNodes = root.querySelectorAll('.result.web-result');
+
+    for (const node of resultNodes) {
+        const titleNode = node.querySelector('.result__a');
+        const snippetNode = node.querySelector('.result__snippet');
+
+        if (titleNode && snippetNode) {
+            const title = titleNode.text;
+            const link = titleNode.getAttribute('href');
+            const snippet = snippetNode.text;
+
+            if (title && link && snippet) {
+                // Clean the link - DuckDuckGo doesn't use uddg parameter anymore
+                let cleanLink = link;
+                if (link.startsWith('https://duckduckgo.com/l/?uddg=')) {
+                    try {
+                        const url = new URL(link);
+                        cleanLink = url.searchParams.get('uddg') || link;
+                    } catch {
+                        cleanLink = link;
+                    }
+                }
+                results.push({ title: title.trim(), snippet: snippet.trim(), link: cleanLink });
+            }
+        }
+    }
+
+    return results;
+  },
+}
+
+export const searchProviders = {
+  duckduckgo: duckDuckGoSearchProvider,
+}