Merge branch 'refs/heads/main' into mcp

2025-03-19 13:27:24 +08:00 · 2025-03-19 13:27:24 +08:00 · 167b1acd5c
commit 167b1acd5c
parent 8f3a60f52b b7dcbfecb3
6 changed files with 287 additions and 72 deletions
--- a/app/config.py
+++ b/app/config.py
@ -25,7 +25,7 @@ class LLMSettings(BaseModel):
        description="Maximum input tokens to use across all requests (None for unlimited)",
    )
    temperature: float = Field(1.0, description="Sampling temperature")
-    api_type: str = Field(..., description="AzureOpenai or Openai")
+    api_type: str = Field(..., description="Azure, Openai, or Ollama")
    api_version: str = Field(..., description="Azure Openai version if AzureOpenai")


--- a/app/llm.py
+++ b/app/llm.py
@ -30,6 +30,14 @@ from app.schema import (


 REASONING_MODELS = ["o1", "o3-mini"]
+MULTIMODAL_MODELS = [
+    "gpt-4-vision-preview",
+    "gpt-4o",
+    "gpt-4o-mini",
+    "claude-3-opus-20240229",
+    "claude-3-sonnet-20240229",
+    "claude-3-haiku-20240307",
+]


 class TokenCounter:
@ -259,12 +267,15 @@ class LLM:
        return "Token limit exceeded"

    @staticmethod
-    def format_messages(messages: List[Union[dict, Message]]) -> List[dict]:
+    def format_messages(
+        messages: List[Union[dict, Message]], supports_images: bool = False
+    ) -> List[dict]:
        """
        Format messages for LLM by converting them to OpenAI message format.

        Args:
            messages: List of messages that can be either dict or Message objects
+            supports_images: Flag indicating if the target model supports image inputs

        Returns:
            List[dict]: List of formatted messages in OpenAI format
@ -288,54 +299,58 @@ class LLM:
            if isinstance(message, Message):
                message = message.to_dict()

-            if not isinstance(message, dict):
+            if isinstance(message, dict):
+                # If message is a dict, ensure it has required fields
+                if "role" not in message:
+                    raise ValueError("Message dict must contain 'role' field")
+
+                # Process base64 images if present and model supports images
+                if supports_images and message.get("base64_image"):
+                    # Initialize or convert content to appropriate format
+                    if not message.get("content"):
+                        message["content"] = []
+                    elif isinstance(message["content"], str):
+                        message["content"] = [
+                            {"type": "text", "text": message["content"]}
+                        ]
+                    elif isinstance(message["content"], list):
+                        # Convert string items to proper text objects
+                        message["content"] = [
+                            (
+                                {"type": "text", "text": item}
+                                if isinstance(item, str)
+                                else item
+                            )
+                            for item in message["content"]
+                        ]
+
+                    # Add the image to content
+                    message["content"].append(
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{message['base64_image']}"
+                            },
+                        }
+                    )
+
+                    # Remove the base64_image field
+                    del message["base64_image"]
+                # If model doesn't support images but message has base64_image, handle gracefully
+                elif not supports_images and message.get("base64_image"):
+                    # Just remove the base64_image field and keep the text content
+                    del message["base64_image"]
+
+                if "content" in message or "tool_calls" in message:
+                    formatted_messages.append(message)
+                # else: do not include the message
+            else:
                raise TypeError(f"Unsupported message type: {type(message)}")

-            # Validate required fields
-            if "role" not in message:
-                raise ValueError("Message dict must contain 'role' field")
-
-            # Process base64 images if present
-            if message.get("base64_image"):
-                # Initialize or convert content to appropriate format
-                if not message.get("content"):
-                    message["content"] = []
-                elif isinstance(message["content"], str):
-                    message["content"] = [{"type": "text", "text": message["content"]}]
-                elif isinstance(message["content"], list):
-                    # Convert string items to proper text objects
-                    message["content"] = [
-                        (
-                            {"type": "text", "text": item}
-                            if isinstance(item, str)
-                            else item
-                        )
-                        for item in message["content"]
-                    ]
-
-                # Add the image to content
-                message["content"].append(
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/jpeg;base64,{message['base64_image']}"
-                        },
-                    }
-                )
-
-                # Remove the base64_image field
-                del message["base64_image"]
-
-            # Only include messages with content or tool_calls
-            if "content" in message or "tool_calls" in message:
-                formatted_messages.append(message)
-
-        # Validate all roles
-        invalid_roles = [
-            msg for msg in formatted_messages if msg["role"] not in ROLE_VALUES
-        ]
-        if invalid_roles:
-            raise ValueError(f"Invalid role: {invalid_roles[0]['role']}")
+        # Validate all messages have required fields
+        for msg in formatted_messages:
+            if msg["role"] not in ROLE_VALUES:
+                raise ValueError(f"Invalid role: {msg['role']}")

        return formatted_messages

@ -372,12 +387,15 @@ class LLM:
            Exception: For unexpected errors
        """
        try:
-            # Format system and user messages
+            # Check if the model supports images
+            supports_images = self.model in MULTIMODAL_MODELS
+
+            # Format system and user messages with image support check
            if system_msgs:
-                system_msgs = self.format_messages(system_msgs)
-                messages = system_msgs + self.format_messages(messages)
+                system_msgs = self.format_messages(system_msgs, supports_images)
+                messages = system_msgs + self.format_messages(messages, supports_images)
            else:
-                messages = self.format_messages(messages)
+                messages = self.format_messages(messages, supports_images)

            # Calculate input token count
            input_tokens = self.count_message_tokens(messages)
@ -499,8 +517,15 @@ class LLM:
            Exception: For unexpected errors
        """
        try:
-            # Format messages
-            formatted_messages = self.format_messages(messages)
+            # For ask_with_images, we always set supports_images to True because
+            # this method should only be called with models that support images
+            if self.model not in MULTIMODAL_MODELS:
+                raise ValueError(
+                    f"Model {self.model} does not support images. Use a model from {MULTIMODAL_MODELS}"
+                )
+
+            # Format messages with image support
+            formatted_messages = self.format_messages(messages, supports_images=True)

            # Ensure the last message is from the user to attach images
            if not formatted_messages or formatted_messages[-1]["role"] != "user":
@ -539,7 +564,10 @@ class LLM:

            # Add system messages if provided
            if system_msgs:
-                all_messages = self.format_messages(system_msgs) + formatted_messages
+                all_messages = (
+                    self.format_messages(system_msgs, supports_images=True)
+                    + formatted_messages
+                )
            else:
                all_messages = formatted_messages

@ -653,12 +681,15 @@ class LLM:
            if tool_choice not in TOOL_CHOICE_VALUES:
                raise ValueError(f"Invalid tool_choice: {tool_choice}")

+            # Check if the model supports images
+            supports_images = self.model in MULTIMODAL_MODELS
+
            # Format messages
            if system_msgs:
-                system_msgs = self.format_messages(system_msgs)
-                messages = system_msgs + self.format_messages(messages)
+                system_msgs = self.format_messages(system_msgs, supports_images)
+                messages = system_msgs + self.format_messages(messages, supports_images)
            else:
-                messages = self.format_messages(messages)
+                messages = self.format_messages(messages, supports_images)

            # Calculate input token count
            input_tokens = self.count_message_tokens(messages)
--- a/app/tool/browser_use_tool.py
+++ b/app/tool/browser_use_tool.py
@ -418,17 +418,7 @@ class BrowserUseTool(BaseTool, Generic[Context]):

                        # Create prompt for LLM
                        prompt_text = """
-Your task is to extract the content of the page. You will be given a page and a goal, and you should extract all relevant information around this goal from the page.
-
-Examples of extraction goals:
- Extract all company names
- Extract specific descriptions
- Extract all information about a topic
- Extract links with companies in structured format
- Extract all links
-
-If the goal is vague, summarize the page. Respond in JSON format.
-
+Your task is to extract the content of the page. You will be given a page and a goal, and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format.
 Extraction goal: {goal}

 Page content:
@ -445,10 +435,54 @@ Page content:

                        messages = [Message.user_message(formatted_prompt)]

-                        # Use LLM to extract content based on the goal
-                        response = await self.llm.ask(messages)
+                        # Define extraction function for the tool
+                        extraction_function = {
+                            "type": "function",
+                            "function": {
+                                "name": "extract_content",
+                                "description": "Extract specific information from a webpage based on a goal",
+                                "parameters": {
+                                    "type": "object",
+                                    "properties": {
+                                        "extracted_content": {
+                                            "type": "object",
+                                            "description": "The content extracted from the page according to the goal",
+                                        }
+                                    },
+                                    "required": ["extracted_content"],
+                                },
+                            },
+                        }
+
+                        # Use LLM to extract content with required function calling
+                        response = await self.llm.ask_tool(
+                            messages,
+                            tools=[extraction_function],
+                            tool_choice="required",
+                        )
+
+                        # Extract content from function call response
+                        if (
+                            response
+                            and response.tool_calls
+                            and len(response.tool_calls) > 0
+                        ):
+                            # Get the first tool call arguments
+                            tool_call = response.tool_calls[0]
+                            # Parse the JSON arguments
+                            try:
+                                args = json.loads(tool_call.function.arguments)
+                                extracted_content = args.get("extracted_content", {})
+                                # Format extracted content as JSON string
+                                content_json = json.dumps(
+                                    extracted_content, indent=2, ensure_ascii=False
+                                )
+                                msg = f"Extracted from page:\n{content_json}\n"
+                            except Exception as e:
+                                msg = f"Error parsing extraction result: {str(e)}\nRaw response: {tool_call.function.arguments}"
+                        else:
+                            msg = "No content was extracted from the page."

-                        msg = f"Extracted from page:\n{response}\n"
                        return ToolResult(output=msg)
                    except Exception as e:
                        # Provide a more helpful error message
--- a/app/tool/search/init.py
+++ b/app/tool/search/init.py
@ -1,5 +1,6 @@
 from app.tool.search.baidu_search import BaiduSearchEngine
 from app.tool.search.base import WebSearchEngine
+from app.tool.search.bing_search import BingSearchEngine
 from app.tool.search.duckduckgo_search import DuckDuckGoSearchEngine
 from app.tool.search.google_search import GoogleSearchEngine

@ -9,4 +10,5 @@ __all__ = [
    "BaiduSearchEngine",
    "DuckDuckGoSearchEngine",
    "GoogleSearchEngine",
+    "BingSearchEngine",
 ]
--- a/app/tool/search/bing_search.py
+++ b/app/tool/search/bing_search.py
@ -0,0 +1,146 @@
+from typing import List
+
+import requests
+from bs4 import BeautifulSoup
+
+from app.logger import logger
+from app.tool.search.base import WebSearchEngine
+
+
+ABSTRACT_MAX_LENGTH = 300
+
+USER_AGENTS = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
+    "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36",
+    "Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR) AppleWebKit/533.3 (KHTML, like Gecko) QtWeb Internet Browser/3.7 http://www.QtWeb.net",
+    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
+    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) ChromePlus/4.0.222.3 Chrome/4.0.222.3 Safari/532.2",
+    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.4pre) Gecko/20070404 K-Ninja/2.1.3",
+    "Mozilla/5.0 (Future Star Technologies Corp.; Star-Blade OS; x86_64; U; en-US) iNet Browser 4.7",
+    "Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201",
+    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080414 Firefox/2.0.0.13 Pogo/2.0.0.13.6866",
+]
+
+HEADERS = {
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
+    "Content-Type": "application/x-www-form-urlencoded",
+    "User-Agent": USER_AGENTS[0],
+    "Referer": "https://www.bing.com/",
+    "Accept-Encoding": "gzip, deflate",
+    "Accept-Language": "zh-CN,zh;q=0.9",
+}
+
+BING_HOST_URL = "https://www.bing.com"
+BING_SEARCH_URL = "https://www.bing.com/search?q="
+
+
+class BingSearchEngine(WebSearchEngine):
+    session: requests.Session = None
+
+    def __init__(self, **data):
+        """Initialize the BingSearch tool with a requests session."""
+        super().__init__(**data)
+        self.session = requests.Session()
+        self.session.headers.update(HEADERS)
+
+    def _search_sync(self, query: str, num_results: int = 10) -> List[str]:
+        """
+        Synchronous Bing search implementation to retrieve a list of URLs matching a query.
+
+        Args:
+            query (str): The search query to submit to Bing. Must not be empty.
+            num_results (int, optional): The maximum number of URLs to return. Defaults to 10.
+
+        Returns:
+            List[str]: A list of URLs from the search results, capped at `num_results`.
+                       Returns an empty list if the query is empty or no results are found.
+
+        Notes:
+            - Pagination is handled by incrementing the `first` parameter and following `next_url` links.
+            - If fewer results than `num_results` are available, all found URLs are returned.
+        """
+        if not query:
+            return []
+
+        list_result = []
+        first = 1
+        next_url = BING_SEARCH_URL + query
+
+        while len(list_result) < num_results:
+            data, next_url = self._parse_html(
+                next_url, rank_start=len(list_result), first=first
+            )
+            if data:
+                list_result.extend([item["url"] for item in data])
+            if not next_url:
+                break
+            first += 10
+
+        return list_result[:num_results]
+
+    def _parse_html(self, url: str, rank_start: int = 0, first: int = 1) -> tuple:
+        """
+        Parse Bing search result HTML synchronously to extract search results and the next page URL.
+
+        Args:
+            url (str): The URL of the Bing search results page to parse.
+            rank_start (int, optional): The starting rank for numbering the search results. Defaults to 0.
+            first (int, optional): Unused parameter (possibly legacy). Defaults to 1.
+        Returns:
+            tuple: A tuple containing:
+                - list: A list of dictionaries with keys 'title', 'abstract', 'url', and 'rank' for each result.
+                - str or None: The URL of the next results page, or None if there is no next page.
+        """
+        try:
+            res = self.session.get(url=url)
+            res.encoding = "utf-8"
+            root = BeautifulSoup(res.text, "lxml")
+
+            list_data = []
+            ol_results = root.find("ol", id="b_results")
+            if not ol_results:
+                return [], None
+
+            for li in ol_results.find_all("li", class_="b_algo"):
+                title = ""
+                url = ""
+                abstract = ""
+                try:
+                    h2 = li.find("h2")
+                    if h2:
+                        title = h2.text.strip()
+                        url = h2.a["href"].strip()
+
+                    p = li.find("p")
+                    if p:
+                        abstract = p.text.strip()
+
+                    if ABSTRACT_MAX_LENGTH and len(abstract) > ABSTRACT_MAX_LENGTH:
+                        abstract = abstract[:ABSTRACT_MAX_LENGTH]
+
+                    rank_start += 1
+                    list_data.append(
+                        {
+                            "title": title,
+                            "abstract": abstract,
+                            "url": url,
+                            "rank": rank_start,
+                        }
+                    )
+                except Exception:
+                    continue
+
+            next_btn = root.find("a", title="Next page")
+            if not next_btn:
+                return list_data, None
+
+            next_url = BING_HOST_URL + next_btn["href"]
+            return list_data, next_url
+        except Exception as e:
+            logger.warning(f"Error parsing HTML: {e}")
+            return [], None
+
+    def perform_search(self, query, num_results=10, *args, **kwargs):
+        """Bing search engine."""
+        return self._search_sync(query, num_results=num_results)
--- a/app/tool/web_search.py
+++ b/app/tool/web_search.py
@ -7,6 +7,7 @@ from app.config import config
 from app.tool.base import BaseTool
 from app.tool.search import (
    BaiduSearchEngine,
+    BingSearchEngine,
    DuckDuckGoSearchEngine,
    GoogleSearchEngine,
    WebSearchEngine,
@ -37,6 +38,7 @@ class WebSearch(BaseTool):
        "google": GoogleSearchEngine(),
        "baidu": BaiduSearchEngine(),
        "duckduckgo": DuckDuckGoSearchEngine(),
+        "bing": BingSearchEngine(),
    }

    async def execute(self, query: str, num_results: int = 10) -> List[str]: