diff --git a/app/config.py b/app/config.py index 0be771b..011ceb1 100644 --- a/app/config.py +++ b/app/config.py @@ -25,7 +25,7 @@ class LLMSettings(BaseModel): description="Maximum input tokens to use across all requests (None for unlimited)", ) temperature: float = Field(1.0, description="Sampling temperature") - api_type: str = Field(..., description="AzureOpenai or Openai") + api_type: str = Field(..., description="Azure, Openai, or Ollama") api_version: str = Field(..., description="Azure Openai version if AzureOpenai") diff --git a/app/llm.py b/app/llm.py index 37de566..a67a86f 100644 --- a/app/llm.py +++ b/app/llm.py @@ -30,6 +30,14 @@ from app.schema import ( REASONING_MODELS = ["o1", "o3-mini"] +MULTIMODAL_MODELS = [ + "gpt-4-vision-preview", + "gpt-4o", + "gpt-4o-mini", + "claude-3-opus-20240229", + "claude-3-sonnet-20240229", + "claude-3-haiku-20240307", +] class TokenCounter: @@ -259,12 +267,15 @@ class LLM: return "Token limit exceeded" @staticmethod - def format_messages(messages: List[Union[dict, Message]]) -> List[dict]: + def format_messages( + messages: List[Union[dict, Message]], supports_images: bool = False + ) -> List[dict]: """ Format messages for LLM by converting them to OpenAI message format. Args: messages: List of messages that can be either dict or Message objects + supports_images: Flag indicating if the target model supports image inputs Returns: List[dict]: List of formatted messages in OpenAI format @@ -288,54 +299,58 @@ class LLM: if isinstance(message, Message): message = message.to_dict() - if not isinstance(message, dict): + if isinstance(message, dict): + # If message is a dict, ensure it has required fields + if "role" not in message: + raise ValueError("Message dict must contain 'role' field") + + # Process base64 images if present and model supports images + if supports_images and message.get("base64_image"): + # Initialize or convert content to appropriate format + if not message.get("content"): + message["content"] = [] + elif isinstance(message["content"], str): + message["content"] = [ + {"type": "text", "text": message["content"]} + ] + elif isinstance(message["content"], list): + # Convert string items to proper text objects + message["content"] = [ + ( + {"type": "text", "text": item} + if isinstance(item, str) + else item + ) + for item in message["content"] + ] + + # Add the image to content + message["content"].append( + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{message['base64_image']}" + }, + } + ) + + # Remove the base64_image field + del message["base64_image"] + # If model doesn't support images but message has base64_image, handle gracefully + elif not supports_images and message.get("base64_image"): + # Just remove the base64_image field and keep the text content + del message["base64_image"] + + if "content" in message or "tool_calls" in message: + formatted_messages.append(message) + # else: do not include the message + else: raise TypeError(f"Unsupported message type: {type(message)}") - # Validate required fields - if "role" not in message: - raise ValueError("Message dict must contain 'role' field") - - # Process base64 images if present - if message.get("base64_image"): - # Initialize or convert content to appropriate format - if not message.get("content"): - message["content"] = [] - elif isinstance(message["content"], str): - message["content"] = [{"type": "text", "text": message["content"]}] - elif isinstance(message["content"], list): - # Convert string items to proper text objects - message["content"] = [ - ( - {"type": "text", "text": item} - if isinstance(item, str) - else item - ) - for item in message["content"] - ] - - # Add the image to content - message["content"].append( - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{message['base64_image']}" - }, - } - ) - - # Remove the base64_image field - del message["base64_image"] - - # Only include messages with content or tool_calls - if "content" in message or "tool_calls" in message: - formatted_messages.append(message) - - # Validate all roles - invalid_roles = [ - msg for msg in formatted_messages if msg["role"] not in ROLE_VALUES - ] - if invalid_roles: - raise ValueError(f"Invalid role: {invalid_roles[0]['role']}") + # Validate all messages have required fields + for msg in formatted_messages: + if msg["role"] not in ROLE_VALUES: + raise ValueError(f"Invalid role: {msg['role']}") return formatted_messages @@ -372,12 +387,15 @@ class LLM: Exception: For unexpected errors """ try: - # Format system and user messages + # Check if the model supports images + supports_images = self.model in MULTIMODAL_MODELS + + # Format system and user messages with image support check if system_msgs: - system_msgs = self.format_messages(system_msgs) - messages = system_msgs + self.format_messages(messages) + system_msgs = self.format_messages(system_msgs, supports_images) + messages = system_msgs + self.format_messages(messages, supports_images) else: - messages = self.format_messages(messages) + messages = self.format_messages(messages, supports_images) # Calculate input token count input_tokens = self.count_message_tokens(messages) @@ -499,8 +517,15 @@ class LLM: Exception: For unexpected errors """ try: - # Format messages - formatted_messages = self.format_messages(messages) + # For ask_with_images, we always set supports_images to True because + # this method should only be called with models that support images + if self.model not in MULTIMODAL_MODELS: + raise ValueError( + f"Model {self.model} does not support images. Use a model from {MULTIMODAL_MODELS}" + ) + + # Format messages with image support + formatted_messages = self.format_messages(messages, supports_images=True) # Ensure the last message is from the user to attach images if not formatted_messages or formatted_messages[-1]["role"] != "user": @@ -539,7 +564,10 @@ class LLM: # Add system messages if provided if system_msgs: - all_messages = self.format_messages(system_msgs) + formatted_messages + all_messages = ( + self.format_messages(system_msgs, supports_images=True) + + formatted_messages + ) else: all_messages = formatted_messages @@ -653,12 +681,15 @@ class LLM: if tool_choice not in TOOL_CHOICE_VALUES: raise ValueError(f"Invalid tool_choice: {tool_choice}") + # Check if the model supports images + supports_images = self.model in MULTIMODAL_MODELS + # Format messages if system_msgs: - system_msgs = self.format_messages(system_msgs) - messages = system_msgs + self.format_messages(messages) + system_msgs = self.format_messages(system_msgs, supports_images) + messages = system_msgs + self.format_messages(messages, supports_images) else: - messages = self.format_messages(messages) + messages = self.format_messages(messages, supports_images) # Calculate input token count input_tokens = self.count_message_tokens(messages) diff --git a/app/tool/browser_use_tool.py b/app/tool/browser_use_tool.py index 7817aef..7fe8f16 100644 --- a/app/tool/browser_use_tool.py +++ b/app/tool/browser_use_tool.py @@ -418,17 +418,7 @@ class BrowserUseTool(BaseTool, Generic[Context]): # Create prompt for LLM prompt_text = """ -Your task is to extract the content of the page. You will be given a page and a goal, and you should extract all relevant information around this goal from the page. - -Examples of extraction goals: -- Extract all company names -- Extract specific descriptions -- Extract all information about a topic -- Extract links with companies in structured format -- Extract all links - -If the goal is vague, summarize the page. Respond in JSON format. - +Your task is to extract the content of the page. You will be given a page and a goal, and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal} Page content: @@ -445,10 +435,54 @@ Page content: messages = [Message.user_message(formatted_prompt)] - # Use LLM to extract content based on the goal - response = await self.llm.ask(messages) + # Define extraction function for the tool + extraction_function = { + "type": "function", + "function": { + "name": "extract_content", + "description": "Extract specific information from a webpage based on a goal", + "parameters": { + "type": "object", + "properties": { + "extracted_content": { + "type": "object", + "description": "The content extracted from the page according to the goal", + } + }, + "required": ["extracted_content"], + }, + }, + } + + # Use LLM to extract content with required function calling + response = await self.llm.ask_tool( + messages, + tools=[extraction_function], + tool_choice="required", + ) + + # Extract content from function call response + if ( + response + and response.tool_calls + and len(response.tool_calls) > 0 + ): + # Get the first tool call arguments + tool_call = response.tool_calls[0] + # Parse the JSON arguments + try: + args = json.loads(tool_call.function.arguments) + extracted_content = args.get("extracted_content", {}) + # Format extracted content as JSON string + content_json = json.dumps( + extracted_content, indent=2, ensure_ascii=False + ) + msg = f"Extracted from page:\n{content_json}\n" + except Exception as e: + msg = f"Error parsing extraction result: {str(e)}\nRaw response: {tool_call.function.arguments}" + else: + msg = "No content was extracted from the page." - msg = f"Extracted from page:\n{response}\n" return ToolResult(output=msg) except Exception as e: # Provide a more helpful error message diff --git a/app/tool/search/__init__.py b/app/tool/search/__init__.py index 4f486ac..fe127ae 100644 --- a/app/tool/search/__init__.py +++ b/app/tool/search/__init__.py @@ -1,5 +1,6 @@ from app.tool.search.baidu_search import BaiduSearchEngine from app.tool.search.base import WebSearchEngine +from app.tool.search.bing_search import BingSearchEngine from app.tool.search.duckduckgo_search import DuckDuckGoSearchEngine from app.tool.search.google_search import GoogleSearchEngine @@ -9,4 +10,5 @@ __all__ = [ "BaiduSearchEngine", "DuckDuckGoSearchEngine", "GoogleSearchEngine", + "BingSearchEngine", ] diff --git a/app/tool/search/bing_search.py b/app/tool/search/bing_search.py new file mode 100644 index 0000000..46955b5 --- /dev/null +++ b/app/tool/search/bing_search.py @@ -0,0 +1,146 @@ +from typing import List + +import requests +from bs4 import BeautifulSoup + +from app.logger import logger +from app.tool.search.base import WebSearchEngine + + +ABSTRACT_MAX_LENGTH = 300 + +USER_AGENTS = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36", + "Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR) AppleWebKit/533.3 (KHTML, like Gecko) QtWeb Internet Browser/3.7 http://www.QtWeb.net", + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", + "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) ChromePlus/4.0.222.3 Chrome/4.0.222.3 Safari/532.2", + "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.4pre) Gecko/20070404 K-Ninja/2.1.3", + "Mozilla/5.0 (Future Star Technologies Corp.; Star-Blade OS; x86_64; U; en-US) iNet Browser 4.7", + "Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201", + "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080414 Firefox/2.0.0.13 Pogo/2.0.0.13.6866", +] + +HEADERS = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", + "Content-Type": "application/x-www-form-urlencoded", + "User-Agent": USER_AGENTS[0], + "Referer": "https://www.bing.com/", + "Accept-Encoding": "gzip, deflate", + "Accept-Language": "zh-CN,zh;q=0.9", +} + +BING_HOST_URL = "https://www.bing.com" +BING_SEARCH_URL = "https://www.bing.com/search?q=" + + +class BingSearchEngine(WebSearchEngine): + session: requests.Session = None + + def __init__(self, **data): + """Initialize the BingSearch tool with a requests session.""" + super().__init__(**data) + self.session = requests.Session() + self.session.headers.update(HEADERS) + + def _search_sync(self, query: str, num_results: int = 10) -> List[str]: + """ + Synchronous Bing search implementation to retrieve a list of URLs matching a query. + + Args: + query (str): The search query to submit to Bing. Must not be empty. + num_results (int, optional): The maximum number of URLs to return. Defaults to 10. + + Returns: + List[str]: A list of URLs from the search results, capped at `num_results`. + Returns an empty list if the query is empty or no results are found. + + Notes: + - Pagination is handled by incrementing the `first` parameter and following `next_url` links. + - If fewer results than `num_results` are available, all found URLs are returned. + """ + if not query: + return [] + + list_result = [] + first = 1 + next_url = BING_SEARCH_URL + query + + while len(list_result) < num_results: + data, next_url = self._parse_html( + next_url, rank_start=len(list_result), first=first + ) + if data: + list_result.extend([item["url"] for item in data]) + if not next_url: + break + first += 10 + + return list_result[:num_results] + + def _parse_html(self, url: str, rank_start: int = 0, first: int = 1) -> tuple: + """ + Parse Bing search result HTML synchronously to extract search results and the next page URL. + + Args: + url (str): The URL of the Bing search results page to parse. + rank_start (int, optional): The starting rank for numbering the search results. Defaults to 0. + first (int, optional): Unused parameter (possibly legacy). Defaults to 1. + Returns: + tuple: A tuple containing: + - list: A list of dictionaries with keys 'title', 'abstract', 'url', and 'rank' for each result. + - str or None: The URL of the next results page, or None if there is no next page. + """ + try: + res = self.session.get(url=url) + res.encoding = "utf-8" + root = BeautifulSoup(res.text, "lxml") + + list_data = [] + ol_results = root.find("ol", id="b_results") + if not ol_results: + return [], None + + for li in ol_results.find_all("li", class_="b_algo"): + title = "" + url = "" + abstract = "" + try: + h2 = li.find("h2") + if h2: + title = h2.text.strip() + url = h2.a["href"].strip() + + p = li.find("p") + if p: + abstract = p.text.strip() + + if ABSTRACT_MAX_LENGTH and len(abstract) > ABSTRACT_MAX_LENGTH: + abstract = abstract[:ABSTRACT_MAX_LENGTH] + + rank_start += 1 + list_data.append( + { + "title": title, + "abstract": abstract, + "url": url, + "rank": rank_start, + } + ) + except Exception: + continue + + next_btn = root.find("a", title="Next page") + if not next_btn: + return list_data, None + + next_url = BING_HOST_URL + next_btn["href"] + return list_data, next_url + except Exception as e: + logger.warning(f"Error parsing HTML: {e}") + return [], None + + def perform_search(self, query, num_results=10, *args, **kwargs): + """Bing search engine.""" + return self._search_sync(query, num_results=num_results) diff --git a/app/tool/web_search.py b/app/tool/web_search.py index 7b1018b..cb13934 100644 --- a/app/tool/web_search.py +++ b/app/tool/web_search.py @@ -7,6 +7,7 @@ from app.config import config from app.tool.base import BaseTool from app.tool.search import ( BaiduSearchEngine, + BingSearchEngine, DuckDuckGoSearchEngine, GoogleSearchEngine, WebSearchEngine, @@ -37,6 +38,7 @@ class WebSearch(BaseTool): "google": GoogleSearchEngine(), "baidu": BaiduSearchEngine(), "duckduckgo": DuckDuckGoSearchEngine(), + "bing": BingSearchEngine(), } async def execute(self, query: str, num_results: int = 10) -> List[str]: