add bing search

2025-03-18 15:40:25 +08:00 · 2025-03-18 15:40:25 +08:00 · b95244a60b
commit b95244a60b
parent 3d7d553476
3 changed files with 157 additions and 0 deletions
--- a/app/tool/search/init.py
+++ b/app/tool/search/init.py
@ -2,6 +2,7 @@ from app.tool.search.baidu_search import BaiduSearchEngine
 from app.tool.search.base import WebSearchEngine
 from app.tool.search.duckduckgo_search import DuckDuckGoSearchEngine
 from app.tool.search.google_search import GoogleSearchEngine
+from app.tool.search.bing_search import BingSearchEngine


 __all__ = [
@ -9,4 +10,5 @@ __all__ = [
    "BaiduSearchEngine",
    "DuckDuckGoSearchEngine",
    "GoogleSearchEngine",
+    "BingSearchEngine",
 ]
--- a/app/tool/search/bing_search.py
+++ b/app/tool/search/bing_search.py
@ -0,0 +1,153 @@
+import asyncio
+from typing import List
+import requests
+from app.logger import logger
+from bs4 import BeautifulSoup
+from app.tool.search.base import WebSearchEngine
+
+ABSTRACT_MAX_LENGTH = 300
+
+USER_AGENTS = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
+    'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
+    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36',
+    'Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR) AppleWebKit/533.3 (KHTML, like Gecko) QtWeb Internet Browser/3.7 http://www.QtWeb.net',
+    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
+    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) ChromePlus/4.0.222.3 Chrome/4.0.222.3 Safari/532.2',
+    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.4pre) Gecko/20070404 K-Ninja/2.1.3',
+    'Mozilla/5.0 (Future Star Technologies Corp.; Star-Blade OS; x86_64; U; en-US) iNet Browser 4.7',
+    'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
+    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080414 Firefox/2.0.0.13 Pogo/2.0.0.13.6866'
+]
+
+HEADERS = {
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
+    "Content-Type": "application/x-www-form-urlencoded",
+    "User-Agent": USER_AGENTS[0],
+    "Referer": "https://www.bing.com/",
+    "Accept-Encoding": "gzip, deflate",
+    "Accept-Language": "zh-CN,zh;q=0.9"
+}
+
+BING_HOST_URL = "https://www.bing.com"
+BING_SEARCH_URL = "https://www.bing.com/search?q="
+
+
+class BingSearchEngine(WebSearchEngine):
+    session: requests.Session = None
+
+    def __init__(self, **data):
+        """Initialize the BingSearch tool with a requests session."""
+        super().__init__(**data)
+        self.session = requests.Session()
+        self.session.headers.update(HEADERS)
+
+    def _search_sync(self, query: str, num_results: int = 10) -> List[str]:
+        """
+        Synchronous Bing search implementation to retrieve a list of URLs matching a query.
+        
+        Args:
+            query (str): The search query to submit to Bing. Must not be empty.
+            num_results (int, optional): The maximum number of URLs to return. Defaults to 10.
+
+        Returns:
+            List[str]: A list of URLs from the search results, capped at `num_results`. 
+                       Returns an empty list if the query is empty or no results are found.
+
+        Notes:
+            - Pagination is handled by incrementing the `first` parameter and following `next_url` links.
+            - If fewer results than `num_results` are available, all found URLs are returned.
+        """
+        if not query:
+            return []
+
+        list_result = []
+        first = 1
+        next_url = BING_SEARCH_URL + query
+
+        while len(list_result) < num_results:
+            data, next_url = self._parse_html(next_url, rank_start=len(list_result), first=first)
+            if data:
+                list_result.extend([item["url"] for item in data])
+            if not next_url:
+                break
+            first += 10
+
+        return list_result[:num_results]
+
+    def _parse_html(self, url: str, rank_start: int = 0, first: int = 1) -> tuple:
+        """
+        Parse Bing search result HTML synchronously to extract search results and the next page URL.
+
+        Args:
+            url (str): The URL of the Bing search results page to parse.
+            rank_start (int, optional): The starting rank for numbering the search results. Defaults to 0.
+            first (int, optional): Unused parameter (possibly legacy). Defaults to 1.
+        Returns:
+            tuple: A tuple containing:
+                - list: A list of dictionaries with keys 'title', 'abstract', 'url', and 'rank' for each result.
+                - str or None: The URL of the next results page, or None if there is no next page.
+        Example:
+            This function is called by `execute` in the following way:
+            ```python
+            results, next_url = self._parse_html(url, rank_start=0)
+            ```
+        """
+        try:
+            res = self.session.get(url=url)
+            res.encoding = "utf-8"
+            root = BeautifulSoup(res.text, "lxml")
+
+            list_data = []
+            ol_results = root.find("ol", id="b_results")
+            if not ol_results:
+                return [], None
+
+            for li in ol_results.find_all("li", class_="b_algo"):
+                title = ''
+                url = ''
+                abstract = ''
+                try:
+                    h2 = li.find("h2")
+                    if h2:
+                        title = h2.text.strip()
+                        url = h2.a['href'].strip()
+
+                    p = li.find("p")
+                    if p:
+                        abstract = p.text.strip()
+
+                    if ABSTRACT_MAX_LENGTH and len(abstract) > ABSTRACT_MAX_LENGTH:
+                        abstract = abstract[:ABSTRACT_MAX_LENGTH]
+
+                    rank_start += 1
+                    list_data.append({"title": title, "abstract": abstract, "url": url, "rank": rank_start})
+                except Exception:
+                    continue
+
+            next_btn = root.find("a", title="Next page")
+            if not next_btn:
+                return list_data, None
+
+            next_url = BING_HOST_URL + next_btn["href"]
+            return list_data, next_url
+        except Exception as e:
+            logger.warning(f"Error parsing HTML: {e}")
+            return [], None
+
+    async def execute(self, query: str, num_results: int = 10) -> List[str]:
+        """
+        Execute a Bing search and return a list of URLs asynchronously.
+
+        Args:
+            query (str): The search query to submit to Bing.
+            num_results (int, optional): The number of search results to return. Default is 10.
+
+        Returns:
+            List[str]: A list of URLs matching the search query.
+        """
+        loop = asyncio.get_event_loop()
+        links = await loop.run_in_executor(
+            None, lambda: self._search_sync(query, num_results=num_results)
+        )
+        return links
--- a/app/tool/web_search.py
+++ b/app/tool/web_search.py
@ -10,6 +10,7 @@ from app.tool.search import (
    DuckDuckGoSearchEngine,
    GoogleSearchEngine,
    WebSearchEngine,
+    BingSearchEngine
 )


@ -37,6 +38,7 @@ class WebSearch(BaseTool):
        "google": GoogleSearchEngine(),
        "baidu": BaiduSearchEngine(),
        "duckduckgo": DuckDuckGoSearchEngine(),
+        "bing": BingSearchEngine()
    }

    async def execute(self, query: str, num_results: int = 10) -> List[str]: