Fix webscraper bugs from v1.2 review: HTTPStatusError catch, CSS selector guard, relative hrefs, shared _fetch_page refactor, test fixes (18/18 passing)

2026-04-03 13:43:44 +02:00
parent 38a2b89bd3
commit bbeca4e27e
4 changed files with 228 additions and 79 deletions
@@ -1,15 +1,22 @@
 """Webscraper MCP server — fetch web pages, extract content, links, tables, sitemaps."""

 import httpx
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, SelectorSyntaxError
 from html2text import html2text
-from urllib.parse import urljoin, urlparse
-from typing import List, Dict, Optional
+from urllib.parse import urljoin
+from typing import List, Dict, Tuple
 import re
 from fastmcp import FastMCP

 mcp = FastMCP("webscraper")

+def _fetch_page(url: str) -> Tuple[httpx.Response, BeautifulSoup]:
+    """Shared fetch helper — returns response and parsed soup."""
+    response = httpx.get(url, timeout=10.0)
+    response.raise_for_status()
+    soup = BeautifulSoup(response.text, 'lxml')
+    return response, soup
+
 def clean_soup(soup):
    """Remove script, style, and other junk from soup before extraction."""
    for element in soup(["script", "style", "nav", "footer", "header"]):
@@ -33,10 +40,7 @@ def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
        Markdown string with title, body, and metadata
    """
    try:
-        response = httpx.get(url, timeout=10.0)
-        response.raise_for_status()
-        
-        soup = BeautifulSoup(response.text, 'lxml')
+        response, soup = _fetch_page(url)
        title = soup.title.string if soup.title else "No Title"
        soup = clean_soup(soup)
        body = html2text(str(soup.body if soup.body else soup), bodywidth=0)
@@ -45,7 +49,7 @@ def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
        metadata = f"URL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"
        
        return f"# {title}\n\n{body}\n\n## Metadata\n{metadata}"
-    except httpx.RequestError as e:
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        return f"# Error fetching {url}\n\n{str(e)}"

@mcp.tool()
@@ -60,23 +64,19 @@ def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
        List of unique href URLs
    """
    try:
-        response = httpx.get(url, timeout=10.0)
-        response.raise_for_status()
-        
-        soup = BeautifulSoup(response.text, 'lxml')
+        _, soup = _fetch_page(url)
        links = []
        for a in soup.find_all('a', href=True):
            href = a['href']
-            if href.startswith('http') or href.startswith('/'):
-                full_url = urljoin(url, href)
-                if filter_junk_links(full_url):
-                    links.append(full_url)
+            full_url = urljoin(url, href)
+            if filter_junk_links(full_url):
+                links.append(full_url)
        
        if deduplicate:
            links = list(set(links))
        
        return links
-    except httpx.RequestError as e:
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        return [f"Error: {str(e)}"]

@mcp.tool()
@@ -90,16 +90,13 @@ def webscraper_fetch_tables(url: str) -> List[str]:
        List of markdown tables
    """
    try:
-        response = httpx.get(url, timeout=10.0)
-        response.raise_for_status()
-        
-        soup = BeautifulSoup(response.text, 'lxml')
+        _, soup = _fetch_page(url)
        tables = []
        for table in soup.find_all('table'):
            markdown_table = html2text(str(table), bodywidth=0)
            tables.append(markdown_table)
        return tables if tables else ["No tables found."]
-    except httpx.RequestError as e:
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        return [f"Error: {str(e)}"]

@mcp.tool()
@@ -113,17 +110,50 @@ def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
    Returns:
        Dict with 'markdown', 'links', 'tables', 'meta'
    """
-    markdown = webscraper_fetch(url, max_chars)
-    links = webscraper_fetch_links(url)
-    tables = webscraper_fetch_tables(url)
-    meta = webscraper_fetch_meta(url)
-    
-    return {
-        "markdown": markdown,
-        "links": links,
-        "tables": tables,
-        "meta": meta
-    }
+    try:
+        response, soup = _fetch_page(url)
+        
+        # Markdown
+        title = soup.title.string if soup.title else "No Title"
+        soup_clean = clean_soup(soup)
+        body = html2text(str(soup_clean.body if soup_clean.body else soup_clean), bodywidth=0)
+        body = body[:max_chars] + "..." if len(body) > max_chars else body
+        markdown = f"# {title}\n\n{body}\n\n## Metadata\nURL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"
+        
+        # Links
+        links = []
+        for a in soup.find_all('a', href=True):
+            href = a['href']
+            full_url = urljoin(url, href)
+            if filter_junk_links(full_url):
+                links.append(full_url)
+        links = list(set(links))
+        
+        # Tables
+        tables = []
+        for table in soup.find_all('table'):
+            markdown_table = html2text(str(table), bodywidth=0)
+            tables.append(markdown_table)
+        tables = tables if tables else ["No tables found."]
+        
+        # Meta
+        meta = {}
+        meta['title'] = title
+        desc_tag = soup.find('meta', attrs={'name': 'description'})
+        meta['description'] = desc_tag['content'] if desc_tag else "No description"
+        og_title = soup.find('meta', attrs={'property': 'og:title'})
+        meta['og:title'] = og_title['content'] if og_title else title
+        og_desc = soup.find('meta', attrs={'property': 'og:description'})
+        meta['og:description'] = og_desc['content'] if og_desc else meta['description']
+        
+        return {
+            "markdown": markdown,
+            "links": links,
+            "tables": tables,
+            "meta": meta
+        }
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
+        return {"error": str(e)}

@mcp.tool()
 def webscraper_fetch_section(url: str, selector: str) -> str:
@@ -137,18 +167,19 @@ def webscraper_fetch_section(url: str, selector: str) -> str:
        Markdown of the selected section
    """
    try:
-        response = httpx.get(url, timeout=10.0)
-        response.raise_for_status()
+        _, soup = _fetch_page(url)
+        try:
+            section = soup.select_one(selector)
+        except SelectorSyntaxError:
+            return f"Invalid CSS selector '{selector}' on {url}"
        
-        soup = BeautifulSoup(response.text, 'lxml')
-        section = soup.select_one(selector)
        if not section:
            return f"No element found for selector '{selector}' on {url}"
        
-        soup = clean_soup(section)
-        markdown = html2text(str(soup), bodywidth=0)
+        soup_clean = clean_soup(section)
+        markdown = html2text(str(soup_clean), bodywidth=0)
        return markdown
-    except httpx.RequestError as e:
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        return f"Error: {str(e)}"

@mcp.tool()
@@ -162,10 +193,7 @@ def webscraper_fetch_meta(url: str) -> Dict[str, str]:
        Dict of metadata
    """
    try:
-        response = httpx.get(url, timeout=10.0)
-        response.raise_for_status()
-        
-        soup = BeautifulSoup(response.text, 'lxml')
+        _, soup = _fetch_page(url)
        meta = {}
        meta['title'] = soup.title.string if soup.title else "No Title"
        
@@ -179,7 +207,7 @@ def webscraper_fetch_meta(url: str) -> Dict[str, str]:
        meta['og:description'] = og_desc['content'] if og_desc else meta['description']
        
        return meta
-    except httpx.RequestError as e:
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        return {"error": str(e)}

@mcp.tool()
@@ -194,10 +222,7 @@ def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
        List of sitemap URLs
    """
    try:
-        response = httpx.get(url, timeout=10.0)
-        response.raise_for_status()
-        
-        soup = BeautifulSoup(response.text, 'xml')
+        response, soup = _fetch_page(url)
        urls = []
        for loc in soup.find_all('loc')[:max_urls]:
            urls.append(loc.text.strip())
@@ -207,7 +232,7 @@ def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
            urls.remove(url)
        
        return urls if urls else [f"No URLs in sitemap {url}"]
-    except httpx.RequestError as e:
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        return [f"Error: {str(e)}"]

 if __name__ == "__main__":