Fix webscraper bugs from v1.2 review: HTTPStatusError catch, CSS selector guard, relative hrefs, shared _fetch_page refactor, test fixes (18/18 passing)

This commit is contained in:
Patrick Plate
2026-04-03 13:43:44 +02:00
parent 38a2b89bd3
commit bbeca4e27e
4 changed files with 228 additions and 79 deletions
+75 -50
View File
@@ -1,15 +1,22 @@
"""Webscraper MCP server — fetch web pages, extract content, links, tables, sitemaps."""
import httpx
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, SelectorSyntaxError
from html2text import html2text
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Optional
from urllib.parse import urljoin
from typing import List, Dict, Tuple
import re
from fastmcp import FastMCP
mcp = FastMCP("webscraper")
def _fetch_page(url: str) -> Tuple[httpx.Response, BeautifulSoup]:
"""Shared fetch helper — returns response and parsed soup."""
response = httpx.get(url, timeout=10.0)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
return response, soup
def clean_soup(soup):
"""Remove script, style, and other junk from soup before extraction."""
for element in soup(["script", "style", "nav", "footer", "header"]):
@@ -33,10 +40,7 @@ def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
Markdown string with title, body, and metadata
"""
try:
response = httpx.get(url, timeout=10.0)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
response, soup = _fetch_page(url)
title = soup.title.string if soup.title else "No Title"
soup = clean_soup(soup)
body = html2text(str(soup.body if soup.body else soup), bodywidth=0)
@@ -45,7 +49,7 @@ def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
metadata = f"URL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"
return f"# {title}\n\n{body}\n\n## Metadata\n{metadata}"
except httpx.RequestError as e:
except (httpx.RequestError, httpx.HTTPStatusError) as e:
return f"# Error fetching {url}\n\n{str(e)}"
@mcp.tool()
@@ -60,23 +64,19 @@ def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
List of unique href URLs
"""
try:
response = httpx.get(url, timeout=10.0)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
_, soup = _fetch_page(url)
links = []
for a in soup.find_all('a', href=True):
href = a['href']
if href.startswith('http') or href.startswith('/'):
full_url = urljoin(url, href)
if filter_junk_links(full_url):
links.append(full_url)
full_url = urljoin(url, href)
if filter_junk_links(full_url):
links.append(full_url)
if deduplicate:
links = list(set(links))
return links
except httpx.RequestError as e:
except (httpx.RequestError, httpx.HTTPStatusError) as e:
return [f"Error: {str(e)}"]
@mcp.tool()
@@ -90,16 +90,13 @@ def webscraper_fetch_tables(url: str) -> List[str]:
List of markdown tables
"""
try:
response = httpx.get(url, timeout=10.0)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
_, soup = _fetch_page(url)
tables = []
for table in soup.find_all('table'):
markdown_table = html2text(str(table), bodywidth=0)
tables.append(markdown_table)
return tables if tables else ["No tables found."]
except httpx.RequestError as e:
except (httpx.RequestError, httpx.HTTPStatusError) as e:
return [f"Error: {str(e)}"]
@mcp.tool()
@@ -113,17 +110,50 @@ def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
Returns:
Dict with 'markdown', 'links', 'tables', 'meta'
"""
markdown = webscraper_fetch(url, max_chars)
links = webscraper_fetch_links(url)
tables = webscraper_fetch_tables(url)
meta = webscraper_fetch_meta(url)
return {
"markdown": markdown,
"links": links,
"tables": tables,
"meta": meta
}
try:
response, soup = _fetch_page(url)
# Markdown
title = soup.title.string if soup.title else "No Title"
soup_clean = clean_soup(soup)
body = html2text(str(soup_clean.body if soup_clean.body else soup_clean), bodywidth=0)
body = body[:max_chars] + "..." if len(body) > max_chars else body
markdown = f"# {title}\n\n{body}\n\n## Metadata\nURL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"
# Links
links = []
for a in soup.find_all('a', href=True):
href = a['href']
full_url = urljoin(url, href)
if filter_junk_links(full_url):
links.append(full_url)
links = list(set(links))
# Tables
tables = []
for table in soup.find_all('table'):
markdown_table = html2text(str(table), bodywidth=0)
tables.append(markdown_table)
tables = tables if tables else ["No tables found."]
# Meta
meta = {}
meta['title'] = title
desc_tag = soup.find('meta', attrs={'name': 'description'})
meta['description'] = desc_tag['content'] if desc_tag else "No description"
og_title = soup.find('meta', attrs={'property': 'og:title'})
meta['og:title'] = og_title['content'] if og_title else title
og_desc = soup.find('meta', attrs={'property': 'og:description'})
meta['og:description'] = og_desc['content'] if og_desc else meta['description']
return {
"markdown": markdown,
"links": links,
"tables": tables,
"meta": meta
}
except (httpx.RequestError, httpx.HTTPStatusError) as e:
return {"error": str(e)}
@mcp.tool()
def webscraper_fetch_section(url: str, selector: str) -> str:
@@ -137,18 +167,19 @@ def webscraper_fetch_section(url: str, selector: str) -> str:
Markdown of the selected section
"""
try:
response = httpx.get(url, timeout=10.0)
response.raise_for_status()
_, soup = _fetch_page(url)
try:
section = soup.select_one(selector)
except SelectorSyntaxError:
return f"Invalid CSS selector '{selector}' on {url}"
soup = BeautifulSoup(response.text, 'lxml')
section = soup.select_one(selector)
if not section:
return f"No element found for selector '{selector}' on {url}"
soup = clean_soup(section)
markdown = html2text(str(soup), bodywidth=0)
soup_clean = clean_soup(section)
markdown = html2text(str(soup_clean), bodywidth=0)
return markdown
except httpx.RequestError as e:
except (httpx.RequestError, httpx.HTTPStatusError) as e:
return f"Error: {str(e)}"
@mcp.tool()
@@ -162,10 +193,7 @@ def webscraper_fetch_meta(url: str) -> Dict[str, str]:
Dict of metadata
"""
try:
response = httpx.get(url, timeout=10.0)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
_, soup = _fetch_page(url)
meta = {}
meta['title'] = soup.title.string if soup.title else "No Title"
@@ -179,7 +207,7 @@ def webscraper_fetch_meta(url: str) -> Dict[str, str]:
meta['og:description'] = og_desc['content'] if og_desc else meta['description']
return meta
except httpx.RequestError as e:
except (httpx.RequestError, httpx.HTTPStatusError) as e:
return {"error": str(e)}
@mcp.tool()
@@ -194,10 +222,7 @@ def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
List of sitemap URLs
"""
try:
response = httpx.get(url, timeout=10.0)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'xml')
response, soup = _fetch_page(url)
urls = []
for loc in soup.find_all('loc')[:max_urls]:
urls.append(loc.text.strip())
@@ -207,7 +232,7 @@ def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
urls.remove(url)
return urls if urls else [f"No URLs in sitemap {url}"]
except httpx.RequestError as e:
except (httpx.RequestError, httpx.HTTPStatusError) as e:
return [f"Error: {str(e)}"]
if __name__ == "__main__":