Files
pi_mcps/mcp/webscraper/src/server.py
T
Patrick Plate 2ab847f51d feat(webscraper): add Brave Search hint tool and User-Agent header
- Add webscraper_search_hint() tool using Brave Search as backend
  (no CAPTCHA/GDPR consent wall, works with plain httpx)
- Add User-Agent header to _fetch_page() — fixes 403 on Wikipedia,
  Feynman Lectures, and other sites that block headless requests
- Add 5 new tests for search hint (23 total, 90% coverage)

Brave Search URL: https://search.brave.com/search?q={query}&source=web
Use sparingly — once per research task as orientation, not in loops
2026-04-05 09:37:30 +02:00

313 lines
11 KiB
Python

"""Webscraper MCP server — fetch web pages, extract content, links, tables, sitemaps."""
import httpx
from bs4 import BeautifulSoup
from html2text import html2text
from urllib.parse import urljoin
from typing import List, Dict, Tuple
import re
import ssl
import os
import certifi
from pathlib import Path
from fastmcp import FastMCP
mcp = FastMCP("webscraper")
# Build a single SSL context at module load — certifi bundle + any extra certs
# shipped in the certs/ directory next to this file.
_EXTRA_CERTS_DIR = Path(__file__).resolve().parent.parent / "certs"
def _build_ssl_context() -> ssl.SSLContext:
"""Build an SSL context from certifi + extra bundled root certs."""
ctx = ssl.create_default_context(cafile=certifi.where())
if _EXTRA_CERTS_DIR.is_dir():
for pem in _EXTRA_CERTS_DIR.glob("*.pem"):
ctx.load_verify_locations(cafile=str(pem))
return ctx
_SSL_CTX = _build_ssl_context()
_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
}
def _fetch_page(url: str) -> Tuple[httpx.Response, BeautifulSoup]:
"""Shared fetch helper — returns response and parsed soup."""
response = httpx.get(url, timeout=10.0, verify=_SSL_CTX, headers=_HEADERS)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
return response, soup
def clean_soup(soup):
"""Remove script, style, and other junk from soup before extraction."""
for element in soup(["script", "style", "nav", "footer", "header"]):
element.decompose()
return soup
def filter_junk_links(href: str) -> bool:
"""Filter out junk links: mailto, javascript, tel, data."""
junk_patterns = [r'^mailto:', r'^javascript:', r'^tel:', r'^data:']
return not any(re.match(pattern, href.lower()) for pattern in junk_patterns)
@mcp.tool()
def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
"""Fetch a URL and return title + markdown body + metadata.
Args:
url: The URL to fetch
max_chars: Maximum characters in the markdown body (default: 5000)
Returns:
Markdown string with title, body, and metadata
"""
try:
response, soup = _fetch_page(url)
title = soup.title.string if soup.title else "No Title"
soup = clean_soup(soup)
body = html2text(str(soup.body if soup.body else soup), bodywidth=0)
body = body[:max_chars] + "..." if len(body) > max_chars else body
metadata = f"URL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"
return f"# {title}\n\n{body}\n\n## Metadata\n{metadata}"
except (httpx.RequestError, httpx.HTTPStatusError) as e:
return f"# Error fetching {url}\n\n{str(e)}"
@mcp.tool()
def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
"""Fetch a URL and extract all href links.
Args:
url: The URL to fetch
deduplicate: Remove duplicate links (default: True)
Returns:
List of unique href URLs
"""
try:
_, soup = _fetch_page(url)
links = []
for a in soup.find_all('a', href=True):
href = a['href']
full_url = urljoin(url, href)
if filter_junk_links(full_url):
links.append(full_url)
if deduplicate:
links = list(set(links))
return links
except (httpx.RequestError, httpx.HTTPStatusError) as e:
return [f"Error: {str(e)}"]
@mcp.tool()
def webscraper_fetch_tables(url: str) -> List[str]:
"""Fetch a URL and extract all HTML tables as markdown.
Args:
url: The URL to fetch
Returns:
List of markdown tables
"""
try:
_, soup = _fetch_page(url)
tables = []
for table in soup.find_all('table'):
markdown_table = html2text(str(table), bodywidth=0)
tables.append(markdown_table)
return tables if tables else ["No tables found."]
except (httpx.RequestError, httpx.HTTPStatusError) as e:
return [f"Error: {str(e)}"]
@mcp.tool()
def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
"""Fetch everything: markdown + links + tables + meta.
Args:
url: The URL to fetch
max_chars: Maximum characters (default: 5000)
Returns:
Dict with 'markdown', 'links', 'tables', 'meta'
"""
try:
response, soup = _fetch_page(url)
# Markdown
title = soup.title.string if soup.title else "No Title"
soup_clean = clean_soup(soup)
body = html2text(str(soup_clean.body if soup_clean.body else soup_clean), bodywidth=0)
body = body[:max_chars] + "..." if len(body) > max_chars else body
markdown = f"# {title}\n\n{body}\n\n## Metadata\nURL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"
# Links
links = []
for a in soup.find_all('a', href=True):
href = a['href']
full_url = urljoin(url, href)
if filter_junk_links(full_url):
links.append(full_url)
links = list(set(links))
# Tables
tables = []
for table in soup.find_all('table'):
markdown_table = html2text(str(table), bodywidth=0)
tables.append(markdown_table)
tables = tables if tables else ["No tables found."]
# Meta
meta = {}
meta['title'] = title
desc_tag = soup.find('meta', attrs={'name': 'description'})
meta['description'] = desc_tag['content'] if desc_tag else "No description"
og_title = soup.find('meta', attrs={'property': 'og:title'})
meta['og:title'] = og_title['content'] if og_title else title
og_desc = soup.find('meta', attrs={'property': 'og:description'})
meta['og:description'] = og_desc['content'] if og_desc else meta['description']
return {
"markdown": markdown,
"links": links,
"tables": tables,
"meta": meta
}
except (httpx.RequestError, httpx.HTTPStatusError) as e:
return {"error": str(e)}
@mcp.tool()
def webscraper_fetch_section(url: str, selector: str) -> str:
"""Fetch a URL and extract specific section by CSS selector.
Args:
url: The URL to fetch
selector: CSS selector (e.g., '.content')
Returns:
Markdown of the selected section
"""
try:
_, soup = _fetch_page(url)
try:
section = soup.select_one(selector)
except Exception as e:
if "selector" in str(e).lower():
return f"Invalid CSS selector '{selector}' on {url}"
raise
if not section:
return f"No element found for selector '{selector}' on {url}"
soup_clean = clean_soup(section)
markdown = html2text(str(soup_clean), bodywidth=0)
return markdown
except (httpx.RequestError, httpx.HTTPStatusError) as e:
return f"Error: {str(e)}"
@mcp.tool()
def webscraper_fetch_meta(url: str) -> Dict[str, str]:
"""Fetch a URL and return page metadata: title, description, OG tags.
Args:
url: The URL to fetch
Returns:
Dict of metadata
"""
try:
_, soup = _fetch_page(url)
meta = {}
meta['title'] = soup.title.string if soup.title else "No Title"
desc_tag = soup.find('meta', attrs={'name': 'description'})
meta['description'] = desc_tag['content'] if desc_tag else "No description"
og_title = soup.find('meta', attrs={'property': 'og:title'})
meta['og:title'] = og_title['content'] if og_title else meta['title']
og_desc = soup.find('meta', attrs={'property': 'og:description'})
meta['og:description'] = og_desc['content'] if og_desc else meta['description']
return meta
except (httpx.RequestError, httpx.HTTPStatusError) as e:
return {"error": str(e)}
@mcp.tool()
def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
"""Fetch sitemap.xml and return list of URLs.
Args:
url: Sitemap URL (or auto-discover)
max_urls: Maximum URLs to return (default: 100)
Returns:
List of sitemap URLs
"""
try:
response, soup = _fetch_page(url)
urls = []
for loc in soup.find_all('loc')[:max_urls]:
urls.append(loc.text.strip())
# Simple loop protection: check for self-reference
if url in urls:
urls.remove(url)
return urls if urls else [f"No URLs in sitemap {url}"]
except (httpx.RequestError, httpx.HTTPStatusError) as e:
return [f"Error: {str(e)}"]
@mcp.tool()
def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
"""Search Brave Search and return top results as a scraping hint.
Use this sparingly — once per research task — to get oriented before
scraping individual pages. Returns top result URLs + snippets so you
can decide which pages are worth scraping deeply.
Args:
query: Search query (e.g. "MacBook Pro M4 price Germany")
max_results: Maximum number of results to return (default: 5)
Returns:
Dict with 'query', 'results' (list of {title, url, snippet}), 'hint'
"""
try:
search_url = f"https://search.brave.com/search?q={query.replace(' ', '+')}&source=web"
_, soup = _fetch_page(search_url)
results = []
# Brave Search result cards: each <a> with class snippet contains title + description
for card in soup.select('.snippet')[:max_results]:
title_el = card.select_one('.snippet-title')
url_el = card.select_one('a')
desc_el = card.select_one('.snippet-description')
title = title_el.get_text(strip=True) if title_el else ""
url = url_el['href'] if url_el and url_el.get('href') else ""
snippet = desc_el.get_text(strip=True) if desc_el else ""
if url and url.startswith('http'):
results.append({"title": title, "url": url, "snippet": snippet})
hint = "; ".join(
f"{r['title']}: {r['url']}" for r in results
) if results else "No results found"
return {
"query": query,
"results": results,
"hint": hint,
}
except (httpx.RequestError, httpx.HTTPStatusError) as e:
return {"query": query, "results": [], "hint": f"Error: {str(e)}"}
if __name__ == "__main__":
mcp.run(transport="stdio")