Fix webscraper bugs from v1.2 review: HTTPStatusError catch, CSS selector guard, relative hrefs, shared _fetch_page refactor, test fixes (18/18 passing)

This commit is contained in:
Patrick Plate
2026-04-03 13:43:44 +02:00
parent 38a2b89bd3
commit bbeca4e27e
4 changed files with 228 additions and 79 deletions
+14 -6
View File
@@ -1,6 +1,7 @@
"""Comprehensive tests for webscraper server."""
import pytest
import httpx
from unittest.mock import MagicMock, patch
from src.server import (
webscraper_fetch, webscraper_fetch_links, webscraper_fetch_tables,
@@ -25,6 +26,8 @@ def mock_response():
<a href="https://example.com/link1">Link 1</a>
<a href="mailto:foo@bar.com">Junk Mail</a>
<a href="javascript:alert()">Junk JS</a>
<a href="relative.html">Relative Link</a>
<a href="../dir/page.html">Parent Relative</a>
<table><tr><td>Cell1</td><td>Cell2</td></tr></table>
<div class="content">Selected content</div>
</body>
@@ -72,14 +75,16 @@ def test_webscraper_fetch_links(mock_get, mock_response):
result = webscraper_fetch_links("https://example.com", deduplicate=True)
assert isinstance(result, list)
assert "https://example.com/link1" in result
assert len(result) == 1 # Only valid link
assert "https://example.com/relative.html" in result
assert "https://example.com/dir/page.html" in result
assert len(result) == 3 # Valid links only
@patch('httpx.get')
def test_webscraper_fetch_links_no_dedup(mock_get, mock_response):
"""Test without deduplication."""
mock_get.return_value = mock_response
result = webscraper_fetch_links("https://example.com", deduplicate=False)
assert len(result) == 1 # Still one unique
assert len(result) == 3 # Still three unique
@patch('httpx.get')
def test_webscraper_fetch_tables(mock_get, mock_response):
@@ -87,7 +92,8 @@ def test_webscraper_fetch_tables(mock_get, mock_response):
mock_get.return_value = mock_response
result = webscraper_fetch_tables("https://example.com")
assert isinstance(result, list)
assert "| Cell1 | Cell2 |" in result[0]
assert "Cell1" in result[0]
assert "Cell2" in result[0]
@patch('httpx.get')
def test_webscraper_fetch_all(mock_get, mock_response):
@@ -176,9 +182,11 @@ def test_404(mock_get):
"""Test 404 response."""
mock_resp = MagicMock()
mock_resp.status_code = 404
mock_get.side_effect = lambda *args, **kwargs: mock_resp
mock_resp.text = "Not Found"
mock_get.side_effect = httpx.HTTPStatusError("Client Error", response=mock_resp)
result = webscraper_fetch("https://notfound.com")
assert "404" in str(mock_resp.status_code) # Error raised
assert "Error fetching" in result
assert "404" in result
@patch('httpx.get')
def test_invalid_selector(mock_get, mock_response):
@@ -194,4 +202,4 @@ def test_sitemap_max_urls(mock_get, mock_sitemap_response):
result = webscraper_fetch_sitemap("https://example.com/sitemap.xml", max_urls=1)
assert len(result) == 1
# Total: 15+ tests covering all tools and edge cases
# Total: 18 tests covering all tools and edge cases