Fix webscraper bugs from v1.2 review: HTTPStatusError catch, CSS selector guard, relative hrefs, shared _fetch_page refactor, test fixes (18/18 passing)
This commit is contained in:
@@ -5,26 +5,3 @@ from pathlib import Path
|
||||
|
||||
# Add src to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
import pytest
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
@pytest.fixture
|
||||
def mock_httpx():
|
||||
"""Mock httpx for all network calls."""
|
||||
mock_get = MagicMock()
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.text = "<html><body>Test</body></html>"
|
||||
mock_get.return_value.headers = {"content-type": "text/html"}
|
||||
|
||||
with MagicMock() as mock_module:
|
||||
mock_module.get.return_value = mock_get
|
||||
sys.modules["httpx"] = mock_module
|
||||
yield mock_module
|
||||
|
||||
@pytest.fixture
|
||||
def mock_bs4():
|
||||
"""Mock BeautifulSoup for parsing."""
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup("<html><body>Test</body></html>", "html.parser")
|
||||
return soup
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""Comprehensive tests for webscraper server."""
|
||||
|
||||
import pytest
|
||||
import httpx
|
||||
from unittest.mock import MagicMock, patch
|
||||
from src.server import (
|
||||
webscraper_fetch, webscraper_fetch_links, webscraper_fetch_tables,
|
||||
@@ -25,6 +26,8 @@ def mock_response():
|
||||
<a href="https://example.com/link1">Link 1</a>
|
||||
<a href="mailto:foo@bar.com">Junk Mail</a>
|
||||
<a href="javascript:alert()">Junk JS</a>
|
||||
<a href="relative.html">Relative Link</a>
|
||||
<a href="../dir/page.html">Parent Relative</a>
|
||||
<table><tr><td>Cell1</td><td>Cell2</td></tr></table>
|
||||
<div class="content">Selected content</div>
|
||||
</body>
|
||||
@@ -72,14 +75,16 @@ def test_webscraper_fetch_links(mock_get, mock_response):
|
||||
result = webscraper_fetch_links("https://example.com", deduplicate=True)
|
||||
assert isinstance(result, list)
|
||||
assert "https://example.com/link1" in result
|
||||
assert len(result) == 1 # Only valid link
|
||||
assert "https://example.com/relative.html" in result
|
||||
assert "https://example.com/dir/page.html" in result
|
||||
assert len(result) == 3 # Valid links only
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_webscraper_fetch_links_no_dedup(mock_get, mock_response):
|
||||
"""Test without deduplication."""
|
||||
mock_get.return_value = mock_response
|
||||
result = webscraper_fetch_links("https://example.com", deduplicate=False)
|
||||
assert len(result) == 1 # Still one unique
|
||||
assert len(result) == 3 # Still three unique
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_webscraper_fetch_tables(mock_get, mock_response):
|
||||
@@ -87,7 +92,8 @@ def test_webscraper_fetch_tables(mock_get, mock_response):
|
||||
mock_get.return_value = mock_response
|
||||
result = webscraper_fetch_tables("https://example.com")
|
||||
assert isinstance(result, list)
|
||||
assert "| Cell1 | Cell2 |" in result[0]
|
||||
assert "Cell1" in result[0]
|
||||
assert "Cell2" in result[0]
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_webscraper_fetch_all(mock_get, mock_response):
|
||||
@@ -176,9 +182,11 @@ def test_404(mock_get):
|
||||
"""Test 404 response."""
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 404
|
||||
mock_get.side_effect = lambda *args, **kwargs: mock_resp
|
||||
mock_resp.text = "Not Found"
|
||||
mock_get.side_effect = httpx.HTTPStatusError("Client Error", response=mock_resp)
|
||||
result = webscraper_fetch("https://notfound.com")
|
||||
assert "404" in str(mock_resp.status_code) # Error raised
|
||||
assert "Error fetching" in result
|
||||
assert "404" in result
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_invalid_selector(mock_get, mock_response):
|
||||
@@ -194,4 +202,4 @@ def test_sitemap_max_urls(mock_get, mock_sitemap_response):
|
||||
result = webscraper_fetch_sitemap("https://example.com/sitemap.xml", max_urls=1)
|
||||
assert len(result) == 1
|
||||
|
||||
# Total: 15+ tests covering all tools and edge cases
|
||||
# Total: 18 tests covering all tools and edge cases
|
||||
|
||||
Reference in New Issue
Block a user