Skip to content

Commit 4a6ef1e

Browse files
authored
Merge pull request #8 from mathiasesn/7-extract-website-as-markdown
Extract website as markdown
2 parents fd3a2de + a90ee7e commit 4a6ef1e

File tree

5 files changed

+369
-6
lines changed

5 files changed

+369
-6
lines changed

pyproject.toml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
1+
[build-system]
2+
requires = ["hatchling"]
3+
build-backend = "hatchling.build"
4+
15
[project]
26
name = "repo-context"
3-
version = "0.3.0"
7+
version = "0.4.0"
48
description = "Convert Git repositories into LLM-friendly context format"
59
authors = [{ name = "Mathias Nielsen", email = "[email protected]" }]
610
maintainers = [{ name = "Mathias Nielsen", email = "[email protected]" }]
@@ -9,7 +13,9 @@ license = { file = "LICENSE" }
913
requires-python = ">=3.10"
1014
dependencies = [
1115
"gitpython>=3.1.43",
16+
"markdownify>=0.14.1",
1217
"python-dotenv>=1.0.1",
18+
"requests>=2.32.3",
1319
"rich>=13.9.4",
1420
"tqdm>=4.67.1",
1521
]
@@ -23,7 +29,3 @@ dev-dependencies = [
2329

2430
[project.scripts]
2531
repo-context = "repo_context.cli:main"
26-
27-
[build-system]
28-
requires = ["hatchling"]
29-
build-backend = "hatchling.build"

repo_context/cli.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ def parse_args() -> Namespace:
4242
default=None,
4343
help="Maximum number of lines in context files",
4444
)
45+
parser.add_argument(
46+
"--web",
47+
action="store_true",
48+
help="Convert a webpage instead of a repository",
49+
)
4550
args = parser.parse_args()
4651
return args
4752

@@ -50,6 +55,23 @@ def main():
5055
# Parse arguments
5156
args = parse_args()
5257

58+
if args.web:
59+
from repo_context.webpage import Webpage
60+
61+
# Create the webpage converter and get markdown
62+
webpage = Webpage()
63+
context = webpage.get_markdown(args.source)
64+
65+
# Get the filename from the URL
66+
fname = urlparse(args.source).path.strip("/").replace("/", "-")
67+
68+
# Write context to file
69+
output_path = Path(f"{args.output}/{fname}.md")
70+
output_path.write_text(context)
71+
72+
logger.info(f"Context written to {output_path}")
73+
return
74+
5375
# Concat ignore patterns
5476
ignore_patterns = args.ignore.copy() if args.ignore else []
5577
if args.ignore_file:

repo_context/webpage.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import logging
2+
import re
3+
from functools import lru_cache
4+
from urllib.parse import urlparse
5+
6+
import requests
7+
from markdownify import markdownify
8+
from requests.exceptions import RequestException, Timeout
9+
10+
logger = logging.getLogger("repo_context.webpage")
11+
12+
13+
class Webpage:
14+
"""A class for fetching and converting webpages to markdown format."""
15+
16+
def __init__(
17+
self,
18+
timeout: int = 20,
19+
allowed_schemes: tuple[str] = ("http", "https"),
20+
max_retries: int = 3,
21+
) -> None:
22+
self.timeout = timeout
23+
self.allowed_schemes = allowed_schemes
24+
self.max_retries = max_retries
25+
26+
self.user_agent: str = "Mozilla/5.0 (compatible; WebpageFetcher/1.0)"
27+
28+
self.session = requests.Session()
29+
self.session.headers.update({"User-Agent": self.user_agent})
30+
31+
def _validate_url(self, url: str) -> None:
32+
"""Validates URL scheme and format."""
33+
parsed = urlparse(url)
34+
if parsed.scheme not in self.allowed_schemes:
35+
raise ValueError(f"Invalid URL scheme. Allowed: {self.allowed_schemes}")
36+
37+
def _fetch_content(self, url: str) -> str:
38+
"""Fetches webpage content with retries."""
39+
for attempt in range(self.max_retries):
40+
try:
41+
response = self.session.get(url, timeout=self.timeout)
42+
response.raise_for_status()
43+
return response.text
44+
except Timeout:
45+
if attempt == self.max_retries - 1:
46+
raise RuntimeError("Request timed out after retries")
47+
except RequestException as e:
48+
raise RuntimeError(f"Failed to fetch webpage: {e}")
49+
50+
def _convert_to_markdown(self, html: str) -> str:
51+
"""Converts HTML to clean markdown format."""
52+
try:
53+
markdown = markdownify(html).strip()
54+
return re.sub(r"\n{3,}", "\n\n", markdown)
55+
except Exception as e:
56+
raise RuntimeError(f"Failed to convert HTML to markdown: {e}")
57+
58+
@lru_cache(maxsize=100)
59+
def get_markdown(self, url: str) -> str:
60+
"""
61+
Fetches webpage and converts to markdown format with caching.
62+
63+
Args:
64+
url: Webpage URL to fetch
65+
66+
Returns:
67+
Converted markdown content
68+
69+
Raises:
70+
WebpageError: If fetching or conversion fails
71+
ValueError: If URL is invalid
72+
"""
73+
try:
74+
self._validate_url(url)
75+
content = self._fetch_content(url)
76+
return self._convert_to_markdown(content)
77+
except Exception as e:
78+
logger.error(f"Failed to process {url}: {e}")
79+
raise

tests/test_webpage.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
from unittest.mock import Mock, patch
2+
3+
import pytest
4+
from requests.exceptions import RequestException, Timeout
5+
from repo_context.webpage import Webpage
6+
7+
8+
@pytest.fixture
9+
def webpage():
10+
"""Base webpage instance with default settings."""
11+
return Webpage()
12+
13+
14+
@pytest.fixture
15+
def mock_response():
16+
"""Mock successful response fixture."""
17+
response = Mock()
18+
response.text = "<h1>Test</h1><p>Content</p>"
19+
response.raise_for_status.return_value = None
20+
return response
21+
22+
23+
class TestWebpage:
24+
def test_init_default_values(self):
25+
"""Test initialization with default values."""
26+
webpage = Webpage()
27+
assert webpage.timeout == 20
28+
assert webpage.allowed_schemes == ("http", "https")
29+
assert webpage.max_retries == 3
30+
assert "Mozilla" in webpage.user_agent
31+
32+
@pytest.mark.parametrize(
33+
"url,valid",
34+
[
35+
("https://example.com", True),
36+
("http://test.com", True),
37+
("ftp://invalid.com", False),
38+
("invalid-url", False),
39+
],
40+
)
41+
def test_validate_url(self, webpage, url, valid):
42+
"""Test URL validation with various inputs."""
43+
if valid:
44+
webpage._validate_url(url)
45+
else:
46+
with pytest.raises(ValueError):
47+
webpage._validate_url(url)
48+
49+
@patch("requests.Session.get")
50+
def test_fetch_content_success(self, mock_get, webpage, mock_response):
51+
"""Test successful content fetching."""
52+
mock_get.return_value = mock_response
53+
content = webpage._fetch_content("https://example.com")
54+
assert content == "<h1>Test</h1><p>Content</p>"
55+
mock_get.assert_called_once()
56+
57+
@patch("requests.Session.get")
58+
def test_fetch_content_timeout_retry(self, mock_get, webpage):
59+
"""Test timeout handling with retries."""
60+
mock_get.side_effect = Timeout()
61+
with pytest.raises(RuntimeError, match="timed out"):
62+
webpage._fetch_content("https://example.com")
63+
assert mock_get.call_count == webpage.max_retries
64+
65+
@patch("requests.Session.get")
66+
def test_fetch_content_request_error(self, mock_get, webpage):
67+
"""Test request exception handling."""
68+
mock_get.side_effect = RequestException("Network error")
69+
with pytest.raises(RuntimeError, match="Failed to fetch"):
70+
webpage._fetch_content("https://example.com")
71+
72+
def test_convert_to_markdown(self, webpage):
73+
"""Test HTML to markdown conversion."""
74+
html = "<h1>Test</h1><p>Content</p>\n\n\n<p>More</p>"
75+
markdown = webpage._convert_to_markdown(html)
76+
assert "Test\n====" in markdown
77+
assert "\n\n\n" not in markdown
78+
79+
@patch("requests.Session.get")
80+
def test_get_markdown_integration(self, mock_get, webpage, mock_response):
81+
"""Test complete markdown conversion flow."""
82+
mock_get.return_value = mock_response
83+
result = webpage.get_markdown("https://example.com")
84+
assert "Test\n====" in result
85+
assert "Content" in result
86+
87+
@patch("requests.Session.get")
88+
def test_get_markdown_caching(self, mock_get, webpage, mock_response):
89+
"""Test LRU caching functionality."""
90+
mock_get.return_value = mock_response
91+
url = "https://example.com"
92+
93+
# First call
94+
webpage.get_markdown(url)
95+
# Second call (should use cache)
96+
webpage.get_markdown(url)
97+
98+
mock_get.assert_called_once()
99+
100+
def test_custom_timeout(self):
101+
"""Test custom timeout configuration."""
102+
webpage = Webpage(timeout=30)
103+
assert webpage.timeout == 30
104+
105+
def test_custom_schemes(self):
106+
"""Test custom allowed schemes."""
107+
webpage = Webpage(allowed_schemes=("https",))
108+
assert webpage.allowed_schemes == ("https",)
109+
with pytest.raises(ValueError):
110+
webpage._validate_url("http://example.com")

0 commit comments

Comments
 (0)