diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 83fa1b8..5c09b82 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -34,4 +34,4 @@ RUN curl -sSL https://install.python-poetry.org | python3 - # Verify installations RUN exiftool -ver && \ - ffmpeg -version \ No newline at end of file + ffmpeg -version diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 7d4a8ef..23dbaf6 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -8,7 +8,6 @@ } }, - // Configure tool-specific properties "customizations": { "vscode": { "settings": { @@ -46,10 +45,8 @@ } }, - // Install project dependencies and dev tools "postCreateCommand": "pip install --user -e '.[dev]' && pip install hatch pre-commit pytest mypy black isort pytest-mock", - // Comment out to connect as root instead "remoteUser": "vscode", "features": { @@ -57,4 +54,4 @@ "ghcr.io/devcontainers/features/github-cli:1": {}, "ghcr.io/devcontainers-contrib/features/hatch:2": {} } -} \ No newline at end of file +} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ad87db7..d7e6637 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,3 +21,4 @@ repos: - id: trailing-whitespace - id: check-yaml - id: check-json + exclude: "^.devcontainer/" diff --git a/README.md b/README.md index bb4abce..4e5375e 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # 📚 Readium -A powerful Python tool for extracting, analyzing, and converting documentation from repositories and directories into accessible formats. +A powerful Python tool for extracting, analyzing, and converting documentation from repositories, directories, and URLs into accessible formats.

Readium @@ -12,6 +12,10 @@ A powerful Python tool for extracting, analyzing, and converting documentation f - Support for private repositories using tokens - Branch selection for Git repositories - Secure token handling and masking +- 🌐 **Process webpages and URLs** to convert directly to Markdown + - Extract main content from documentation websites + - Convert HTML to well-formatted Markdown + - Support for tables, links, and images in converted content - 🔄 **Convert multiple document formats** to Markdown using MarkItDown integration - 🎯 **Target specific subdirectories** for focused analysis - ⚡ **Process a wide range of file types**: @@ -19,6 +23,7 @@ A powerful Python tool for extracting, analyzing, and converting documentation f - Code files (`.py`, `.js`, `.java`, etc.) - Configuration files (`.yml`, `.toml`, `.json`, etc.) - Office documents with MarkItDown (`.pdf`, `.docx`, `.xlsx`, `.pptx`) + - Webpages and HTML via direct URL processing - 🎛️ **Highly configurable**: - Customizable file size limits - Flexible file extension filtering @@ -59,6 +64,9 @@ readium https://github.com/username/repository -b feature-branch # Process a private Git repository with token readium https://token@github.com/username/repository +# Process a webpage and convert to Markdown +readium https://example.com/documentation + # Save output to a file readium /path/to/directory -o output.md @@ -85,6 +93,12 @@ readium /path/to/directory --debug # Generate split files for fine-tuning readium /path/to/directory --split-output ./training-data/ + +# Process URL with content preservation mode +readium https://example.com/docs --url-mode full + +# Process URL with main content extraction (default) +readium https://example.com/docs --url-mode clean ``` ### Python API @@ -118,12 +132,68 @@ summary, tree, content = reader.read_docs( # Process private Git repository with token summary, tree, content = reader.read_docs('https://token@github.com/username/repo') +# Process a webpage and convert to Markdown +summary, tree, content = reader.read_docs('https://example.com/documentation') + # Access results print("Summary:", summary) print("\nFile Tree:", tree) print("\nContent:", content) ``` +## 🌐 URL to Markdown + +Readium can process web pages and convert them directly to Markdown: + +```bash +# Process a webpage +readium https://example.com/documentation + +# Save the output to a file +readium https://example.com/documentation -o docs.md + +# Process URL preserving more content +readium https://example.com/documentation --url-mode full + +# Process URL extracting only main content (default) +readium https://example.com/documentation --url-mode clean +``` + +### URL Conversion Configuration + +The URL to Markdown conversion can be configured with several options: + +- `--url-mode`: Processing mode (`clean` or `full`) + - `clean` (default): Extracts only the main content, ignoring menus, ads, etc. + - `full`: Attempts to preserve most of the page content + +### Python API for URLs + +```python +from readium import Readium, ReadConfig + +# Configure with URL options +config = ReadConfig( + url_mode="clean", # 'clean' or 'full' + include_tables=True, + include_images=True, + include_links=True, + include_comments=False, + debug=True +) + +reader = Readium(config) + +# Process a URL +summary, tree, content = reader.read_docs('https://example.com/documentation') + +# Save the content +with open('documentation.md', 'w', encoding='utf-8') as f: + f.write(content) +``` + +Readium uses [trafilatura](https://github.com/adbar/trafilatura) for web content extraction and conversion, which is especially effective for extracting the main content from technical documentation, tutorials, and other web resources. + ## 🔧 Configuration The `ReadConfig` class supports the following options: @@ -151,6 +221,15 @@ config = ReadConfig( # Specify extensions for MarkItDown processing markitdown_extensions={'.pdf', '.docx', '.xlsx'}, + # URL processing mode: 'clean' or 'full' + url_mode='clean', + + # URL content options + include_tables=True, + include_images=True, + include_links=True, + include_comments=False, + # Enable debug mode debug=False ) @@ -268,6 +347,11 @@ readium /path/to/repository \ --target-dir docs \ --use-markitdown \ --debug + +# Process a URL and create split files +readium https://example.com/docs \ + --split-output ./training-data/ \ + --url-mode clean ``` Python API: @@ -286,6 +370,9 @@ reader.split_output_dir = "./training-data/" # Process and generate split files summary, tree, content = reader.read_docs('/path/to/repository') + +# Process a URL and generate split files +summary, tree, content = reader.read_docs('https://example.com/docs') ``` ## 🛠️ Development @@ -340,5 +427,6 @@ This project is licensed under the MIT License - see the LICENSE file for detail ## 🙏 Acknowledgments - Microsoft and MarkItDown for their powerful document conversion tool +- Trafilatura for excellent web content extraction capabilities - Rich library for beautiful console output - Click for the powerful CLI interface diff --git a/poetry.lock b/poetry.lock index c104bdf..706239b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -81,6 +81,22 @@ files = [ {file = "audioop_lts-0.2.1.tar.gz", hash = "sha256:e81268da0baa880431b68b1308ab7257eb33f356e57a5f9b1f915dfb13dd1387"}, ] +[[package]] +name = "babel" +version = "2.17.0" +description = "Internationalization utilities" +optional = false +python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2"}, + {file = "babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d"}, +] + +[package.extras] +dev = ["backports.zoneinfo", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata"] + [[package]] name = "backports-tarfile" version = "1.2.0" @@ -421,6 +437,27 @@ files = [ ] markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and platform_system == \"Windows\"", dev = "(platform_system == \"Windows\" or sys_platform == \"win32\") and (python_version <= \"3.11\" or python_version >= \"3.12\")"} +[[package]] +name = "courlan" +version = "1.3.2" +description = "Clean, filter and sample URLs to optimize data collection – includes spam, content type and language filters." +optional = false +python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "courlan-1.3.2-py3-none-any.whl", hash = "sha256:d0dab52cf5b5b1000ee2839fbc2837e93b2514d3cb5bb61ae158a55b7a04c6be"}, + {file = "courlan-1.3.2.tar.gz", hash = "sha256:0b66f4db3a9c39a6e22dd247c72cfaa57d68ea660e94bb2c84ec7db8712af190"}, +] + +[package.dependencies] +babel = ">=2.16.0" +tld = ">=0.13" +urllib3 = ">=1.26,<3" + +[package.extras] +dev = ["black", "flake8", "mypy", "pytest", "pytest-cov", "types-urllib3"] + [[package]] name = "cryptography" version = "44.0.0" @@ -472,6 +509,30 @@ ssh = ["bcrypt (>=3.1.5)"] test = ["certifi (>=2024)", "cryptography-vectors (==44.0.0)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] test-randomorder = ["pytest-randomly"] +[[package]] +name = "dateparser" +version = "1.2.1" +description = "Date parsing library designed to parse dates from HTML pages" +optional = false +python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "dateparser-1.2.1-py3-none-any.whl", hash = "sha256:bdcac262a467e6260030040748ad7c10d6bacd4f3b9cdb4cfd2251939174508c"}, + {file = "dateparser-1.2.1.tar.gz", hash = "sha256:7e4919aeb48481dbfc01ac9683c8e20bfe95bb715a38c1e9f6af889f4f30ccc3"}, +] + +[package.dependencies] +python-dateutil = ">=2.7.0" +pytz = ">=2024.2" +regex = ">=2015.06.24,<2019.02.19 || >2019.02.19,<2021.8.27 || >2021.8.27" +tzlocal = ">=0.2" + +[package.extras] +calendars = ["convertdate (>=2.2.1)", "hijridate"] +fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.19.3,<2)"] +langdetect = ["langdetect (>=1.0.0)"] + [[package]] name = "distlib" version = "0.3.9" @@ -609,6 +670,31 @@ pluggy = ">=1.0.0" tomli = {version = ">=1.2.2", markers = "python_version < \"3.11\""} trove-classifiers = "*" +[[package]] +name = "htmldate" +version = "1.9.3" +description = "Fast and robust extraction of original and updated publication dates from URLs and web pages." +optional = false +python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "htmldate-1.9.3-py3-none-any.whl", hash = "sha256:3fadc422cf3c10a5cdb5e1b914daf37ec7270400a80a1b37e2673ff84faaaff8"}, + {file = "htmldate-1.9.3.tar.gz", hash = "sha256:ac0caf4628c3ded4042011e2d60dc68dfb314c77b106587dd307a80d77e708e9"}, +] + +[package.dependencies] +charset_normalizer = ">=3.4.0" +dateparser = ">=1.1.2" +lxml = {version = ">=5.3.0,<6", markers = "platform_system != \"Darwin\" or python_version > \"3.8\""} +python-dateutil = ">=2.9.0.post0" +urllib3 = ">=1.26,<3" + +[package.extras] +all = ["htmldate[dev]", "htmldate[speed]"] +dev = ["black", "flake8", "mypy", "pytest", "pytest-cov", "types-dateparser", "types-lxml", "types-python-dateutil", "types-urllib3"] +speed = ["backports-datetime-fromisoformat", "faust-cchardet (>=2.1.19)", "urllib3[brotli]"] + [[package]] name = "httpcore" version = "1.0.7" @@ -928,6 +1014,22 @@ files = [ {file = "jiter-0.8.2.tar.gz", hash = "sha256:cd73d3e740666d0e639f678adb176fad25c1bcbdae88d8d7b857e1783bb4212d"}, ] +[[package]] +name = "justext" +version = "3.0.2" +description = "Heuristic based boilerplate removal tool" +optional = false +python-versions = "*" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7"}, + {file = "justext-3.0.2.tar.gz", hash = "sha256:13496a450c44c4cd5b5a75a5efcd9996066d2a189794ea99a49949685a0beb05"}, +] + +[package.dependencies] +lxml = {version = ">=4.4.2", extras = ["html-clean"]} + [[package]] name = "keyring" version = "25.6.0" @@ -1108,6 +1210,9 @@ files = [ {file = "lxml-5.3.0.tar.gz", hash = "sha256:4e109ca30d1edec1ac60cdbe341905dc3b8f55b16855e03a54aaf59e51ec8c6f"}, ] +[package.dependencies] +lxml-html-clean = {version = "*", optional = true, markers = "extra == \"html-clean\""} + [package.extras] cssselect = ["cssselect (>=0.7)"] html-clean = ["lxml-html-clean"] @@ -1115,6 +1220,22 @@ html5 = ["html5lib"] htmlsoup = ["BeautifulSoup4"] source = ["Cython (>=3.0.11)"] +[[package]] +name = "lxml-html-clean" +version = "0.4.1" +description = "HTML cleaner from lxml project" +optional = false +python-versions = "*" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "lxml_html_clean-0.4.1-py3-none-any.whl", hash = "sha256:b704f2757e61d793b1c08bf5ad69e4c0b68d6696f4c3c1429982caf90050bcaf"}, + {file = "lxml_html_clean-0.4.1.tar.gz", hash = "sha256:40c838bbcf1fc72ba4ce811fbb3135913017b27820d7c16e8bc412ae1d8bc00b"}, +] + +[package.dependencies] +lxml = "*" + [[package]] name = "mammoth" version = "1.9.0" @@ -2134,6 +2255,111 @@ files = [ {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, ] +[[package]] +name = "regex" +version = "2024.11.6" +description = "Alternative regular expression module, to replace re." +optional = false +python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "regex-2024.11.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ff590880083d60acc0433f9c3f713c51f7ac6ebb9adf889c79a261ecf541aa91"}, + {file = "regex-2024.11.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:658f90550f38270639e83ce492f27d2c8d2cd63805c65a13a14d36ca126753f0"}, + {file = "regex-2024.11.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:164d8b7b3b4bcb2068b97428060b2a53be050085ef94eca7f240e7947f1b080e"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d3660c82f209655a06b587d55e723f0b813d3a7db2e32e5e7dc64ac2a9e86fde"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d22326fcdef5e08c154280b71163ced384b428343ae16a5ab2b3354aed12436e"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1ac758ef6aebfc8943560194e9fd0fa18bcb34d89fd8bd2af18183afd8da3a2"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:997d6a487ff00807ba810e0f8332c18b4eb8d29463cfb7c820dc4b6e7562d0cf"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:02a02d2bb04fec86ad61f3ea7f49c015a0681bf76abb9857f945d26159d2968c"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f02f93b92358ee3f78660e43b4b0091229260c5d5c408d17d60bf26b6c900e86"}, + {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:06eb1be98df10e81ebaded73fcd51989dcf534e3c753466e4b60c4697a003b67"}, + {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:040df6fe1a5504eb0f04f048e6d09cd7c7110fef851d7c567a6b6e09942feb7d"}, + {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fdabbfc59f2c6edba2a6622c647b716e34e8e3867e0ab975412c5c2f79b82da2"}, + {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8447d2d39b5abe381419319f942de20b7ecd60ce86f16a23b0698f22e1b70008"}, + {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:da8f5fc57d1933de22a9e23eec290a0d8a5927a5370d24bda9a6abe50683fe62"}, + {file = "regex-2024.11.6-cp310-cp310-win32.whl", hash = "sha256:b489578720afb782f6ccf2840920f3a32e31ba28a4b162e13900c3e6bd3f930e"}, + {file = "regex-2024.11.6-cp310-cp310-win_amd64.whl", hash = "sha256:5071b2093e793357c9d8b2929dfc13ac5f0a6c650559503bb81189d0a3814519"}, + {file = "regex-2024.11.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5478c6962ad548b54a591778e93cd7c456a7a29f8eca9c49e4f9a806dcc5d638"}, + {file = "regex-2024.11.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2c89a8cc122b25ce6945f0423dc1352cb9593c68abd19223eebbd4e56612c5b7"}, + {file = "regex-2024.11.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:94d87b689cdd831934fa3ce16cc15cd65748e6d689f5d2b8f4f4df2065c9fa20"}, + {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1062b39a0a2b75a9c694f7a08e7183a80c63c0d62b301418ffd9c35f55aaa114"}, + {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:167ed4852351d8a750da48712c3930b031f6efdaa0f22fa1933716bfcd6bf4a3"}, + {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d548dafee61f06ebdb584080621f3e0c23fff312f0de1afc776e2a2ba99a74f"}, + {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2a19f302cd1ce5dd01a9099aaa19cae6173306d1302a43b627f62e21cf18ac0"}, + {file = "regex-2024.11.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bec9931dfb61ddd8ef2ebc05646293812cb6b16b60cf7c9511a832b6f1854b55"}, + {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9714398225f299aa85267fd222f7142fcb5c769e73d7733344efc46f2ef5cf89"}, + {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:202eb32e89f60fc147a41e55cb086db2a3f8cb82f9a9a88440dcfc5d37faae8d"}, + {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:4181b814e56078e9b00427ca358ec44333765f5ca1b45597ec7446d3a1ef6e34"}, + {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:068376da5a7e4da51968ce4c122a7cd31afaaec4fccc7856c92f63876e57b51d"}, + {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f2c4184420d881a3475fb2c6f4d95d53a8d50209a2500723d831036f7c45"}, + {file = "regex-2024.11.6-cp311-cp311-win32.whl", hash = "sha256:c36f9b6f5f8649bb251a5f3f66564438977b7ef8386a52460ae77e6070d309d9"}, + {file = "regex-2024.11.6-cp311-cp311-win_amd64.whl", hash = "sha256:02e28184be537f0e75c1f9b2f8847dc51e08e6e171c6bde130b2687e0c33cf60"}, + {file = "regex-2024.11.6-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:52fb28f528778f184f870b7cf8f225f5eef0a8f6e3778529bdd40c7b3920796a"}, + {file = "regex-2024.11.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdd6028445d2460f33136c55eeb1f601ab06d74cb3347132e1c24250187500d9"}, + {file = "regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:805e6b60c54bf766b251e94526ebad60b7de0c70f70a4e6210ee2891acb70bf2"}, + {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b85c2530be953a890eaffde05485238f07029600e8f098cdf1848d414a8b45e4"}, + {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bb26437975da7dc36b7efad18aa9dd4ea569d2357ae6b783bf1118dabd9ea577"}, + {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abfa5080c374a76a251ba60683242bc17eeb2c9818d0d30117b4486be10c59d3"}, + {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b7fa6606c2881c1db9479b0eaa11ed5dfa11c8d60a474ff0e095099f39d98e"}, + {file = "regex-2024.11.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c32f75920cf99fe6b6c539c399a4a128452eaf1af27f39bce8909c9a3fd8cbe"}, + {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:982e6d21414e78e1f51cf595d7f321dcd14de1f2881c5dc6a6e23bbbbd68435e"}, + {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a7c2155f790e2fb448faed6dd241386719802296ec588a8b9051c1f5c481bc29"}, + {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149f5008d286636e48cd0b1dd65018548944e495b0265b45e1bffecce1ef7f39"}, + {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:e5364a4502efca094731680e80009632ad6624084aff9a23ce8c8c6820de3e51"}, + {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0a86e7eeca091c09e021db8eb72d54751e527fa47b8d5787caf96d9831bd02ad"}, + {file = "regex-2024.11.6-cp312-cp312-win32.whl", hash = "sha256:32f9a4c643baad4efa81d549c2aadefaeba12249b2adc5af541759237eee1c54"}, + {file = "regex-2024.11.6-cp312-cp312-win_amd64.whl", hash = "sha256:a93c194e2df18f7d264092dc8539b8ffb86b45b899ab976aa15d48214138e81b"}, + {file = "regex-2024.11.6-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a6ba92c0bcdf96cbf43a12c717eae4bc98325ca3730f6b130ffa2e3c3c723d84"}, + {file = "regex-2024.11.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:525eab0b789891ac3be914d36893bdf972d483fe66551f79d3e27146191a37d4"}, + {file = "regex-2024.11.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:086a27a0b4ca227941700e0b31425e7a28ef1ae8e5e05a33826e17e47fbfdba0"}, + {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bde01f35767c4a7899b7eb6e823b125a64de314a8ee9791367c9a34d56af18d0"}, + {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b583904576650166b3d920d2bcce13971f6f9e9a396c673187f49811b2769dc7"}, + {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c4de13f06a0d54fa0d5ab1b7138bfa0d883220965a29616e3ea61b35d5f5fc7"}, + {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3cde6e9f2580eb1665965ce9bf17ff4952f34f5b126beb509fee8f4e994f143c"}, + {file = "regex-2024.11.6-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0d7f453dca13f40a02b79636a339c5b62b670141e63efd511d3f8f73fba162b3"}, + {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:59dfe1ed21aea057a65c6b586afd2a945de04fc7db3de0a6e3ed5397ad491b07"}, + {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b97c1e0bd37c5cd7902e65f410779d39eeda155800b65fc4d04cc432efa9bc6e"}, + {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f9d1e379028e0fc2ae3654bac3cbbef81bf3fd571272a42d56c24007979bafb6"}, + {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:13291b39131e2d002a7940fb176e120bec5145f3aeb7621be6534e46251912c4"}, + {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4f51f88c126370dcec4908576c5a627220da6c09d0bff31cfa89f2523843316d"}, + {file = "regex-2024.11.6-cp313-cp313-win32.whl", hash = "sha256:63b13cfd72e9601125027202cad74995ab26921d8cd935c25f09c630436348ff"}, + {file = "regex-2024.11.6-cp313-cp313-win_amd64.whl", hash = "sha256:2b3361af3198667e99927da8b84c1b010752fa4b1115ee30beaa332cabc3ef1a"}, + {file = "regex-2024.11.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:3a51ccc315653ba012774efca4f23d1d2a8a8f278a6072e29c7147eee7da446b"}, + {file = "regex-2024.11.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ad182d02e40de7459b73155deb8996bbd8e96852267879396fb274e8700190e3"}, + {file = "regex-2024.11.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ba9b72e5643641b7d41fa1f6d5abda2c9a263ae835b917348fc3c928182ad467"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40291b1b89ca6ad8d3f2b82782cc33807f1406cf68c8d440861da6304d8ffbbd"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cdf58d0e516ee426a48f7b2c03a332a4114420716d55769ff7108c37a09951bf"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a36fdf2af13c2b14738f6e973aba563623cb77d753bbbd8d414d18bfaa3105dd"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1cee317bfc014c2419a76bcc87f071405e3966da434e03e13beb45f8aced1a6"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50153825ee016b91549962f970d6a4442fa106832e14c918acd1c8e479916c4f"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ea1bfda2f7162605f6e8178223576856b3d791109f15ea99a9f95c16a7636fb5"}, + {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:df951c5f4a1b1910f1a99ff42c473ff60f8225baa1cdd3539fe2819d9543e9df"}, + {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:072623554418a9911446278f16ecb398fb3b540147a7828c06e2011fa531e773"}, + {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f654882311409afb1d780b940234208a252322c24a93b442ca714d119e68086c"}, + {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:89d75e7293d2b3e674db7d4d9b1bee7f8f3d1609428e293771d1a962617150cc"}, + {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:f65557897fc977a44ab205ea871b690adaef6b9da6afda4790a2484b04293a5f"}, + {file = "regex-2024.11.6-cp38-cp38-win32.whl", hash = "sha256:6f44ec28b1f858c98d3036ad5d7d0bfc568bdd7a74f9c24e25f41ef1ebfd81a4"}, + {file = "regex-2024.11.6-cp38-cp38-win_amd64.whl", hash = "sha256:bb8f74f2f10dbf13a0be8de623ba4f9491faf58c24064f32b65679b021ed0001"}, + {file = "regex-2024.11.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5704e174f8ccab2026bd2f1ab6c510345ae8eac818b613d7d73e785f1310f839"}, + {file = "regex-2024.11.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:220902c3c5cc6af55d4fe19ead504de80eb91f786dc102fbd74894b1551f095e"}, + {file = "regex-2024.11.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5e7e351589da0850c125f1600a4c4ba3c722efefe16b297de54300f08d734fbf"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5056b185ca113c88e18223183aa1a50e66507769c9640a6ff75859619d73957b"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e34b51b650b23ed3354b5a07aab37034d9f923db2a40519139af34f485f77d0"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5670bce7b200273eee1840ef307bfa07cda90b38ae56e9a6ebcc9f50da9c469b"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08986dce1339bc932923e7d1232ce9881499a0e02925f7402fb7c982515419ef"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:93c0b12d3d3bc25af4ebbf38f9ee780a487e8bf6954c115b9f015822d3bb8e48"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:764e71f22ab3b305e7f4c21f1a97e1526a25ebdd22513e251cf376760213da13"}, + {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f056bf21105c2515c32372bbc057f43eb02aae2fda61052e2f7622c801f0b4e2"}, + {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:69ab78f848845569401469da20df3e081e6b5a11cb086de3eed1d48f5ed57c95"}, + {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:86fddba590aad9208e2fa8b43b4c098bb0ec74f15718bb6a704e3c63e2cef3e9"}, + {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:684d7a212682996d21ca12ef3c17353c021fe9de6049e19ac8481ec35574a70f"}, + {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a03e02f48cd1abbd9f3b7e3586d97c8f7a9721c436f51a5245b3b9483044480b"}, + {file = "regex-2024.11.6-cp39-cp39-win32.whl", hash = "sha256:41758407fc32d5c3c5de163888068cfee69cb4c2be844e7ac517a52770f9af57"}, + {file = "regex-2024.11.6-cp39-cp39-win_amd64.whl", hash = "sha256:b2837718570f95dd41675328e111345f9b7095d821bac435aac173ac80b19983"}, + {file = "regex-2024.11.6.tar.gz", hash = "sha256:7ab159b063c52a0333c884e4679f8d7a85112ee3078fe3d9004b2dd875585519"}, +] + [[package]] name = "requests" version = "2.32.3" @@ -2305,6 +2531,19 @@ files = [ {file = "standard_chunk-3.13.0.tar.gz", hash = "sha256:4ac345d37d7e686d2755e01836b8d98eda0d1a3ee90375e597ae43aaf064d654"}, ] +[[package]] +name = "tld" +version = "0.13" +description = "Extract the top-level domain (TLD) from the URL given." +optional = false +python-versions = ">=3.7, <4" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "tld-0.13-py2.py3-none-any.whl", hash = "sha256:f75b2be080f767ed17c2338a339eaa4fab5792586319ca819119da252f9f3749"}, + {file = "tld-0.13.tar.gz", hash = "sha256:93dde5e1c04bdf1844976eae440706379d21f4ab235b73c05d7483e074fb5629"}, +] + [[package]] name = "tomli" version = "2.2.1" @@ -2397,6 +2636,32 @@ notebook = ["ipywidgets (>=6)"] slack = ["slack-sdk"] telegram = ["requests"] +[[package]] +name = "trafilatura" +version = "1.12.2" +description = "Python package and command-line tool designed to gather text on the Web, includes all necessary discovery and text processing components to perform web crawling, downloads, scraping, and extraction of main texts, metadata and comments." +optional = false +python-versions = ">=3.6" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "trafilatura-1.12.2-py3-none-any.whl", hash = "sha256:6df5b666f625c9579a50d7cc715005f450fa75606696aceab73eeda0a76dbe96"}, + {file = "trafilatura-1.12.2.tar.gz", hash = "sha256:4c9cb1434f7e13ef0b16cb44ee1d44e84523ec7268940b9559c374e7effc9a96"}, +] + +[package.dependencies] +certifi = "*" +charset-normalizer = {version = ">=3.2.0", markers = "python_version >= \"3.7\""} +courlan = ">=1.2.0" +htmldate = ">=1.8.1" +justext = ">=3.0.1" +lxml = {version = ">=5.2.2", markers = "platform_system != \"Darwin\" or python_version > \"3.8\""} +urllib3 = {version = ">=1.26,<3", markers = "python_version >= \"3.7\""} + +[package.extras] +all = ["brotli", "cchardet (>=2.1.7)", "faust-cchardet (>=2.1.19)", "htmldate[speed] (>=1.8.1)", "py3langid (>=0.2.2)", "pycurl (>=7.45.3)", "urllib3[socks]", "zstandard (>=0.20.0)"] +gui = ["Gooey (>=1.0.1)"] + [[package]] name = "trove-classifiers" version = "2025.1.10.15" @@ -2436,6 +2701,25 @@ files = [ {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"}, ] +[[package]] +name = "tzlocal" +version = "5.3.1" +description = "tzinfo object for the local timezone" +optional = false +python-versions = ">=3.9" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d"}, + {file = "tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd"}, +] + +[package.dependencies] +tzdata = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +devenv = ["check-manifest", "pytest (>=4.3)", "pytest-cov", "pytest-mock (>=3.3)", "zest.releaser"] + [[package]] name = "urllib3" version = "2.3.0" @@ -2687,5 +2971,5 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" -python-versions = ">=3.10" -content-hash = "aeec2661763c0e6ad520a15051997c1e2ae4a38ac65f6b64e9a0b77818f5d27d" +python-versions = ">=3.10,<4.0" +content-hash = "2a5a451c8f536f90a8952a947e277b070c4a36e3a66f8a867cbf42af6a10c129" diff --git a/pr.sh b/pr.sh new file mode 100644 index 0000000..43bac9d --- /dev/null +++ b/pr.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +# Script para generar un prompt de análisis de PR basado en el diff con main/master + +# Determinar si la rama principal es 'main' o 'master' +MAIN_BRANCH="main" +if ! git show-ref --verify --quiet refs/heads/main; then + if git show-ref --verify --quiet refs/heads/master; then + MAIN_BRANCH="master" + else + echo "Error: Neither 'main' nor 'master' branch found." + exit 1 + fi +fi + +# Obtener la rama actual +CURRENT_BRANCH=$(git branch --show-current) + +if [ "$CURRENT_BRANCH" == "$MAIN_BRANCH" ]; then + echo "Error: You are currently on the $MAIN_BRANCH branch. Please checkout a feature branch." + exit 1 +fi + +# Obtener el diff +DIFF=$(git diff $MAIN_BRANCH..$CURRENT_BRANCH) + +# Generar el prompt +PROMPT="# Pull Request Analysis Request + +Please analyze the following Git diff from a pull request and provide a detailed contribution analysis in English. + +## Expected Analysis Structure + +1. **Summary of Changes** + - Brief overview of the main modifications + - Key components affected + +2. **Technical Details** + - Files modified/added/deleted + - Key functions or methods changed + - Code quality observations + +3. **Implementation Analysis** + - Approach taken + - Design patterns used + - Potential improvements + +4. **Testing Considerations** + - Tests added/modified + - Test coverage + - Areas that might need additional testing + +5. **Documentation** + - Documentation quality + - Areas that might need more documentation + +6. **Impact Assessment** + - Potential impact on existing functionality + - Performance considerations + - Security implications (if any) + +7. **Conclusion** + - Overall assessment of the PR + - Recommendations (approve, request changes, etc.) + +## Git Diff + +\`\`\`diff +$DIFF +\`\`\` +" + +# Imprimir el prompt o guardarlo en un archivo +if [ -n "$1" ]; then + echo "$PROMPT" > "$1" + echo "Prompt saved to $1" +else + echo "$PROMPT" +fi diff --git a/pyproject.toml b/pyproject.toml index 3f6e701..8bceb15 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,13 @@ [project] name = "readium" -version = "0.2.0" -description = "A tool to extract and analyze documentation from repositories and directories" +version = "0.3.0" +description = "A tool to extract and analyze documentation from repositories, directories, and URLs" authors = [ {name = "Pablo Toledo", email = "pablotoledo@users.noreply.github.com"} ] license = {text = "MIT"} readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.10,<4.0" classifiers = [ "Development Status :: 3 - Alpha", "Environment :: Console", @@ -20,14 +20,16 @@ classifiers = [ "Programming Language :: Python :: 3.12", "Topic :: Software Development :: Documentation", ] + [tool.poetry.dependencies] -python = ">=3.10" +python = ">=3.10,<4.0" # Actualizado aquí también click = ">=8.1.8,<9.0.0" rich = ">=13.9.4,<14.0.0" black = ">=24.10.0,<25.0.0" isort = ">=5.12.0,<6.0.0" markitdown = ">=0.0.1a3,<0.0.2" pypdf = ">=3.0.1,<4.0.0" +trafilatura = ">=1.6.0,<2.0.0" [tool.poetry.group.dev.dependencies] pytest = "*" @@ -50,7 +52,6 @@ readium = "readium.cli:main" requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" - [tool.isort] profile = "black" multi_line_output = 3 diff --git a/src/readium/cli.py b/src/readium/cli.py index d3355a7..afa9097 100644 --- a/src/readium/cli.py +++ b/src/readium/cli.py @@ -1,15 +1,17 @@ from pathlib import Path +from typing import Literal, cast # Añadimos cast para el tipado import click from rich.console import Console from rich.table import Table +from .config import URL_MODES # Importamos URL_MODES para el tipado from .config import ( DEFAULT_EXCLUDE_DIRS, DEFAULT_INCLUDE_EXTENSIONS, MARKITDOWN_EXTENSIONS, ) -from .core import ReadConfig, Readium +from .core import ReadConfig, Readium, is_url from .utils.error_handling import print_error console = Console() @@ -17,7 +19,7 @@ @click.command( help=""" -Read and analyze documentation from directories or repositories. +Read and analyze documentation from directories, repositories, or URLs. Examples: # Process a local directory @@ -26,21 +28,23 @@ # Process a Git repository readium https://github.com/username/repository - # Process a specific branch of a Git repository - readium https://github.com/username/repository -b feature-branch + # Process a webpage and convert to Markdown + readium https://example.com/docs + + # Process a webpage with custom output + readium https://example.com/docs -o docs.md # Save output to a file readium /path/to/directory -o output.md - # Process specific subdirectory - readium /path/to/directory -t python - - # Generate split files for fine-tuning - readium /path/to/directory --split-output ./fine-tuning-data + # Generate split files from a webpage + readium https://example.com/docs --split-output ./markdown-files/ """ ) @click.argument("path", type=str) -@click.option("--target-dir", "-t", help="Target subdirectory to analyze") +@click.option( + "--target-dir", "-t", help="Target subdirectory to analyze (for directories)" +) @click.option( "--branch", "-b", help="Specific Git branch to clone (only for Git repositories)" ) @@ -52,10 +56,7 @@ help="Maximum file size in bytes (default: 5MB)", ) @click.option( - "--output", - "-o", - type=click.Path(), - help="Output file path for combined results", + "--output", "-o", type=click.Path(), help="Output file path for combined results" ) @click.option( "--split-output", @@ -63,22 +64,22 @@ help="Directory path for split output files (each file gets its own UUID-named file)", ) @click.option( - "--exclude-dir", "-x", multiple=True, help="Additional directories to exclude" -) -@click.option( - "--include-ext", "-i", multiple=True, help="Additional extensions to include" + "--exclude-dir", + "-x", + multiple=True, + help="Additional directories to exclude (for directories)", ) @click.option( - "--use-markitdown/--no-markitdown", - "-m/-M", - default=False, - help="Use MarkItDown for compatible file formats", + "--include-ext", + "-i", + multiple=True, + help="Additional extensions to include (for directories)", ) @click.option( - "--markitdown-ext", - "-k", - multiple=True, - help="Specific extensions to process with MarkItDown (default: all supported)", + "--url-mode", + type=click.Choice(["full", "clean"]), + default="clean", + help="URL processing mode: 'full' preserves all content, 'clean' extracts main content only (default: clean)", ) @click.option( "--debug/--no-debug", @@ -95,21 +96,25 @@ def main( split_output: str, exclude_dir: tuple, include_ext: tuple, - use_markitdown: bool, - markitdown_ext: tuple, + url_mode: str, debug: bool, ): - """Read and analyze documentation from a directory or repository""" + """Read and analyze documentation from a directory, repository, or URL""" try: + # Validamos que url_mode sea uno de los valores permitidos + if url_mode not in ("full", "clean"): + url_mode = "clean" # Valor por defecto si no es válido + config = ReadConfig( max_file_size=max_size, exclude_dirs=DEFAULT_EXCLUDE_DIRS | set(exclude_dir), include_extensions=DEFAULT_INCLUDE_EXTENSIONS | set(include_ext), target_dir=target_dir, - use_markitdown=use_markitdown, - markitdown_extensions=( - set(markitdown_ext) if markitdown_ext else MARKITDOWN_EXTENSIONS - ), + url_mode=cast( + URL_MODES, url_mode + ), # Usamos cast para que mypy entienda el tipo + use_markitdown=False, + markitdown_extensions=set(), debug=debug, ) diff --git a/src/readium/config.py b/src/readium/config.py index 57e1313..084e5c4 100644 --- a/src/readium/config.py +++ b/src/readium/config.py @@ -1,5 +1,5 @@ from dataclasses import dataclass, field -from typing import Optional, Set +from typing import Literal, Optional, Set, Tuple # Add Tuple for function return type DEFAULT_EXCLUDE_DIRS = { ".git", @@ -154,6 +154,9 @@ ".msg", } +# Add a new constant for URL processing modes +URL_MODES = Literal["full", "clean"] + @dataclass class ReadConfig: @@ -173,3 +176,72 @@ class ReadConfig: default_factory=lambda: MARKITDOWN_EXTENSIONS.copy() ) debug: bool = False + url_mode: URL_MODES = "clean" # URL processing mode (new) + include_comments: bool = False # Include web page comments (new) + include_tables: bool = True # Include tables from web pages (new) + include_images: bool = True # Include image references (new) + include_links: bool = True # Include links (new) + + +def convert_url_to_markdown(url: str, config: ReadConfig) -> Tuple[str, str]: + """ + Convert a URL to Markdown using trafilatura. + + Parameters + ---------- + url : str + URL to convert. + config : ReadConfig + Configuration for processing. + + Returns + ------- + Tuple[str, str]: + Extracted title, content in Markdown format. + """ + import trafilatura + from trafilatura.settings import use_config + + try: + # Configure trafilatura for Markdown output + trafilatura_config = use_config() + trafilatura_config.set("DEFAULT", "output_format", "markdown") + + # Adjust extraction settings based on URL mode + if config.url_mode == "full": + # Disable aggressive filtering + trafilatura_config.set("DEFAULT", "extraction_timeout", "30") + trafilatura_config.set("DEFAULT", "min_extracted_size", "10") + trafilatura_config.set( + "EXTRACTION", + "list_tags", + "p, blockquote, q, dl, ul, ol, h1, h2, h3, h4, h5, h6, div, section, article", + ) + + # Download and extract content + downloaded = trafilatura.fetch_url(url) + if not downloaded: + raise ValueError(f"Failed to download content from {url}") + + # Extract metadata and content + metadata = trafilatura.extract_metadata(downloaded) + title = metadata.title if metadata and metadata.title else "Untitled" + + # Extract content as Markdown + markdown = trafilatura.extract( + downloaded, + output_format="markdown", + include_tables=config.include_tables, + include_images=config.include_images, + include_links=config.include_links, + include_comments=config.include_comments, + config=trafilatura_config, + ) + + if not markdown: + raise ValueError(f"Failed to extract content from {url}") + + return title, markdown + + except Exception as e: + raise ValueError(f"Error converting URL to Markdown: {str(e)}") diff --git a/src/readium/core.py b/src/readium/core.py index 4c77aee..ce8ad7e 100644 --- a/src/readium/core.py +++ b/src/readium/core.py @@ -1,6 +1,7 @@ import os import subprocess import tempfile +import urllib.parse import uuid from dataclasses import dataclass, field from pathlib import Path @@ -19,9 +20,115 @@ def is_git_url(url: str) -> bool: """Check if the given string is a git URL""" - return url.startswith(("http://", "https://")) and ( - url.endswith(".git") or "github.com" in url or "gitlab.com" in url - ) + if not url.startswith(("http://", "https://")): + return False + + # Detect Git-specific URLs + if url.endswith(".git"): + return True + + # Detect GitHub/GitLab style paths + if "github.com/" in url or "gitlab.com/" in url: + parts = url.split("/") + # Basic user/repo format (at least 4 parts) + if len(parts) >= 4: + return True + + return False + + +def is_url(url: str) -> bool: + """Check if a string is a valid URL (but not a git URL)""" + try: + result = urllib.parse.urlparse(url) + # It is an HTTP/HTTPS URL but NOT a git URL + is_valid_url = all([result.scheme, result.netloc]) and result.scheme in ( + "http", + "https", + ) + return is_valid_url and not is_git_url(url) + except ValueError: + return False + + +def convert_url_to_markdown( + url: str, config: Optional[ReadConfig] = None +) -> Tuple[str, str]: + """ + Convert a URL to Markdown using trafilatura + + Parameters + ---------- + url : str + URL to convert. + config : Optional[ReadConfig] + Configuration for processing, defaults to None + + Returns + ------- + Tuple[str, str]: + Extracted title, content in Markdown format. + """ + if config is None: + config = ReadConfig() + + try: + # Attempt to import trafilatura here to handle import errors + import trafilatura + from trafilatura.settings import use_config + + # Configure trafilatura for Markdown output + trafilatura_config = use_config() + trafilatura_config.set("DEFAULT", "output_format", "markdown") + + # Adjust extraction settings based on URL mode + if config.url_mode == "full": + # Disable aggressive filtering + trafilatura_config.set("DEFAULT", "extraction_timeout", "30") + trafilatura_config.set("DEFAULT", "min_extracted_size", "10") + trafilatura_config.set( + "EXTRACTION", + "list_tags", + "p, blockquote, q, dl, ul, ol, h1, h2, h3, h4, h5, h6, div, section, article", + ) + + # Download and extract content + downloaded = trafilatura.fetch_url(url) + if not downloaded: + raise ValueError(f"Failed to download content from {url}") + + # Extract metadata and content + metadata = trafilatura.extract_metadata(downloaded) + title = metadata.title if metadata and metadata.title else "Untitled" + + # Extract content as Markdown + markdown = trafilatura.extract( + downloaded, + output_format="markdown", + include_tables=config.include_tables, + include_images=config.include_images, + include_links=config.include_links, + include_comments=config.include_comments, + config=trafilatura_config, + ) + + if not markdown: + raise ValueError(f"Failed to extract content from {url}") + + return title, markdown + + except ImportError: + # If trafilatura is not installed, return an error message + print( + "Warning: Trafilatura is not installed. URL to Markdown conversion is disabled." + ) + # Return generic error content + return ( + "Error", + f"# Error\n\nUnable to convert URL: {url}. The required package 'trafilatura' is not installed.", + ) + except Exception as e: + raise ValueError(f"Error converting URL to Markdown: {str(e)}") def clone_repository(url: str, target_dir: str, branch: Optional[str] = None) -> None: @@ -172,12 +279,12 @@ def read_docs( self, path: Union[str, Path], branch: Optional[str] = None ) -> Tuple[str, str, str]: """ - Read documentation from a directory or git repository + Read documentation from a directory, git repository, or URL Parameters ---------- path : Union[str, Path] - Local path or git URL + Local path, git URL, or web URL branch : Optional[str] Specific branch to clone for git repositories (default: None) @@ -196,6 +303,57 @@ def read_docs( return self._process_directory(Path(temp_dir), original_path=path) except Exception as e: raise ValueError(f"Error processing git repository: {str(e)}") + # If it's a regular URL, process it + elif isinstance(path, str) and is_url(path): + try: + self.log_debug(f"URL detected: {path}") + + # Extract title and Markdown content + title, markdown_content = convert_url_to_markdown(path, self.config) + + # Generate file name from the URL + file_name = ( + os.path.basename(urllib.parse.urlparse(path).path) or "index.md" + ) + if not file_name.endswith(".md"): + file_name += ".md" + + # Generate result + file_info = [ + {"path": file_name, "content": markdown_content, "title": title} + ] + + # Write split files if output directory is specified + if self.split_output_dir: + self.write_split_files( + file_info, Path(urllib.parse.urlparse(path).netloc) + ) + + # Generate tree structure + tree = "Documentation Structure:\n" + tree += f"└── {file_name} (from {path})\n" + + # Generate content + content = f"================================================\n" + content += f"File: {file_name}\n" + content += f"Source: {path}\n" + content += f"Title: {title}\n" + content += f"================================================\n\n" + content += markdown_content + + # Generate summary + summary = f"URL processed: {path}\n" + summary += f"Title: {title}\n" + summary += f"Output file: {file_name}\n" + if self.split_output_dir: + summary += ( + f"Split files output directory: {self.split_output_dir}\n" + ) + + return summary, tree, content + + except Exception as e: + raise ValueError(f"Error processing URL: {str(e)}") else: path_obj = Path(path) if not path_obj.exists(): diff --git a/tests/test_url_handling.py b/tests/test_url_handling.py new file mode 100644 index 0000000..0032b63 --- /dev/null +++ b/tests/test_url_handling.py @@ -0,0 +1,162 @@ +# tests/test_url_handling.py +import os +from pathlib import Path +from unittest.mock import Mock, patch + +import pytest + +from readium import ReadConfig, Readium +from readium.core import convert_url_to_markdown, is_url + + +def test_is_url(): + """Test URL detection""" + assert is_url("https://github.com") + assert is_url("http://example.com/docs") + assert not is_url("github.com") # No scheme + assert not is_url("/local/path") + assert not is_url("git@github.com:user/repo.git") # SSH URL + assert not is_url("https://github.com/user/repo.git") # Git URL + + +@patch("trafilatura.fetch_url") +@patch("trafilatura.extract") +@patch("trafilatura.extract_metadata") +def test_convert_url_to_markdown(mock_metadata, mock_extract, mock_fetch): + """Test converting a URL to Markdown""" + # Setup mocks + mock_fetch.return_value = "

Test

Content

" + mock_extract.return_value = "# Test\n\nContent" + + # Setup metadata mock + metadata_mock = Mock() + metadata_mock.title = "Test Page" + mock_metadata.return_value = metadata_mock + + # Test conversion + title, markdown = convert_url_to_markdown("https://example.com/docs") + + # Assertions + assert title == "Test Page" + assert markdown == "# Test\n\nContent" + mock_fetch.assert_called_once_with("https://example.com/docs") + mock_extract.assert_called_once() + + +@patch("readium.core.convert_url_to_markdown") +def test_read_docs_url(mock_convert): + """Test reading a URL directly""" + # Setup mock + mock_convert.return_value = ( + "Test Document", + "# Test Document\n\nThis is test content.", + ) + + # Setup reader + reader = Readium(ReadConfig(debug=True)) + + # Test URL processing + summary, tree, content = reader.read_docs("https://example.com/documentation") + + # Assertions + assert "URL processed: https://example.com/documentation" in summary + assert "Title: Test Document" in summary + assert "Documentation Structure:" in tree + assert "documentation.md" in tree + assert "# Test Document" in content + assert "This is test content." in content + + +@patch("readium.core.convert_url_to_markdown") +def test_read_docs_url_with_output(mock_convert, tmp_path): + """Test reading a URL with output file""" + # Setup mock + mock_convert.return_value = ("Test Document", "# Test Document\n\nContent.") + + # Setup reader with split output + output_dir = tmp_path / "output" + reader = Readium(ReadConfig(debug=True)) + reader.split_output_dir = str(output_dir) + + # Test URL processing + summary, tree, content = reader.read_docs("https://example.com/page.html") + + # Assertions + assert "Split files output directory:" in summary + assert os.path.exists(output_dir) + + # Check if at least one file was created + files = list(output_dir.glob("*.txt")) + assert len(files) > 0 + + # Check content of first file + with open(files[0], "r", encoding="utf-8") as f: + file_content = f.read() + assert "Original Path:" in file_content + assert "# Test Document" in file_content + + +# Check if trafilatura is installed +try: + import trafilatura + + trafilatura_installed = True +except ImportError: + trafilatura_installed = False + + +@pytest.mark.skipif(not trafilatura_installed, reason="Trafilatura not installed") +@patch("trafilatura.fetch_url") +def test_convert_url_error_handling(mock_fetch): + """Test error handling when fetching URL fails""" + # Setup mock to return None (failed download) + mock_fetch.return_value = None + + # Import locally to avoid errors when trafilatura isn't installed + from readium.core import convert_url_to_markdown + + # Test with invalid URL + with pytest.raises(ValueError) as excinfo: + convert_url_to_markdown("https://example.com/nonexistent") + + assert "Failed to download content" in str(excinfo.value) + + +def test_cli_url_processing(): + """Test CLI with URL processing""" + import os + + from click.testing import CliRunner + + from readium.cli import main + + # Patch convert_url_to_markdown to avoid actual network requests + with patch("readium.core.convert_url_to_markdown") as mock_convert: + mock_convert.return_value = ("Test Title", "# Test Content") + + runner = CliRunner() + with runner.isolated_filesystem(): + output_file = "docs.md" + result = runner.invoke( + main, + [ + "https://example.com/docs", + "--output", + output_file, + "--url-mode", + "clean", + ], + ) + + # Verify successful execution + assert result.exit_code == 0 + + # Verify that the file was created successfully + assert f"Results saved to {output_file}" in result.output + assert os.path.exists(output_file) + + # Verify the content of the file + with open(output_file, "r") as f: + content = f.read() + # Verify that the file content includes some of the expected content + assert "# Test Content" in content