Skip to content

Commit edb4579

Browse files
committed
build: Bump Azure PDF Parser
Added an explicit dependency on lxml. This solves the `ImportError` seen: ``` ______________________________ ERROR collecting src/html_parser/test/test_readability_parser.py ______________________________ ImportError while importing test module '/Users/jesse/src/github.com/climatepolicyradar/navigator-document-parser/src/html_parser/test/test_readability_parser.py'. Hint: make sure your test modules/packages have valid Python names. Traceback: ../../../../Library/Caches/pypoetry/virtualenvs/navigator-document-parser-vnqSsywO-py3.12/lib/python3.12/site-packages/_pytest/python.py:617: in _importtestmodule mod = import_path(self.path, mode=importmode, root=self.config.rootpath) ../../../../Library/Caches/pypoetry/virtualenvs/navigator-document-parser-vnqSsywO-py3.12/lib/python3.12/site-packages/_pytest/pathlib.py:567: in import_path importlib.import_module(module_name) /opt/homebrew/Cellar/[email protected]/3.12.4/Frameworks/Python.framework/Versions/3.12/lib/python3.12/importlib/__init__.py:90: in import_module return _bootstrap._gcd_import(name[level:], package, level) <frozen importlib._bootstrap>:1387: in _gcd_import ??? <frozen importlib._bootstrap>:1360: in _find_and_load ??? <frozen importlib._bootstrap>:1331: in _find_and_load_unlocked ??? <frozen importlib._bootstrap>:935: in _load_unlocked ??? ../../../../Library/Caches/pypoetry/virtualenvs/navigator-document-parser-vnqSsywO-py3.12/lib/python3.12/site-packages/_pytest/assertion/rewrite.py:186: in exec_module exec(co, module.__dict__) src/html_parser/test/test_readability_parser.py:5: in <module> from src.html_parser.readability import ReadabilityParser src/html_parser/readability.py:13: in <module> from readability import Document ../../../../Library/Caches/pypoetry/virtualenvs/navigator-document-parser-vnqSsywO-py3.12/lib/python3.12/site-packages/readability/__init__.py:3: in <module> from .readability import Document ../../../../Library/Caches/pypoetry/virtualenvs/navigator-document-parser-vnqSsywO-py3.12/lib/python3.12/site-packages/readability/readability.py:11: in <module> from .cleaners import clean_attributes ../../../../Library/Caches/pypoetry/virtualenvs/navigator-document-parser-vnqSsywO-py3.12/lib/python3.12/site-packages/readability/cleaners.py:3: in <module> from lxml.html.clean import Cleaner ../../../../Library/Caches/pypoetry/virtualenvs/navigator-document-parser-vnqSsywO-py3.12/lib/python3.12/site-packages/lxml/html/clean.py:18: in <module> raise ImportError( E ImportError: lxml.html.clean module is now a separate project lxml_html_clean. E Install lxml[html_clean] or lxml_html_clean directly. ``` wip
1 parent 364966b commit edb4579

16 files changed

+1237
-483
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ To run in Docker with the `data/raw` folder as input and `data/processed` as out
4040
make run_docker
4141
```
4242

43-
The CLI operates on an input folder of tasks defined by JSON files in the following format as defined in `cpr_data_access` library dependency. This can be found [here](https://github.com/climatepolicyradar/data-access).
43+
The CLI operates on an input folder of tasks defined by JSON files in the following format as defined in `cpr_sdk` library dependency. This can be found [here](https://github.com/climatepolicyradar/data-access).
4444

4545
``` python
4646
class ParserInput(BaseModel):

cli/parse_htmls.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import cloudpathlib.exceptions
88
from cloudpathlib import CloudPath
9-
from cpr_data_access.parser_models import ParserInput, ParserOutput, HTMLData
9+
from cpr_sdk.parser_models import ParserInput, ParserOutput, HTMLData
1010
from tqdm import tqdm
1111

1212
sys.path.append("..")

cli/parse_no_content_type.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import cloudpathlib.exceptions
77
from cloudpathlib import CloudPath
8-
from cpr_data_access.parser_models import ParserInput, ParserOutput
8+
from cpr_sdk.parser_models import ParserInput, ParserOutput
99
from tqdm.auto import tqdm
1010

1111
sys.path.append("..")

cli/parse_pdfs.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from azure.ai.formrecognizer import AnalyzeResult
1818
from azure.core.exceptions import ServiceRequestError, HttpResponseError
1919
from cloudpathlib import CloudPath, S3Path
20-
from cpr_data_access.parser_models import (
20+
from cpr_sdk.parser_models import (
2121
ParserInput,
2222
ParserOutput,
2323
PDFData,

cli/run_parser.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import click
1010
import pydantic
1111
from cloudpathlib import S3Path, CloudPath
12-
from cpr_data_access.parser_models import (
12+
from cpr_sdk.parser_models import (
1313
ParserInput,
1414
CONTENT_TYPE_HTML,
1515
CONTENT_TYPE_PDF,

cli/test/test_run_parser.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,13 @@
99
from azure.core.exceptions import HttpResponseError, ServiceRequestError
1010
from click.testing import CliRunner
1111
from cloudpathlib.local import LocalS3Path
12-
from cpr_data_access.parser_models import (
12+
from cpr_sdk.parser_models import (
1313
ParserOutput,
1414
HTMLData,
1515
CONTENT_TYPE_HTML,
1616
CONTENT_TYPE_PDF,
1717
)
18-
from cpr_data_access.pipeline_general_models import BackendDocument
18+
from cpr_sdk.pipeline_general_models import BackendDocument
1919
from azure_pdf_parser.base import PDFPagesBatchExtracted
2020
from azure.ai.formrecognizer import AnalyzeResult
2121
from mock import patch

cli/translate_outputs.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import cloudpathlib.exceptions
77
from cloudpathlib import CloudPath
8-
from cpr_data_access.parser_models import ParserOutput
8+
from cpr_sdk.parser_models import ParserOutput
99
from tqdm.auto import tqdm
1010

1111
from src.config import TARGET_LANGUAGES

poetry.lock

+1,208-456
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

+13-12
Original file line numberDiff line numberDiff line change
@@ -9,37 +9,38 @@ packages = [
99

1010

1111
[tool.poetry.dependencies]
12-
python = "^3.10"
12+
python = ">=3.10,<4.0"
1313
awscli = "^1.26.16"
14-
news-please = "^1.5.22"
15-
pandas = "^1.4.4"
14+
news-please = "^1.5.48"
15+
pandas = "^2.2.2"
16+
numpy = "^2.0.0"
1617
tqdm = "^4.64.1"
1718
pydantic = "^2.4.0"
1819
click = "^8.1.3"
1920
langdetect = "^1.0.9"
2021
playwright = "^1.35.0"
2122
readability-lxml = "^0.8.1"
22-
bleach = "^5.0.1"
23+
bleach = "^6.1.0"
2324
python-json-logger = "^2.0.4"
24-
cloudpathlib = { extras = ["s3"], version = "^0.10.0" }
25+
cloudpathlib = { extras = ["s3"], version = "^0.18.1" }
2526
PyMuPDF = "^1.20.2"
2627
google-cloud-translate = "^3.8.2"
27-
psutil = "^5.9.2"
28+
psutil = "^6.0.0"
2829
multiprocessing-logging = "^0.3.3"
2930
json-logging = "^1.3.0"
3031
tenacity = "^8.2.1"
31-
pillow = "<10" # to fix import of Image.LINEAR by dependency
32+
pillow = "^10.4.0" # to fix import of Image.LINEAR by dependency
3233
azure-ai-formrecognizer = "^3.2.1"
33-
pytest = "^7.4.0"
3434
mock = "^5.1.0"
3535
pypdf2 = "^3.0.1"
36-
azure-pdf-parser = { git = "https://github.com/climatepolicyradar/azure-pdf-parser.git", tag = "v0.3.1" }
36+
azure-pdf-parser = { git = "https://github.com/climatepolicyradar/azure-pdf-parser.git", tag = "v0.4.1" }
37+
lxml = {extras = ["html-clean"], version = "^5.2.2"}
3738

3839
[tool.poetry.dev-dependencies]
39-
pre-commit = "^2.20.0"
40+
pre-commit = "^3.7.1"
4041
pyright = "^1.1.270"
41-
pytest = "^7.1.3"
42-
black = "^22.10.0"
42+
pytest = "^8.3.1"
43+
black = "^24.4.2"
4344

4445
[build-system]
4546
requires = ["poetry-core>=1.0.0"]

src/base.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
"""Base classes for parsing."""
2+
23
import logging
34
import logging.config
45
from abc import ABC, abstractmethod
56

6-
from cpr_data_access.parser_models import HTMLData, ParserOutput, ParserInput
7+
from cpr_sdk.parser_models import HTMLData, ParserOutput, ParserInput
78

89
logger = logging.getLogger(__name__)
910

src/html_parser/combined.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import logging
44
import requests
5-
from cpr_data_access.parser_models import ParserInput, ParserOutput
5+
from cpr_sdk.parser_models import ParserInput, ParserOutput
66
from playwright.sync_api import sync_playwright
77
from playwright.sync_api._generated import Playwright
88
from src.html_parser.newsplease import NewsPleaseParser

src/html_parser/newsplease.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from src.config import HTML_HTTP_REQUEST_TIMEOUT, HTML_MIN_NO_LINES_FOR_VALID_TEXT
99
from src.base import HTMLParser
10-
from cpr_data_access.parser_models import (
10+
from cpr_sdk.parser_models import (
1111
ParserInput,
1212
ParserOutput,
1313
HTMLData,

src/html_parser/readability.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from typing import List
55
import re
66
import requests
7-
from cpr_data_access.parser_models import (
7+
from cpr_sdk.parser_models import (
88
ParserInput,
99
ParserOutput,
1010
HTMLData,

src/html_parser/test/test_parsers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import pytest
2-
from cpr_data_access.parser_models import ParserInput, ParserOutput
2+
from cpr_sdk.parser_models import ParserInput, ParserOutput
33

44
from src.base import HTMLParser
55
from src.html_parser.newsplease import NewsPleaseParser

src/translator/test/test_translate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from pathlib import Path
33
from unittest import mock
44

5-
from cpr_data_access.parser_models import ParserOutput
5+
from cpr_sdk.parser_models import ParserOutput
66
from src.translator.translate import translate_parser_output
77

88

src/translator/translate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from tenacity import retry, stop_after_attempt, wait_random_exponential
55

66

7-
from cpr_data_access.parser_models import ParserOutput
7+
from cpr_sdk.parser_models import ParserOutput
88
from google.cloud import translate_v2
99

1010

0 commit comments

Comments
 (0)