Skip to content

Commit

Permalink
Merge pull request #138 from climatepolicyradar/feature/pods-1497-bum…
Browse files Browse the repository at this point in the history
…p-azure-pdf-parser

Bump azure-pdf-parser
  • Loading branch information
jesse-c authored Jul 23, 2024
2 parents 364966b + edb4579 commit ad45f95
Show file tree
Hide file tree
Showing 16 changed files with 1,237 additions and 483 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ To run in Docker with the `data/raw` folder as input and `data/processed` as out
make run_docker
```

The CLI operates on an input folder of tasks defined by JSON files in the following format as defined in `cpr_data_access` library dependency. This can be found [here](https://github.com/climatepolicyradar/data-access).
The CLI operates on an input folder of tasks defined by JSON files in the following format as defined in `cpr_sdk` library dependency. This can be found [here](https://github.com/climatepolicyradar/data-access).

``` python
class ParserInput(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion cli/parse_htmls.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import cloudpathlib.exceptions
from cloudpathlib import CloudPath
from cpr_data_access.parser_models import ParserInput, ParserOutput, HTMLData
from cpr_sdk.parser_models import ParserInput, ParserOutput, HTMLData
from tqdm import tqdm

sys.path.append("..")
Expand Down
2 changes: 1 addition & 1 deletion cli/parse_no_content_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import cloudpathlib.exceptions
from cloudpathlib import CloudPath
from cpr_data_access.parser_models import ParserInput, ParserOutput
from cpr_sdk.parser_models import ParserInput, ParserOutput
from tqdm.auto import tqdm

sys.path.append("..")
Expand Down
2 changes: 1 addition & 1 deletion cli/parse_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from azure.ai.formrecognizer import AnalyzeResult
from azure.core.exceptions import ServiceRequestError, HttpResponseError
from cloudpathlib import CloudPath, S3Path
from cpr_data_access.parser_models import (
from cpr_sdk.parser_models import (
ParserInput,
ParserOutput,
PDFData,
Expand Down
2 changes: 1 addition & 1 deletion cli/run_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import click
import pydantic
from cloudpathlib import S3Path, CloudPath
from cpr_data_access.parser_models import (
from cpr_sdk.parser_models import (
ParserInput,
CONTENT_TYPE_HTML,
CONTENT_TYPE_PDF,
Expand Down
4 changes: 2 additions & 2 deletions cli/test/test_run_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
from azure.core.exceptions import HttpResponseError, ServiceRequestError
from click.testing import CliRunner
from cloudpathlib.local import LocalS3Path
from cpr_data_access.parser_models import (
from cpr_sdk.parser_models import (
ParserOutput,
HTMLData,
CONTENT_TYPE_HTML,
CONTENT_TYPE_PDF,
)
from cpr_data_access.pipeline_general_models import BackendDocument
from cpr_sdk.pipeline_general_models import BackendDocument
from azure_pdf_parser.base import PDFPagesBatchExtracted
from azure.ai.formrecognizer import AnalyzeResult
from mock import patch
Expand Down
2 changes: 1 addition & 1 deletion cli/translate_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import cloudpathlib.exceptions
from cloudpathlib import CloudPath
from cpr_data_access.parser_models import ParserOutput
from cpr_sdk.parser_models import ParserOutput
from tqdm.auto import tqdm

from src.config import TARGET_LANGUAGES
Expand Down
1,664 changes: 1,208 additions & 456 deletions poetry.lock

Large diffs are not rendered by default.

25 changes: 13 additions & 12 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,37 +9,38 @@ packages = [


[tool.poetry.dependencies]
python = "^3.10"
python = ">=3.10,<4.0"
awscli = "^1.26.16"
news-please = "^1.5.22"
pandas = "^1.4.4"
news-please = "^1.5.48"
pandas = "^2.2.2"
numpy = "^2.0.0"
tqdm = "^4.64.1"
pydantic = "^2.4.0"
click = "^8.1.3"
langdetect = "^1.0.9"
playwright = "^1.35.0"
readability-lxml = "^0.8.1"
bleach = "^5.0.1"
bleach = "^6.1.0"
python-json-logger = "^2.0.4"
cloudpathlib = { extras = ["s3"], version = "^0.10.0" }
cloudpathlib = { extras = ["s3"], version = "^0.18.1" }
PyMuPDF = "^1.20.2"
google-cloud-translate = "^3.8.2"
psutil = "^5.9.2"
psutil = "^6.0.0"
multiprocessing-logging = "^0.3.3"
json-logging = "^1.3.0"
tenacity = "^8.2.1"
pillow = "<10" # to fix import of Image.LINEAR by dependency
pillow = "^10.4.0" # to fix import of Image.LINEAR by dependency
azure-ai-formrecognizer = "^3.2.1"
pytest = "^7.4.0"
mock = "^5.1.0"
pypdf2 = "^3.0.1"
azure-pdf-parser = { git = "https://github.com/climatepolicyradar/azure-pdf-parser.git", tag = "v0.3.1" }
azure-pdf-parser = { git = "https://github.com/climatepolicyradar/azure-pdf-parser.git", tag = "v0.4.1" }
lxml = {extras = ["html-clean"], version = "^5.2.2"}

[tool.poetry.dev-dependencies]
pre-commit = "^2.20.0"
pre-commit = "^3.7.1"
pyright = "^1.1.270"
pytest = "^7.1.3"
black = "^22.10.0"
pytest = "^8.3.1"
black = "^24.4.2"

[build-system]
requires = ["poetry-core>=1.0.0"]
Expand Down
3 changes: 2 additions & 1 deletion src/base.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""Base classes for parsing."""

import logging
import logging.config
from abc import ABC, abstractmethod

from cpr_data_access.parser_models import HTMLData, ParserOutput, ParserInput
from cpr_sdk.parser_models import HTMLData, ParserOutput, ParserInput

logger = logging.getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion src/html_parser/combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging
import requests
from cpr_data_access.parser_models import ParserInput, ParserOutput
from cpr_sdk.parser_models import ParserInput, ParserOutput
from playwright.sync_api import sync_playwright
from playwright.sync_api._generated import Playwright
from src.html_parser.newsplease import NewsPleaseParser
Expand Down
2 changes: 1 addition & 1 deletion src/html_parser/newsplease.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from src.config import HTML_HTTP_REQUEST_TIMEOUT, HTML_MIN_NO_LINES_FOR_VALID_TEXT
from src.base import HTMLParser
from cpr_data_access.parser_models import (
from cpr_sdk.parser_models import (
ParserInput,
ParserOutput,
HTMLData,
Expand Down
2 changes: 1 addition & 1 deletion src/html_parser/readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import List
import re
import requests
from cpr_data_access.parser_models import (
from cpr_sdk.parser_models import (
ParserInput,
ParserOutput,
HTMLData,
Expand Down
2 changes: 1 addition & 1 deletion src/html_parser/test/test_parsers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pytest
from cpr_data_access.parser_models import ParserInput, ParserOutput
from cpr_sdk.parser_models import ParserInput, ParserOutput

from src.base import HTMLParser
from src.html_parser.newsplease import NewsPleaseParser
Expand Down
2 changes: 1 addition & 1 deletion src/translator/test/test_translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from pathlib import Path
from unittest import mock

from cpr_data_access.parser_models import ParserOutput
from cpr_sdk.parser_models import ParserOutput
from src.translator.translate import translate_parser_output


Expand Down
2 changes: 1 addition & 1 deletion src/translator/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from tenacity import retry, stop_after_attempt, wait_random_exponential


from cpr_data_access.parser_models import ParserOutput
from cpr_sdk.parser_models import ParserOutput
from google.cloud import translate_v2


Expand Down

0 comments on commit ad45f95

Please sign in to comment.