Skip to content
This repository has been archived by the owner on Sep 11, 2024. It is now read-only.

Commit

Permalink
Merge pull request #155 from climatepolicyradar/feature/rnd-1136-add-…
Browse files Browse the repository at this point in the history
…a-from_flat_json-method-to-the-dal-for-the-parser-output

Adding method to instantiate from flat json.
  • Loading branch information
THOR300 authored Mar 27, 2024
2 parents cbbb1e2 + 96e062e commit 9e0b9e9
Show file tree
Hide file tree
Showing 6 changed files with 6,709 additions and 7 deletions.
20 changes: 17 additions & 3 deletions src/cpr_data_access/parser_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
from enum import Enum
from typing import List, Optional, Sequence, Tuple, TypeVar, Union

from langdetect import DetectorFactory, LangDetectException, detect
from pydantic import BaseModel, AnyHttpUrl, Field, model_validator

from cpr_data_access.pipeline_general_models import (
CONTENT_TYPE_HTML,
CONTENT_TYPE_PDF,
BackendDocument,
Json,
)
from cpr_data_access.utils import remove_key_if_all_nested_vals_none, unflatten_json
from langdetect import DetectorFactory, LangDetectException, detect
from pydantic import AnyHttpUrl, BaseModel, Field, model_validator

_LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -359,3 +359,17 @@ class ParserOutput(BaseParserOutput):
"""Output to a parser with the metadata format used by the CPR backend."""

document_metadata: BackendDocument

@staticmethod
def from_flat_json(data: dict):
"""Instantiate a parser output object from flat json."""

unflattened = unflatten_json(data)

# We remove optional fields that have complex nested structures.
# E.g. if html_data had a value of None for has_valid_text, we need to remove
# it as this would throw a validation error.
unflattened = remove_key_if_all_nested_vals_none(unflattened, "html_data")
unflattened = remove_key_if_all_nested_vals_none(unflattened, "pdf_data")

return ParserOutput.model_validate(unflattened)
30 changes: 30 additions & 0 deletions src/cpr_data_access/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,33 @@ def dig(obj: Union[list, dict], *fields: Any, default: Any = None) -> Any:
elif not obj:
return default
return obj


def unflatten_json(data: dict) -> dict:
"""
Unflatten a dictionary with keys that are dot-separated strings.
I.e. metadata.data respresents {"metadata": {"data": {}}}
"""
unflattened = {}
for key, value in data.items():
parts = key.split(".")
current = unflattened
for part in parts[:-1]:
current = current.setdefault(part, {})
current[parts[-1]] = value
return unflattened


def remove_key_if_all_nested_vals_none(data: dict, key: str) -> dict:
"""
Remove the value for a given key if it's a dict with all None values.
E.g. {"key": {"a": None, "b": None}} -> {}
"""
if key not in data:
return data
if isinstance(data[key], dict):
if all(value is None for value in data[key].values()):
data.pop(key)
return data
7 changes: 7 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,13 @@ def parser_output_json_html() -> dict:
return json.load(f)


@pytest.fixture()
def parser_output_json_flat() -> dict:
"""A dictionary representation of a parser output that is flat"""
with open("tests/test_data/huggingface/flat_hf_parser_output.json") as f:
return json.load(f)


@pytest.fixture()
def backend_document_json() -> dict:
"""A dictionary representation of a backend document"""
Expand Down
Loading

0 comments on commit 9e0b9e9

Please sign in to comment.