Skip to content

Commit

Permalink
Feature/pdct 1399 map document to new json structure (#9)
Browse files Browse the repository at this point in the history
* WIP : feat - update documents function to map  data from csv

* feat: extend document function

- separate functionality for mapping translated files, checking
  validation etc
- check for missing columns in the data frame

* refactor: update document parse function

- add new column headers to titles
- update validate function to check for invalid paths
- small udpates for readability

* test: add tests for document function

* chore: ask trunk to ignore spelling mistakes in tests file

* chore: add unknown words to cspell dictionary

rather than telling trunk to ignore potential mispellings

* fix: tell cspell to ignore conftest file because it does not like foreign languages

* refactor: update invalid urls to check for reserved and unreserved characters in path

* fix: missing key in copy

* test: update test data to use mock data

* chore: add blurb explaining what the code is doing

* refactor: change name of enum, to better illustrate what it is doing

* Bump to 0.1.7

* refactor: update copy

* test: update tests

move fixtures into its own file

* fix: update mentions of gcf to mcf

---------

Co-authored-by: Katy Baulch <[email protected]>
Co-authored-by: Osneil Drakes <[email protected]>
Co-authored-by: Osneil Drakes <[email protected]>
  • Loading branch information
4 people authored Sep 3, 2024
1 parent e26532a commit bab7919
Show file tree
Hide file tree
Showing 6 changed files with 576 additions and 9 deletions.
6 changes: 5 additions & 1 deletion .trunk/configs/cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@
"SCRIPTDIR",
"chunksize",
"dataframe",
"dataframes"
"dataframes",
"iloc",
"iterrows",
"notna",
"conftest"
],
"flagWords": ["hte"],
"suggestionsTimeout": 5000
Expand Down
1 change: 1 addition & 0 deletions .trunk/trunk.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ lint:
paths:
- .trunk/configs/cspell.json
- .gitignore
- tests/unit_tests/parsers/document/conftest.py
- linters: [pre-commit-hooks, prettier]
paths:
- tests/unit_tests/fixtures/malformed_data.json
Expand Down
148 changes: 145 additions & 3 deletions gcf_data_mapper/parsers/document.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,161 @@
from enum import Enum
from typing import Any, Optional
from urllib.parse import urlparse

import click
import pandas as pd


def document(mcf_docs: pd.DataFrame, debug: bool) -> list[Optional[dict[str, Any]]]:
class RequiredColumns(Enum):
SOURCE_URL = "Document page permalink"
TITLE = "Title"
TRANSLATED_FILES = "Translated files"
TRANSLATED_TITLES = "Translated titles"
TYPE = "Type"


def contains_duplicate_urls(urls: list[str]) -> bool:
"""
Checks a list of urls for any duplicate entries
param: list[str] urls: A list of urls
return bool: Returns true if duplicate urls are present
"""

# Convert all URLs to lowercase for case-insensitive comparison
lowercase_urls = [url.lower() for url in urls]
return len(lowercase_urls) != len(set(lowercase_urls))


def contains_empty_urls(urls: list[str]) -> bool:
"""
Checks a list of urls for any empty entries
param: list[str] urls: a list of urls
return bool: Returns true if empty urls are present, or false if not
"""
for url in urls:
if not url.strip():
return True
return False


def contains_invalid_paths(urls: list[str]) -> bool:
"""
Checks a list of urls for any malformed entries
param: list[str] urls: A list of urls
return bool: Returns true if malformed urls are present, or false if not
"""
for url in urls:
parsed_url = urlparse(url)
path = parsed_url.path
# Reserved and unreserved characters per RFC 3986
reserved_and_unreserved_characters = ":/?#[]@!$&'()*+,;=-_.~"
if any(
not (c.isalnum() or c in reserved_and_unreserved_characters) for c in path
):
return True
return False


def validate_urls(urls: list[str], doc_id: str) -> None:
"""
Validates a list of URLs for empty, duplicate, and malformed entries.
param: list[str] urls : A list of urls
param: str doc_id: The document id of the invalid source url/s
raises ValueError: If the list contains duplicate, empty or malformed url/s
"""
if contains_empty_urls(urls):
raise ValueError(
f"Empty URL found in list of translated urls. DocumentId : {doc_id}"
)
if contains_duplicate_urls(urls):
raise ValueError(
f"Duplicate URLs found in list of translated urls. DocumentId : {doc_id}"
)
if contains_invalid_paths(urls):
raise ValueError(
f"Malformed url found in list of translated urls. DocumentId : {doc_id}"
)


def map_translated_files(translated_files_row: pd.Series) -> list[dict]:
"""
Maps the GCF document with translated versions into the new json structure
:param pd.Series translated_files_row: A row from the DataFrame containing the 'Translated files' field, which holds a string of translated source URLs separated by the pipe (|) character. This string includes multiple URLs for translated documents in various languages.
:return: A list of gcf document objects, each with a different source url reflecting the translated version of the original document
"""

mapped_documents = []

concatenated_string_of_url_docs = str(
translated_files_row[RequiredColumns.TRANSLATED_FILES.value]
)
url_docs = concatenated_string_of_url_docs.split("|")

doc_id = translated_files_row.iloc[0]

try:
validate_urls(url_docs, doc_id)
for url in url_docs:
mapped_documents.append(
{
"metadata": {
"type": translated_files_row[RequiredColumns.TYPE.value]
},
"title": translated_files_row[RequiredColumns.TITLE.value],
"source_url": url.strip(),
"variant_name": "Translated",
}
)
return mapped_documents
except Exception as e:
raise e


def document(gcf_docs: pd.DataFrame, debug: bool) -> list[Optional[dict[str, Any]]]:
"""Map the GCF document info to new structure.
:param pd.DataFrame mcf_docs: The MCF documents data.
:param pd.DataFrame gcf_docs: The GCF documents data.
:param bool debug: Whether debug mode is on.
:raises ValueError: If the DataFrame is missing one or more of the required column headers
:return list[Optional[dict[str, Any]]]: A list of GCF families in
the 'destination' format described in the GCF Data Mapper Google
Sheet.
"""

required_columns = [column.value for column in RequiredColumns]
missing_columns = [col for col in required_columns if col not in gcf_docs.columns]

if missing_columns:
click.echo("Missing required columns: {}".format(", ".join(missing_columns)))
raise ValueError("Missing required columns in GCF data frame")

if debug:
click.echo("📝 Wrangling GCF document data.")

return []
mapped_docs = []
# We iterate over each row in the DataFrame gcf_docs using iterrows(),
# the underscore indicates that the index of the row will not be used in this loop.
# We check if the field in the 'TRANSLATED_TITLES' column is not NaN. Note - Empty entries return as nan
# Then we create a dictionary for each row with metadata type, title, source URL,
# and variant name which is appended to the list.
# Separately, if that row also contains a value in the translated titles column,
# we will map a separate object for each of the translated versions, using the translated url
# as the source url and add those translated versions to the list
for _, row in gcf_docs.iterrows():
has_translated_files = pd.notna(row.at[RequiredColumns.TRANSLATED_TITLES.value])
mapped_docs.append(
{
"metadata": {"type": row[RequiredColumns.TYPE.value]},
"title": row[RequiredColumns.TITLE.value],
"source_url": row[RequiredColumns.SOURCE_URL.value],
"variant_name": "Original Translation",
}
)
if has_translated_files:
mapped_docs.extend(map_translated_files(row))
return mapped_docs
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gcf-data-mapper"
version = "0.1.6"
version = "0.1.7"
description = "A CLI tool to wrangle GCF data into format recognised by the bulk-import tool."
authors = ["CPR-dev-team <[email protected]>"]
license = "Apache-2.0"
Expand Down
Loading

0 comments on commit bab7919

Please sign in to comment.