Feature/pdct 1399 map document to new json structure (#9)

* WIP : feat - update documents function to map data from csv * feat: extend document function - separate functionality for mapping translated files, checking validation etc - check for missing columns in the data frame * refactor: update document parse function - add new column headers to titles - update validate function to check for invalid paths - small udpates for readability * test: add tests for document function * chore: ask trunk to ignore spelling mistakes in tests file * chore: add unknown words to cspell dictionary rather than telling trunk to ignore potential mispellings * fix: tell cspell to ignore conftest file because it does not like foreign languages * refactor: update invalid urls to check for reserved and unreserved characters in path * fix: missing key in copy * test: update test data to use mock data * chore: add blurb explaining what the code is doing * refactor: change name of enum, to better illustrate what it is doing * Bump to 0.1.7 * refactor: update copy * test: update tests move fixtures into its own file * fix: update mentions of gcf to mcf --------- Co-authored-by: Katy Baulch <[email protected]> Co-authored-by: Osneil Drakes <[email protected]> Co-authored-by: Osneil Drakes <[email protected]>
climatepolicyradar · Sep 3, 2024 · bab7919 · bab7919
1 parent e26532a
commit bab7919
Show file tree

Hide file tree

Showing 6 changed files with 576 additions and 9 deletions.
diff --git a/.trunk/configs/cspell.json b/.trunk/configs/cspell.json
@@ -31,7 +31,11 @@
     "SCRIPTDIR",
     "chunksize",
     "dataframe",
-    "dataframes"
+    "dataframes",
+    "iloc",
+    "iterrows",
+    "notna",
+    "conftest"
   ],
   "flagWords": ["hte"],
   "suggestionsTimeout": 5000

diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml
@@ -41,6 +41,7 @@ lint:
       paths:
         - .trunk/configs/cspell.json
         - .gitignore
+        - tests/unit_tests/parsers/document/conftest.py
     - linters: [pre-commit-hooks, prettier]
       paths:
         - tests/unit_tests/fixtures/malformed_data.json

diff --git a/gcf_data_mapper/parsers/document.py b/gcf_data_mapper/parsers/document.py
@@ -1,19 +1,161 @@
+from enum import Enum
 from typing import Any, Optional
+from urllib.parse import urlparse
 
 import click
 import pandas as pd
 
 
-def document(mcf_docs: pd.DataFrame, debug: bool) -> list[Optional[dict[str, Any]]]:
+class RequiredColumns(Enum):
+    SOURCE_URL = "Document page permalink"
+    TITLE = "Title"
+    TRANSLATED_FILES = "Translated files"
+    TRANSLATED_TITLES = "Translated titles"
+    TYPE = "Type"
+
+
+def contains_duplicate_urls(urls: list[str]) -> bool:
+    """
+    Checks a list of urls for any duplicate entries
+
+    param: list[str] urls: A list of urls
+    return bool: Returns true if duplicate urls are present
+    """
+
+    # Convert all URLs to lowercase for case-insensitive comparison
+    lowercase_urls = [url.lower() for url in urls]
+    return len(lowercase_urls) != len(set(lowercase_urls))
+
+
+def contains_empty_urls(urls: list[str]) -> bool:
+    """
+    Checks a list of urls for any empty entries
+
+    param: list[str] urls: a list of urls
+    return bool: Returns true if empty urls are present, or false if not
+    """
+    for url in urls:
+        if not url.strip():
+            return True
+    return False
+
+
+def contains_invalid_paths(urls: list[str]) -> bool:
+    """
+    Checks a list of urls for any malformed entries
+
+    param: list[str] urls: A list of urls
+    return bool: Returns true if malformed urls are present, or false if not
+    """
+    for url in urls:
+        parsed_url = urlparse(url)
+        path = parsed_url.path
+        # Reserved and unreserved characters per RFC 3986
+        reserved_and_unreserved_characters = ":/?#[]@!$&'()*+,;=-_.~"
+        if any(
+            not (c.isalnum() or c in reserved_and_unreserved_characters) for c in path
+        ):
+            return True
+    return False
+
+
+def validate_urls(urls: list[str], doc_id: str) -> None:
+    """
+    Validates a list of URLs for empty, duplicate, and malformed entries.
+
+    param: list[str] urls : A list of urls
+    param: str doc_id: The document id of the invalid source url/s
+    raises ValueError: If the list contains duplicate, empty or malformed url/s
+    """
+    if contains_empty_urls(urls):
+        raise ValueError(
+            f"Empty URL found in list of translated urls. DocumentId : {doc_id}"
+        )
+    if contains_duplicate_urls(urls):
+        raise ValueError(
+            f"Duplicate URLs found in list of translated urls. DocumentId : {doc_id}"
+        )
+    if contains_invalid_paths(urls):
+        raise ValueError(
+            f"Malformed url found in list of translated urls. DocumentId : {doc_id}"
+        )
+
+
+def map_translated_files(translated_files_row: pd.Series) -> list[dict]:
+    """
+    Maps the GCF document with translated versions into the new json structure
+
+    :param pd.Series translated_files_row: A row from the DataFrame containing the 'Translated files' field, which holds a string of translated source URLs separated by the pipe (|) character. This string includes multiple URLs for translated documents in various languages.
+    :return: A list of gcf document objects, each with a different source url reflecting the translated version of the original document
+    """
+
+    mapped_documents = []
+
+    concatenated_string_of_url_docs = str(
+        translated_files_row[RequiredColumns.TRANSLATED_FILES.value]
+    )
+    url_docs = concatenated_string_of_url_docs.split("|")
+
+    doc_id = translated_files_row.iloc[0]
+
+    try:
+        validate_urls(url_docs, doc_id)
+        for url in url_docs:
+            mapped_documents.append(
+                {
+                    "metadata": {
+                        "type": translated_files_row[RequiredColumns.TYPE.value]
+                    },
+                    "title": translated_files_row[RequiredColumns.TITLE.value],
+                    "source_url": url.strip(),
+                    "variant_name": "Translated",
+                }
+            )
+        return mapped_documents
+    except Exception as e:
+        raise e
+
+
+def document(gcf_docs: pd.DataFrame, debug: bool) -> list[Optional[dict[str, Any]]]:
     """Map the GCF document info to new structure.
 
-    :param pd.DataFrame mcf_docs: The MCF documents data.
+    :param pd.DataFrame gcf_docs: The GCF documents data.
     :param bool debug: Whether debug mode is on.
+    :raises ValueError: If the DataFrame is missing one or more of the required column headers
     :return list[Optional[dict[str, Any]]]: A list of GCF families in
         the 'destination' format described in the GCF Data Mapper Google
         Sheet.
     """
+
+    required_columns = [column.value for column in RequiredColumns]
+    missing_columns = [col for col in required_columns if col not in gcf_docs.columns]
+
+    if missing_columns:
+        click.echo("Missing required columns: {}".format(", ".join(missing_columns)))
+        raise ValueError("Missing required columns in GCF data frame")
+
     if debug:
         click.echo("📝 Wrangling GCF document data.")
 
-    return []
+    mapped_docs = []
+    # We iterate over each row in the DataFrame gcf_docs using iterrows(),
+    # the underscore indicates that the index of the row will not be used in this loop.
+    # We check if the field in the 'TRANSLATED_TITLES' column is not NaN. Note - Empty entries return as nan
+    # Then we create a dictionary for each row with metadata type, title, source URL,
+    # and variant name which is appended to the list.
+    # Separately, if that row also contains a value in the translated titles column,
+    # we will map a separate object for each of the translated versions, using the translated url
+    # as the source url and add those translated versions to the list
+    for _, row in gcf_docs.iterrows():
+        has_translated_files = pd.notna(row.at[RequiredColumns.TRANSLATED_TITLES.value])
+        mapped_docs.append(
+            {
+                "metadata": {"type": row[RequiredColumns.TYPE.value]},
+                "title": row[RequiredColumns.TITLE.value],
+                "source_url": row[RequiredColumns.SOURCE_URL.value],
+                "variant_name": "Original Translation",
+            }
+        )
+        if has_translated_files:
+            mapped_docs.extend(map_translated_files(row))
+    return mapped_docs
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gcf-data-mapper"
-version = "0.1.6"
+version = "0.1.7"
 description = "A CLI tool to wrangle GCF data into format recognised by the bulk-import tool."
 authors = ["CPR-dev-team <[email protected]>"]
 license = "Apache-2.0"