Skip to content

Commit 6fb760c

Browse files
authored
Read data using pandas (#8)
2 parents a486468 + c8a1d33 commit 6fb760c

29 files changed

+720
-257
lines changed

.env.example

Whitespace-only changes.

.github/workflows/ci-cd.yml

+7-7
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ permissions: read-all
1616

1717
jobs:
1818
code-quality:
19-
uses: climatepolicyradar/reusable-workflows/.github/workflows/python-precommit-validator-without-version.yml@v12
19+
uses: climatepolicyradar/reusable-workflows/.github/workflows/python-precommit-validator-without-version.yml@v13
20+
with:
21+
python-version: "3.10"
2022

2123
test:
2224
runs-on: ubuntu-latest
@@ -36,15 +38,13 @@ jobs:
3638
- name: Install dependencies
3739
run: |
3840
python -m pip install "poetry==1.7.1"
39-
poetry config virtualenvs.create false
40-
poetry install --no-cache
41+
poetry config virtualenvs.prefer-active-python true
42+
poetry config virtualenvs.create true
4143
poetry install --only-root
44+
poetry install --with dev --no-cache
4245
4346
- name: Export PYTHONPATH
4447
run: echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
4548

46-
- name: Create .env
47-
run: cp .env.example .env
48-
4949
- name: Run Tests
50-
run: pytest -vvv tests/unit_tests
50+
run: poetry run pytest -vvv tests/unit_tests

.trunk/configs/cspell.json

+4-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@
2828
"PYTHONPATH",
2929
"markdownlint",
3030
"shellcheck",
31-
"SCRIPTDIR"
31+
"SCRIPTDIR",
32+
"chunksize",
33+
"dataframe",
34+
"dataframes"
3235
],
3336
"flagWords": ["hte"],
3437
"suggestionsTimeout": 5000

.trunk/trunk.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ lint:
4343
- .gitignore
4444
- linters: [pre-commit-hooks, prettier]
4545
paths:
46-
- tests/unit_tests/test_fixtures/malformed_data.json
46+
- tests/unit_tests/fixtures/malformed_data.json
4747

4848
enabled:
4949

README.md

+10-1
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,13 @@ to find the latest version.
2424

2525
## Usage
2626

27-
TBD
27+
If `--output_file` is not passed, by default an output file called `output.json`
28+
will be created in the current directory if it does not already exist.
29+
30+
If `--gcf_projects_file`, `mcf_projects_file` or `mcf_docs_file` is not passed,
31+
by default the GCF mapper tool will look for a sub-folder in the current working
32+
directory called `data` with the following files in it:
33+
34+
```bash
35+
gcf_data_mapper --gcf_projects_file FILENAME --mcf_projects_file FILENAME --mcf_docs_file FILENAME --output_file FILENAME
36+
```

gcf_data_mapper/cli.py

+65-9
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,64 @@
1+
import os
12
import sys
23
from typing import Any, Optional
34

45
import click
6+
import pandas as pd
57

68
from gcf_data_mapper.parsers.collection import collection
79
from gcf_data_mapper.parsers.document import document
810
from gcf_data_mapper.parsers.family import family
11+
from gcf_data_mapper.read import read
912

1013

1114
@click.command()
12-
@click.option("--debug/--no-debug", default=False)
15+
@click.option(
16+
"--gcf_projects_file",
17+
default=os.path.join(os.getcwd(), "data", "gcf-projects.json"),
18+
type=click.Path(exists=True),
19+
)
20+
@click.option(
21+
"--mcf_projects_file",
22+
# trunk-ignore(cspell/error)
23+
default=os.path.join(os.getcwd(), "data", "MCFprojects.csv"),
24+
type=click.Path(exists=True),
25+
)
26+
@click.option(
27+
"--mcf_docs_file",
28+
# trunk-ignore(cspell/error)
29+
default=os.path.join(os.getcwd(), "data", "MCFdocuments-v2.csv"),
30+
type=click.Path(exists=True),
31+
)
32+
@click.option(
33+
"--output_file",
34+
default=os.path.join(os.getcwd(), "output.json"),
35+
type=click.Path(exists=False),
36+
)
37+
@click.option("--debug/--no-debug", default=True)
1338
@click.version_option("0.1.0", "--version", "-v", help="Show the version and exit.")
14-
def entrypoint(debug: bool):
39+
def entrypoint(
40+
gcf_projects_file, mcf_projects_file, mcf_docs_file, output_file, debug: bool
41+
):
1542
"""Simple program that wrangles GCF data into bulk import format.
1643
44+
:param str gcf_projects_file: The GCF projects filename.
45+
:param str mcf_projects_file: The MCF projects filename.
46+
:param str mcf_docs_file: The MCF projects filename.
47+
:param str output_file: The output filename.
1748
:param bool debug: Whether debug mode is on.
1849
"""
1950
click.echo("🚀 Starting the GCF data mapping process.")
51+
if debug:
52+
click.echo("📝 Input files:")
53+
click.echo(f"- {click.format_filename(gcf_projects_file)}")
54+
click.echo(f"- {click.format_filename(mcf_projects_file)}")
55+
click.echo(f"- {click.format_filename(mcf_docs_file)}")
2056

2157
try:
22-
wrangle_to_json(debug)
58+
project_info, doc_info = read(
59+
gcf_projects_file, mcf_projects_file, mcf_docs_file, debug
60+
)
61+
mapped_data = wrangle_to_json(project_info, doc_info, debug)
2362
except Exception as e:
2463
click.echo(f"❌ Failed to map GCF data to expected JSON. Error: {e}.")
2564
sys.exit(1)
@@ -28,31 +67,48 @@ def entrypoint(debug: bool):
2867

2968
click.echo()
3069
click.echo("🚀 Dumping GCF data to output file")
31-
dump_output()
70+
dump_output(mapped_data, output_file, debug)
3271
click.echo("✅ Finished dumping mapped GCF data.")
3372

3473

35-
def wrangle_to_json(debug) -> dict[str, list[Optional[dict[str, Any]]]]:
74+
def wrangle_to_json(
75+
project_info: pd.DataFrame, doc_info: pd.DataFrame, debug: bool
76+
) -> dict[str, list[Optional[dict[str, Any]]]]:
3677
"""Put the mapped GCF data into a dictionary ready for dumping.
3778
3879
The output of this function will get dumped as JSON to the output
3980
file.
4081
82+
:param pd.DataFrame project_info: The GCF and MCF joined project
83+
info.
84+
:param pd.DataFrame doc_info: The MCF docs info.
4185
:param bool debug: Whether debug mode is on.
4286
:return dict[str, list[Optional[dict[str, Any]]]]: The GCF data
4387
mapped to the Document-Family-Collection-Event entity it
4488
corresponds to.
4589
"""
4690
return {
4791
"collections": collection(debug),
48-
"families": family(debug),
49-
"documents": document(debug),
92+
"families": family(project_info, debug),
93+
"documents": document(doc_info, debug),
5094
"events": [],
5195
}
5296

5397

54-
def dump_output():
55-
pass
98+
def dump_output(
99+
mapped_data: dict[str, list[Optional[dict[str, Any]]]],
100+
output_file: str,
101+
debug: bool,
102+
):
103+
"""Dump the wrangled JSON to the output file.
104+
105+
:param dict[str, list[Optional[dict[str, Any]]]] mapped_data: The
106+
mapped GCF data.
107+
:param str output_file: The output filename.
108+
:param bool debug: Whether debug mode is on.
109+
"""
110+
if debug:
111+
click.echo(f"📝 Output file {click.format_filename(output_file)}")
56112

57113

58114
if __name__ == "__main__":

gcf_data_mapper/parsers/collection.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,6 @@ def collection(debug: bool) -> list[Optional[dict[str, Any]]]:
1515
Sheet.
1616
"""
1717
if debug:
18-
click.echo("📝 Wrangling GCF collection data.")
18+
click.echo("📝 No GCF collection data to wrangle.")
1919

2020
return []

gcf_data_mapper/parsers/document.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
from typing import Any, Optional
22

33
import click
4+
import pandas as pd
45

56

6-
def document(debug: bool) -> list[Optional[dict[str, Any]]]:
7+
def document(mcf_docs: pd.DataFrame, debug: bool) -> list[Optional[dict[str, Any]]]:
78
"""Map the GCF document info to new structure.
89
10+
:param pd.DataFrame mcf_docs: The MCF documents data.
911
:param bool debug: Whether debug mode is on.
1012
:return list[Optional[dict[str, Any]]]: A list of GCF families in
1113
the 'destination' format described in the GCF Data Mapper Google

gcf_data_mapper/parsers/family.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
from typing import Any, Optional
22

33
import click
4+
import pandas as pd
45

56

6-
def family(debug: bool) -> list[Optional[dict[str, Any]]]:
7+
def family(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str, Any]]]:
78
"""Map the GCF family info to new structure.
89
10+
:param pd.DataFrame projects_data: The MCF and GCF project data,
11+
joined on FP num.
912
:param bool debug: Whether debug mode is on.
1013
:return list[Optional[dict[str, Any]]]: A list of GCF families in
1114
the 'destination' format described in the GCF Data Mapper Google

0 commit comments

Comments
 (0)