Skip to content

Commit

Permalink
Add mock_data test configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
mattsolo1 committed Sep 29, 2023
1 parent b5dcca2 commit 9552b62
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 25 deletions.
10 changes: 9 additions & 1 deletion data-pipeline/check.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#!/usr/bin/env bash
#
echo "┏━━━ Clean ━━━━━━━━━━━━━━━━━━━"
find . -type d -name "__pycache__" -exec rm -r {} +
find . -name "*.pyc" -exec rm -f {} +

echo "┏━━━ Running pyright ━━━━━━━━━━━━━━━━━━━"
pyright
Expand All @@ -12,4 +16,8 @@ ruff src/data_pipeline --fix
ruff tests --fix

echo "┏━━━ Running pytest ━━━━━━━━━━━━━━━━━━━"
pytest -m only
if [[ "$1" == "--mock-data" ]]; then
pytest -k "mock_data"
else
pytest
fi
5 changes: 3 additions & 2 deletions data-pipeline/pytest.ini
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
[pytest]
testpaths =
tests/pipeline
addopts = --strict -W ignore -v -s --durations=0
tests/v4
addopts = --strict -W ignore -v -s --durations=0 -k "not mock_data and not broken"
markers =
only: marked with "only"
long: takes a long time
mock_data: requires mock datasets to be available
broken: test is broken
9 changes: 4 additions & 5 deletions data-pipeline/src/data_pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,6 @@ def get_inputs(self):
if isinstance(v, (Task, DownloadTask)):
paths.update({k: v.get_output_path()})
else:
logger.info(k)
paths.update({k: os.path.join(self._config.input_root, v)})

return paths
Expand Down Expand Up @@ -174,7 +173,7 @@ def run(self, force=False):
elapsed = stop - start
logger.info(f"Finished {self._name} in {elapsed // 60}m{elapsed % 60:02}s")
else:
logger.info("Skipping %s", self._name)
logger.info(f"Skipping {self._name}")


@attr.define
Expand Down Expand Up @@ -213,17 +212,17 @@ def get_task(self, name: str) -> Union[Task, DownloadTask]:
def get_all_task_names(self) -> List[str]:
return list(self._tasks.keys())

def run(self, force_tasks=None):
def run(self, force_tasks=None) -> None:
for task_name, task in self._tasks.items():
task.run(force=force_tasks and task_name in force_tasks)

def set_outputs(self, outputs):
def set_outputs(self, outputs) -> None:
for output_name, task_name in outputs.items():
assert task_name in self._tasks, f"Unable to set output '{output_name}', no task named '{task_name}'"

self._outputs = outputs

def get_output(self, output_name):
def get_output(self, output_name) -> str:
task_name = self._outputs[output_name]
return self._tasks[task_name]

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os


from data_pipeline.config import PipelineConfig, get_data_environment, DataEnvironment
from data_pipeline.pipeline import Pipeline, PipelineMock, run_pipeline

Expand All @@ -14,6 +15,8 @@

DATA_ENV = os.getenv("DATA_ENV", "mock")

pipeline_name = "gnomad_v4_variants"

data_environment = get_data_environment(DATA_ENV)

if data_environment == DataEnvironment.mock:
Expand All @@ -30,7 +33,7 @@
}
)
config = PipelineConfig(
name="gnomad_v4_variants",
name=pipeline_name,
input_root="data/v4_mock/inputs",
output_root="data/v4_mock/outputs",
)
Expand All @@ -39,7 +42,7 @@
from data_pipeline.pipelines.genes import pipeline as genes_pipeline

config = PipelineConfig(
name="gnomad_v4_variants",
name=pipeline_name,
input_root="gs://gnomad-matt-data-pipeline/2023-09-26/inputs",
output_root="gs://gnomad-matt-data-pipeline/2023-09-26/outputs",
)
Expand All @@ -48,7 +51,6 @@
f"Data environment invalid. Set DATA_ENV to one of {', '.join([e.name for e in DataEnvironment])}"
)


pipeline = Pipeline(config=config)

pipeline.add_task(
Expand Down
15 changes: 4 additions & 11 deletions data-pipeline/tests/pipeline/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
from data_pipeline.config import DataEnvironment, PipelineConfig, get_data_environment
from data_pipeline.pipeline import Pipeline

# from data_pipeline.pipeline import Pipeline


@pytest.fixture
def input_tmp():
Expand All @@ -26,27 +24,23 @@ def output_tmp():
yield temp_dir


@pytest.mark.only
def test_get_data_environment_defaults_mock():
data_environment = get_data_environment("mock")
assert data_environment == DataEnvironment.mock


@pytest.mark.only
def test_get_data_environment_raises_if_invalid_environment():
with pytest.raises(ValueError, match="Invalid value 'nonexisting_environment'. Allowed values are"):
get_data_environment("nonexisting_environment")


@pytest.mark.only
def test_config_created(input_tmp, output_tmp):
config = PipelineConfig(name="test", input_root=input_tmp, output_root=output_tmp)
assert isinstance(config, PipelineConfig)
assert isinstance(config.input_root, str)
assert isinstance(config.output_root, str)


@pytest.mark.only
def test_config_read_input_file(input_tmp, output_tmp):
config = PipelineConfig(
name="test",
Expand All @@ -70,7 +64,6 @@ def write(self, path, overwrite=False):
f.write(self.text)


@pytest.mark.only
def test_pipeline_tasks(input_tmp, output_tmp):
def task_1_fn(input_file_path):
with open(input_file_path, "r") as f:
Expand All @@ -79,11 +72,11 @@ def task_1_fn(input_file_path):
output_data.update_text(f"{input_data} processed")
return output_data

config = PipelineConfig(name="pipeline1", input_root=input_tmp, output_root=output_tmp)
test_config = PipelineConfig(name="pipeline1", input_root=input_tmp, output_root=output_tmp)

pipeline = Pipeline(config=config)
test_pipeline = Pipeline(config=test_config)

pipeline.add_task(
test_pipeline.add_task(
name="process_data",
task_function=task_1_fn,
output_path="my_output.txt",
Expand All @@ -92,7 +85,7 @@ def task_1_fn(input_file_path):
},
)

pipeline.run()
test_pipeline.run()

with open(os.path.join(output_tmp, "my_output.txt"), "r") as f:
assert f.read() == "tiny dataset processed"
1 change: 1 addition & 0 deletions data-pipeline/tests/pipeline/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def output_dir():
yield temp_dir


@pytest.mark.broken
def test_config(input_dir, output_dir):
logger.info(input_dir)
logger.info(output_dir)
Expand Down
9 changes: 6 additions & 3 deletions data-pipeline/tests/v4/test_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from cattrs import structure, structure_attrs_fromdict
import hail as hl
import json
from loguru import logger

from data_pipeline.pipelines.gnomad_v4_variants import (
pipeline as gnomad_v4_variant_pipeline,
Expand Down Expand Up @@ -33,22 +32,24 @@ def ht_to_json(ht: hl.Table, field: str = "row"):
return objs


@pytest.mark.only
@pytest.mark.mock_data
def test_globals_input_validation():
input_path = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants").get_inputs()["input_path"]
ht = hl.read_table(input_path)
result = ht_to_json(ht, "globals")[0]
logger.info(result)
# logger.info(result)
structure(result, Globals)


@pytest.mark.mock_data
def test_validate_variant_input():
input_path = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants").get_inputs()["input_path"]
ht = hl.read_table(input_path)
result = ht_to_json(ht)
[structure_attrs_fromdict(variant, InitialVariant) for variant in result]


@pytest.mark.mock_data
def test_validate_step1_output():
output_path = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants").get_output_path()
ht = hl.read_table(output_path)
Expand All @@ -57,13 +58,15 @@ def test_validate_step1_output():
[structure_attrs_fromdict(variant, Step1Variant) for variant in result]


@pytest.mark.mock_data
def test_validate_step2_output():
output_path = gnomad_v4_variant_pipeline.get_task("annotate_gnomad_v4_exome_variants").get_output_path()
ht = hl.read_table(output_path)
result = ht_to_json(ht)
[structure_attrs_fromdict(variant, Step2Variant) for variant in result]


@pytest.mark.mock_data
def test_validate_step3_output():
output_path = gnomad_v4_variant_pipeline.get_task(
"annotate_gnomad_v4_exome_transcript_consequences"
Expand Down

0 comments on commit 9552b62

Please sign in to comment.