From dcc59ba0bfe80229a054cb01177ebe0340ae84fe Mon Sep 17 00:00:00 2001 From: Julio Date: Fri, 9 Jun 2023 12:04:07 -0400 Subject: [PATCH] change to subworkflows in workflow instead of subgraph4 --- nvtabular/workflow/workflow.py | 6 ++-- .../unit/workflow/test_workflow_subgraphs.py | 29 +++---------------- 2 files changed, 7 insertions(+), 28 deletions(-) diff --git a/nvtabular/workflow/workflow.py b/nvtabular/workflow/workflow.py index 38c4815952..9c5e944a37 100755 --- a/nvtabular/workflow/workflow.py +++ b/nvtabular/workflow/workflow.py @@ -143,8 +143,8 @@ def fit_schema(self, input_schema: Schema): return self @property - def subgraphs(self): - return self.graph.subgraphs.keys() + def subworkflows(self): + return list(self.graph.subgraphs.keys()) @property def input_dtypes(self): @@ -169,7 +169,7 @@ def output_node(self): def _input_columns(self): return self.graph._input_columns() - def get_subgraph(self, subgraph_name): + def get_subworkflow(self, subgraph_name): subgraph = self.graph.subgraph(subgraph_name) return Workflow(subgraph.output_node) diff --git a/tests/unit/workflow/test_workflow_subgraphs.py b/tests/unit/workflow/test_workflow_subgraphs.py index 72fae8ef47..40e169e997 100644 --- a/tests/unit/workflow/test_workflow_subgraphs.py +++ b/tests/unit/workflow/test_workflow_subgraphs.py @@ -14,18 +14,16 @@ # limitations under the License. # -import math import os import numpy as np import pytest from pandas.api.types import is_integer_dtype -from merlin.core.dispatch import HAS_GPU from merlin.core.utils import set_dask_client from merlin.dag.ops.subgraph import Subgraph from nvtabular import Workflow, ops -from tests.conftest import assert_eq, get_cats +from tests.conftest import assert_eq @pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1]) @@ -74,25 +72,6 @@ def get_norms(tar): concat_ops = "_FillMissing_1_LogOp_1" if replace: concat_ops = "" - assert math.isclose(get_norms(df.x).mean(), norms.means["x" + concat_ops], rel_tol=1e-1) - assert math.isclose(get_norms(df.y).mean(), norms.means["y" + concat_ops], rel_tol=1e-1) - - assert math.isclose(get_norms(df.x).std(), norms.stds["x" + concat_ops], rel_tol=1e-1) - assert math.isclose(get_norms(df.y).std(), norms.stds["y" + concat_ops], rel_tol=1e-1) - # Check that categories match - if engine == "parquet": - cats_expected0 = df["name-cat"].unique().values_host if HAS_GPU else df["name-cat"].unique() - cats0 = get_cats(workflow, "name-cat") - # adding the None entry as a string because of move from gpu - assert all(cat in sorted(cats_expected0.tolist()) for cat in cats0.tolist()) - assert len(cats0.tolist()) == len(cats_expected0.tolist()) - cats_expected1 = ( - df["name-string"].unique().values_host if HAS_GPU else df["name-string"].unique() - ) - cats1 = get_cats(workflow, "name-string") - # adding the None entry as a string because of move from gpu - assert all(cat in sorted(cats_expected1.tolist()) for cat in cats1.tolist()) - assert len(cats1.tolist()) == len(cats_expected1.tolist()) # Write to new "shuffled" and "processed" dataset df_pp = workflow.transform(dataset).to_ddf().compute() @@ -101,8 +80,8 @@ def get_norms(tar): assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) - subgraph_cat = workflow.get_subgraph("cat_graph") - subgraph_cont = workflow.get_subgraph("cont_graph") + subgraph_cat = workflow.get_subworkflow("cat_graph") + subgraph_cont = workflow.get_subworkflow("cont_graph") assert isinstance(subgraph_cat, Workflow) assert isinstance(subgraph_cont, Workflow) # will not be the same nodes of saved out and loaded back @@ -111,7 +90,7 @@ def get_norms(tar): assert subgraph_cont.output_node == cont_features # check failure path works as expected with pytest.raises(ValueError) as exc: - workflow.get_subgraph("not_exist") + workflow.get_subworkflow("not_exist") assert "No subgraph named" in str(exc.value) sub_cat_df = subgraph_cat.transform(dataset).to_ddf().compute()