Skip to content

Commit

Permalink
aliased stat operator with merlin core stat operator (#1825)
Browse files Browse the repository at this point in the history
* aliased stat operator with merlin core stat operator

* add correct stat operator import to nvt workflow
  • Loading branch information
jperez999 authored May 24, 2023
1 parent 86e443d commit 9233565
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 41 deletions.
41 changes: 1 addition & 40 deletions nvtabular/ops/stat_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,43 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Any

import dask.dataframe as dd

from nvtabular.ops.operator import ColumnSelector, Operator


class StatOperator(Operator):
"""
Base class for statistical operator classes. This adds a 'fit' and 'finalize' method
on top of the Operator class.
"""

def fit(self, col_selector: ColumnSelector, ddf: dd.DataFrame) -> Any:
"""Calculate statistics for this operator, and return a dask future
to these statistics, which will be computed by the workflow."""

raise NotImplementedError(
"""The dask operations needed to return a dictionary of uncomputed statistics."""
)

def fit_finalize(self, dask_stats):
"""Finalize statistics calculation - the workflow calls this function with
the computed statistics from the 'fit' object'"""

raise NotImplementedError(
"""Follow-up operations to convert dask statistics in to member variables"""
)

def clear(self):
"""zero and reinitialize all relevant statistical properties"""
raise NotImplementedError("clear isn't implemented for this op!")

def set_storage_path(self, new_path, copy=False):
"""Certain stat operators need external storage - for instance Categorify writes out
parquet files containing the categorical mapping. When we save the operator, we
also want to save these files as part of the bundle. Implementing this method
lets statoperators bundle their dependent files into the new path that we're writing
out (note that this could happen after the operator is created)
"""
from merlin.dag.ops.stat_operator import StatOperator # noqa pylint: disable=unused-import
3 changes: 2 additions & 1 deletion nvtabular/workflow/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@
from merlin.dag import Graph
from merlin.dag.executors import DaskExecutor, LocalExecutor
from merlin.dag.node import iter_nodes
from merlin.dag.ops.stat_operator import StatOperator
from merlin.io import Dataset
from merlin.schema import Schema
from nvtabular.ops import LambdaOp, StatOperator
from nvtabular.ops import LambdaOp
from nvtabular.workflow.node import WorkflowNode

LOG = logging.getLogger("nvtabular")
Expand Down

0 comments on commit 9233565

Please sign in to comment.