From c83d679b7e9c87b048d40600a68caf4889a21be6 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Wed, 17 Apr 2024 13:47:45 -0700 Subject: [PATCH] avoid using p2p shuffle - sorting used to be unstable --- nvtabular/ops/categorify.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py index cf5a486b54..4ebc878621 100644 --- a/nvtabular/ops/categorify.py +++ b/nvtabular/ops/categorify.py @@ -28,6 +28,7 @@ import pandas as pd import pyarrow as pa import pyarrow.dataset as pa_ds +from dask import config from dask.base import tokenize from dask.blockwise import BlockIndex from dask.core import flatten @@ -1251,7 +1252,8 @@ def _drop_first_row(part, index): if has_size: # Avoid using dask_cudf to calculate divisions # (since it may produce too-few partitions) - df = df.sort_values(name_size, ascending=False) + with config.set({"dataframe.shuffle.method": "tasks"}): + df = df.sort_values(name_size, ascending=False) unique_path = _save_encodings( df,