Update categorify to handle missing divisions

NVIDIA-Merlin · Apr 23, 2024 · 2e4bad4 · 2e4bad4
1 parent bfb119b
commit 2e4bad4
Showing 1 changed file with 3 additions and 7 deletions.
diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py
@@ -28,6 +28,7 @@
 import pandas as pd
 import pyarrow as pa
 import pyarrow.dataset as pa_ds
+from dask import config
 from dask.base import tokenize
 from dask.blockwise import BlockIndex
 from dask.core import flatten
@@ -1251,13 +1252,8 @@ def _drop_first_row(part, index):
             if has_size:
                 # Avoid using dask_cudf to calculate divisions
                 # (since it may produce too-few partitions)
-                df = df.sort_values(
-                    name_size,
-                    ascending=False,
-                    divisions=dd.shuffle._calculate_divisions(
-                        df, df[name_size], False, df.npartitions
-                    )[0][::-1],
-                )
+                with config.set({"dataframe.shuffle.method": "tasks"}):
+                    df = df.sort_values(name_size, ascending=False)
 
             unique_path = _save_encodings(
                 df,