avoid using p2p shuffle - sorting used to be unstable

NVIDIA-Merlin · Apr 17, 2024 · c83d679 · c83d679
1 parent e22a836
commit c83d679
Showing 1 changed file with 3 additions and 1 deletion.
diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py
@@ -28,6 +28,7 @@
 import pandas as pd
 import pyarrow as pa
 import pyarrow.dataset as pa_ds
+from dask import config
 from dask.base import tokenize
 from dask.blockwise import BlockIndex
 from dask.core import flatten
@@ -1251,7 +1252,8 @@ def _drop_first_row(part, index):
             if has_size:
                 # Avoid using dask_cudf to calculate divisions
                 # (since it may produce too-few partitions)
-                df = df.sort_values(name_size, ascending=False)
+                with config.set({"dataframe.shuffle.method": "tasks"}):
+                    df = df.sort_values(name_size, ascending=False)
 
             unique_path = _save_encodings(
                 df,