Skip to content

Commit

Permalink
avoid using p2p shuffle - sorting used to be unstable
Browse files Browse the repository at this point in the history
  • Loading branch information
rjzamora committed Apr 17, 2024
1 parent e22a836 commit c83d679
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion nvtabular/ops/categorify.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as pa_ds
from dask import config
from dask.base import tokenize
from dask.blockwise import BlockIndex
from dask.core import flatten
Expand Down Expand Up @@ -1251,7 +1252,8 @@ def _drop_first_row(part, index):
if has_size:
# Avoid using dask_cudf to calculate divisions
# (since it may produce too-few partitions)
df = df.sort_values(name_size, ascending=False)
with config.set({"dataframe.shuffle.method": "tasks"}):
df = df.sort_values(name_size, ascending=False)

unique_path = _save_encodings(
df,
Expand Down

0 comments on commit c83d679

Please sign in to comment.