From 2e4bad48cdac811ebd31267a7d3818cb7c60ac0f Mon Sep 17 00:00:00 2001 From: Oliver Holworthy Date: Tue, 23 Apr 2024 11:10:06 +0100 Subject: [PATCH] Update categorify to handle missing divisions --- nvtabular/ops/categorify.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py index 556e2a005a..4ebc878621 100644 --- a/nvtabular/ops/categorify.py +++ b/nvtabular/ops/categorify.py @@ -28,6 +28,7 @@ import pandas as pd import pyarrow as pa import pyarrow.dataset as pa_ds +from dask import config from dask.base import tokenize from dask.blockwise import BlockIndex from dask.core import flatten @@ -1251,13 +1252,8 @@ def _drop_first_row(part, index): if has_size: # Avoid using dask_cudf to calculate divisions # (since it may produce too-few partitions) - df = df.sort_values( - name_size, - ascending=False, - divisions=dd.shuffle._calculate_divisions( - df, df[name_size], False, df.npartitions - )[0][::-1], - ) + with config.set({"dataframe.shuffle.method": "tasks"}): + df = df.sort_values(name_size, ascending=False) unique_path = _save_encodings( df,