Skip to content

Commit

Permalink
Update categorify to handle missing divisions
Browse files Browse the repository at this point in the history
  • Loading branch information
oliverholworthy committed Apr 23, 2024
1 parent bfb119b commit 2e4bad4
Showing 1 changed file with 3 additions and 7 deletions.
10 changes: 3 additions & 7 deletions nvtabular/ops/categorify.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as pa_ds
from dask import config
from dask.base import tokenize
from dask.blockwise import BlockIndex
from dask.core import flatten
Expand Down Expand Up @@ -1251,13 +1252,8 @@ def _drop_first_row(part, index):
if has_size:
# Avoid using dask_cudf to calculate divisions
# (since it may produce too-few partitions)
df = df.sort_values(
name_size,
ascending=False,
divisions=dd.shuffle._calculate_divisions(
df, df[name_size], False, df.npartitions
)[0][::-1],
)
with config.set({"dataframe.shuffle.method": "tasks"}):
df = df.sort_values(name_size, ascending=False)

unique_path = _save_encodings(
df,
Expand Down

0 comments on commit 2e4bad4

Please sign in to comment.