Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 117 additions & 70 deletions python/ray/data/_internal/plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,112 @@
logger = logging.getLogger(__name__)


def _format_dataset_as_table(
schema, num_rows, num_blocks=None, sample_rows=None, max_rows=10
) -> str:
"""Format dataset information as a polars-style table.

Args:
schema: Dataset schema with .names and .types attributes
num_rows: Total number of rows (or "?" if unknown)
num_blocks: Number of blocks (only for MaterializedDataset)
sample_rows: Optional list of sample row dicts to display
max_rows: Maximum number of rows to show

Returns:
Formatted table string with box-drawing characters
"""
if not schema or not schema.names:
return f"Dataset(num_rows={num_rows})"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AttributeError when schema is a type object

The _format_dataset_as_table() function doesn't handle the case when schema is a Python type object. The schema() method returns Union[type, "pyarrow.lib.Schema"], and the old code at line 343 explicitly handled isinstance(schema, type). However, _format_dataset_as_table() accesses schema.names which will raise AttributeError for type objects since they don't have a .names attribute.

Additional Locations (1)

Fix in Cursor Fix in Web


col_names = schema.names
col_types = [str(t) if not hasattr(t, "__name__") else t.__name__
for t in schema.types]

# Calculate column widths
col_widths = []
for i, name in enumerate(col_names):
width = max(
len(name),
len(col_types[i]) + 4, # "--- " prefix
10 # minimum width
)
if sample_rows:
for row in sample_rows[:max_rows]:
if name in row:
val_str = str(row[name])
# Truncate long values
if len(val_str) > 50:
val_str = val_str[:47] + "..."
width = max(width, len(val_str))
Comment on lines +68 to +74
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The logic for calculating column width contribution from sample rows can be made more direct and readable. Instead of creating a truncated string to then measure its length, you can directly use min() with the value's length and the cap of 50. This avoids unnecessary string manipulation and makes the intent clearer.

            for row in sample_rows[:max_rows]:
                if name in row:
                    val_len = len(str(row[name]))
                    # The width contribution from a value is capped at 50.
                    width = max(width, min(val_len, 50))

col_widths.append(min(width, 50)) # Cap at 50 chars

# Build table
lines = []

# Shape line (like polars)
shape_str = f"shape: ({num_rows}, {len(col_names)})"
if num_blocks is not None:
shape_str += f", num_blocks: {num_blocks}"
lines.append(shape_str)

# Top border
parts = ["┌"]
for i, width in enumerate(col_widths):
parts.append("─" * (width + 2))
parts.append("┬" if i < len(col_widths) - 1 else "┐")
lines.append("".join(parts))

# Column names
parts = ["│"]
for name, width in zip(col_names, col_widths):
parts.append(f" {name:{width}} │")
lines.append("".join(parts))

# Type separator with "---"
parts = ["│"]
for col_type, width in zip(col_types, col_widths):
type_str = f"--- {col_type}"
parts.append(f" {type_str:{width}} │")
lines.append("".join(parts))
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Column names and types not truncated despite width cap

Low Severity

Column widths are capped at 50 characters on line 75, but when outputting column names (line 96) and types (line 103), Python's format specifier {name:{width}} only pads shorter strings - it doesn't truncate longer ones. If a column name or type exceeds 50 characters, it will overflow the allocated width, causing misalignment between the content and the box-drawing borders. The PR description claims "Long column names/types automatically truncated" but this truncation logic is missing.

Fix in Cursor Fix in Web


# Header separator
parts = ["╞"]
for i, width in enumerate(col_widths):
parts.append("═" * (width + 2))
parts.append("╪" if i < len(col_widths) - 1 else "╡")
lines.append("".join(parts))

# Data rows (if materialized)
if sample_rows:
show_rows = min(len(sample_rows), max_rows)
for row_idx, row in enumerate(sample_rows[:show_rows]):
parts = ["│"]
for name, width in zip(col_names, col_widths):
val = row.get(name, "")
val_str = str(val)
if len(val_str) > width:
val_str = val_str[:width-3] + "..."
parts.append(f" {val_str:{width}} │")
lines.append("".join(parts))

# Show ellipsis if there are more rows
if num_rows != "?" and isinstance(num_rows, int) and num_rows > show_rows:
parts = ["│"]
for width in col_widths:
parts.append(f" {'…':{width}} │")
lines.append("".join(parts))

# Bottom border
parts = ["└"]
for i, width in enumerate(col_widths):
parts.append("─" * (width + 2))
parts.append("┴" if i < len(col_widths) - 1 else "┘")
lines.append("".join(parts))

return "\n".join(lines)


class ExecutionPlan:
"""A lazy execution plan for a Dataset.

Expand Down Expand Up @@ -253,82 +359,23 @@ def get_plan_as_string(self, dataset_cls: Type["Dataset"]) -> str:
num_blocks = self.initial_num_blocks()
assert num_blocks is not None

name_str = (
"name={}, ".format(self._dataset_name)
if self._dataset_name is not None
else ""
)
num_blocks_str = f"num_blocks={num_blocks}, " if num_blocks else ""

dataset_str = "{}({}{}num_rows={}, schema={})".format(
dataset_cls.__name__,
name_str,
num_blocks_str,
count,
schema_str,
# Use table formatter for better UX (polars-style)
dataset_str = _format_dataset_as_table(
schema=schema,
num_rows=count,
num_blocks=num_blocks,
sample_rows=None # TODO: Add sample data fetching in future PR
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dataset name is lost in new table representation

The _format_dataset_as_table() function doesn't accept a name parameter and self._dataset_name is never passed to it. The old code displayed name={self._dataset_name} when set, but this information is now completely lost in the representation. Named datasets will no longer show their names, which is a regression in functionality.

Additional Locations (1)

Fix in Cursor Fix in Web

)

# If the resulting string representation fits in one line, use it directly.
SCHEMA_LINE_CHAR_LIMIT = 80
MIN_FIELD_LENGTH = 10
INDENT_STR = " " * 3
trailing_space = INDENT_STR * plan_max_depth

if len(dataset_str) > SCHEMA_LINE_CHAR_LIMIT:
# If the resulting string representation exceeds the line char limit,
# first try breaking up each `Dataset` parameter into its own line
# and check if each line fits within the line limit. We check the
# `schema` param's length, since this is likely the longest string.
schema_str_on_new_line = f"{trailing_space}{INDENT_STR}schema={schema_str}"
if len(schema_str_on_new_line) > SCHEMA_LINE_CHAR_LIMIT:
# If the schema cannot fit on a single line, break up each field
# into its own line.
schema_str = []
for n, t in zip(schema.names, schema.types):
if hasattr(t, "__name__"):
t = t.__name__
col_str = f"{trailing_space}{INDENT_STR * 2}{n}: {t}"
# If the field line exceeds the char limit, abbreviate
# the field name to fit while maintaining the full type
if len(col_str) > SCHEMA_LINE_CHAR_LIMIT:
shortened_suffix = f"...: {str(t)}"
# Show at least 10 characters of the field name, even if
# we have already hit the line limit with the type.
chars_left_for_col_name = max(
SCHEMA_LINE_CHAR_LIMIT - len(shortened_suffix),
MIN_FIELD_LENGTH,
)
col_str = (
f"{col_str[:chars_left_for_col_name]}{shortened_suffix}"
)
schema_str.append(col_str)
schema_str = ",\n".join(schema_str)
schema_str = (
"{\n" + schema_str + f"\n{trailing_space}{INDENT_STR}" + "}"
)
name_str = (
f"\n{trailing_space}{INDENT_STR}name={self._dataset_name},"
if self._dataset_name is not None
else ""
)
num_blocks_str = (
f"\n{trailing_space}{INDENT_STR}num_blocks={num_blocks},"
if num_blocks
else ""
)
dataset_str = (
f"{dataset_cls.__name__}("
f"{name_str}"
f"{num_blocks_str}"
f"\n{trailing_space}{INDENT_STR}num_rows={count},"
f"\n{trailing_space}{INDENT_STR}schema={schema_str}"
f"\n{trailing_space})"
)

if plan_max_depth == 0:
plan_str += dataset_str
else:
plan_str += f"{INDENT_STR * (plan_max_depth - 1)}+- {dataset_str}"
# Indent each line of the table for nested plans
indented_lines = []
for line in dataset_str.split("\n"):
indented_lines.append(f"{INDENT_STR * (plan_max_depth - 1)}+- {line}")
plan_str += "\n".join(indented_lines)
Comment on lines +375 to +378
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The current indentation logic for nested plans incorrectly prefixes every line of the dataset's table representation with +- . This breaks the visual hierarchy of the plan tree. The +- should only prefix the first line of the nested dataset block, and subsequent lines should be indented to align with the content of that first line.

            lines = dataset_str.split("\n")
            indented_lines = [f"{INDENT_STR * (plan_max_depth - 1)}+- {lines[0]}"]
            subsequent_indent = INDENT_STR * plan_max_depth
            indented_lines.extend(subsequent_indent + line for line in lines[1:])
            plan_str += "\n".join(indented_lines)

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tree prefix incorrectly added to every table line

When plan_max_depth > 0, the code adds the +- prefix to every line of the multi-line table output. The +- prefix is a tree structure indicator that should only appear on the first line. Subsequent lines should have only indentation (spaces), not the +- prefix. This produces malformed output like +- shape:... followed by +- ┌───... instead of proper continuation indentation.

Fix in Cursor Fix in Web

return plan_str

def link_logical_plan(self, logical_plan: "LogicalPlan"):
Expand Down