-
Notifications
You must be signed in to change notification settings - Fork 7.1k
Improve dataset repr with polars-style table formatting #59748
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,6 +34,112 @@ | |
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def _format_dataset_as_table( | ||
| schema, num_rows, num_blocks=None, sample_rows=None, max_rows=10 | ||
| ) -> str: | ||
| """Format dataset information as a polars-style table. | ||
|
|
||
| Args: | ||
| schema: Dataset schema with .names and .types attributes | ||
| num_rows: Total number of rows (or "?" if unknown) | ||
| num_blocks: Number of blocks (only for MaterializedDataset) | ||
| sample_rows: Optional list of sample row dicts to display | ||
| max_rows: Maximum number of rows to show | ||
|
|
||
| Returns: | ||
| Formatted table string with box-drawing characters | ||
| """ | ||
| if not schema or not schema.names: | ||
| return f"Dataset(num_rows={num_rows})" | ||
|
|
||
| col_names = schema.names | ||
| col_types = [str(t) if not hasattr(t, "__name__") else t.__name__ | ||
| for t in schema.types] | ||
|
|
||
| # Calculate column widths | ||
| col_widths = [] | ||
| for i, name in enumerate(col_names): | ||
| width = max( | ||
| len(name), | ||
| len(col_types[i]) + 4, # "--- " prefix | ||
| 10 # minimum width | ||
| ) | ||
| if sample_rows: | ||
| for row in sample_rows[:max_rows]: | ||
| if name in row: | ||
| val_str = str(row[name]) | ||
| # Truncate long values | ||
| if len(val_str) > 50: | ||
| val_str = val_str[:47] + "..." | ||
| width = max(width, len(val_str)) | ||
|
Comment on lines
+68
to
+74
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The logic for calculating column width contribution from sample rows can be made more direct and readable. Instead of creating a truncated string to then measure its length, you can directly use for row in sample_rows[:max_rows]:
if name in row:
val_len = len(str(row[name]))
# The width contribution from a value is capped at 50.
width = max(width, min(val_len, 50)) |
||
| col_widths.append(min(width, 50)) # Cap at 50 chars | ||
|
|
||
| # Build table | ||
| lines = [] | ||
|
|
||
| # Shape line (like polars) | ||
| shape_str = f"shape: ({num_rows}, {len(col_names)})" | ||
| if num_blocks is not None: | ||
| shape_str += f", num_blocks: {num_blocks}" | ||
| lines.append(shape_str) | ||
|
|
||
| # Top border | ||
| parts = ["┌"] | ||
| for i, width in enumerate(col_widths): | ||
| parts.append("─" * (width + 2)) | ||
| parts.append("┬" if i < len(col_widths) - 1 else "┐") | ||
| lines.append("".join(parts)) | ||
|
|
||
| # Column names | ||
| parts = ["│"] | ||
| for name, width in zip(col_names, col_widths): | ||
| parts.append(f" {name:{width}} │") | ||
| lines.append("".join(parts)) | ||
|
|
||
| # Type separator with "---" | ||
| parts = ["│"] | ||
| for col_type, width in zip(col_types, col_widths): | ||
| type_str = f"--- {col_type}" | ||
| parts.append(f" {type_str:{width}} │") | ||
| lines.append("".join(parts)) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Column names and types not truncated despite width capLow Severity Column widths are capped at 50 characters on line 75, but when outputting column names (line 96) and types (line 103), Python's format specifier |
||
|
|
||
| # Header separator | ||
| parts = ["╞"] | ||
| for i, width in enumerate(col_widths): | ||
| parts.append("═" * (width + 2)) | ||
| parts.append("╪" if i < len(col_widths) - 1 else "╡") | ||
| lines.append("".join(parts)) | ||
|
|
||
| # Data rows (if materialized) | ||
| if sample_rows: | ||
| show_rows = min(len(sample_rows), max_rows) | ||
| for row_idx, row in enumerate(sample_rows[:show_rows]): | ||
| parts = ["│"] | ||
| for name, width in zip(col_names, col_widths): | ||
| val = row.get(name, "") | ||
| val_str = str(val) | ||
| if len(val_str) > width: | ||
| val_str = val_str[:width-3] + "..." | ||
| parts.append(f" {val_str:{width}} │") | ||
| lines.append("".join(parts)) | ||
|
|
||
| # Show ellipsis if there are more rows | ||
| if num_rows != "?" and isinstance(num_rows, int) and num_rows > show_rows: | ||
| parts = ["│"] | ||
| for width in col_widths: | ||
| parts.append(f" {'…':{width}} │") | ||
| lines.append("".join(parts)) | ||
|
|
||
| # Bottom border | ||
| parts = ["└"] | ||
| for i, width in enumerate(col_widths): | ||
| parts.append("─" * (width + 2)) | ||
| parts.append("┴" if i < len(col_widths) - 1 else "┘") | ||
| lines.append("".join(parts)) | ||
|
|
||
| return "\n".join(lines) | ||
|
|
||
|
|
||
| class ExecutionPlan: | ||
| """A lazy execution plan for a Dataset. | ||
|
|
||
|
|
@@ -253,82 +359,23 @@ def get_plan_as_string(self, dataset_cls: Type["Dataset"]) -> str: | |
| num_blocks = self.initial_num_blocks() | ||
| assert num_blocks is not None | ||
|
|
||
| name_str = ( | ||
| "name={}, ".format(self._dataset_name) | ||
| if self._dataset_name is not None | ||
| else "" | ||
| ) | ||
| num_blocks_str = f"num_blocks={num_blocks}, " if num_blocks else "" | ||
|
|
||
| dataset_str = "{}({}{}num_rows={}, schema={})".format( | ||
| dataset_cls.__name__, | ||
| name_str, | ||
| num_blocks_str, | ||
| count, | ||
| schema_str, | ||
| # Use table formatter for better UX (polars-style) | ||
| dataset_str = _format_dataset_as_table( | ||
| schema=schema, | ||
| num_rows=count, | ||
| num_blocks=num_blocks, | ||
| sample_rows=None # TODO: Add sample data fetching in future PR | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dataset name is lost in new table representationThe Additional Locations (1) |
||
| ) | ||
|
|
||
| # If the resulting string representation fits in one line, use it directly. | ||
| SCHEMA_LINE_CHAR_LIMIT = 80 | ||
| MIN_FIELD_LENGTH = 10 | ||
| INDENT_STR = " " * 3 | ||
| trailing_space = INDENT_STR * plan_max_depth | ||
|
|
||
| if len(dataset_str) > SCHEMA_LINE_CHAR_LIMIT: | ||
| # If the resulting string representation exceeds the line char limit, | ||
| # first try breaking up each `Dataset` parameter into its own line | ||
| # and check if each line fits within the line limit. We check the | ||
| # `schema` param's length, since this is likely the longest string. | ||
| schema_str_on_new_line = f"{trailing_space}{INDENT_STR}schema={schema_str}" | ||
| if len(schema_str_on_new_line) > SCHEMA_LINE_CHAR_LIMIT: | ||
| # If the schema cannot fit on a single line, break up each field | ||
| # into its own line. | ||
| schema_str = [] | ||
| for n, t in zip(schema.names, schema.types): | ||
| if hasattr(t, "__name__"): | ||
| t = t.__name__ | ||
| col_str = f"{trailing_space}{INDENT_STR * 2}{n}: {t}" | ||
| # If the field line exceeds the char limit, abbreviate | ||
| # the field name to fit while maintaining the full type | ||
| if len(col_str) > SCHEMA_LINE_CHAR_LIMIT: | ||
| shortened_suffix = f"...: {str(t)}" | ||
| # Show at least 10 characters of the field name, even if | ||
| # we have already hit the line limit with the type. | ||
| chars_left_for_col_name = max( | ||
| SCHEMA_LINE_CHAR_LIMIT - len(shortened_suffix), | ||
| MIN_FIELD_LENGTH, | ||
| ) | ||
| col_str = ( | ||
| f"{col_str[:chars_left_for_col_name]}{shortened_suffix}" | ||
| ) | ||
| schema_str.append(col_str) | ||
| schema_str = ",\n".join(schema_str) | ||
| schema_str = ( | ||
| "{\n" + schema_str + f"\n{trailing_space}{INDENT_STR}" + "}" | ||
| ) | ||
| name_str = ( | ||
| f"\n{trailing_space}{INDENT_STR}name={self._dataset_name}," | ||
| if self._dataset_name is not None | ||
| else "" | ||
| ) | ||
| num_blocks_str = ( | ||
| f"\n{trailing_space}{INDENT_STR}num_blocks={num_blocks}," | ||
| if num_blocks | ||
| else "" | ||
| ) | ||
| dataset_str = ( | ||
| f"{dataset_cls.__name__}(" | ||
| f"{name_str}" | ||
| f"{num_blocks_str}" | ||
| f"\n{trailing_space}{INDENT_STR}num_rows={count}," | ||
| f"\n{trailing_space}{INDENT_STR}schema={schema_str}" | ||
| f"\n{trailing_space})" | ||
| ) | ||
|
|
||
| if plan_max_depth == 0: | ||
| plan_str += dataset_str | ||
| else: | ||
| plan_str += f"{INDENT_STR * (plan_max_depth - 1)}+- {dataset_str}" | ||
| # Indent each line of the table for nested plans | ||
| indented_lines = [] | ||
| for line in dataset_str.split("\n"): | ||
| indented_lines.append(f"{INDENT_STR * (plan_max_depth - 1)}+- {line}") | ||
| plan_str += "\n".join(indented_lines) | ||
|
Comment on lines
+375
to
+378
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current indentation logic for nested plans incorrectly prefixes every line of the dataset's table representation with lines = dataset_str.split("\n")
indented_lines = [f"{INDENT_STR * (plan_max_depth - 1)}+- {lines[0]}"]
subsequent_indent = INDENT_STR * plan_max_depth
indented_lines.extend(subsequent_indent + line for line in lines[1:])
plan_str += "\n".join(indented_lines)There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tree prefix incorrectly added to every table lineWhen |
||
| return plan_str | ||
|
|
||
| def link_logical_plan(self, logical_plan: "LogicalPlan"): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AttributeError when schema is a type object
The
_format_dataset_as_table()function doesn't handle the case whenschemais a Python type object. Theschema()method returnsUnion[type, "pyarrow.lib.Schema"], and the old code at line 343 explicitly handledisinstance(schema, type). However,_format_dataset_as_table()accessesschema.nameswhich will raiseAttributeErrorfor type objects since they don't have a.namesattribute.Additional Locations (1)
python/ray/data/_internal/plan.py#L362-L364