Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
499c000
attempt 1
hadia206 Jul 15, 2025
c192a6e
add collection back
hadia206 Jul 16, 2025
77f63d1
Merge branch 'main' of https://github.com/bodo-ai/PyDough into Hadia/…
hadia206 Jul 16, 2025
721eecc
range collections base, initial try
hadia206 Jul 14, 2025
4a53e01
add range_collection to pydough top
hadia206 Jul 15, 2025
d0ce87f
add test
hadia206 Jul 15, 2025
3c5ed66
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 16, 2025
8893d79
address comments
hadia206 Jul 16, 2025
a7469d3
merge conflict
hadia206 Jul 16, 2025
9e66de2
update test
hadia206 Jul 16, 2025
025b556
make range inside UnqualifiedGeneratedCollection
hadia206 Jul 16, 2025
29e1e74
[run CI] hybrid and execute
hadia206 Jul 18, 2025
dc3c686
fix uniqueness, singular, always exists, and add another test
hadia206 Jul 18, 2025
a626dc1
[run CI] more tests and fix empty table
hadia206 Jul 19, 2025
8491cc6
add other tests (skipped as they're not passing)
hadia206 Jul 22, 2025
6315f3d
Fixing test bugs
knassre-bodo Jul 22, 2025
d4230e4
Fixing hybrid/qualification/conversion bugs
knassre-bodo Jul 22, 2025
a465cbe
merge conflicts
hadia206 Oct 17, 2025
54c88ec
add SF support and test
hadia206 Oct 31, 2025
b55df23
remove unneeded code
hadia206 Oct 31, 2025
420cb5c
[run all] update test 6
hadia206 Nov 3, 2025
b6e11bb
[run all] docs
hadia206 Nov 3, 2025
b7a252f
[run all] missed file
hadia206 Nov 3, 2025
5145d4e
[run all] merge conflict
hadia206 Nov 3, 2025
b1e151b
[run all] docs, remove ansi/sqlite files, fix quoted check
hadia206 Nov 3, 2025
136d526
[run all] fix AST print
hadia206 Nov 3, 2025
40c7228
[run all] one more try
hadia206 Nov 3, 2025
3b6463d
replace base with not implemented error
hadia206 Nov 5, 2025
46daebe
address John comments
hadia206 Nov 6, 2025
0ae47e7
[run all] address Kian comments
hadia206 Nov 6, 2025
7c08883
Merge branch 'main' of https://github.com/bodo-ai/PyDough into Hadia/…
hadia206 Nov 7, 2025
1fb9c44
[run all]
hadia206 Nov 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pydough/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"get_logger",
"init_pydough_context",
"parse_json_metadata_from_file",
"range_collection",
"to_df",
"to_sql",
]
Expand All @@ -22,6 +23,7 @@
from .logger import get_logger
from .metadata import parse_json_metadata_from_file
from .unqualified import display_raw, from_string, init_pydough_context
from .user_collections.user_collection_apis import range_collection

# Create a default session for the user to interact with.
# In most situations users will just use this session and
Expand Down
3 changes: 2 additions & 1 deletion pydough/conversion/agg_removal.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
CallExpression,
EmptySingleton,
Filter,
GeneratedTable,
Join,
JoinType,
Limit,
Expand Down Expand Up @@ -276,7 +277,7 @@ def aggregation_uniqueness_helper(
)
return node, final_uniqueness
# Empty singletons don't have uniqueness information.
case EmptySingleton():
case EmptySingleton() | GeneratedTable():
return node, set()
case _:
raise NotImplementedError(
Expand Down
3 changes: 2 additions & 1 deletion pydough/conversion/filter_pushdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
ColumnReference,
EmptySingleton,
Filter,
GeneratedTable,
Join,
JoinType,
Limit,
Expand Down Expand Up @@ -143,7 +144,7 @@ def push_filters(
# be transposed beneath a limit without changing its output.
node._input = push_filters(node.input, set())
return build_filter(node, filters)
case EmptySingleton() | Scan():
case EmptySingleton() | Scan() | GeneratedTable():
# For remaining nodes, materialize all of the remaining filters.
return build_filter(node, filters)
case _:
Expand Down
31 changes: 31 additions & 0 deletions pydough/conversion/hybrid_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"HybridPartition",
"HybridPartitionChild",
"HybridRoot",
"HybridUserGeneratedCollection",
]


Expand All @@ -27,6 +28,9 @@
ColumnProperty,
PyDoughExpressionQDAG,
)
from pydough.qdag.collections.user_collection_qdag import (
PyDoughUserGeneratedCollectionQDag,
)

from .hybrid_connection import HybridConnection
from .hybrid_expressions import (
Expand Down Expand Up @@ -483,3 +487,30 @@ def __repr__(self):

def search_term_definition(self, name: str) -> HybridExpr | None:
return self.predecessor.search_term_definition(name)


class HybridUserGeneratedCollection(HybridOperation):
"""
Class for HybridOperation corresponding to a user-generated collection.
"""

def __init__(self, user_collection: PyDoughUserGeneratedCollectionQDag):
"""
Args:
`collection`: the QDAG node for the user-generated collection.
"""
self._user_collection: PyDoughUserGeneratedCollectionQDag = user_collection
terms: dict[str, HybridExpr] = {}
for name, typ in user_collection.collection.column_names_and_types:
terms[name] = HybridRefExpr(name, typ)
super().__init__(terms, {}, [], [])

@property
def user_collection(self) -> PyDoughUserGeneratedCollectionQDag:
"""
The user-generated collection that this hybrid operation represents.
"""
return self._user_collection

def __repr__(self):
return f"USER_GEN_COLLECTION[{self.user_collection.name}]"
26 changes: 26 additions & 0 deletions pydough/conversion/hybrid_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
Where,
WindowCall,
)
from pydough.qdag.collections.user_collection_qdag import (
PyDoughUserGeneratedCollectionQDag,
)
from pydough.types import BooleanType, NumericType

from .hybrid_connection import ConnectionType, HybridConnection
Expand All @@ -68,6 +71,7 @@
HybridPartition,
HybridPartitionChild,
HybridRoot,
HybridUserGeneratedCollection,
)
from .hybrid_syncretizer import HybridSyncretizer
from .hybrid_tree import HybridTree
Expand Down Expand Up @@ -1339,6 +1343,9 @@ def define_root_link(
case HybridRoot():
# A root does not need to be joined to its parent
join_keys = []
case HybridUserGeneratedCollection():
# A user-generated collection does not need to be joined to its parent
join_keys = []
case _:
raise NotImplementedError(f"{operation.__class__.__name__}")
if join_keys is not None:
Expand Down Expand Up @@ -1624,12 +1631,31 @@ def make_hybrid_tree(
successor_hybrid = HybridTree(
HybridRoot(), node.ancestral_mapping
)
# HA: TODO: handle the case where the child access is a
# user-generated collection.
case HybridUserGeneratedCollection():
raise NotImplementedError(
"User-generated collections are not supported in child access"
)
case _:
raise NotImplementedError(
f"{node.__class__.__name__} (child is {node.child_access.__class__.__name__})"
)
self.define_root_link(parent, successor_hybrid, is_aggregate)
return successor_hybrid
case PyDoughUserGeneratedCollectionQDag():
# A user-generated collection is a special case of a collection
# access that is not a sub-collection, but rather a user-defined
# collection that is defined in the PyDough user collections.
hybrid_collection = HybridUserGeneratedCollection(node)
# Create a new hybrid tree for the user-generated collection.
successor_hybrid = HybridTree(hybrid_collection, node.ancestral_mapping)
hybrid = self.make_hybrid_tree(
node.ancestor_context, parent, is_aggregate
)
hybrid.add_successor(successor_hybrid)
return successor_hybrid

case _:
raise NotImplementedError(f"{node.__class__.__name__}")

Expand Down
9 changes: 9 additions & 0 deletions pydough/conversion/hybrid_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
HybridPartition,
HybridPartitionChild,
HybridRoot,
HybridUserGeneratedCollection,
)


Expand Down Expand Up @@ -676,6 +677,9 @@ def always_exists(self) -> bool:
# Stepping into a partition child always has a matching data
# record for each parent, by definition.
pass
case HybridUserGeneratedCollection():
# User-generated collections are always guaranteed to exist.
pass
case _:
raise NotImplementedError(
f"Invalid start of pipeline: {start_operation.__class__.__name__}"
Expand Down Expand Up @@ -726,6 +730,11 @@ def is_singular(self) -> bool:
case HybridChildPullUp():
if not self.children[self.pipeline[0].child_idx].subtree.is_singular():
return False
# HA TODO: confirm is that right?
case HybridUserGeneratedCollection():
# User-generated collections are always guaranteed to be
# singular.
pass
case _:
return False
# The current level is fine, so check any levels above it next.
Expand Down
43 changes: 41 additions & 2 deletions pydough/conversion/relational_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
EmptySingleton,
ExpressionSortInfo,
Filter,
GeneratedTable,
Join,
JoinCardinality,
JoinType,
Expand All @@ -49,6 +50,7 @@
WindowCallExpression,
)
from pydough.types import BooleanType, NumericType, UnknownType
from pydough.types.pydough_type import PyDoughType

from .agg_removal import remove_redundant_aggs
from .agg_split import split_partial_aggregates
Expand Down Expand Up @@ -79,6 +81,7 @@
HybridPartition,
HybridPartitionChild,
HybridRoot,
HybridUserGeneratedCollection,
)
from .hybrid_translator import HybridTranslator
from .hybrid_tree import HybridTree
Expand Down Expand Up @@ -1166,6 +1169,29 @@ def translate_hybridroot(self, context: TranslationOutput) -> TranslationOutput:
new_expressions[shifted_expr] = column_ref
return TranslationOutput(context.relational_node, new_expressions)

def build_user_generated_table(
self, node: HybridUserGeneratedCollection
) -> TranslationOutput:
"""Builds a user-generated table from the given hybrid user-generated collection.

Args:
`node`: The user-generated collection node to translate.

Returns:
The translated output payload.
"""
collection = node._user_collection.collection
out_columns: dict[HybridExpr, ColumnReference] = {}
gen_columns: dict[str, RelationalExpression] = {}
for column_name, column_type in collection.column_names_and_types:
hybrid_ref = HybridRefExpr(column_name, column_type)
col_ref = ColumnReference(column_name, column_type)
out_columns[hybrid_ref] = col_ref
gen_columns[column_name] = col_ref

answer = GeneratedTable(collection)
return TranslationOutput(answer, out_columns)

def rel_translation(
self,
hybrid: HybridTree,
Expand Down Expand Up @@ -1289,6 +1315,8 @@ def rel_translation(
case HybridRoot():
assert context is not None, "Malformed HybridTree pattern."
result = self.translate_hybridroot(context)
case HybridUserGeneratedCollection():
result = self.build_user_generated_table(operation)
case _:
raise NotImplementedError(
f"TODO: support relational conversion on {operation.__class__.__name__}"
Expand All @@ -1304,16 +1332,27 @@ def preprocess_root(
"""
Transforms the final PyDough collection by appending it with an extra
CALCULATE containing all of the columns that are output.
Args:
`node`: the PyDough QDAG collection node to be translated.
`output_cols`: a list of tuples in the form `(alias, column)`
describing every column that should be in the output, in the order
they should appear, and the alias they should be given. If None, uses
the most recent CALCULATE in the node to determine the columns.
Returns:
The PyDoughCollectionQDAG with an additional CALCULATE at the end
that contains all of the columns that should be in the output.
"""
# Fetch all of the expressions that should be kept in the final output
final_terms: list[tuple[str, PyDoughExpressionQDAG]] = []
if output_cols is None:
for name in node.calc_terms:
final_terms.append((name, Reference(node, name)))
name_typ: PyDoughType = node.get_expr(name).pydough_type
final_terms.append((name, Reference(node, name, name_typ)))
final_terms.sort(key=lambda term: node.get_expression_position(term[0]))
else:
for _, column in output_cols:
final_terms.append((column, Reference(node, column)))
column_typ: PyDoughType = node.get_expr(column).pydough_type
final_terms.append((column, Reference(node, column, column_typ)))
children: list[PyDoughCollectionQDAG] = []
final_calc: Calculate = Calculate(node, children).with_terms(final_terms)
return final_calc
Expand Down
17 changes: 13 additions & 4 deletions pydough/qdag/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ table_collection = builder.build_child_access("Nations", global_context_node)

# Build a reference node
# Equivalent PyDough code: `TPCH.Nations.name`
reference_node = builder.build_reference(table_collection, "name")
ref_name = "name"
pydough_type = table_collection.get_expr(ref_name).pydough_type
reference_node = builder.build_reference(table_collection, ref_name, pydough_type)

# Build an expression function call node
# Equivalent PyDough code: `LOWER(TPCH.Nations.name)`
Expand Down Expand Up @@ -99,7 +101,10 @@ regions_collection = builder.build_child_access("Regions", global_context_node)
# Access nations sub-collection
nations_sub_collection = builder.build_child_access("nations", regions_collection)
# Create WHERE(key == 4) condition
key_ref = builder.build_reference(nations_sub_collection, "key")

ref_name = "key"
pydough_type = nations_sub_collection.get_expr(ref_name).pydough_type
key_ref = builder.build_reference(nations_sub_collection, ref_name, pydough_type)
literal_4 = builder.build_literal(4, NumericType())
condition = builder.build_expression_function_call("EQU", [key_ref, literal_4])
# Build WHERE node with condition
Expand All @@ -108,7 +113,9 @@ where_node = where_node.with_condition(condition)
# Create SINGULAR node from filtered result
singular_node = builder.build_singular(where_node)
# Build reference node for name
reference_node = builder.build_reference(singular_node, "name")
ref_name = "name"
pydough_type = singular_node.get_expr(ref_name).pydough_type
reference_node = builder.build_reference(singular_node, ref_name, pydough_type)
# Build CALCULATE node with calculated term
calculate_node = builder.build_calc(regions_collection, [nations_sub_collection])
calculate_node = calculate_node.with_terms([("n_4_nation", reference_node)])
Expand All @@ -130,7 +137,9 @@ top_k_node = top_k_node.with_collation([collation_expression])
# Build a PARTITION BY node
# Equivalent PyDough code: `TPCH.PARTITION(Parts, name="p", by=part_type)`
part_collection = builder.build_child_access("Parts", global_context_node)
partition_key = builder.build_reference(part_collection, "part_type")
ref_name = "part_type"
pydough_type = part_collection.get_expr(ref_name).pydough_type
partition_key = builder.build_reference(part_collection, ref_name, pydough_type)
partition_by_node = builder.build_partition(part_collection, child_collection, "p")
partition_by_node = partition_by_node.with_keys([partition_key])

Expand Down
1 change: 1 addition & 0 deletions pydough/qdag/collections/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"TableCollection",
"TopK",
"Where",
"range_collection",
]

from .augmenting_child_operator import AugmentingChildOperator
Expand Down
3 changes: 2 additions & 1 deletion pydough/qdag/collections/augmenting_child_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ def get_term(self, term_name: str) -> PyDoughQDAG:
if isinstance(term, ChildAccess):
term = term.clone_with_parent(self)
elif isinstance(term, PyDoughExpressionQDAG):
term = Reference(self.preceding_context, term_name)
typ = self.preceding_context.get_expr(term_name).pydough_type
term = Reference(self.preceding_context, term_name, typ)
return term

@cache
Expand Down
4 changes: 3 additions & 1 deletion pydough/qdag/collections/collection_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,9 @@ def get_term(self, term_name: str) -> PyDoughQDAG:
else:
assert context.ancestor_context is not None
context = context.ancestor_context
return Reference(context, term_name)
return Reference(
context, term_name, context.get_expr(term_name).pydough_type
)

if term_name not in self.all_terms:
raise PyDoughQDAGException(self.name_mismatch_error(term_name))
Expand Down
4 changes: 3 additions & 1 deletion pydough/qdag/collections/partition_child.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,9 @@ def get_term(self, term_name: str):
else:
assert context.ancestor_context is not None
context = context.ancestor_context
return Reference(context, term_name)
return Reference(
context, term_name, context.get_expr(term_name).pydough_type
)

elif term_name not in self.all_terms:
raise PyDoughQDAGException(self.name_mismatch_error(term_name))
Expand Down
Loading