-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-50856][SS][PYTHON][CONNECT] Spark Connect Support for TransformWithStateInPandas In Python #49560
[SPARK-50856][SS][PYTHON][CONNECT] Spark Connect Support for TransformWithStateInPandas In Python #49560
Changes from all commits
ac9a883
f667498
b3569c1
6e8cb25
13acf55
f700da8
d880684
07cbcf4
e933132
a90e1c5
7c1cf9d
b6751bf
314bbd2
960de22
efe3e15
a13c4e4
9b249a5
d17e7d0
73ef177
714f300
9d4f77a
dcf7b0f
510ea91
2ea9768
9478691
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2546,6 +2546,74 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: | |
return self._with_relations(plan, session) | ||
|
||
|
||
class TransformWithStateInPandas(LogicalPlan): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we add some comments here ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mean class docstring? I have one line comment of docstring in line 2550. |
||
"""Logical plan object for a TransformWithStateInPandas.""" | ||
|
||
def __init__( | ||
self, | ||
child: Optional["LogicalPlan"], | ||
grouping_cols: Sequence[Column], | ||
function: "UserDefinedFunction", | ||
output_schema: Union[DataType, str], | ||
output_mode: str, | ||
time_mode: str, | ||
event_time_col_name: str, | ||
cols: List[str], | ||
initial_state_plan: Optional["LogicalPlan"], | ||
initial_state_grouping_cols: Optional[Sequence[Column]], | ||
): | ||
assert isinstance(grouping_cols, list) and all(isinstance(c, Column) for c in grouping_cols) | ||
if initial_state_plan is not None: | ||
assert isinstance(initial_state_grouping_cols, list) and all( | ||
isinstance(c, Column) for c in initial_state_grouping_cols | ||
) | ||
super().__init__( | ||
child, self._collect_references(grouping_cols + initial_state_grouping_cols) | ||
) | ||
else: | ||
super().__init__(child, self._collect_references(grouping_cols)) | ||
self._grouping_cols = grouping_cols | ||
self._output_schema: DataType = ( | ||
UnparsedDataType(output_schema) if isinstance(output_schema, str) else output_schema | ||
) | ||
self._output_mode = output_mode | ||
self._time_mode = time_mode | ||
self._event_time_col_name = event_time_col_name | ||
self._function = function._build_common_inline_user_defined_function(*cols) | ||
self._initial_state_plan = initial_state_plan | ||
self._initial_state_grouping_cols = initial_state_grouping_cols | ||
|
||
def plan(self, session: "SparkConnectClient") -> proto.Relation: | ||
assert self._child is not None | ||
plan = self._create_proto_relation() | ||
plan.group_map.input.CopyFrom(self._child.plan(session)) | ||
plan.group_map.grouping_expressions.extend( | ||
[c.to_plan(session) for c in self._grouping_cols] | ||
) | ||
plan.group_map.output_mode = self._output_mode | ||
|
||
# fill in initial state related fields | ||
if self._initial_state_plan is not None: | ||
plan.group_map.initial_input.CopyFrom(self._initial_state_plan.plan(session)) | ||
assert self._initial_state_grouping_cols is not None | ||
plan.group_map.initial_grouping_expressions.extend( | ||
[c.to_plan(session) for c in self._initial_state_grouping_cols] | ||
) | ||
|
||
# fill in transformWithStateInPandas related fields | ||
tws_info = proto.TransformWithStateInfo() | ||
tws_info.time_mode = self._time_mode | ||
tws_info.event_time_column_name = self._event_time_col_name | ||
tws_info.output_schema.CopyFrom(pyspark_types_to_proto_types(self._output_schema)) | ||
|
||
plan.group_map.transform_with_state_info.CopyFrom(tws_info) | ||
|
||
# wrap transformWithStateInPandasUdf in a function | ||
plan.group_map.func.CopyFrom(self._function.to_plan_udf(session)) | ||
|
||
return self._with_relations(plan, session) | ||
|
||
|
||
class PythonUDTF: | ||
"""Represents a Python user-defined table function.""" | ||
|
||
|
Large diffs are not rendered by default.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have we verified that this test actually run in CI?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah i think so. I got several failed test case for this suite in previous CI run: https://github.com/jingz-db/spark/actions/runs/13039529632/job/36378113583#step:12:4144 which is now fixed, but this verifies the suite is actually running on CI
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great, thanks for confirming!