diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md index f5bec39686..bc5864c21d 100644 --- a/docs/api-reference/narwhals.md +++ b/docs/api-reference/narwhals.md @@ -45,6 +45,7 @@ Here are the top-level functions available in Narwhals. - scan_csv - scan_parquet - show_versions + - struct - sum - sum_horizontal - to_native diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 52e0eb0506..cc124a18bf 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -75,6 +75,7 @@ scan_csv, scan_parquet, show_versions, + struct, sum, sum_horizontal, when, @@ -169,6 +170,7 @@ "scan_parquet", "selectors", "show_versions", + "struct", "sum", "sum_horizontal", "to_native", diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index 98282a575e..4dafefa3de 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -227,6 +227,21 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: context=self, ) + def struct(self, *exprs: ArrowExpr) -> ArrowExpr: + def func(df: ArrowDataFrame) -> list[ArrowSeries]: + series = list(chain.from_iterable(expr(df) for expr in exprs)) + arrays = [s._native_series.combine_chunks() for s in series] + name = series[0].name + struct_array = pc.make_struct(*arrays, field_names=[s.name for s in series]) + return [self._series(struct_array, name=name, version=self._version)] + + return self._expr._from_callable( + func=func, + evaluate_output_names=combine_evaluate_output_names(*exprs), + alias_output_names=combine_alias_output_names(*exprs), + context=self, + ) + def coalesce(self, *exprs: ArrowExpr) -> ArrowExpr: def func(df: ArrowDataFrame) -> list[ArrowSeries]: align = self._series._align_full_broadcast diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index c4791a7e0d..cb28c37f73 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -22,7 +22,7 @@ combine_alias_output_names, combine_evaluate_output_names, ) -from narwhals._utils import Implementation, zip_strict +from narwhals._utils import Implementation, not_implemented, zip_strict if TYPE_CHECKING: from collections.abc import Iterable, Iterator @@ -255,6 +255,8 @@ def func(df: DaskLazyFrame) -> list[dx.Series]: version=self._version, ) + struct = not_implemented() + def coalesce(self, *exprs: DaskExpr) -> DaskExpr: def func(df: DaskLazyFrame) -> list[dx.Series]: series = align_series_full_broadcast( diff --git a/narwhals/_duckdb/namespace.py b/narwhals/_duckdb/namespace.py index ebc5041e68..be74089627 100644 --- a/narwhals/_duckdb/namespace.py +++ b/narwhals/_duckdb/namespace.py @@ -26,7 +26,7 @@ combine_evaluate_output_names, ) from narwhals._sql.namespace import SQLNamespace -from narwhals._utils import Implementation +from narwhals._utils import Implementation, not_implemented if TYPE_CHECKING: from collections.abc import Iterable @@ -119,6 +119,8 @@ def func(df: DuckDBLazyFrame) -> list[Expression]: version=self._version, ) + struct = not_implemented() + def mean_horizontal(self, *exprs: DuckDBExpr) -> DuckDBExpr: def func(cols: Iterable[Expression]) -> Expression: cols = tuple(cols) diff --git a/narwhals/_ibis/namespace.py b/narwhals/_ibis/namespace.py index d3116edd4f..0535ad3f2b 100644 --- a/narwhals/_ibis/namespace.py +++ b/narwhals/_ibis/namespace.py @@ -17,7 +17,7 @@ from narwhals._ibis.selectors import IbisSelectorNamespace from narwhals._ibis.utils import function, lit, narwhals_to_native_dtype from narwhals._sql.namespace import SQLNamespace -from narwhals._utils import Implementation +from narwhals._utils import Implementation, not_implemented if TYPE_CHECKING: from collections.abc import Iterable, Sequence @@ -100,6 +100,8 @@ def func(df: IbisLazyFrame) -> list[ir.Value]: version=self._version, ) + struct = not_implemented() + def mean_horizontal(self, *exprs: IbisExpr) -> IbisExpr: def func(cols: Iterable[ir.Value]) -> ir.Value: cols = list(cols) diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 96b7a22290..c4ad49f187 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -335,6 +335,54 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: context=self, ) + def struct(self, *exprs: PandasLikeExpr) -> PandasLikeExpr: + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + series_list = [s for _expr in exprs for s in _expr(df)] + df = self.concat( + (s.to_frame() for s in series_list), how="horizontal" + )._native_frame + + try: + import pandas as pd + import pyarrow.compute as pc + except ModuleNotFoundError as exc: + msg = "'pyarrow' and 'pandas' are required to use `struct()` in this backend." + raise ModuleNotFoundError(msg) from exc + + # Check for consistent types within each column + for col in df.columns: + values = df[col].tolist() + non_null_values = [v for v in values if not pd.isna(v)] + if not non_null_values: + continue # all nulls, skip + first_type = type(non_null_values[0]) + for v in non_null_values[1:]: + if not isinstance(v, first_type): + msg = ( + f"unexpected value while building Series of type {first_type.__name__}; " + f"found value of type {type(v).__name__}: {v}\n\n" + f"Hint: ensure all values in each column have the same dtype." + ) + raise TypeError(msg) + + df_arrow = df.convert_dtypes(dtype_backend="pyarrow") + arrays = [df_arrow[col].array._pa_array for col in df.columns] + struct_array = pc.make_struct(*arrays, field_names=df.columns) + struct_series = struct_array.to_pandas( + types_mapper=lambda x: pd.ArrowDtype(x) + ) + result = PandasLikeSeries( + struct_series, implementation=self._implementation, version=self._version + ).alias("struct") + return [result] + + return self._expr._from_callable( + func=func, + evaluate_output_names=combine_evaluate_output_names(*exprs), + alias_output_names=combine_alias_output_names(*exprs), + context=self, + ) + def _if_then_else( self, when: NativeSeriesT, diff --git a/narwhals/_polars/namespace.py b/narwhals/_polars/namespace.py index ac8da364be..0a57b749b1 100644 --- a/narwhals/_polars/namespace.py +++ b/narwhals/_polars/namespace.py @@ -198,6 +198,10 @@ def concat_str( version=self._version, ) + def struct(self, *exprs: PolarsExpr) -> PolarsExpr: + pl_exprs = [expr._native_expr for expr in exprs] + return self._expr(pl.struct(pl_exprs), version=self._version) + def when_then( self, when: PolarsExpr, then: PolarsExpr, otherwise: PolarsExpr | None = None ) -> PolarsExpr: diff --git a/narwhals/_spark_like/namespace.py b/narwhals/_spark_like/namespace.py index c660b67298..bcb073b871 100644 --- a/narwhals/_spark_like/namespace.py +++ b/narwhals/_spark_like/namespace.py @@ -19,6 +19,7 @@ true_divide, ) from narwhals._sql.namespace import SQLNamespace +from narwhals._utils import not_implemented if TYPE_CHECKING: from collections.abc import Iterable @@ -196,3 +197,5 @@ def func(df: SparkLikeLazyFrame) -> list[Column]: version=self._version, implementation=self._implementation, ) + + struct = not_implemented() diff --git a/narwhals/_sql/namespace.py b/narwhals/_sql/namespace.py index 94f61d7c65..0ab9131e6f 100644 --- a/narwhals/_sql/namespace.py +++ b/narwhals/_sql/namespace.py @@ -7,6 +7,7 @@ from narwhals._compliant import LazyNamespace from narwhals._compliant.typing import NativeExprT, NativeFrameT from narwhals._sql.typing import SQLExprT, SQLLazyFrameT +from narwhals._utils import not_implemented if TYPE_CHECKING: from collections.abc import Iterable @@ -86,3 +87,5 @@ def func_with_otherwise(cols: list[NativeExprT]) -> NativeExprT: return self._expr._from_elementwise_horizontal_op( func_with_otherwise, then, predicate, otherwise ) + + struct = not_implemented() diff --git a/narwhals/functions.py b/narwhals/functions.py index bff3f27c85..64ae9a3ad2 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -1587,6 +1587,46 @@ def concat_str( ) +def struct(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr: + r"""Horizontally combine multiple columns into a single struct column. + + Arguments: + exprs: One or more expressions to combine into a struct. Strings are treated as column names. + *more_exprs: Additional columns or expressions, passed as positional arguments. + + Returns: + An expression that produces a single struct column containing the given fields. + + Example: + >>> import pandas as pd + >>> import narwhals as nw + >>> + >>> data = { + ... "a": [1, 2, 3], + ... "b": ["dogs", "cats", None], + ... "c": ["play", "swim", "walk"], + ... } + >>> df_native = pd.DataFrame(data) + >>> ( + ... nw.from_native(df_native).select( + ... nw.struct([nw.col("a") * 2, nw.col("b"), nw.col("c")]).alias( + ... "my_struct" + ... ) + ... ) + ... ) + ┌─────────────────────────────────────┐ + | Narwhals DataFrame | + |-------------------------------------| + | my_struct| + |0 {'a': 2, 'b': 'dogs', 'c': 'play'}| + |1 {'a': 4, 'b': 'cats', 'c': 'swim'}| + |2 {'a': 6, 'b': None, 'c': 'walk'}| + └─────────────────────────────────────┘ + """ + flat_exprs = flatten([*flatten([exprs]), *more_exprs]) + return _expr_with_horizontal_op("struct", *flat_exprs) + + def coalesce( exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr | NonNestedLiteral ) -> Expr: diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 8e5a78672d..fbf57efdc2 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -1163,6 +1163,10 @@ def concat_str( ) +def struct(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr: + return _stableify(nw.struct(exprs, *more_exprs)) + + def format(f_string: str, *args: IntoExpr) -> Expr: """Format expressions as a string.""" return _stableify(nw.format(f_string, *args)) @@ -1440,6 +1444,7 @@ def scan_parquet( "scan_parquet", "selectors", "show_versions", + "struct", "sum", "sum_horizontal", "to_native", diff --git a/narwhals/stable/v2/__init__.py b/narwhals/stable/v2/__init__.py index cbc5ff21d3..9739bc4a7a 100644 --- a/narwhals/stable/v2/__init__.py +++ b/narwhals/stable/v2/__init__.py @@ -916,6 +916,45 @@ def concat_str( ) +def struct(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr: + r"""Horizontally combine multiple columns into a single struct column. + + Arguments: + exprs: One or more expressions to combine into a struct. Strings are treated as column names. + *more_exprs: Additional columns or expressions, passed as positional arguments. + + Returns: + An expression that produces a single struct column containing the given fields. + + Example: + >>> import pandas as pd + >>> import narwhals as nw + >>> + >>> data = { + ... "a": [1, 2, 3], + ... "b": ["dogs", "cats", None], + ... "c": ["play", "swim", "walk"], + ... } + >>> df_native = pd.DataFrame(data) + >>> ( + ... nw.from_native(df_native).select( + ... nw.struct([nw.col("a") * 2, nw.col("b"), nw.col("c")]).alias( + ... "my_struct" + ... ) + ... ) + ... ) + ┌─────────────────────────────────────┐ + | Narwhals DataFrame | + |-------------------------------------| + | my_struct| + |0 {'a': 2, 'b': 'dogs', 'c': 'play'}| + |1 {'a': 4, 'b': 'cats', 'c': 'swim'}| + |2 {'a': 6, 'b': None, 'c': 'walk'}| + └─────────────────────────────────────┘ + """ + return _stableify(nw.struct(exprs, *more_exprs)) + + def format(f_string: str, *args: IntoExpr) -> Expr: """Format expressions as a string. @@ -1279,6 +1318,7 @@ def scan_parquet( "selectors", "selectors", "show_versions", + "struct", "sum", "sum_horizontal", "to_native", diff --git a/tests/expr_and_series/struct_test.py b/tests/expr_and_series/struct_test.py new file mode 100644 index 0000000000..a1177de082 --- /dev/null +++ b/tests/expr_and_series/struct_test.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import pytest + +import narwhals as nw +from tests.utils import POLARS_VERSION, Constructor, assert_equal_data + +pytest.importorskip("pyarrow") + +data = {"a": [1, 2, 3], "b": ["dogs", "cats", None], "c": ["play", "swim", "walk"]} + + +def test_struct(constructor: Constructor, *, request: pytest.FixtureRequest) -> None: + if "polars" in str(constructor) and POLARS_VERSION < (1, 0, 0): + request.applymarker(pytest.mark.xfail) + if any( + x in str(constructor) for x in ("dask", "duckdb", "ibis", "pyspark", "sqlframe") + ): + request.applymarker(pytest.mark.xfail(reason="Not supported / not implemented")) + + df = nw.from_native(constructor(data)) + result = df.select(nw.struct([nw.col("a"), nw.col("b"), nw.col("c")]).alias("struct")) + + expected = { + "struct": [ + {"a": 1, "b": "dogs", "c": "play"}, + {"a": 2, "b": "cats", "c": "swim"}, + {"a": 3, "b": None, "c": "walk"}, + ] + } + + assert_equal_data(result, expected)