narwhals-dev · dangotbanned · Oct 12, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 2, 2025
diff --git a/narwhals/_plan/_expr_ir.py b/narwhals/_plan/_expr_ir.py
@@ -304,3 +304,7 @@ def is_column(self, *, allow_aliasing: bool = False) -> bool:
 
         ir = self.expr
         return isinstance(ir, Column) and ((self.name == ir.name) or allow_aliasing)
+
+
+def named_ir(name: str, expr: ExprIRT, /) -> NamedIR[ExprIRT]:
+    return NamedIR(expr=expr, name=name)
diff --git a/narwhals/_plan/arrow/acero.py b/narwhals/_plan/arrow/acero.py
@@ -25,11 +25,13 @@
 import pyarrow.compute as pc  # ignore-banned-import
 from pyarrow.acero import Declaration as Decl
 
+from narwhals._plan.common import flatten_hash_safe
+from narwhals._plan.options import SortMultipleOptions
 from narwhals._plan.typing import OneOrSeq
-from narwhals.typing import SingleColSelector
+from narwhals.typing import JoinStrategy, SingleColSelector
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, Collection, Iterable, Iterator
+    from collections.abc import Callable, Collection, Iterable, Iterator, Mapping
 
     from typing_extensions import TypeAlias
 
@@ -38,7 +40,7 @@
         Aggregation as _Aggregation,
     )
     from narwhals._plan.arrow.group_by import AggSpec
-    from narwhals._plan.arrow.typing import NullPlacement
+    from narwhals._plan.arrow.typing import JoinTypeSubset, NullPlacement
     from narwhals._plan.typing import OneOrIterable, Order, Seq
     from narwhals.typing import NonNestedLiteral
 
@@ -64,6 +66,14 @@
 lit = cast("Callable[[NonNestedLiteral], Expr]", pc.scalar)
 """Alias for `pyarrow.compute.scalar`."""
 
+_HOW_JOIN: Mapping[JoinStrategy, JoinTypeSubset] = {
+    "inner": "inner",
+    "left": "left outer",
+    "full": "full outer",
+    "anti": "left anti",
+    "semi": "left semi",
+}
+
 
 # NOTE: ATOW there are 304 valid function names, 46 can be used for some kind of agg
 # Due to expr expansion, it is very likely that we have repeat runs
@@ -189,10 +199,81 @@ def _order_by(
     return Decl("order_by", pac.OrderByNodeOptions(keys, null_placement=null_placement))
 
 
-# TODO @dangotbanned: Utilize `SortMultipleOptions.to_arrow_acero`
-def sort_by(*args: Any, **kwds: Any) -> Decl:
-    msg = "Should convert from polars args -> use `_order_by"
-    raise NotImplementedError(msg)
+def sort_by(
+    by: OneOrIterable[str],
+    *more_by: str,
+    descending: OneOrIterable[bool] = False,
+    nulls_last: bool = False,
+) -> Decl:
+    return SortMultipleOptions.parse(
+        descending=descending, nulls_last=nulls_last
+    ).to_arrow_acero(tuple(flatten_hash_safe((by, more_by))))
 def over_ordered( 
     self, node: ir.OrderedWindowExpr, frame: Frame, name: str 
 ) -> Self | Scalar: 
     if node.partition_by: 
         msg = f"Need to implement `group_by`, `join` for:\n{node!r}" 
         raise NotImplementedError(msg) 
     # NOTE: Converting `over(order_by=..., options=...)` into the right shape for `DataFrame.sort` 
     sort_by = tuple(NamedIR.from_ir(e) for e in node.order_by) 
     options = node.sort_options.to_multiple(len(node.order_by)) 
     idx_name = temp.column_name(frame) 
     sorted_context = frame.with_row_index(idx_name).sort(sort_by, options) 
     evaluated = node.expr.dispatch(self, sorted_context.drop([idx_name]), name) 
     if isinstance(evaluated, ArrowScalar): 
         # NOTE: We're already sorted, defer broadcasting to the outer context 
         # Wouldn't be suitable for partitions, but will be fine here 
         # - https://github.com/narwhals-dev/narwhals/pull/2528/commits/2ae42458cae91f4473e01270919815fcd7cb9667 
         # - https://github.com/narwhals-dev/narwhals/pull/2528/commits/b8066c4c57d4b0b6c38d58a0f5de05eefc2cae70 
         return self._with_native(evaluated.native, name) 
     indices = pc.sort_indices(sorted_context.get_column(idx_name).native) 
     height = len(sorted_context) 
     result = evaluated.broadcast(height).native.take(indices) 
     return self._with_native(result, name) 
 def is_first_distinct(self) -> Self: 
     import numpy as np  # ignore-banned-import 
     row_number = pa.array(np.arange(len(self))) 
     col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name]) 
     first_distinct_index = ( 
         pa.Table.from_arrays([self.native], names=[self.name]) 
         .append_column(col_token, row_number) 
         .group_by(self.name) 
         .aggregate([(col_token, "min")]) 
         .column(f"{col_token}_min") 
     ) 
     return self._with_native(pc.is_in(row_number, first_distinct_index)) 
 def is_last_distinct(self) -> Self: 
     import numpy as np  # ignore-banned-import 
     row_number = pa.array(np.arange(len(self))) 
     col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name]) 
     last_distinct_index = ( 
         pa.Table.from_arrays([self.native], names=[self.name]) 
         .append_column(col_token, row_number) 
         .group_by(self.name) 
         .aggregate([(col_token, "max")]) 
         .column(f"{col_token}_max") 
     ) 
     return self._with_native(pc.is_in(row_number, last_distinct_index)) 
 def over_ordered( 
     self, node: ir.OrderedWindowExpr, frame: Frame, name: str 
 ) -> Self | Scalar: 
     if node.partition_by: 
         msg = f"Need to implement `group_by`, `join` for:\n{node!r}" 
         raise NotImplementedError(msg) 
  
     # NOTE: Converting `over(order_by=..., options=...)` into the right shape for `DataFrame.sort` 
     sort_by = tuple(NamedIR.from_ir(e) for e in node.order_by) 
     options = node.sort_options.to_multiple(len(node.order_by)) 
     idx_name = temp.column_name(frame) 
     sorted_context = frame.with_row_index(idx_name).sort(sort_by, options) 
     evaluated = node.expr.dispatch(self, sorted_context.drop([idx_name]), name) 
     if isinstance(evaluated, ArrowScalar): 
         # NOTE: We're already sorted, defer broadcasting to the outer context 
         # Wouldn't be suitable for partitions, but will be fine here 
         # - https://github.com/narwhals-dev/narwhals/pull/2528/commits/2ae42458cae91f4473e01270919815fcd7cb9667 
         # - https://github.com/narwhals-dev/narwhals/pull/2528/commits/b8066c4c57d4b0b6c38d58a0f5de05eefc2cae70 
         return self._with_native(evaluated.native, name) 
     indices = pc.sort_indices(sorted_context.get_column(idx_name).native) 
     height = len(sorted_context) 
     result = evaluated.broadcast(height).native.take(indices) 
     return self._with_native(result, name) 
 def is_first_distinct(self) -> Self: 
     import numpy as np  # ignore-banned-import 
  
     row_number = pa.array(np.arange(len(self))) 
     col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name]) 
     first_distinct_index = ( 
         pa.Table.from_arrays([self.native], names=[self.name]) 
         .append_column(col_token, row_number) 
         .group_by(self.name) 
         .aggregate([(col_token, "min")]) 
         .column(f"{col_token}_min") 
     ) 
  
     return self._with_native(pc.is_in(row_number, first_distinct_index)) 
  
 def is_last_distinct(self) -> Self: 
     import numpy as np  # ignore-banned-import 
  
     row_number = pa.array(np.arange(len(self))) 
     col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name]) 
     last_distinct_index = ( 
         pa.Table.from_arrays([self.native], names=[self.name]) 
         .append_column(col_token, row_number) 
         .group_by(self.name) 
         .aggregate([(col_token, "max")]) 
         .column(f"{col_token}_max") 
     ) 
  
     return self._with_native(pc.is_in(row_number, last_distinct_index)) 
+
+
+def join(
+    left: pa.Table,
+    right: pa.Table,
+    how: JoinTypeSubset,
+    left_on: OneOrIterable[str],
+    right_on: OneOrIterable[str],
+    suffix: str = "_right",
+    *,
+    coalesce_keys: bool = True,
+) -> Decl:
+    """Heavily based on [`pyarrow.acero._perform_join`].
+
+    [`pyarrow.acero._perform_join`]: https://github.com/apache/arrow/blob/f7320c9a40082639f9e0cf8b3075286e3fc6c0b9/python/pyarrow/acero.py#L82-L260
+    """
+    left_on = [left_on] if isinstance(left_on, str) else list(left_on)
+    right_on = [right_on] if isinstance(right_on, str) else list(right_on)
+
+    # polars full join does not coalesce keys,
+    coalesce_keys = coalesce_keys and (how != "full outer")
+    if not coalesce_keys:
+        opts = _join_options(how, left_on, right_on, suffix=suffix)
+        return _hashjoin(left, right, opts)
+
+    # By default expose all columns on both left and right table
+    left_names = left.schema.names
+    right_names = right.schema.names
+
+    if how in {"left semi", "left anti"}:
+        right_names = []
+    elif how in {"inner", "left outer"}:
+        right_names = [name for name in right_names if name not in right_on]
+    opts = _join_options(
+        how,
+        left_on,
+        right_on,
+        suffix=suffix,
+        left_output=left_names,
+        right_output=right_names,
+    )
+    return _hashjoin(left, right, opts)
+
+
+def _join_options(
+    how: JoinTypeSubset,
+    left_on: str | list[str],
+    right_on: str | list[str],
+    *,
+    suffix: str = "_right",
+    left_output: Iterable[str] | None = None,
+    right_output: Iterable[str] | None = None,
+) -> pac.HashJoinNodeOptions:
+    tp: Incomplete = pac.HashJoinNodeOptions
+    kwds = {
+        "left_output": left_output,
+        "right_output": right_output,
+        "output_suffix_for_right": suffix,
+    }
+    return tp(how, left_on, right_on, **kwds)  # type: ignore[no-any-return]
+
+
+def _hashjoin(
+    left: pa.Table, right: pa.Table, /, options: pac.HashJoinNodeOptions
+) -> Decl:
+    return Decl("hashjoin", options, [table_source(left), table_source(right)])
 
 
 def collect(*declarations: Decl, use_threads: bool = True) -> pa.Table:
@@ -251,3 +332,22 @@ def select_names_table(
     native: pa.Table, column_names: OneOrIterable[str], *more_names: str
 ) -> pa.Table:
     return collect(table_source(native), select_names(column_names, *more_names))
+
+
+def join_tables(
+    left: pa.Table,
+    right: pa.Table,
+    how: JoinStrategy,
+    left_on: OneOrIterable[str] | None,
+    right_on: OneOrIterable[str] | None = (),
+    suffix: str = "_right",
+    *,
+    coalesce_keys: bool = True,
+) -> pa.Table:
+    join_type = _HOW_JOIN[how]
+    left_on = left_on or ()
+    right_on = right_on or left_on
+    decl = join(
+        left, right, join_type, left_on, right_on, suffix, coalesce_keys=coalesce_keys
+    )
+    return collect(decl)
diff --git a/narwhals/_plan/arrow/dataframe.py b/narwhals/_plan/arrow/dataframe.py
@@ -9,7 +9,7 @@
 import pyarrow.compute as pc  # ignore-banned-import
 
 from narwhals._arrow.utils import native_to_narwhals_dtype
-from narwhals._plan.arrow import functions as fn
+from narwhals._plan.arrow import acero, functions as fn
 from narwhals._plan.arrow.group_by import ArrowGroupBy as GroupBy
 from narwhals._plan.arrow.series import ArrowSeries as Series
 from narwhals._plan.compliant.dataframe import EagerDataFrame
@@ -31,7 +31,7 @@
     from narwhals._plan.options import SortMultipleOptions
     from narwhals._plan.typing import Seq
     from narwhals.dtypes import DType
-    from narwhals.typing import IntoSchema
+    from narwhals.typing import IntoSchema, JoinStrategy
 
 
 class ArrowDataFrame(EagerDataFrame[Series, "pa.Table", "ChunkedArrayAny"]):
@@ -144,3 +144,20 @@ def select_names(self, *column_names: str) -> Self:
     def row(self, index: int) -> tuple[Any, ...]:
         row = self.native.slice(index, 1)
         return tuple(chain.from_iterable(row.to_pydict().values()))
+
+    def join(
+        self,
+        other: Self,
+        *,
+        how: JoinStrategy,
+        left_on: Sequence[str] | None,
+        right_on: Sequence[str] | None,
+        suffix: str = "_right",
+    ) -> Self:
+        if how == "cross":
+            msg = f"join(how={how!r})"
+            raise NotImplementedError(msg)
+        result = acero.join_tables(
+            self.native, other.native, how, left_on, right_on, suffix=suffix
+        )
+        return self._with_native(result)
diff --git a/narwhals/_plan/arrow/expr.py b/narwhals/_plan/arrow/expr.py
@@ -6,21 +6,17 @@
 import pyarrow.compute as pc  # ignore-banned-import
 
 from narwhals._arrow.utils import narwhals_to_native_dtype
+from narwhals._plan import expressions as ir
 from narwhals._plan.arrow import functions as fn
 from narwhals._plan.arrow.series import ArrowSeries as Series
 from narwhals._plan.arrow.typing import ChunkedOrScalarAny, NativeScalar, StoresNativeT_co
+from narwhals._plan.common import temp
 from narwhals._plan.compliant.column import ExprDispatch
 from narwhals._plan.compliant.expr import EagerExpr
 from narwhals._plan.compliant.scalar import EagerScalar
 from narwhals._plan.compliant.typing import namespace
 from narwhals._plan.expressions import NamedIR
-from narwhals._utils import (
-    Implementation,
-    Version,
-    _StoresNative,
-    generate_temporary_column_name,
-    not_implemented,
-)
+from narwhals._utils import Implementation, Version, _StoresNative, not_implemented
 from narwhals.exceptions import InvalidOperationError, ShapeError
 
 if TYPE_CHECKING:
@@ -29,7 +25,6 @@
     from typing_extensions import Self, TypeAlias
 
     from narwhals._arrow.typing import ChunkedArrayAny, Incomplete
-    from narwhals._plan import expressions as ir
     from narwhals._plan.arrow.dataframe import ArrowDataFrame as Frame
     from narwhals._plan.arrow.namespace import ArrowNamespace
     from narwhals._plan.expressions.aggregation import (
@@ -53,6 +48,8 @@
         All,
         IsBetween,
         IsFinite,
+        IsFirstDistinct,
+        IsLastDistinct,
         IsNan,
         IsNull,
         Not,
@@ -198,6 +195,9 @@ def _with_native(self, result: ChunkedOrScalarAny, name: str, /) -> Scalar | Sel
             return ArrowScalar.from_native(result, name, version=self.version)
         return self.from_native(result, name or self.name, self.version)
 
+    # NOTE: I'm not sure what I meant by
+    # > "isn't natively supported on `ChunkedArray`"
+    # Was that supposed to say "is only supported on `ChunkedArray`"?
     def _dispatch_expr(self, node: ir.ExprIR, frame: Frame, name: str) -> Series:
         """Use instead of `_dispatch` *iff* an operation isn't natively supported on `ChunkedArray`.
 
@@ -231,10 +231,8 @@ def sort(self, node: ir.Sort, frame: Frame, name: str) -> Expr:
 
     def sort_by(self, node: ir.SortBy, frame: Frame, name: str) -> Expr:
         series = self._dispatch_expr(node.expr, frame, name)
-        by = (
-            self._dispatch_expr(e, frame, f"<TEMP>_{idx}")
-            for idx, e in enumerate(node.by)
-        )
+        it_names = temp.column_names(frame)
+        by = (self._dispatch_expr(e, frame, nm) for e, nm in zip(node.by, it_names))
         df = namespace(self)._concat_horizontal((series, *by))
         names = df.columns[1:]
         indices = pc.sort_indices(df.native, options=node.options.to_arrow(names))
@@ -342,7 +340,7 @@ def over_ordered(
         # NOTE: Converting `over(order_by=..., options=...)` into the right shape for `DataFrame.sort`
         sort_by = tuple(NamedIR.from_ir(e) for e in node.order_by)
         options = node.sort_options.to_multiple(len(node.order_by))
-        idx_name = generate_temporary_column_name(8, frame.columns)
+        idx_name = temp.column_name(frame)
         sorted_context = frame.with_row_index(idx_name).sort(sort_by, options)
         evaluated = node.expr.dispatch(self, sorted_context.drop([idx_name]), name)
         if isinstance(evaluated, ArrowScalar):
@@ -374,6 +372,27 @@ def map_batches(self, node: ir.AnonymousExpr, frame: Frame, name: str) -> Self:
     def rolling_expr(self, node: ir.RollingExpr, frame: Frame, name: str) -> Self:
         raise NotImplementedError
 
+    def _is_first_last_distinct(
+        self,
+        node: FunctionExpr[IsFirstDistinct | IsLastDistinct],
+        frame: Frame,
+        name: str,
+    ) -> Self:
+        idx_name = temp.column_name([name])
+        expr_ir = fn.IS_FIRST_LAST_DISTINCT[type(node.function)](idx_name)
+        series = self._dispatch_expr(node.input[0], frame, name)
+        df = series.to_frame().with_row_index(idx_name)
+        distinct_index = (
+            df.group_by_names((name,))
+            .agg((ir.named_ir(idx_name, expr_ir),))
+            .get_column(idx_name)
+            .native
+        )
+        return self._with_native(fn.is_in(df.to_series().native, distinct_index), name)
+
+    is_first_distinct = _is_first_last_distinct
+    is_last_distinct = _is_first_last_distinct
+
 
 class ArrowScalar(
     _ArrowDispatch["ArrowScalar"],