diff --git a/docs/api-reference/expr_str.md b/docs/api-reference/expr_str.md index fbd50d4848..be3d5002f0 100644 --- a/docs/api-reference/expr_str.md +++ b/docs/api-reference/expr_str.md @@ -18,6 +18,7 @@ - to_date - to_datetime - to_lowercase + - to_titlecase - to_uppercase - zfill show_source: false diff --git a/docs/api-reference/series_str.md b/docs/api-reference/series_str.md index 4d36554c6d..2b5cef4178 100644 --- a/docs/api-reference/series_str.md +++ b/docs/api-reference/series_str.md @@ -18,6 +18,7 @@ - to_date - to_datetime - to_lowercase + - to_titlecase - to_uppercase - zfill show_source: false diff --git a/narwhals/_arrow/series_str.py b/narwhals/_arrow/series_str.py index 4b3fe0ee1d..6768a1d92a 100644 --- a/narwhals/_arrow/series_str.py +++ b/narwhals/_arrow/series_str.py @@ -79,6 +79,9 @@ def to_uppercase(self) -> ArrowSeries: def to_lowercase(self) -> ArrowSeries: return self.with_native(pc.utf8_lower(self.native)) + def to_titlecase(self) -> ArrowSeries: + return self.with_native(pc.utf8_title(self.native)) + def zfill(self, width: int) -> ArrowSeries: binary_join: Incomplete = pc.binary_join_element_wise native = self.native diff --git a/narwhals/_compliant/any_namespace.py b/narwhals/_compliant/any_namespace.py index 54df3160fd..7538c16155 100644 --- a/narwhals/_compliant/any_namespace.py +++ b/narwhals/_compliant/any_namespace.py @@ -100,6 +100,7 @@ def split(self, by: str) -> CompliantT_co: ... def to_datetime(self, format: str | None) -> CompliantT_co: ... def to_date(self, format: str | None) -> CompliantT_co: ... def to_lowercase(self) -> CompliantT_co: ... + def to_titlecase(self) -> CompliantT_co: ... def to_uppercase(self) -> CompliantT_co: ... def zfill(self, width: int) -> CompliantT_co: ... diff --git a/narwhals/_compliant/expr.py b/narwhals/_compliant/expr.py index 52a5b4f6c5..2d92912b2c 100644 --- a/narwhals/_compliant/expr.py +++ b/narwhals/_compliant/expr.py @@ -1156,6 +1156,9 @@ def to_uppercase(self) -> EagerExprT: def zfill(self, width: int) -> EagerExprT: return self.compliant._reuse_series_namespace("str", "zfill", width=width) + def to_titlecase(self) -> EagerExprT: + return self.compliant._reuse_series_namespace("str", "to_titlecase") + class EagerExprStructNamespace( EagerExprNamespace[EagerExprT], StructNamespace[EagerExprT], Generic[EagerExprT] diff --git a/narwhals/_dask/expr_str.py b/narwhals/_dask/expr_str.py index 953f271cee..200a358297 100644 --- a/narwhals/_dask/expr_str.py +++ b/narwhals/_dask/expr_str.py @@ -113,6 +113,11 @@ def to_lowercase(self) -> DaskExpr: lambda expr: expr.str.lower(), "to_lowercase" ) + def to_titlecase(self) -> DaskExpr: + return self.compliant._with_callable( + lambda expr: expr.str.title(), "to_titlecase" + ) + def zfill(self, width: int) -> DaskExpr: return self.compliant._with_callable( lambda expr, width: expr.str.zfill(width), "zfill", width=width diff --git a/narwhals/_duckdb/expr_str.py b/narwhals/_duckdb/expr_str.py index e5708d19b0..24bc791c60 100644 --- a/narwhals/_duckdb/expr_str.py +++ b/narwhals/_duckdb/expr_str.py @@ -2,11 +2,13 @@ from typing import TYPE_CHECKING -from narwhals._duckdb.utils import F, lit +from narwhals._duckdb.utils import F, col, concat_str, lit from narwhals._sql.expr_str import SQLExprStringNamespace -from narwhals._utils import not_implemented +from narwhals._utils import not_implemented, requires if TYPE_CHECKING: + from duckdb import Expression + from narwhals._duckdb.expr import DuckDBExpr @@ -27,4 +29,25 @@ def to_date(self, format: str | None) -> DuckDBExpr: compliant_expr = self.compliant return compliant_expr.cast(compliant_expr._version.dtypes.Date()) + @requires.backend_version((1, 2)) + def to_titlecase(self) -> DuckDBExpr: + from narwhals._duckdb.utils import lambda_expr + + def _to_titlecase(expr: Expression) -> Expression: + extract_expr = F( + "regexp_extract_all", F("lower", expr), lit(r"[a-z0-9]*[^a-z0-9]*") + ) + elem = col("_") + capitalize = lambda_expr( + elem, + concat_str( + F("upper", F("array_extract", elem, lit(1))), + F("substring", elem, lit(2)), + ), + ) + capitalized_expr = F("list_transform", extract_expr, capitalize) + return F("list_aggregate", capitalized_expr, lit("string_agg"), lit("")) + + return self.compliant._with_elementwise(_to_titlecase) + replace = not_implemented() diff --git a/narwhals/_duckdb/utils.py b/narwhals/_duckdb/utils.py index d126bff4e9..243c9bf0bc 100644 --- a/narwhals/_duckdb/utils.py +++ b/narwhals/_duckdb/utils.py @@ -5,6 +5,7 @@ import duckdb import duckdb.typing as duckdb_dtypes +from duckdb import Expression from duckdb.typing import DuckDBPyType from narwhals._utils import Version, isinstance_or_issubclass, zip_strict @@ -13,7 +14,7 @@ if TYPE_CHECKING: from collections.abc import Mapping, Sequence - from duckdb import DuckDBPyRelation, Expression + from duckdb import DuckDBPyRelation from narwhals._compliant.typing import CompliantLazyFrameAny from narwhals._duckdb.dataframe import DuckDBLazyFrame @@ -50,6 +51,22 @@ """Alias for `duckdb.FunctionExpression`.""" +def lambda_expr( + params: str | Expression | tuple[Expression, ...], expr: Expression, / +) -> Expression: + """Wraps [`duckdb.LambdaExpression`]. + + [`duckdb.LambdaExpression`]: https://duckdb.org/docs/stable/sql/functions/lambda + """ + try: + from duckdb import LambdaExpression + except ModuleNotFoundError as exc: # pragma: no cover + msg = f"DuckDB>=1.2.0 is required for this operation. Found: DuckDB {duckdb.__version__}" + raise NotImplementedError(msg) from exc + args = (params,) if isinstance(params, Expression) else params + return LambdaExpression(args, expr) + + def concat_str(*exprs: Expression, separator: str = "") -> Expression: """Concatenate many strings, NULL inputs are skipped. diff --git a/narwhals/_ibis/expr_str.py b/narwhals/_ibis/expr_str.py index adf84fe4e1..189db5823b 100644 --- a/narwhals/_ibis/expr_str.py +++ b/narwhals/_ibis/expr_str.py @@ -81,3 +81,4 @@ def fn(expr: ir.StringColumn) -> ir.DateValue: return self.compliant._with_callable(fn) replace = not_implemented() + to_titlecase = not_implemented() diff --git a/narwhals/_pandas_like/series_str.py b/narwhals/_pandas_like/series_str.py index 19e16f3c35..7fb820598a 100644 --- a/narwhals/_pandas_like/series_str.py +++ b/narwhals/_pandas_like/series_str.py @@ -88,5 +88,8 @@ def to_uppercase(self) -> PandasLikeSeries: def to_lowercase(self) -> PandasLikeSeries: return self.with_native(self.native.str.lower()) + def to_titlecase(self) -> PandasLikeSeries: + return self.with_native(self.native.str.title()) + def zfill(self, width: int) -> PandasLikeSeries: return self.with_native(self.native.str.zfill(width)) diff --git a/narwhals/_polars/expr.py b/narwhals/_polars/expr.py index b7cb739ffe..98002ca227 100644 --- a/narwhals/_polars/expr.py +++ b/narwhals/_polars/expr.py @@ -5,6 +5,7 @@ import polars as pl from narwhals._polars.utils import ( + BACKEND_VERSION, PolarsAnyNamespace, PolarsCatNamespace, PolarsDateTimeNamespace, @@ -411,6 +412,21 @@ class PolarsExprDateTimeNamespace( class PolarsExprStringNamespace( PolarsExprNamespace, PolarsStringNamespace[PolarsExpr, pl.Expr] ): + def to_titlecase(self) -> PolarsExpr: + native_expr = self.native + + if BACKEND_VERSION < (1, 5): + native_result = ( + native_expr.str.to_lowercase() + .str.extract_all(r"[a-z0-9]*[^a-z0-9]*") + .list.eval(pl.element().str.to_titlecase()) + .list.join("") + ) + else: + native_result = native_expr.str.to_titlecase() + + return self.compliant._with_native(native_result) + @requires.backend_version((0, 20, 5)) def zfill(self, width: int) -> PolarsExpr: backend_version = self.compliant._backend_version diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index 9478beb94d..5eb22cbf73 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -786,6 +786,11 @@ class PolarsSeriesDateTimeNamespace( class PolarsSeriesStringNamespace( PolarsSeriesNamespace, PolarsStringNamespace[PolarsSeries, pl.Series] ): + def to_titlecase(self) -> PolarsSeries: + name = self.name + ns = self.__narwhals_namespace__() + return self.to_frame().select(ns.col(name).str.to_titlecase()).get_column(name) + def zfill(self, width: int) -> PolarsSeries: name = self.name ns = self.__narwhals_namespace__() diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index 397016b765..d638c791fd 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -326,6 +326,9 @@ class PolarsStringNamespace(PolarsAnyNamespace[CompliantT, NativeT_co]): _accessor: ClassVar[Accessor] = "str" # NOTE: Use `abstractmethod` if we have defs to implement, but also `Method` usage + @abc.abstractmethod + def to_titlecase(self) -> CompliantT: ... + @abc.abstractmethod def zfill(self, width: int) -> CompliantT: ... diff --git a/narwhals/_spark_like/expr_str.py b/narwhals/_spark_like/expr_str.py index 3c36a2c534..ddcd9fbc3f 100644 --- a/narwhals/_spark_like/expr_str.py +++ b/narwhals/_spark_like/expr_str.py @@ -5,9 +5,11 @@ from narwhals._spark_like.utils import strptime_to_pyspark_format from narwhals._sql.expr_str import SQLExprStringNamespace -from narwhals._utils import _is_naive_format, not_implemented +from narwhals._utils import _is_naive_format, not_implemented, requires if TYPE_CHECKING: + from sqlframe.base.column import Column + from narwhals._spark_like.expr import SparkLikeExpr @@ -33,4 +35,30 @@ def to_date(self, format: str | None) -> SparkLikeExpr: lambda expr: F.to_date(expr, format=strptime_to_pyspark_format(format)) ) + def to_titlecase(self) -> SparkLikeExpr: + impl = self.compliant._implementation + sqlframe_required_version = (3, 43, 1) + if ( + impl.is_sqlframe() + and (version := impl._backend_version()) < sqlframe_required_version + ): # pragma: no cover + required_str = requires._unparse_version(sqlframe_required_version) + found_str = requires._unparse_version(version) + msg = ( + f"`str.to_titlecase` is only available in 'sqlframe>={required_str}', " + f"found version {found_str!r}." + ) + raise NotImplementedError(msg) + + def _to_titlecase(expr: Column) -> Column: + F = self.compliant._F + lower_expr = F.lower(expr) + extract_expr = F.regexp_extract_all( + lower_expr, regexp=F.lit(r"[a-z0-9]*[^a-z0-9]*"), idx=0 + ) + capitalized_expr = F.transform(extract_expr, f=F.initcap) + return F.array_join(capitalized_expr, delimiter="") + + return self.compliant._with_elementwise(_to_titlecase) + replace = not_implemented() diff --git a/narwhals/expr_str.py b/narwhals/expr_str.py index 19edb12911..b64d4580dc 100644 --- a/narwhals/expr_str.py +++ b/narwhals/expr_str.py @@ -455,6 +455,62 @@ def to_lowercase(self) -> ExprT: lambda plx: self._expr._to_compliant_expr(plx).str.to_lowercase() ) + def to_titlecase(self) -> ExprT: + """Modify strings to their titlecase equivalent. + + Notes: + This is a form of case transform where the first letter of each word is + capitalized, with the rest of the word in lowercase. + + Warning: + Different backends might follow different rules to determine what a "word" is: + + - duckdb, polars and spark-like use non-**alphanumeric** characters to + define the word boundaries. + - pandas-like, pyarrow and dask use non-**alphabetic** characters to define + the word boundaries, matching the behavior of + [`str.title`](https://docs.python.org/3/library/stdtypes.html#str.title). + + We can observe the difference with the string `"with123numbers"`: + + - non-**alphanumeric** -> `"With123numbers"` + - notice lowercase **n** after the digits + - non-**alphabetic** -> `"With123Numbers"` + - notice uppercase **N** after the digits + + Examples: + >>> import polars as pl + >>> import narwhals as nw + >>> df_native = pl.DataFrame( + ... { + ... "quotes": [ + ... "'e.t. phone home'", + ... "you talkin' to me?", + ... "to infinity,and BEYOND!", + ... ] + ... } + ... ) + >>> df = nw.from_native(df_native) + >>> df.with_columns(quotes_title=nw.col("quotes").str.to_titlecase()) + ┌─────────────────────────────────────────────────────┐ + | Narwhals DataFrame | + |-----------------------------------------------------| + |shape: (3, 2) | + |┌─────────────────────────┬─────────────────────────┐| + |│ quotes ┆ quotes_title │| + |│ --- ┆ --- │| + |│ str ┆ str │| + |╞═════════════════════════╪═════════════════════════╡| + |│ 'e.t. phone home' ┆ 'E.T. Phone Home' │| + |│ you talkin' to me? ┆ You Talkin' To Me? │| + |│ to infinity,and BEYOND! ┆ To Infinity,And Beyond! │| + |└─────────────────────────┴─────────────────────────┘| + └─────────────────────────────────────────────────────┘ + """ + return self._expr._with_elementwise( + lambda plx: self._expr._to_compliant_expr(plx).str.to_titlecase() + ) + def zfill(self, width: int) -> ExprT: """Transform string to zero-padded variant. diff --git a/narwhals/series_str.py b/narwhals/series_str.py index 7469ad5c53..ae98d4db34 100644 --- a/narwhals/series_str.py +++ b/narwhals/series_str.py @@ -392,6 +392,52 @@ def to_date(self, format: str | None = None) -> SeriesT: self._narwhals_series._compliant_series.str.to_date(format=format) ) + def to_titlecase(self) -> SeriesT: + """Modify strings to their titlecase equivalent. + + Notes: + This is a form of case transform where the first letter of each word is + capitalized, with the rest of the word in lowercase. + + Warning: + Different backends might follow different rules to determine what a "word" is: + + - polars uses **non-alphanumeric** characters to define the word boundaries. + - pandas-like and pyarrow use **non-alphabetic** characters to define + the word boundaries, matching the behavior of + [`str.title`](https://docs.python.org/3/library/stdtypes.html#str.title). + + As an example of such difference, in the former case the string `"with123numbers"` + is mapped to `"With123numbers"` (notice lowercase **n** after the digits), while + in the latter to `"With123Numbers"` (notice uppercase **N** after the digits). + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> s_native = pa.chunked_array( + ... [ + ... [ + ... "'e.t. phone home'", + ... "you talkin' to me?", + ... "to infinity,and BEYOND!", + ... ] + ... ] + ... ) + >>> s = nw.from_native(s_native, series_only=True) + >>> s.str.to_titlecase().to_native() # doctest: +ELLIPSIS + + [ + [ + "'E.T. Phone Home'", + "You Talkin' To Me?", + "To Infinity,And Beyond!" + ] + ] + """ + return self._narwhals_series._with_compliant( + self._narwhals_series._compliant_series.str.to_titlecase() + ) + def zfill(self, width: int) -> SeriesT: r"""Pad strings with zeros on the left. diff --git a/pyproject.toml b/pyproject.toml index dff47118e7..6890d7ee71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -303,6 +303,7 @@ exclude_also = [ 'request.applymarker\(pytest.mark.xfail', 'backend_version <', '.*._backend_version\(\) <', + 'BACKEND_VERSION <', 'if ".*" in str\(constructor', 'pytest.skip\(', 'assert_never\(', diff --git a/tests/expr_and_series/str/to_titlecase_test.py b/tests/expr_and_series/str/to_titlecase_test.py new file mode 100644 index 0000000000..d3509b28aa --- /dev/null +++ b/tests/expr_and_series/str/to_titlecase_test.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +import pytest + +import narwhals as nw +from tests.utils import DUCKDB_VERSION, Constructor, ConstructorEager, assert_equal_data + +data = { + "a": [ + "e.t. phone home", + "they're bill's friends from the UK", + "to infinity,and BEYOND!", + "with123numbers", + "__dunder__score_a1_.2b ?three", + ] +} + +expected_non_alphabetic = { + "a": [ + "E.T. Phone Home", + "They'Re Bill'S Friends From The Uk", + "To Infinity,And Beyond!", + "With123Numbers", + "__Dunder__Score_A1_.2B ?Three", + ] +} +expected_non_alphanumeric = { + "a": [ + "E.T. Phone Home", + "They'Re Bill'S Friends From The Uk", + "To Infinity,And Beyond!", + "With123numbers", + "__Dunder__Score_A1_.2b ?Three", + ] +} + +NON_ALPHANUMERIC_BACKENDS = ("duckdb", "polars", "pyspark") + + +def test_str_to_titlecase_expr( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 2): + reason = "version too old, duckdb 1.2 required for LambdaExpression." + pytest.skip(reason=reason) + + if "ibis" in str(constructor): + request.applymarker(pytest.mark.xfail) + + expected = ( + expected_non_alphanumeric + if any(x in str(constructor) for x in NON_ALPHANUMERIC_BACKENDS) + else expected_non_alphabetic + ) + + df = nw.from_native(constructor(data)) + result_frame = df.select(nw.col("a").str.to_titlecase()) + + assert_equal_data(result_frame, expected) + + +def test_str_to_titlecase_series(constructor_eager: ConstructorEager) -> None: + expected = ( + expected_non_alphanumeric + if any(x in str(constructor_eager) for x in NON_ALPHANUMERIC_BACKENDS) + else expected_non_alphabetic + ) + + df = nw.from_native(constructor_eager(data), eager_only=True) + result_series = df["a"].str.to_titlecase() + + assert_equal_data({"a": result_series}, expected)