Skip to content

Commit e0d570b

Browse files
committed
Add cat_slice
1 parent 10dd3dc commit e0d570b

File tree

8 files changed

+197
-16
lines changed

8 files changed

+197
-16
lines changed

crates/polars-ops/src/chunked_array/strings/mod.rs

+2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ use polars_core::prelude::*;
4646
pub use split::*;
4747
#[cfg(feature = "strings")]
4848
pub use strip::*;
49+
#[cfg(feature = "strings")]
50+
pub use substring::{substring_ternary_offsets_value, update_view};
4951

5052
pub trait AsString {
5153
fn as_string(&self) -> &StringChunked;

crates/polars-ops/src/chunked_array/strings/substring.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ fn substring_ternary_offsets(
9292
))
9393
}
9494

95-
fn substring_ternary_offsets_value(str_val: &str, offset: i64, length: u64) -> (usize, usize) {
95+
pub fn substring_ternary_offsets_value(str_val: &str, offset: i64, length: u64) -> (usize, usize) {
9696
// Fast-path: always empty string.
9797
if length == 0 || offset >= str_val.len() as i64 {
9898
return (0, 0);
@@ -137,7 +137,7 @@ fn substring_ternary(
137137
unsafe { opt_str_val.map(|str_val| str_val.get_unchecked(start..end)) }
138138
}
139139

140-
fn update_view(mut view: View, start: usize, end: usize, val: &str) -> View {
140+
pub fn update_view(mut view: View, start: usize, end: usize, val: &str) -> View {
141141
let length = (end - start) as u32;
142142
view.length = length;
143143

crates/polars-plan/src/dsl/cat.rs

+8
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,12 @@ impl CategoricalNameSpace {
3636
suffix,
3737
)))
3838
}
39+
40+
#[cfg(feature = "strings")]
41+
pub fn slice(self, offset: i64, length: Option<usize>) -> Expr {
42+
self.0
43+
.map_private(FunctionExpr::Categorical(CategoricalFunction::Slice(
44+
offset, length,
45+
)))
46+
}
3947
}

crates/polars-plan/src/dsl/function_expr/cat.rs

+38-14
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ pub enum CategoricalFunction {
1313
StartsWith(String),
1414
#[cfg(feature = "strings")]
1515
EndsWith(String),
16+
#[cfg(feature = "strings")]
17+
Slice(i64, Option<usize>),
1618
}
1719

1820
impl CategoricalFunction {
@@ -28,6 +30,8 @@ impl CategoricalFunction {
2830
StartsWith(_) => mapper.with_dtype(DataType::Boolean),
2931
#[cfg(feature = "strings")]
3032
EndsWith(_) => mapper.with_dtype(DataType::Boolean),
33+
#[cfg(feature = "strings")]
34+
Slice(_, _) => mapper.with_dtype(DataType::String),
3135
}
3236
}
3337
}
@@ -45,6 +49,8 @@ impl Display for CategoricalFunction {
4549
StartsWith(_) => "starts_with",
4650
#[cfg(feature = "strings")]
4751
EndsWith(_) => "ends_with",
52+
#[cfg(feature = "strings")]
53+
Slice(_, _) => "slice",
4854
};
4955
write!(f, "cat.{s}")
5056
}
@@ -63,6 +69,8 @@ impl From<CategoricalFunction> for SpecialEq<Arc<dyn ColumnsUdf>> {
6369
StartsWith(prefix) => map!(starts_with, prefix.as_str()),
6470
#[cfg(feature = "strings")]
6571
EndsWith(suffix) => map!(ends_with, suffix.as_str()),
72+
#[cfg(feature = "strings")]
73+
Slice(offset, length) => map!(slice, offset, length),
6674
}
6775
}
6876
}
@@ -101,12 +109,14 @@ fn _get_cat_phys_map(ca: &CategoricalChunked) -> (StringChunked, Series) {
101109

102110
/// Fast path: apply a string function to the categories of a categorical column and broadcast the
103111
/// result back to the array.
104-
fn apply_to_cats<F, T>(ca: &CategoricalChunked, mut op: F) -> PolarsResult<Column>
112+
// fn apply_to_cats<F, T>(ca: &CategoricalChunked, mut op: F) -> PolarsResult<Column>
113+
fn apply_to_cats<F, T>(c: &Column, mut op: F) -> PolarsResult<Column>
105114
where
106115
F: FnMut(&StringChunked) -> ChunkedArray<T>,
107116
ChunkedArray<T>: IntoSeries,
108117
T: PolarsDataType<HasViews = FalseT, IsStruct = FalseT, IsNested = FalseT>,
109118
{
119+
let ca = c.categorical()?;
110120
let (categories, phys) = _get_cat_phys_map(ca);
111121
let result = op(&categories);
112122
// SAFETY: physical idx array is valid.
@@ -116,12 +126,13 @@ where
116126

117127
/// Fast path: apply a binary function to the categories of a categorical column and broadcast the
118128
/// result back to the array.
119-
fn apply_to_cats_binary<F, T>(ca: &CategoricalChunked, mut op: F) -> PolarsResult<Column>
129+
fn apply_to_cats_binary<F, T>(c: &Column, mut op: F) -> PolarsResult<Column>
120130
where
121131
F: FnMut(&BinaryChunked) -> ChunkedArray<T>,
122132
ChunkedArray<T>: IntoSeries,
123133
T: PolarsDataType<HasViews = FalseT, IsStruct = FalseT, IsNested = FalseT>,
124134
{
135+
let ca = c.categorical()?;
125136
let (categories, phys) = _get_cat_phys_map(ca);
126137
let result = op(&categories.as_binary());
127138
// SAFETY: physical idx array is valid.
@@ -130,25 +141,38 @@ where
130141
}
131142

132143
#[cfg(feature = "strings")]
133-
fn len_bytes(s: &Column) -> PolarsResult<Column> {
134-
let ca = s.categorical()?;
135-
apply_to_cats(ca, |s| s.str_len_bytes())
144+
fn len_bytes(c: &Column) -> PolarsResult<Column> {
145+
apply_to_cats(c, |s| s.str_len_bytes())
136146
}
137147

138148
#[cfg(feature = "strings")]
139-
fn len_chars(s: &Column) -> PolarsResult<Column> {
140-
let ca = s.categorical()?;
141-
apply_to_cats(ca, |s| s.str_len_chars())
149+
fn len_chars(c: &Column) -> PolarsResult<Column> {
150+
apply_to_cats(c, |s| s.str_len_chars())
142151
}
143152

144153
#[cfg(feature = "strings")]
145-
fn starts_with(s: &Column, prefix: &str) -> PolarsResult<Column> {
146-
let ca = s.categorical()?;
147-
apply_to_cats(ca, |s| s.starts_with(prefix))
154+
fn starts_with(c: &Column, prefix: &str) -> PolarsResult<Column> {
155+
apply_to_cats(c, |s| s.starts_with(prefix))
148156
}
149157

150158
#[cfg(feature = "strings")]
151-
fn ends_with(s: &Column, suffix: &str) -> PolarsResult<Column> {
152-
let ca = s.categorical()?;
153-
apply_to_cats_binary(ca, |s| s.as_binary().ends_with(suffix.as_bytes()))
159+
fn ends_with(c: &Column, suffix: &str) -> PolarsResult<Column> {
160+
apply_to_cats_binary(c, |s| s.as_binary().ends_with(suffix.as_bytes()))
161+
}
162+
163+
#[cfg(feature = "strings")]
164+
fn slice(c: &Column, offset: i64, length: Option<usize>) -> PolarsResult<Column> {
165+
let length = length.unwrap_or(usize::MAX) as u64;
166+
let ca = c.categorical()?;
167+
let (categories, phys) = _get_cat_phys_map(ca);
168+
169+
let result = unsafe {
170+
categories.apply_views(|view, val| {
171+
let (start, end) = substring_ternary_offsets_value(val, offset, length);
172+
update_view(view, start, end, val)
173+
})
174+
};
175+
// SAFETY: physical idx array is valid.
176+
let out = unsafe { result.take_unchecked(phys.idx().unwrap()) };
177+
Ok(out.into_column())
154178
}

crates/polars-python/src/expr/categorical.rs

+5
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,9 @@ impl PyExpr {
2323
fn cat_ends_with(&self, suffix: String) -> Self {
2424
self.inner.clone().cat().ends_with(suffix).into()
2525
}
26+
27+
#[pyo3(signature = (offset, length=None))]
28+
fn cat_slice(&self, offset: i64, length: Option<usize>) -> Self {
29+
self.inner.clone().cat().slice(offset, length).into()
30+
}
2631
}

py-polars/polars/expr/categorical.py

+65
Original file line numberDiff line numberDiff line change
@@ -237,3 +237,68 @@ def ends_with(self, suffix: str) -> Expr:
237237
msg = f"'suffix' must be a string; found {type(suffix)!r}"
238238
raise TypeError(msg)
239239
return wrap_expr(self._pyexpr.cat_ends_with(suffix))
240+
241+
def slice(self, offset: int, length: int | None = None) -> Expr:
242+
"""
243+
Extract a substring from each string value.
244+
245+
Parameters
246+
----------
247+
offset
248+
Start index. Negative indexing is supported.
249+
length
250+
Length of the slice. If set to `None` (default), the slice is taken to the
251+
end of the string.
252+
253+
Returns
254+
-------
255+
Expr
256+
Expression of data type :class:`String`.
257+
258+
Notes
259+
-----
260+
Both the `offset` and `length` inputs are defined in terms of the number
261+
of characters in the (UTF8) string. A character is defined as a
262+
`Unicode scalar value`_. A single character is represented by a single byte
263+
when working with ASCII text, and a maximum of 4 bytes otherwise.
264+
265+
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
266+
267+
Examples
268+
--------
269+
>>> df = pl.DataFrame(
270+
... {
271+
... "s": pl.Series(
272+
... ["pear", None, "papaya", "dragonfruit"],
273+
... dtype=pl.Categorical,
274+
... )
275+
... },
276+
>>> df.with_columns(pl.col("s").cat.slice(-3).alias("slice"))
277+
shape: (4, 2)
278+
┌─────────────┬───────┐
279+
│ s ┆ slice │
280+
│ --- ┆ --- │
281+
│ cat ┆ str │
282+
╞═════════════╪═══════╡
283+
│ pear ┆ ear │
284+
│ null ┆ null │
285+
│ papaya ┆ aya │
286+
│ dragonfruit ┆ uit │
287+
└─────────────┴───────┘
288+
289+
Using the optional `length` parameter
290+
291+
>>> df.with_columns(pl.col("s").cat.slice(4, length=3).alias("slice"))
292+
shape: (4, 2)
293+
┌─────────────┬───────┐
294+
│ s ┆ slice │
295+
│ --- ┆ --- │
296+
│ cat ┆ str │
297+
╞═════════════╪═══════╡
298+
│ pear ┆ │
299+
│ null ┆ null │
300+
│ papaya ┆ ya │
301+
│ dragonfruit ┆ onf │
302+
└─────────────┴───────┘
303+
"""
304+
return wrap_expr(self._pyexpr.cat_slice(offset, length))

py-polars/polars/series/categorical.py

+52
Original file line numberDiff line numberDiff line change
@@ -239,3 +239,55 @@ def ends_with(self, suffix: str) -> Series:
239239
null
240240
]
241241
"""
242+
243+
def slice(self, offset: int, length: int | None = None) -> Series:
244+
"""
245+
Extract a substring from each string value.
246+
247+
Parameters
248+
----------
249+
offset
250+
Start index. Negative indexing is supported.
251+
length
252+
Length of the slice. If set to `None` (default), the slice is taken to the
253+
end of the string.
254+
255+
Returns
256+
-------
257+
Series
258+
Series of data type :class:`String`.
259+
260+
Notes
261+
-----
262+
Both the `offset` and `length` inputs are defined in terms of the number
263+
of characters in the (UTF8) string. A character is defined as a
264+
`Unicode scalar value`_. A single character is represented by a single byte
265+
when working with ASCII text, and a maximum of 4 bytes otherwise.
266+
267+
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
268+
269+
Examples
270+
--------
271+
>>> s = pl.Series(["pear", None, "papaya", "dragonfruit"], dtype=pl.Categorical)
272+
>>> s.cat.slice(-3)
273+
shape: (4,)
274+
Series: '' [str]
275+
[
276+
"ear"
277+
null
278+
"aya"
279+
"uit"
280+
]
281+
282+
Using the optional `length` parameter
283+
284+
>>> s.cat.slice(4, length=3)
285+
shape: (4,)
286+
Series: '' [cat]
287+
[
288+
""
289+
null
290+
"ya"
291+
"onf"
292+
]
293+
"""

py-polars/tests/unit/operations/namespaces/test_categorical.py

+25
Original file line numberDiff line numberDiff line change
@@ -276,3 +276,28 @@ def test_starts_ends_with() -> None:
276276

277277
with pytest.raises(TypeError, match="'suffix' must be a string; found"):
278278
df.select(pl.col("a").cat.ends_with(None)) # type: ignore[arg-type]
279+
280+
281+
def test_cat_slice() -> None:
282+
df = pl.DataFrame(
283+
{
284+
"a": pl.Series(
285+
[
286+
"foobar",
287+
"barfoo",
288+
"foobar",
289+
"x",
290+
None,
291+
],
292+
dtype=pl.Categorical,
293+
)
294+
}
295+
)
296+
assert df["a"].cat.slice(-3).to_list() == ["bar", "foo", "bar", "x", None]
297+
assert df.select([pl.col("a").cat.slice(2, 4)])["a"].to_list() == [
298+
"obar",
299+
"rfoo",
300+
"obar",
301+
"",
302+
None,
303+
]

0 commit comments

Comments
 (0)