Skip to content

Commit

Permalink
SNOW-1819521: Add support for Series.dt.strftime (10 directives) (#2781)
Browse files Browse the repository at this point in the history
<!---
Please answer these questions before creating your pull request. Thanks!
--->

1. Which Jira issue is this PR addressing? Make sure that there is an
accompanying issue to your PR.

   <!---
   In this section, please add a Snowflake Jira issue number.

Note that if a corresponding GitHub issue exists, you should still
include
   the Snowflake Jira issue number. For example, for GitHub issue
#1400, you should
   add "SNOW-1335071" here.
    --->

   Fixes SNOW-1819521

2. Fill out the following pre-review checklist:

- [x] I am adding a new automated test(s) to verify correctness of my
new code
- [ ] If this test skips Local Testing mode, I'm requesting review from
@snowflakedb/local-testing
   - [ ] I am adding new logging messages
   - [ ] I am adding a new telemetry message
   - [ ] I am adding new credentials
   - [ ] I am adding a new dependency
- [ ] If this is a new feature/behavior, I'm adding the Local Testing
parity changes.
- [ ] I acknowledge that I have ensured my changes to be thread-safe.
Follow the link for more information: [Thread-safe Developer
Guidelines](https://github.com/snowflakedb/snowpark-python/blob/main/CONTRIBUTING.md#thread-safe-development)

3. Please describe how your code solves the related issue.

   Add support for Series.dt.strftime (10 directives).
  • Loading branch information
sfc-gh-helmeleegy authored Dec 19, 2024
1 parent d2cc2b8 commit eacec5c
Show file tree
Hide file tree
Showing 7 changed files with 222 additions and 45 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,17 @@
- Added support for `DataFrame.from_dict` and `DataFrame.from_records`.
- Added support for mixed case field names in struct type columns.
- Added support for `SeriesGroupBy.unique`
- Added support for `Series.dt.strftime` with the following directives:
- %d: Day of the month as a zero-padded decimal number.
- %m: Month as a zero-padded decimal number.
- %Y: Year with century as a decimal number.
- %H: Hour (24-hour clock) as a zero-padded decimal number.
- %M: Minute as a zero-padded decimal number.
- %S: Second as a zero-padded decimal number.
- %f: Microsecond as a decimal number, zero-padded to 6 digits.
- %j: Day of the year as a zero-padded decimal number.
- %X: Locale’s appropriate time representation.
- %%: A literal '%' character.

#### Bug Fixes

Expand Down
4 changes: 3 additions & 1 deletion docs/source/modin/supported/series_dt_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,9 @@ the method in the left column.
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``normalize`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``strftime`` | N | |
| ``strftime`` | P | ``N`` if `date_format` contains directives other |
| | | than (`%d`, `%m`, `%Y`, `%H`, `%M`, `%S`, `%f`, |
| | | `%j`, `%X`, `%%`). |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``round`` | P | ``N`` if `ambiguous` or `nonexistent` are set to a |
| | | non-default value. |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18162,7 +18162,7 @@ def dt_total_seconds(self, include_index: bool = False) -> "SnowflakeQueryCompil
)
)

def dt_strftime(self, date_format: str) -> None:
def dt_strftime(self, date_format: str) -> "SnowflakeQueryCompiler":
"""
Format underlying date-time data using specified format.

Expand All @@ -18172,8 +18172,102 @@ def dt_strftime(self, date_format: str) -> None:
Returns:
New QueryCompiler containing formatted date-time values.
"""
ErrorMessage.not_implemented(
"Snowpark pandas doesn't yet support the method 'Series.dt.strftime'"

def strftime_func(column: SnowparkColumn) -> SnowparkColumn:
directive_to_function_map: dict[str, Callable] = {
"d": (
# Day of the month as a zero-padded decimal number
lambda column: lpad(
dayofmonth(column), pandas_lit(2), pandas_lit("0")
)
),
"m": (
# Month as a zero-padded decimal number
lambda column: lpad(month(column), pandas_lit(2), pandas_lit("0"))
),
"Y": (
# Year with century as a decimal number
lambda column: lpad(year(column), pandas_lit(4), pandas_lit("0"))
),
"H": (
# Hour (24-hour clock) as a zero-padded decimal number
lambda column: lpad(hour(column), pandas_lit(2), pandas_lit("0"))
),
"M": (
# Minute as a zero-padded decimal number
lambda column: lpad(minute(column), pandas_lit(2), pandas_lit("0"))
),
"S": (
# Second as a zero-padded decimal number
lambda column: lpad(second(column), pandas_lit(2), pandas_lit("0"))
),
"f": (
# Microsecond as a decimal number, zero-padded to 6 digits
lambda column: lpad(
floor(date_part("ns", column) / 1000),
pandas_lit(6),
pandas_lit("0"),
)
),
"j": (
# Day of the year as a zero-padded decimal number
lambda column: lpad(
dayofyear(column), pandas_lit(3), pandas_lit("0")
)
),
"X": (
# Locale’s appropriate time representation
lambda column: trunc(to_time(column), pandas_lit("second"))
),
"%": (
# A literal '%' character
lambda column: pandas_lit("%")
),
}

parts = re.split("%.", date_format)
directive_first = False
if parts[0] == "":
parts = parts[1:]
directive_first = True
if parts[-1] == "":
parts = parts[:-1]
directives = re.findall("%.", date_format)
cols = []
for i in range(min(len(parts), len(directives))):
directive_function = directive_to_function_map.get(directives[i][1:])
if not directive_function:
raise ErrorMessage.not_implemented(
f"Snowpark pandas 'Series.dt.strftime' method does not yet support the directive '%{directives[i][1:]}'"
)

if directive_first:
cols.append(directive_function(column))
cols.append(pandas_lit(parts[i]))
else:
cols.append(pandas_lit(parts[i]))
cols.append(directive_function(column))

if len(parts) > len(directives):
cols.append(pandas_lit(parts[-1]))
if len(parts) < len(directives):
directive_function = directive_to_function_map.get(directives[-1][1:])
if not directive_function:
raise ErrorMessage.not_implemented(
f"Snowpark pandas 'Series.dt.strftime' method does not yet support the directive '%{directives[-1][1:]}'"
)
cols.append(directive_function(column))

if len(cols) == 1:
return iff(column.is_null(), pandas_lit(None), cols[0])
else:
return iff(column.is_null(), pandas_lit(None), concat(*cols))

return SnowflakeQueryCompiler(
self._modin_frame.apply_snowpark_function_to_columns(
strftime_func,
include_index=False,
)
)

def topn(
Expand Down
44 changes: 43 additions & 1 deletion src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2218,7 +2218,49 @@ def normalize():
pass

def strftime():
pass
"""
Convert to Index using specified date_format.
Return an Index of formatted strings specified by date_format, which supports the same string format as the python standard library. Details of the string format can be found in python string format doc.
Formats supported by the C strftime API but not by the python string format doc (such as “%R”, “%r”) are not officially supported and should be preferably replaced with their supported equivalents (such as “%H:%M”, “%I:%M:%S %p”).
Note that PeriodIndex support additional directives, detailed in Period.strftime.
Parameters
----------
date_format : str
Date format string (e.g. “%Y-%m-%d”).
Returns
-------
ndarray[object]
NumPy ndarray of formatted strings.
See also
--------
to_datetime
Convert the given argument to datetime.
DatetimeIndex.normalize
Return DatetimeIndex with times to midnight.
DatetimeIndex.round
Round the DatetimeIndex to the specified freq.
DatetimeIndex.floor
Floor the DatetimeIndex to the specified freq.
Timestamp.strftime
Format a single Timestamp.
Period.strftime
Format a single Period.
Examples
--------
>>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"),
... periods=3, freq='s')
>>> rng.strftime('%B %d, %Y, %r') # doctest: +SKIP
Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM',
'March 10, 2018, 09:00:02 AM'],
dtype='object')
"""

def round():
"""
Expand Down
68 changes: 68 additions & 0 deletions tests/integ/modin/series/test_dt_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,74 @@ def test_days_in_month(property):
)


@sql_count_checker(query_count=1)
@pytest.mark.parametrize(
"date_format",
[
"a%d-%m-%Y-%H-%M-%S-%f-%j-%X-%%b",
"%d-%m-%Y-%H-%M-%S-%f-%j-%X-%%b",
"a%d-%m-%Y-%H-%M-%S-%f-%j-%X-%%",
"%d-%m-%Y-%H-%M-%S-%f-%j-%X-%%",
"%%%M",
"%%M",
"abc%",
],
)
def test_strftime(date_format):
datetime_index = native_pd.DatetimeIndex(
[
"2014-04-04 23:56:01.000000001",
"2014-07-18 21:24:02.000000002",
"2015-11-22 22:14:03.000000003",
"2015-11-23 20:12:04.1234567890",
pd.NaT,
],
)
native_ser = native_pd.Series(datetime_index)
snow_ser = pd.Series(native_ser)
eval_snowpark_pandas_result(
snow_ser,
native_ser,
lambda s: s.dt.strftime(date_format=date_format),
)


@sql_count_checker(query_count=0)
@pytest.mark.parametrize(
"date_format",
[
"%a",
"%A",
"%w",
"%b",
"%B",
"%y",
"%I",
"%p",
"%z",
"%Z",
"%U",
"%W",
"%c",
"%x",
],
)
def test_strftime_neg(date_format):
datetime_index = native_pd.DatetimeIndex(
[
"2014-04-04 23:56:01.000000001",
"2014-07-18 21:24:02.000000002",
"2015-11-22 22:14:03.000000003",
"2015-11-23 20:12:04.1234567890",
pd.NaT,
],
)
native_ser = native_pd.Series(datetime_index)
snow_ser = pd.Series(native_ser)
with pytest.raises(NotImplementedError):
snow_ser.dt.strftime(date_format=date_format)


@dt_properties
@sql_count_checker(query_count=1)
def test_dt_property_with_tz(property_name):
Expand Down
39 changes: 0 additions & 39 deletions tests/integ/modin/series/test_dt_accessor_unsupported.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,42 +26,3 @@ def test_dt_namespace_accessor_datetime64(self, freq):
msg = "Snowpark pandas doesn't yet support the property 'Series.dt.freq'"
with pytest.raises(NotImplementedError, match=msg):
ser.dt.freq

@pytest.mark.parametrize(
"date, format_string, expected",
[
(
native_pd.date_range("20130101", periods=5),
"%Y/%m/%d",
native_pd.Series(
[
"2013/01/01",
"2013/01/02",
"2013/01/03",
"2013/01/04",
"2013/01/05",
]
),
),
(
native_pd.date_range("2015-02-03 11:22:33.4567", periods=5),
"%Y/%m/%d %H-%M-%S",
native_pd.Series(
[
"2015/02/03 11-22-33",
"2015/02/04 11-22-33",
"2015/02/05 11-22-33",
"2015/02/06 11-22-33",
"2015/02/07 11-22-33",
]
),
),
],
)
@sql_count_checker(query_count=0)
def test_strftime(self, date, format_string, expected):
# GH 10086
ser = pd.Series(date)
msg = "Snowpark pandas doesn't yet support the method 'Series.dt.strftime'"
with pytest.raises(NotImplementedError, match=msg):
ser.dt.strftime(format_string)
1 change: 0 additions & 1 deletion tests/unit/modin/test_series_dt.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def mock_query_compiler_for_dt_series() -> SnowflakeQueryCompiler:
[
(lambda s: s.dt.timetz, "timetz"),
(lambda s: s.dt.to_period(), "to_period"),
(lambda s: s.dt.strftime(date_format="YY/MM/DD"), "strftime"),
(lambda s: s.dt.qyear, "qyear"),
(lambda s: s.dt.start_time, "start_time"),
(lambda s: s.dt.end_time, "end_time"),
Expand Down

0 comments on commit eacec5c

Please sign in to comment.