SNOW-1819521: Add support for Series.dt.strftime (10 directives) (#2781)

1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR.  Fixes SNOW-1819521 2. Fill out the following pre-review checklist: - [x] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. - [ ] I acknowledge that I have ensured my changes to be thread-safe. Follow the link for more information: [Thread-safe Developer Guidelines](https://github.com/snowflakedb/snowpark-python/blob/main/CONTRIBUTING.md#thread-safe-development) 3. Please describe how your code solves the related issue. Add support for Series.dt.strftime (10 directives).
snowflakedb · Dec 19, 2024 · eacec5c · eacec5c
1 parent d2cc2b8
commit eacec5c
Show file tree

Hide file tree

Showing 7 changed files with 222 additions and 45 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,6 +27,17 @@
 - Added support for `DataFrame.from_dict` and `DataFrame.from_records`.
 - Added support for mixed case field names in struct type columns.
 - Added support for `SeriesGroupBy.unique`
+- Added support for `Series.dt.strftime` with the following directives:
+  - %d: Day of the month as a zero-padded decimal number.
+  - %m: Month as a zero-padded decimal number.
+  - %Y: Year with century as a decimal number.
+  - %H: Hour (24-hour clock) as a zero-padded decimal number.
+  - %M: Minute as a zero-padded decimal number.
+  - %S: Second as a zero-padded decimal number.
+  - %f: Microsecond as a decimal number, zero-padded to 6 digits.
+  - %j: Day of the year as a zero-padded decimal number.
+  - %X: Locale’s appropriate time representation.
+  - %%: A literal '%' character.
 
 #### Bug Fixes
 

diff --git a/docs/source/modin/supported/series_dt_supported.rst b/docs/source/modin/supported/series_dt_supported.rst
@@ -98,7 +98,9 @@ the method in the left column.
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``normalize``               | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``strftime``                | N                               |                                                    |
+| ``strftime``                | P                               | ``N`` if `date_format` contains directives other   |
+|                             |                                 | than (`%d`, `%m`, `%Y`, `%H`, `%M`, `%S`, `%f`,    |
+|                             |                                 | `%j`, `%X`, `%%`).                                 |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``round``                   | P                               | ``N`` if `ambiguous` or `nonexistent` are set to a |
 |                             |                                 | non-default value.                                 |

diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -18162,7 +18162,7 @@ def dt_total_seconds(self, include_index: bool = False) -> "SnowflakeQueryCompil
             )
         )
 
-    def dt_strftime(self, date_format: str) -> None:
+    def dt_strftime(self, date_format: str) -> "SnowflakeQueryCompiler":
         """
         Format underlying date-time data using specified format.
 
@@ -18172,8 +18172,102 @@ def dt_strftime(self, date_format: str) -> None:
         Returns:
             New QueryCompiler containing formatted date-time values.
         """
-        ErrorMessage.not_implemented(
-            "Snowpark pandas doesn't yet support the method 'Series.dt.strftime'"
+
+        def strftime_func(column: SnowparkColumn) -> SnowparkColumn:
+            directive_to_function_map: dict[str, Callable] = {
+                "d": (
+                    # Day of the month as a zero-padded decimal number
+                    lambda column: lpad(
+                        dayofmonth(column), pandas_lit(2), pandas_lit("0")
+                    )
+                ),
+                "m": (
+                    # Month as a zero-padded decimal number
+                    lambda column: lpad(month(column), pandas_lit(2), pandas_lit("0"))
+                ),
+                "Y": (
+                    # Year with century as a decimal number
+                    lambda column: lpad(year(column), pandas_lit(4), pandas_lit("0"))
+                ),
+                "H": (
+                    # Hour (24-hour clock) as a zero-padded decimal number
+                    lambda column: lpad(hour(column), pandas_lit(2), pandas_lit("0"))
+                ),
+                "M": (
+                    # Minute as a zero-padded decimal number
+                    lambda column: lpad(minute(column), pandas_lit(2), pandas_lit("0"))
+                ),
+                "S": (
+                    # Second as a zero-padded decimal number
+                    lambda column: lpad(second(column), pandas_lit(2), pandas_lit("0"))
+                ),
+                "f": (
+                    # Microsecond as a decimal number, zero-padded to 6 digits
+                    lambda column: lpad(
+                        floor(date_part("ns", column) / 1000),
+                        pandas_lit(6),
+                        pandas_lit("0"),
+                    )
+                ),
+                "j": (
+                    # Day of the year as a zero-padded decimal number
+                    lambda column: lpad(
+                        dayofyear(column), pandas_lit(3), pandas_lit("0")
+                    )
+                ),
+                "X": (
+                    # Locale’s appropriate time representation
+                    lambda column: trunc(to_time(column), pandas_lit("second"))
+                ),
+                "%": (
+                    # A literal '%' character
+                    lambda column: pandas_lit("%")
+                ),
+            }
+
+            parts = re.split("%.", date_format)
+            directive_first = False
+            if parts[0] == "":
+                parts = parts[1:]
+                directive_first = True
+            if parts[-1] == "":
+                parts = parts[:-1]
+            directives = re.findall("%.", date_format)
+            cols = []
+            for i in range(min(len(parts), len(directives))):
+                directive_function = directive_to_function_map.get(directives[i][1:])
+                if not directive_function:
+                    raise ErrorMessage.not_implemented(
+                        f"Snowpark pandas 'Series.dt.strftime' method does not yet support the directive '%{directives[i][1:]}'"
+                    )
+
+                if directive_first:
+                    cols.append(directive_function(column))
+                    cols.append(pandas_lit(parts[i]))
+                else:
+                    cols.append(pandas_lit(parts[i]))
+                    cols.append(directive_function(column))
+
+            if len(parts) > len(directives):
+                cols.append(pandas_lit(parts[-1]))
+            if len(parts) < len(directives):
+                directive_function = directive_to_function_map.get(directives[-1][1:])
+                if not directive_function:
+                    raise ErrorMessage.not_implemented(
+                        f"Snowpark pandas 'Series.dt.strftime' method does not yet support the directive '%{directives[-1][1:]}'"
+                    )
+                cols.append(directive_function(column))
+
+            if len(cols) == 1:
+                return iff(column.is_null(), pandas_lit(None), cols[0])
+            else:
+                return iff(column.is_null(), pandas_lit(None), concat(*cols))
+
+        return SnowflakeQueryCompiler(
+            self._modin_frame.apply_snowpark_function_to_columns(
+                strftime_func,
+                include_index=False,
+            )
         )
 
     def topn(

diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py b/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py
@@ -2218,7 +2218,49 @@ def normalize():
         pass
 
     def strftime():
-        pass
+        """
+        Convert to Index using specified date_format.
+
+        Return an Index of formatted strings specified by date_format, which supports the same string format as the python standard library. Details of the string format can be found in python string format doc.
+
+        Formats supported by the C strftime API but not by the python string format doc (such as “%R”, “%r”) are not officially supported and should be preferably replaced with their supported equivalents (such as “%H:%M”, “%I:%M:%S %p”).
+
+        Note that PeriodIndex support additional directives, detailed in Period.strftime.
+
+        Parameters
+        ----------
+        date_format : str
+            Date format string (e.g. “%Y-%m-%d”).
+
+        Returns
+        -------
+        ndarray[object]
+            NumPy ndarray of formatted strings.
+
+        See also
+        --------
+        to_datetime
+            Convert the given argument to datetime.
+        DatetimeIndex.normalize
+            Return DatetimeIndex with times to midnight.
+        DatetimeIndex.round
+            Round the DatetimeIndex to the specified freq.
+        DatetimeIndex.floor
+            Floor the DatetimeIndex to the specified freq.
+        Timestamp.strftime
+            Format a single Timestamp.
+        Period.strftime
+            Format a single Period.
+
+        Examples
+        --------
+        >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"),
+        ...                     periods=3, freq='s')
+        >>> rng.strftime('%B %d, %Y, %r')  # doctest: +SKIP
+        Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM',
+               'March 10, 2018, 09:00:02 AM'],
+              dtype='object')
+        """
 
     def round():
         """

diff --git a/tests/integ/modin/series/test_dt_accessor.py b/tests/integ/modin/series/test_dt_accessor.py
@@ -433,6 +433,74 @@ def test_days_in_month(property):
     )
 
 
+@sql_count_checker(query_count=1)
+@pytest.mark.parametrize(
+    "date_format",
+    [
+        "a%d-%m-%Y-%H-%M-%S-%f-%j-%X-%%b",
+        "%d-%m-%Y-%H-%M-%S-%f-%j-%X-%%b",
+        "a%d-%m-%Y-%H-%M-%S-%f-%j-%X-%%",
+        "%d-%m-%Y-%H-%M-%S-%f-%j-%X-%%",
+        "%%%M",
+        "%%M",
+        "abc%",
+    ],
+)
+def test_strftime(date_format):
+    datetime_index = native_pd.DatetimeIndex(
+        [
+            "2014-04-04 23:56:01.000000001",
+            "2014-07-18 21:24:02.000000002",
+            "2015-11-22 22:14:03.000000003",
+            "2015-11-23 20:12:04.1234567890",
+            pd.NaT,
+        ],
+    )
+    native_ser = native_pd.Series(datetime_index)
+    snow_ser = pd.Series(native_ser)
+    eval_snowpark_pandas_result(
+        snow_ser,
+        native_ser,
+        lambda s: s.dt.strftime(date_format=date_format),
+    )
+
+
+@sql_count_checker(query_count=0)
+@pytest.mark.parametrize(
+    "date_format",
+    [
+        "%a",
+        "%A",
+        "%w",
+        "%b",
+        "%B",
+        "%y",
+        "%I",
+        "%p",
+        "%z",
+        "%Z",
+        "%U",
+        "%W",
+        "%c",
+        "%x",
+    ],
+)
+def test_strftime_neg(date_format):
+    datetime_index = native_pd.DatetimeIndex(
+        [
+            "2014-04-04 23:56:01.000000001",
+            "2014-07-18 21:24:02.000000002",
+            "2015-11-22 22:14:03.000000003",
+            "2015-11-23 20:12:04.1234567890",
+            pd.NaT,
+        ],
+    )
+    native_ser = native_pd.Series(datetime_index)
+    snow_ser = pd.Series(native_ser)
+    with pytest.raises(NotImplementedError):
+        snow_ser.dt.strftime(date_format=date_format)
+
+
 @dt_properties
 @sql_count_checker(query_count=1)
 def test_dt_property_with_tz(property_name):

diff --git a/tests/integ/modin/series/test_dt_accessor_unsupported.py b/tests/integ/modin/series/test_dt_accessor_unsupported.py
@@ -26,42 +26,3 @@ def test_dt_namespace_accessor_datetime64(self, freq):
         msg = "Snowpark pandas doesn't yet support the property 'Series.dt.freq'"
         with pytest.raises(NotImplementedError, match=msg):
             ser.dt.freq
-
-    @pytest.mark.parametrize(
-        "date, format_string, expected",
-        [
-            (
-                native_pd.date_range("20130101", periods=5),
-                "%Y/%m/%d",
-                native_pd.Series(
-                    [
-                        "2013/01/01",
-                        "2013/01/02",
-                        "2013/01/03",
-                        "2013/01/04",
-                        "2013/01/05",
-                    ]
-                ),
-            ),
-            (
-                native_pd.date_range("2015-02-03 11:22:33.4567", periods=5),
-                "%Y/%m/%d %H-%M-%S",
-                native_pd.Series(
-                    [
-                        "2015/02/03 11-22-33",
-                        "2015/02/04 11-22-33",
-                        "2015/02/05 11-22-33",
-                        "2015/02/06 11-22-33",
-                        "2015/02/07 11-22-33",
-                    ]
-                ),
-            ),
-        ],
-    )
-    @sql_count_checker(query_count=0)
-    def test_strftime(self, date, format_string, expected):
-        # GH 10086
-        ser = pd.Series(date)
-        msg = "Snowpark pandas doesn't yet support the method 'Series.dt.strftime'"
-        with pytest.raises(NotImplementedError, match=msg):
-            ser.dt.strftime(format_string)
diff --git a/tests/unit/modin/test_series_dt.py b/tests/unit/modin/test_series_dt.py
@@ -35,7 +35,6 @@ def mock_query_compiler_for_dt_series() -> SnowflakeQueryCompiler:
     [
         (lambda s: s.dt.timetz, "timetz"),
         (lambda s: s.dt.to_period(), "to_period"),
-        (lambda s: s.dt.strftime(date_format="YY/MM/DD"), "strftime"),
         (lambda s: s.dt.qyear, "qyear"),
         (lambda s: s.dt.start_time, "start_time"),
         (lambda s: s.dt.end_time, "end_time"),