From d8091a994fac1f254fbdc385f1356195a9210283 Mon Sep 17 00:00:00 2001
From: Snowflake Provisioner
 <58576687+snowflake-provisioner@users.noreply.github.com>
Date: Fri, 9 Jun 2023 10:28:32 -0700
Subject: [PATCH] Project import generated by Copybara. (#20)

GitOrigin-RevId: 7cece61f8ed84deeaabc0bf1cd91fc803117f627

Co-authored-by: Snowflake Authors <noreply@snowflake.com>
---
 bazel/requirements/BUILD.bazel                |  23 ++-
 bazel/requirements/templates/meta.tpl.yaml    |   6 -
 ci/conda_recipe/meta.yaml                     |   7 +-
 ci/get_excluded_tests.sh                      |   2 +-
 .../ml/modeling/impute/simple_imputer.py      |  19 +-
 snowflake/ml/modeling/metrics/BUILD.bazel     |   1 +
 snowflake/ml/modeling/metrics/__init__.py     |   2 +
 .../precision_recall_fscore_support.py        |  10 +-
 snowflake/ml/modeling/metrics/roc_curve.py    |  94 ++++++++++
 .../snowflake/ml/modeling/framework/utils.py  |  12 +-
 .../snowflake/ml/modeling/metrics/BUILD.bazel |  27 ++-
 .../modeling/metrics/test_accuracy_score.py   | 153 +++++++--------
 .../test_precision_recall_fscore_support.py   | 176 ++++++++++++------
 .../modeling/metrics/test_precision_score.py  | 144 ++++++++------
 .../ml/modeling/metrics/test_roc_curve.py     | 121 ++++++++++++
 15 files changed, 554 insertions(+), 243 deletions(-)
 create mode 100644 snowflake/ml/modeling/metrics/roc_curve.py
 create mode 100644 tests/integ/snowflake/ml/modeling/metrics/test_roc_curve.py

diff --git a/bazel/requirements/BUILD.bazel b/bazel/requirements/BUILD.bazel
index 58d8003c..aca69d96 100644
--- a/bazel/requirements/BUILD.bazel
+++ b/bazel/requirements/BUILD.bazel
@@ -21,12 +21,14 @@ _GENERATE_TOOL = ":parse_and_generate_requirements"
 
 _GENERATE_COMMAND = "$(location " + _GENERATE_TOOL + ") $(location " + _SRC_REQUIREMENT_FILE + ") --schema $(location " + _SCHEMA_FILE + ") {options} > $@"
 
-_TEMPLATE_FOLDER_PATH = "//bazel/requirements/templates"
-
 _AUTOGEN_HEADERS = """# DO NOT EDIT!
 # Generate by running 'bazel run //bazel/requirements:sync_requirements'
 """
 
+# "---" is a document start marker, which is legit but optional (https://yaml.org/spec/1.1/#c-document-start). This
+# is needed for conda meta.yaml to bypass some bug from conda side.
+_YAML_START_DOCUMENT_MARKER = "---"
+
 _GENERATED_REQUIREMENTS_FILES = {
     "requirements_txt": {
         "cmd": "--mode dev_version --format text",
@@ -77,7 +79,7 @@ _GENERATED_REQUIREMENTS_FILES = {
             "{generated}.body".format(generated = value["generated"]),
         ],
         outs = [value["generated"]],
-        cmd = "(echo -e \""+ _AUTOGEN_HEADERS +"\" ; cat $(location :{generated}.body) ) > $@".format(
+        cmd = "(echo -e \"" + _AUTOGEN_HEADERS + "\" ; cat $(location :{generated}.body) ) > $@".format(
             generated = value["generated"],
         ),
         tools = [_GENERATE_TOOL],
@@ -99,15 +101,24 @@ genrule(
 )
 
 yq(
-    name = "gen_conda_meta",
+    name = "gen_conda_meta_body_format",
     srcs = [
         ":meta.body.yaml",
-        "{template_folder}:meta.tpl.yaml".format(template_folder = _TEMPLATE_FOLDER_PATH),
+        "//bazel/requirements/templates:meta.tpl.yaml",
     ],
-    outs = ["meta.yaml"],
+    outs = ["meta.body.formatted.yaml"],
     expression = ". as $item ireduce ({}; . * $item ) | sort_keys(..)",
 )
 
+genrule(
+    name = "gen_conda_meta",
+    srcs = [
+        ":meta.body.formatted.yaml",
+    ],
+    outs = ["meta.yaml"],
+    cmd = "(echo -e \"" + _AUTOGEN_HEADERS + "\" ; echo \"" + _YAML_START_DOCUMENT_MARKER + "\"; cat $(location :meta.body.formatted.yaml) ) > $@",
+)
+
 # Create a test target for each file that Bazel should
 # write to the source tree.
 [
diff --git a/bazel/requirements/templates/meta.tpl.yaml b/bazel/requirements/templates/meta.tpl.yaml
index 260f3955..f6f5be6c 100644
--- a/bazel/requirements/templates/meta.tpl.yaml
+++ b/bazel/requirements/templates/meta.tpl.yaml
@@ -1,9 +1,3 @@
-# DO NOT EDIT!
-# Generated by //bazel/requirements:gen_conda_meta
-# To update, run:
-# bazel run //bazel/requirements:sync_requirements
-#
-
 package:
   name: snowflake-ml-python
 
diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml
index 607c0485..c9f3502d 100644
--- a/ci/conda_recipe/meta.yaml
+++ b/ci/conda_recipe/meta.yaml
@@ -1,8 +1,7 @@
 # DO NOT EDIT!
-# Generated by //bazel/requirements:gen_conda_meta
-# To update, run:
-# bazel run //bazel/requirements:sync_requirements
-#
+# Generate by running 'bazel run //bazel/requirements:sync_requirements'
+
+---
 about:
   description: |
     Snowflake ML client Library is used for interacting with Snowflake to build machine learning solutions.
diff --git a/ci/get_excluded_tests.sh b/ci/get_excluded_tests.sh
index b59057c7..199db075 100755
--- a/ci/get_excluded_tests.sh
+++ b/ci/get_excluded_tests.sh
@@ -12,7 +12,7 @@
 #     The missing dependency cuold happen when a new operator is being developed, but not yet released.
 
 set -o pipefail
-set -eu
+set -u
 
 echo "Running "$0
 
diff --git a/snowflake/ml/modeling/impute/simple_imputer.py b/snowflake/ml/modeling/impute/simple_imputer.py
index 1c579df1..e01c4231 100644
--- a/snowflake/ml/modeling/impute/simple_imputer.py
+++ b/snowflake/ml/modeling/impute/simple_imputer.py
@@ -15,6 +15,8 @@
 from snowflake.snowpark import functions as F, types as T
 from snowflake.snowpark._internal import utils as snowpark_utils
 
+_SUBPROJECT = "Impute"
+
 STRATEGY_TO_STATE_DICT = {
     "constant": None,
     "mean": _utils.NumericStatistics.MEAN,
@@ -194,10 +196,7 @@ def check_type_consistency(col_types: Dict[str, T.DataType]) -> None:
 
         return input_col_datatypes
 
-    @telemetry.send_api_usage_telemetry(
-        project=base.PROJECT,
-        subproject=base.SUBPROJECT,
-    )
+    @telemetry.send_api_usage_telemetry(project=base.PROJECT, subproject=_SUBPROJECT)
     def fit(self, dataset: snowpark.DataFrame) -> "SimpleImputer":
         """
         Compute values to impute for the dataset according to the strategy.
@@ -214,7 +213,7 @@ def fit(self, dataset: snowpark.DataFrame) -> "SimpleImputer":
         input_col_datatypes = self._get_dataset_input_col_datatypes(dataset)
 
         self.statistics_: Dict[str, Any] = {}
-        statement_params = telemetry.get_statement_params(base.PROJECT, base.SUBPROJECT, self.__class__.__name__)
+        statement_params = telemetry.get_statement_params(base.PROJECT, _SUBPROJECT, self.__class__.__name__)
 
         if self.strategy == "constant":
             if self.fill_value is None:
@@ -274,14 +273,8 @@ def fit(self, dataset: snowpark.DataFrame) -> "SimpleImputer":
         self._is_fitted = True
         return self
 
-    @telemetry.send_api_usage_telemetry(
-        project=base.PROJECT,
-        subproject=base.SUBPROJECT,
-    )
-    @telemetry.add_stmt_params_to_df(
-        project=base.PROJECT,
-        subproject=base.SUBPROJECT,
-    )
+    @telemetry.send_api_usage_telemetry(project=base.PROJECT, subproject=_SUBPROJECT)
+    @telemetry.add_stmt_params_to_df(project=base.PROJECT, subproject=_SUBPROJECT)
     def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[snowpark.DataFrame, pd.DataFrame]:
         """
         Transform the input dataset by imputing the computed statistics in the input columns.
diff --git a/snowflake/ml/modeling/metrics/BUILD.bazel b/snowflake/ml/modeling/metrics/BUILD.bazel
index d5bd8657..08b85709 100644
--- a/snowflake/ml/modeling/metrics/BUILD.bazel
+++ b/snowflake/ml/modeling/metrics/BUILD.bazel
@@ -13,6 +13,7 @@ py_library(
         "precision_recall_fscore_support.py",
         "precision_score.py",
         "regression.py",
+        "roc_curve.py",
     ],
     deps = [
         ":init",
diff --git a/snowflake/ml/modeling/metrics/__init__.py b/snowflake/ml/modeling/metrics/__init__.py
index ed8be701..730ed70f 100644
--- a/snowflake/ml/modeling/metrics/__init__.py
+++ b/snowflake/ml/modeling/metrics/__init__.py
@@ -4,6 +4,7 @@
 from .covariance import covariance
 from .precision_recall_fscore_support import precision_recall_fscore_support
 from .precision_score import precision_score
+from .roc_curve import roc_curve
 
 __all__ = [
     "accuracy_score",
@@ -12,4 +13,5 @@
     "covariance",
     "precision_recall_fscore_support",
     "precision_score",
+    "roc_curve",
 ]
diff --git a/snowflake/ml/modeling/metrics/precision_recall_fscore_support.py b/snowflake/ml/modeling/metrics/precision_recall_fscore_support.py
index ecd5504f..65d4894e 100644
--- a/snowflake/ml/modeling/metrics/precision_recall_fscore_support.py
+++ b/snowflake/ml/modeling/metrics/precision_recall_fscore_support.py
@@ -115,10 +115,18 @@ def precision_recall_fscore_support(
 
     session = df._session
     assert session is not None
-    query = df.queries["queries"][-1]
     sproc_name = f"precision_recall_fscore_support_{snowpark_utils.generate_random_alphanumeric()}"
     statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT)
 
+    cols = []
+    if isinstance(y_true_col_names, str):
+        cols = [y_true_col_names, y_pred_col_names]
+    elif isinstance(y_true_col_names, list):
+        cols = y_true_col_names + y_pred_col_names  # type:ignore[assignment, operator]
+    if sample_weight_col_name:
+        cols.append(sample_weight_col_name)
+    query = df[cols].queries["queries"][-1]
+
     @F.sproc(  # type: ignore[misc]
         session=session,
         name=sproc_name,
diff --git a/snowflake/ml/modeling/metrics/roc_curve.py b/snowflake/ml/modeling/metrics/roc_curve.py
new file mode 100644
index 00000000..fae77f9a
--- /dev/null
+++ b/snowflake/ml/modeling/metrics/roc_curve.py
@@ -0,0 +1,94 @@
+from typing import Optional, Tuple, Union
+
+import cloudpickle
+import numpy.typing as npt
+from sklearn import metrics
+
+from snowflake import snowpark
+from snowflake.ml._internal import telemetry
+from snowflake.snowpark import functions as F
+from snowflake.snowpark._internal import utils as snowpark_utils
+
+_PROJECT = "ModelDevelopment"
+_SUBPROJECT = "Metrics"
+
+
+@telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT)
+def roc_curve(
+    *,
+    df: snowpark.DataFrame,
+    y_true_col_name: str,
+    y_score_col_name: str,
+    pos_label: Optional[Union[str, int]] = None,
+    sample_weight_col_name: Optional[str] = None,
+    drop_intermediate: bool = True,
+) -> Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]:
+    """
+    Compute Receiver operating characteristic (ROC).
+
+    Note: this implementation is restricted to the binary classification task.
+
+    Args:
+        df: Input dataframe.
+        y_true_col_name: Column name representing true binary labels.
+            If labels are not either {-1, 1} or {0, 1}, then pos_label should be
+            explicitly given.
+        y_score_col_name: Column name representing target scores, can either
+            be probability estimates of the positive class, confidence values,
+            or non-thresholded measure of decisions (as returned by
+            "decision_function" on some classifiers).
+        pos_label: The label of the positive class.
+            When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1},
+            ``pos_label`` is set to 1, otherwise an error will be raised.
+        sample_weight_col_name: Column name representing sample weights.
+        drop_intermediate: Whether to drop some suboptimal thresholds which would
+            not appear on a plotted ROC curve. This is useful in order to create
+            lighter ROC curves.
+
+    Returns:
+        fpr: ndarray of shape (>2,)
+            Increasing false positive rates such that element i is the false
+            positive rate of predictions with score >= `thresholds[i]`.
+        tpr : ndarray of shape (>2,)
+            Increasing true positive rates such that element `i` is the true
+            positive rate of predictions with score >= `thresholds[i]`.
+        thresholds : ndarray of shape = (n_thresholds,)
+            Decreasing thresholds on the decision function used to compute
+            fpr and tpr. `thresholds[0]` represents no instances being predicted
+            and is arbitrarily set to `max(y_score) + 1`.
+    """
+    session = df._session
+    assert session is not None
+    sproc_name = f"roc_curve_{snowpark_utils.generate_random_alphanumeric()}"
+    statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT)
+
+    cols = [y_true_col_name, y_score_col_name]
+    if sample_weight_col_name:
+        cols.append(sample_weight_col_name)
+    query = df[cols].queries["queries"][-1]
+
+    @F.sproc(  # type: ignore[misc]
+        session=session,
+        name=sproc_name,
+        replace=True,
+        packages=["cloudpickle", "scikit-learn", "snowflake-snowpark-python"],
+        statement_params=statement_params,
+    )
+    def roc_curve_sproc(session: snowpark.Session) -> bytes:
+        df = session.sql(query).to_pandas(statement_params=statement_params)
+        y_true = df[y_true_col_name]
+        y_score = df[y_score_col_name]
+        sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None
+        fpr, tpr, thresholds = metrics.roc_curve(
+            y_true,
+            y_score,
+            pos_label=pos_label,
+            sample_weight=sample_weight,
+            drop_intermediate=drop_intermediate,
+        )
+
+        return cloudpickle.dumps((fpr, tpr, thresholds))  # type: ignore[no-any-return]
+
+    loaded_data = cloudpickle.loads(session.call(sproc_name))
+    res: Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike] = loaded_data
+    return res
diff --git a/tests/integ/snowflake/ml/modeling/framework/utils.py b/tests/integ/snowflake/ml/modeling/framework/utils.py
index 9c5fc250..60c1efbb 100644
--- a/tests/integ/snowflake/ml/modeling/framework/utils.py
+++ b/tests/integ/snowflake/ml/modeling/framework/utils.py
@@ -144,7 +144,7 @@ class DataType(Enum):
 
 
 def gen_fuzz_data(
-    rows: int, types: List[DataType], low: int = MIN_INT, high: int = MAX_INT
+    rows: int, types: List[DataType], low: Union[int, List[int]] = MIN_INT, high: Union[int, List[int]] = MAX_INT
 ) -> Tuple[List[Any], List[str]]:
     """
     Generate random data based on input column types and row count.
@@ -153,8 +153,8 @@ def gen_fuzz_data(
     Args:
         rows: num of rows to generate
         types: type per column
-        low: lower bound of the output interval (inclusive)
-        high: upper bound of the output interval (exclusive)
+        low: lower bound(s) of the output interval (inclusive)
+        high: upper bound(s) of the output interval (exclusive)
 
     Returns:
         A tuple of generated data and column names
@@ -166,10 +166,12 @@ def gen_fuzz_data(
     names = ["ID"]
 
     for idx, t in enumerate(types):
+        _low = low if isinstance(low, int) else low[idx]
+        _high = high if isinstance(high, int) else high[idx]
         if t == DataType.INTEGER:
-            data.append(np.random.randint(low, high, rows))
+            data.append(np.random.randint(_low, _high, rows))
         elif t == DataType.FLOAT:
-            data.append(np.random.uniform(low, high, rows))
+            data.append(np.random.uniform(_low, _high, rows))
         else:
             raise ValueError(f"Unsupported data type {t}")
         names.append(f"COL_{idx}")
diff --git a/tests/integ/snowflake/ml/modeling/metrics/BUILD.bazel b/tests/integ/snowflake/ml/modeling/metrics/BUILD.bazel
index d4befcbf..58ce132b 100644
--- a/tests/integ/snowflake/ml/modeling/metrics/BUILD.bazel
+++ b/tests/integ/snowflake/ml/modeling/metrics/BUILD.bazel
@@ -2,6 +2,9 @@ load("//bazel:py_rules.bzl", "py_test")
 
 package(default_visibility = ["//visibility:public"])
 
+SHARD_COUNT = 3
+TIMEOUT = "long"  # 900s
+
 py_test(
     name = "test_r2_score",
     srcs = ["test_r2_score.py"],
@@ -23,7 +26,7 @@ py_test(
 
 py_test(
     name = "test_confusion_matrix",
-    timeout = "long",
+    timeout = TIMEOUT,
     srcs = ["test_confusion_matrix.py"],
     deps = [
         "//snowflake/ml/modeling/metrics",
@@ -34,7 +37,7 @@ py_test(
 
 py_test(
     name = "test_correlation",
-    timeout = "long",
+    timeout = TIMEOUT,
     srcs = ["test_correlation.py"],
     deps = [
         "//snowflake/ml/modeling/metrics",
@@ -44,7 +47,7 @@ py_test(
 
 py_test(
     name = "test_covariance",
-    timeout = "long",
+    timeout = TIMEOUT,
     srcs = ["test_covariance.py"],
     deps = [
         "//snowflake/ml/modeling/metrics",
@@ -54,8 +57,9 @@ py_test(
 
 py_test(
     name = "test_precision_recall_fscore_support",
-    timeout = "long",
+    timeout = TIMEOUT,
     srcs = ["test_precision_recall_fscore_support.py"],
+    shard_count = SHARD_COUNT,
     deps = [
         "//snowflake/ml/modeling/metrics",
         "//snowflake/ml/utils:connection_params",
@@ -65,8 +69,21 @@ py_test(
 
 py_test(
     name = "test_precision_score",
-    timeout = "long",
+    timeout = TIMEOUT,
     srcs = ["test_precision_score.py"],
+    shard_count = SHARD_COUNT,
+    deps = [
+        "//snowflake/ml/modeling/metrics",
+        "//snowflake/ml/utils:connection_params",
+        "//tests/integ/snowflake/ml/modeling/framework:utils",
+    ],
+)
+
+py_test(
+    name = "test_roc_curve",
+    timeout = TIMEOUT,
+    srcs = ["test_roc_curve.py"],
+    shard_count = SHARD_COUNT,
     deps = [
         "//snowflake/ml/modeling/metrics",
         "//snowflake/ml/utils:connection_params",
diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_accuracy_score.py b/tests/integ/snowflake/ml/modeling/metrics/test_accuracy_score.py
index 5f114412..573d0e6d 100644
--- a/tests/integ/snowflake/ml/modeling/metrics/test_accuracy_score.py
+++ b/tests/integ/snowflake/ml/modeling/metrics/test_accuracy_score.py
@@ -4,6 +4,7 @@
 from typing import Any, Dict
 
 import numpy as np
+import pandas as pd
 from absl.testing import parameterized
 from absl.testing.absltest import main
 from sklearn import metrics as sklearn_metrics
@@ -13,15 +14,17 @@
 from snowflake.ml.utils import connection_params
 from tests.integ.snowflake.ml.modeling.framework import utils
 
+_ROWS = 100
+_TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT]
 _BINARY_DATA, _SCHEMA = utils.gen_fuzz_data(
-    rows=100,
-    types=[utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT],
+    rows=_ROWS,
+    types=_TYPES,
     low=0,
     high=2,
 )
 _MULTICLASS_DATA, _ = utils.gen_fuzz_data(
-    rows=100,
-    types=[utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT],
+    rows=_ROWS,
+    types=_TYPES,
     low=0,
     high=5,
 )
@@ -39,99 +42,75 @@ def setUp(self) -> None:
         """Creates Snowpark and Snowflake environments for testing."""
         self._session = snowpark.Session.builder.configs(connection_params.SnowflakeLoginOptions()).create()
 
-        self._binary_input_df = self._session.create_dataframe(_BINARY_DATA, schema=_SCHEMA)
-        self._binary_pandas_df = self._binary_input_df.to_pandas()
-        self._multiclass_input_df = self._session.create_dataframe(_MULTICLASS_DATA, schema=_SCHEMA)
-        self._multiclass_pandas_df = self._multiclass_input_df.to_pandas()
-
     def tearDown(self) -> None:
         self._session.close()
 
     @parameterized.parameters(  # type: ignore[misc]
-        {"params": {"y_true_col_names": [_Y_TRUE_COL, _Y_TRUE_COLS], "y_pred_col_names": [_Y_PRED_COL, _Y_PRED_COLS]}},
-    )
-    def test_accuracy_score(self, params: Dict[str, Any]) -> None:
-        for i in range(len(params["y_true_col_names"])):
-            y_true_col_names = params["y_true_col_names"][i]
-            y_pred_col_names = params["y_pred_col_names"][i]
-            input_df = self._multiclass_input_df if isinstance(y_true_col_names, str) else self._binary_input_df
-            pandas_df = self._multiclass_pandas_df if isinstance(y_true_col_names, str) else self._binary_pandas_df
-
-            score = snowml_metrics.accuracy_score(
-                df=input_df, y_true_col_names=y_true_col_names, y_pred_col_names=y_pred_col_names, normalize=False
-            )
-            score_sklearn = sklearn_metrics.accuracy_score(
-                pandas_df[y_true_col_names], pandas_df[y_pred_col_names], normalize=False
-            )
-            np.testing.assert_allclose(score, score_sklearn)
-
-    @parameterized.parameters(  # type: ignore[misc]
-        {"params": {"y_true_col_names": [_Y_TRUE_COL, _Y_TRUE_COLS], "y_pred_col_names": [_Y_PRED_COL, _Y_PRED_COLS]}},
+        {
+            "params": {
+                "sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL],
+                "values": [
+                    {"data": _BINARY_DATA, "y_true": _Y_TRUE_COLS, "y_pred": _Y_PRED_COLS},
+                    {"data": _MULTICLASS_DATA, "y_true": _Y_TRUE_COL, "y_pred": _Y_PRED_COL},
+                ],
+            }
+        },
     )
     def test_accuracy_score_sample_weight(self, params: Dict[str, Any]) -> None:
-        for i in range(len(params["y_true_col_names"])):
-            y_true_col_names = params["y_true_col_names"][i]
-            y_pred_col_names = params["y_pred_col_names"][i]
-            input_df = self._multiclass_input_df if isinstance(y_true_col_names, str) else self._binary_input_df
-            pandas_df = self._multiclass_pandas_df if isinstance(y_true_col_names, str) else self._binary_pandas_df
-
-            score = snowml_metrics.accuracy_score(
-                df=input_df,
-                y_true_col_names=y_true_col_names,
-                y_pred_col_names=y_pred_col_names,
-                sample_weight_col_name=_SAMPLE_WEIGHT_COL,
-                normalize=False,
-            )
-            score_sklearn = sklearn_metrics.accuracy_score(
-                pandas_df[y_true_col_names],
-                pandas_df[y_pred_col_names],
-                sample_weight=pandas_df[_SAMPLE_WEIGHT_COL].to_numpy(),
-                normalize=False,
-            )
-            np.testing.assert_allclose(score, score_sklearn)
+        for values in params["values"]:
+            data = values["data"]
+            y_true = values["y_true"]
+            y_pred = values["y_pred"]
+            pandas_df = pd.DataFrame(data, columns=_SCHEMA)
+            input_df = self._session.create_dataframe(pandas_df)
+
+            for sample_weight_col_name in params["sample_weight_col_name"]:
+                actual_score = snowml_metrics.accuracy_score(
+                    df=input_df,
+                    y_true_col_names=y_true,
+                    y_pred_col_names=y_pred,
+                    sample_weight_col_name=sample_weight_col_name,
+                )
+                sample_weight = pandas_df[sample_weight_col_name].to_numpy() if sample_weight_col_name else None
+                sklearn_score = sklearn_metrics.accuracy_score(
+                    pandas_df[y_true],
+                    pandas_df[y_pred],
+                    sample_weight=sample_weight,
+                )
+                np.testing.assert_allclose(actual_score, sklearn_score)
 
     @parameterized.parameters(  # type: ignore[misc]
-        {"params": {"y_true_col_names": [_Y_TRUE_COL, _Y_TRUE_COLS], "y_pred_col_names": [_Y_PRED_COL, _Y_PRED_COLS]}},
+        {
+            "params": {
+                "normalize": [True, False],
+                "values": [
+                    {"data": _BINARY_DATA, "y_true": _Y_TRUE_COLS, "y_pred": _Y_PRED_COLS},
+                    {"data": _MULTICLASS_DATA, "y_true": _Y_TRUE_COL, "y_pred": _Y_PRED_COL},
+                ],
+            }
+        },
     )
     def test_accuracy_score_normalized(self, params: Dict[str, Any]) -> None:
-        for i in range(len(params["y_true_col_names"])):
-            y_true_col_names = params["y_true_col_names"][i]
-            y_pred_col_names = params["y_pred_col_names"][i]
-            input_df = self._multiclass_input_df if isinstance(y_true_col_names, str) else self._binary_input_df
-            pandas_df = self._multiclass_pandas_df if isinstance(y_true_col_names, str) else self._binary_pandas_df
-
-            score = snowml_metrics.accuracy_score(
-                df=input_df, y_true_col_names=y_true_col_names, y_pred_col_names=y_pred_col_names, normalize=True
-            )
-            score_sklearn = sklearn_metrics.accuracy_score(
-                pandas_df[y_true_col_names], pandas_df[y_pred_col_names], normalize=True
-            )
-            np.testing.assert_allclose(score, score_sklearn)
-
-    @parameterized.parameters(  # type: ignore[misc]
-        {"params": {"y_true_col_names": [_Y_TRUE_COL, _Y_TRUE_COLS], "y_pred_col_names": [_Y_PRED_COL, _Y_PRED_COLS]}},
-    )
-    def test_accuracy_score_sample_weight_normalized(self, params: Dict[str, Any]) -> None:
-        for i in range(len(params["y_true_col_names"])):
-            y_true_col_names = params["y_true_col_names"][i]
-            y_pred_col_names = params["y_pred_col_names"][i]
-            input_df = self._multiclass_input_df if isinstance(y_true_col_names, str) else self._binary_input_df
-            pandas_df = self._multiclass_pandas_df if isinstance(y_true_col_names, str) else self._binary_pandas_df
-
-            score = snowml_metrics.accuracy_score(
-                df=input_df,
-                y_true_col_names=y_true_col_names,
-                y_pred_col_names=y_pred_col_names,
-                sample_weight_col_name=_SAMPLE_WEIGHT_COL,
-                normalize=True,
-            )
-            score_sklearn = sklearn_metrics.accuracy_score(
-                pandas_df[y_true_col_names],
-                pandas_df[y_pred_col_names],
-                sample_weight=pandas_df[_SAMPLE_WEIGHT_COL].to_numpy(),
-                normalize=True,
-            )
-            np.testing.assert_allclose(score, score_sklearn)
+        for values in params["values"]:
+            data = values["data"]
+            y_true = values["y_true"]
+            y_pred = values["y_pred"]
+            pandas_df = pd.DataFrame(data, columns=_SCHEMA)
+            input_df = self._session.create_dataframe(pandas_df)
+
+            for normalize in params["normalize"]:
+                actual_score = snowml_metrics.accuracy_score(
+                    df=input_df,
+                    y_true_col_names=y_true,
+                    y_pred_col_names=y_pred,
+                    normalize=normalize,
+                )
+                sklearn_score = sklearn_metrics.accuracy_score(
+                    pandas_df[y_true],
+                    pandas_df[y_pred],
+                    normalize=normalize,
+                )
+                np.testing.assert_allclose(actual_score, sklearn_score)
 
 
 if __name__ == "__main__":
diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_precision_recall_fscore_support.py b/tests/integ/snowflake/ml/modeling/metrics/test_precision_recall_fscore_support.py
index 6272ea87..9a0365bd 100644
--- a/tests/integ/snowflake/ml/modeling/metrics/test_precision_recall_fscore_support.py
+++ b/tests/integ/snowflake/ml/modeling/metrics/test_precision_recall_fscore_support.py
@@ -16,7 +16,13 @@
 
 _ROWS = 100
 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT]
-_DATA, _SCHEMA = utils.gen_fuzz_data(
+_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data(
+    rows=_ROWS,
+    types=_TYPES,
+    low=0,
+    high=2,
+)
+_MULTICLASS_DATA, _ = utils.gen_fuzz_data(
     rows=_ROWS,
     types=_TYPES,
     low=0,
@@ -39,11 +45,47 @@ def setUp(self) -> None:
     def tearDown(self) -> None:
         self._session.close()
 
+    @parameterized.parameters(  # type: ignore[misc]
+        {
+            "params": {
+                "beta": [1.0, 0.5],
+                "values": [
+                    {"data": _BINARY_DATA, "y_true": _Y_TRUE_COLS, "y_pred": _Y_PRED_COLS},
+                    {"data": _MULTICLASS_DATA, "y_true": _Y_TRUE_COL, "y_pred": _Y_PRED_COL},
+                ],
+            }
+        },
+    )
+    def test_precision_recall_fscore_support_beta(self, params: Dict[str, Any]) -> None:
+        for values in params["values"]:
+            data = values["data"]
+            y_true = values["y_true"]
+            y_pred = values["y_pred"]
+            pandas_df = pd.DataFrame(data, columns=_SCHEMA)
+            input_df = self._session.create_dataframe(pandas_df)
+
+            for beta in params["beta"]:
+                actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support(
+                    df=input_df,
+                    y_true_col_names=y_true,
+                    y_pred_col_names=y_pred,
+                    beta=beta,
+                )
+                sklearn_p, sklearn_r, sklearn_f, sklearn_s = sklearn_metrics.precision_recall_fscore_support(
+                    pandas_df[y_true],
+                    pandas_df[y_pred],
+                    beta=beta,
+                )
+                np.testing.assert_allclose(
+                    np.array((actual_p, actual_r, actual_f, actual_s)),
+                    np.array((sklearn_p, sklearn_r, sklearn_f, sklearn_s)),
+                )
+
     @parameterized.parameters(  # type: ignore[misc]
         {"params": {"labels": [None, [2, 0, 4]]}},
     )
     def test_precision_recall_fscore_support_labels(self, params: Dict[str, Any]) -> None:
-        pandas_df = pd.DataFrame(_DATA, columns=_SCHEMA)
+        pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA)
         input_df = self._session.create_dataframe(pandas_df)
 
         for labels in params["labels"]:
@@ -64,24 +106,23 @@ def test_precision_recall_fscore_support_labels(self, params: Dict[str, Any]) ->
             )
 
     @parameterized.parameters(  # type: ignore[misc]
-        {"params": {"sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL]}},
+        {"params": {"pos_label": [0, 2, 4]}},
     )
-    def test_precision_recall_fscore_support_sample_weight(self, params: Dict[str, Any]) -> None:
-        pandas_df = pd.DataFrame(_DATA, columns=_SCHEMA)
+    def test_precision_recall_fscore_support_pos_label(self, params: Dict[str, Any]) -> None:
+        pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA)
         input_df = self._session.create_dataframe(pandas_df)
 
-        for sample_weight_col_name in params["sample_weight_col_name"]:
+        for pos_label in params["pos_label"]:
             actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support(
                 df=input_df,
                 y_true_col_names=_Y_TRUE_COL,
                 y_pred_col_names=_Y_PRED_COL,
-                sample_weight_col_name=sample_weight_col_name,
+                pos_label=pos_label,
             )
-            sample_weight = pandas_df[sample_weight_col_name].to_numpy() if sample_weight_col_name else None
             sklearn_p, sklearn_r, sklearn_f, sklearn_s = sklearn_metrics.precision_recall_fscore_support(
                 pandas_df[_Y_TRUE_COL],
                 pandas_df[_Y_PRED_COL],
-                sample_weight=sample_weight,
+                pos_label=pos_label,
             )
             np.testing.assert_allclose(
                 np.array((actual_p, actual_r, actual_f, actual_s)),
@@ -89,16 +130,50 @@ def test_precision_recall_fscore_support_sample_weight(self, params: Dict[str, A
             )
 
     @parameterized.parameters(  # type: ignore[misc]
-        {"params": {"average": [None, "binary", "micro", "macro", "samples", "weighted"]}},
+        {
+            "params": {
+                "sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL],
+                "values": [
+                    {"data": _BINARY_DATA, "y_true": _Y_TRUE_COLS, "y_pred": _Y_PRED_COLS},
+                    {"data": _MULTICLASS_DATA, "y_true": _Y_TRUE_COL, "y_pred": _Y_PRED_COL},
+                ],
+            }
+        },
+    )
+    def test_precision_recall_fscore_support_sample_weight(self, params: Dict[str, Any]) -> None:
+        for values in params["values"]:
+            data = values["data"]
+            y_true = values["y_true"]
+            y_pred = values["y_pred"]
+            pandas_df = pd.DataFrame(data, columns=_SCHEMA)
+            input_df = self._session.create_dataframe(pandas_df)
+
+            for sample_weight_col_name in params["sample_weight_col_name"]:
+                actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support(
+                    df=input_df,
+                    y_true_col_names=y_true,
+                    y_pred_col_names=y_pred,
+                    sample_weight_col_name=sample_weight_col_name,
+                )
+                sample_weight = pandas_df[sample_weight_col_name].to_numpy() if sample_weight_col_name else None
+                sklearn_p, sklearn_r, sklearn_f, sklearn_s = sklearn_metrics.precision_recall_fscore_support(
+                    pandas_df[y_true],
+                    pandas_df[y_pred],
+                    sample_weight=sample_weight,
+                )
+                np.testing.assert_allclose(
+                    np.array((actual_p, actual_r, actual_f, actual_s)),
+                    np.array((sklearn_p, sklearn_r, sklearn_f, sklearn_s)),
+                )
+
+    @parameterized.parameters(  # type: ignore[misc]
+        {"params": {"average": [None, "micro", "macro", "weighted"]}},
     )
-    def test_precision_recall_fscore_support_average(self, params: Dict[str, Any]) -> None:
-        pandas_df = pd.DataFrame(_DATA, columns=_SCHEMA)
+    def test_precision_recall_fscore_support_average_multiclass(self, params: Dict[str, Any]) -> None:
+        pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA)
         input_df = self._session.create_dataframe(pandas_df)
 
         for average in params["average"]:
-            if average == "binary" or average == "samples":
-                continue
-
             actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support(
                 df=input_df,
                 y_true_col_names=_Y_TRUE_COL,
@@ -115,48 +190,37 @@ def test_precision_recall_fscore_support_average(self, params: Dict[str, Any]) -
                 np.array((sklearn_p, sklearn_r, sklearn_f, sklearn_s), dtype=np.float_),
             )
 
-        data, _ = utils.gen_fuzz_data(
-            rows=_ROWS,
-            types=_TYPES,
-            low=0,
-            high=2,
-        )
-        binary_pandas_df = pd.DataFrame(data, columns=_SCHEMA)
-        binary_input_df = self._session.create_dataframe(binary_pandas_df)
-
-        # binary
-        actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support(
-            df=binary_input_df,
-            y_true_col_names=_Y_TRUE_COL,
-            y_pred_col_names=_Y_PRED_COL,
-            average="binary",
-        )
-        sklearn_p, sklearn_r, sklearn_f, sklearn_s = sklearn_metrics.precision_recall_fscore_support(
-            binary_pandas_df[_Y_TRUE_COL],
-            binary_pandas_df[_Y_PRED_COL],
-            average="binary",
-        )
-        np.testing.assert_allclose(
-            np.array((actual_p, actual_r, actual_f, actual_s), dtype=np.float_),
-            np.array((sklearn_p, sklearn_r, sklearn_f, sklearn_s), dtype=np.float_),
-        )
+    @parameterized.parameters(  # type: ignore[misc]
+        {
+            "params": {
+                "average": ["binary", "samples"],
+                "y_true": [_Y_TRUE_COL, _Y_TRUE_COLS],
+                "y_pred": [_Y_PRED_COL, _Y_PRED_COLS],
+            }
+        },
+    )
+    def test_precision_recall_fscore_support_average_binary(self, params: Dict[str, Any]) -> None:
+        pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA)
+        input_df = self._session.create_dataframe(pandas_df)
 
-        # samples
-        actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support(
-            df=binary_input_df,
-            y_true_col_names=_Y_TRUE_COLS,
-            y_pred_col_names=_Y_PRED_COLS,
-            average="samples",
-        )
-        sklearn_p, sklearn_r, sklearn_f, sklearn_s = sklearn_metrics.precision_recall_fscore_support(
-            binary_pandas_df[_Y_TRUE_COLS],
-            binary_pandas_df[_Y_PRED_COLS],
-            average="samples",
-        )
-        np.testing.assert_allclose(
-            np.array((actual_p, actual_r, actual_f, actual_s), dtype=np.float_),
-            np.array((sklearn_p, sklearn_r, sklearn_f, sklearn_s), dtype=np.float_),
-        )
+        for idx, average in enumerate(params["average"]):
+            y_true = params["y_true"][idx]
+            y_pred = params["y_pred"][idx]
+            actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support(
+                df=input_df,
+                y_true_col_names=y_true,
+                y_pred_col_names=y_pred,
+                average=average,
+            )
+            sklearn_p, sklearn_r, sklearn_f, sklearn_s = sklearn_metrics.precision_recall_fscore_support(
+                pandas_df[y_true],
+                pandas_df[y_pred],
+                average=average,
+            )
+            np.testing.assert_allclose(
+                np.array((actual_p, actual_r, actual_f, actual_s), dtype=np.float_),
+                np.array((sklearn_p, sklearn_r, sklearn_f, sklearn_s), dtype=np.float_),
+            )
 
     @parameterized.parameters(  # type: ignore[misc]
         {"params": {"zero_division": ["warn", 0, 1]}},
diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_precision_score.py b/tests/integ/snowflake/ml/modeling/metrics/test_precision_score.py
index e45cd16c..c3ae6636 100644
--- a/tests/integ/snowflake/ml/modeling/metrics/test_precision_score.py
+++ b/tests/integ/snowflake/ml/modeling/metrics/test_precision_score.py
@@ -16,12 +16,18 @@
 
 _ROWS = 100
 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT]
-_DATA, _SCHEMA = utils.gen_fuzz_data(
+_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data(
     rows=_ROWS,
     types=_TYPES,
     low=0,
     high=2,
 )
+_MULTICLASS_DATA, _ = utils.gen_fuzz_data(
+    rows=_ROWS,
+    types=_TYPES,
+    low=0,
+    high=5,
+)
 _Y_TRUE_COL = _SCHEMA[1]
 _Y_PRED_COL = _SCHEMA[2]
 _Y_TRUE_COLS = [_SCHEMA[1], _SCHEMA[2]]
@@ -43,13 +49,7 @@ def tearDown(self) -> None:
         {"params": {"labels": [None, [2, 0, 4]]}},
     )
     def test_precision_score_labels(self, params: Dict[str, Any]) -> None:
-        data, _ = utils.gen_fuzz_data(
-            rows=_ROWS,
-            types=_TYPES,
-            low=0,
-            high=5,
-        )
-        pandas_df = pd.DataFrame(data, columns=_SCHEMA)
+        pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA)
         input_df = self._session.create_dataframe(pandas_df)
 
         for labels in params["labels"]:
@@ -69,87 +69,113 @@ def test_precision_score_labels(self, params: Dict[str, Any]) -> None:
             np.testing.assert_allclose(actual_p, sklearn_p)
 
     @parameterized.parameters(  # type: ignore[misc]
-        {"params": {"sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL]}},
+        {"params": {"pos_label": [0, 2, 4]}},
     )
-    def test_precision_score_sample_weight(self, params: Dict[str, Any]) -> None:
-        pandas_df = pd.DataFrame(_DATA, columns=_SCHEMA)
+    def test_precision_score_pos_label(self, params: Dict[str, Any]) -> None:
+        pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA)
         input_df = self._session.create_dataframe(pandas_df)
 
-        for sample_weight_col_name in params["sample_weight_col_name"]:
+        for pos_label in params["pos_label"]:
             actual_p = snowml_metrics.precision_score(
                 df=input_df,
                 y_true_col_names=_Y_TRUE_COL,
                 y_pred_col_names=_Y_PRED_COL,
-                sample_weight_col_name=sample_weight_col_name,
+                pos_label=pos_label,
+                average="micro",
             )
-            sample_weight = pandas_df[sample_weight_col_name].to_numpy() if sample_weight_col_name else None
             sklearn_p = sklearn_metrics.precision_score(
                 pandas_df[_Y_TRUE_COL],
                 pandas_df[_Y_PRED_COL],
-                sample_weight=sample_weight,
+                pos_label=pos_label,
+                average="micro",
             )
             np.testing.assert_allclose(actual_p, sklearn_p)
 
     @parameterized.parameters(  # type: ignore[misc]
-        {"params": {"average": [None, "binary", "micro", "macro", "samples", "weighted"]}},
+        {
+            "params": {
+                "sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL],
+                "values": [
+                    {"data": _BINARY_DATA, "y_true": _Y_TRUE_COLS, "y_pred": _Y_PRED_COLS},
+                    {"data": _MULTICLASS_DATA, "y_true": _Y_TRUE_COL, "y_pred": _Y_PRED_COL},
+                ],
+            }
+        },
     )
-    def test_precision_score_average(self, params: Dict[str, Any]) -> None:
-        data, _ = utils.gen_fuzz_data(
-            rows=_ROWS,
-            types=_TYPES,
-            low=0,
-            high=5,
-        )
-        multiclass_pandas_df = pd.DataFrame(data, columns=_SCHEMA)
-        multiclass_input_df = self._session.create_dataframe(multiclass_pandas_df)
+    def test_precision_score_sample_weight(self, params: Dict[str, Any]) -> None:
+        for values in params["values"]:
+            data = values["data"]
+            y_true = values["y_true"]
+            y_pred = values["y_pred"]
+            pandas_df = pd.DataFrame(data, columns=_SCHEMA)
+            input_df = self._session.create_dataframe(pandas_df)
+
+            for sample_weight_col_name in params["sample_weight_col_name"]:
+                actual_p = snowml_metrics.precision_score(
+                    df=input_df,
+                    y_true_col_names=y_true,
+                    y_pred_col_names=y_pred,
+                    sample_weight_col_name=sample_weight_col_name,
+                    average="micro",
+                )
+                sample_weight = pandas_df[sample_weight_col_name].to_numpy() if sample_weight_col_name else None
+                sklearn_p = sklearn_metrics.precision_score(
+                    pandas_df[y_true],
+                    pandas_df[y_pred],
+                    sample_weight=sample_weight,
+                    average="micro",
+                )
+                np.testing.assert_allclose(actual_p, sklearn_p)
 
-        for average in params["average"]:
-            if average == "binary" or average == "samples":
-                continue
+    @parameterized.parameters(  # type: ignore[misc]
+        {"params": {"average": [None, "micro", "macro", "weighted"]}},
+    )
+    def test_precision_score_average_multiclass(self, params: Dict[str, Any]) -> None:
+        pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA)
+        input_df = self._session.create_dataframe(pandas_df)
 
+        for average in params["average"]:
             actual_p = snowml_metrics.precision_score(
-                df=multiclass_input_df,
+                df=input_df,
                 y_true_col_names=_Y_TRUE_COL,
                 y_pred_col_names=_Y_PRED_COL,
                 average=average,
             )
             sklearn_p = sklearn_metrics.precision_score(
-                multiclass_pandas_df[_Y_TRUE_COL],
-                multiclass_pandas_df[_Y_PRED_COL],
+                pandas_df[_Y_TRUE_COL],
+                pandas_df[_Y_PRED_COL],
                 average=average,
             )
             np.testing.assert_allclose(actual_p, sklearn_p)
 
-        pandas_df = pd.DataFrame(_DATA, columns=_SCHEMA)
+    @parameterized.parameters(  # type: ignore[misc]
+        {
+            "params": {
+                "average": ["binary", "samples"],
+                "y_true": [_Y_TRUE_COL, _Y_TRUE_COLS],
+                "y_pred": [_Y_PRED_COL, _Y_PRED_COLS],
+            }
+        },
+    )
+    def test_precision_score_average_binary(self, params: Dict[str, Any]) -> None:
+        pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA)
         input_df = self._session.create_dataframe(pandas_df)
 
-        # binary
-        actual_p = snowml_metrics.precision_score(
-            df=input_df,
-            y_true_col_names=_Y_TRUE_COL,
-            y_pred_col_names=_Y_PRED_COL,
-            average="binary",
-        )
-        sklearn_p = sklearn_metrics.precision_score(
-            pandas_df[_Y_TRUE_COL],
-            pandas_df[_Y_PRED_COL],
-            average="binary",
-        )
-        np.testing.assert_allclose(actual_p, sklearn_p)
-
-        # samples
-        actual_p = snowml_metrics.precision_score(
-            df=input_df,
-            y_true_col_names=_Y_TRUE_COLS,
-            y_pred_col_names=_Y_PRED_COLS,
-            average="samples",
-        )
-        sklearn_p = sklearn_metrics.precision_score(
-            pandas_df[_Y_TRUE_COLS],
-            pandas_df[_Y_PRED_COLS],
-            average="samples",
-        )
-        np.testing.assert_allclose(actual_p, sklearn_p)
+        for idx, average in enumerate(params["average"]):
+            y_true = params["y_true"][idx]
+            y_pred = params["y_pred"][idx]
+            actual_p = snowml_metrics.precision_score(
+                df=input_df,
+                y_true_col_names=y_true,
+                y_pred_col_names=y_pred,
+                average=average,
+            )
+            sklearn_p = sklearn_metrics.precision_score(
+                pandas_df[y_true],
+                pandas_df[y_pred],
+                average=average,
+            )
+            np.testing.assert_allclose(actual_p, sklearn_p)
 
     @parameterized.parameters(  # type: ignore[misc]
         {"params": {"zero_division": ["warn", 0, 1]}},
diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_roc_curve.py b/tests/integ/snowflake/ml/modeling/metrics/test_roc_curve.py
new file mode 100644
index 00000000..7ca91162
--- /dev/null
+++ b/tests/integ/snowflake/ml/modeling/metrics/test_roc_curve.py
@@ -0,0 +1,121 @@
+#
+# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved.
+#
+from typing import Any, Dict
+
+import numpy as np
+import pandas as pd
+from absl.testing import parameterized
+from absl.testing.absltest import main
+from sklearn import metrics as sklearn_metrics
+
+from snowflake import snowpark
+from snowflake.ml.modeling import metrics as snowml_metrics
+from snowflake.ml.utils import connection_params
+from tests.integ.snowflake.ml.modeling.framework import utils
+
+_ROWS = 100
+_TYPES = [utils.DataType.INTEGER] + [utils.DataType.FLOAT] * 2
+_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data(
+    rows=_ROWS,
+    types=_TYPES,
+    low=0,
+    high=[2, 1, 1],
+)
+_MULTICLASS_DATA, _ = utils.gen_fuzz_data(
+    rows=_ROWS,
+    types=_TYPES,
+    low=0,
+    high=[5, 1, 1],
+)
+_Y_TRUE_COL = _SCHEMA[1]
+_Y_SCORE_COL = _SCHEMA[2]
+_SAMPLE_WEIGHT_COL = _SCHEMA[3]
+
+
+class RocCurveTest(parameterized.TestCase):
+    """Test ROC."""
+
+    def setUp(self) -> None:
+        """Creates Snowpark and Snowflake environments for testing."""
+        self._session = snowpark.Session.builder.configs(connection_params.SnowflakeLoginOptions()).create()
+
+    def tearDown(self) -> None:
+        self._session.close()
+
+    @parameterized.parameters(  # type: ignore[misc]
+        {"params": {"pos_label": [0, 2, 4]}},
+    )
+    def test_roc_curve_pos_label(self, params: Dict[str, Any]) -> None:
+        pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA)
+        input_df = self._session.create_dataframe(pandas_df)
+
+        for pos_label in params["pos_label"]:
+            actual_fpr, actual_tpr, actual_thresholds = snowml_metrics.roc_curve(
+                df=input_df,
+                y_true_col_name=_Y_TRUE_COL,
+                y_score_col_name=_Y_SCORE_COL,
+                pos_label=pos_label,
+            )
+            sklearn_fpr, sklearn_tpr, sklearn_thresholds = sklearn_metrics.roc_curve(
+                pandas_df[_Y_TRUE_COL],
+                pandas_df[_Y_SCORE_COL],
+                pos_label=pos_label,
+            )
+            np.testing.assert_allclose(
+                np.array((actual_fpr, actual_tpr, actual_thresholds)),
+                np.array((sklearn_fpr, sklearn_tpr, sklearn_thresholds)),
+            )
+
+    @parameterized.parameters(  # type: ignore[misc]
+        {"params": {"sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL]}},
+    )
+    def test_roc_curve_sample_weight(self, params: Dict[str, Any]) -> None:
+        pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA)
+        input_df = self._session.create_dataframe(pandas_df)
+
+        for sample_weight_col_name in params["sample_weight_col_name"]:
+            actual_fpr, actual_tpr, actual_thresholds = snowml_metrics.roc_curve(
+                df=input_df,
+                y_true_col_name=_Y_TRUE_COL,
+                y_score_col_name=_Y_SCORE_COL,
+                sample_weight_col_name=sample_weight_col_name,
+            )
+            sample_weight = pandas_df[sample_weight_col_name].to_numpy() if sample_weight_col_name else None
+            sklearn_fpr, sklearn_tpr, sklearn_thresholds = sklearn_metrics.roc_curve(
+                pandas_df[_Y_TRUE_COL],
+                pandas_df[_Y_SCORE_COL],
+                sample_weight=sample_weight,
+            )
+            np.testing.assert_allclose(
+                np.array((actual_fpr, actual_tpr, actual_thresholds)),
+                np.array((sklearn_fpr, sklearn_tpr, sklearn_thresholds)),
+            )
+
+    @parameterized.parameters(  # type: ignore[misc]
+        {"params": {"drop_intermediate": [True, False]}},
+    )
+    def test_roc_curve_drop_intermediate(self, params: Dict[str, Any]) -> None:
+        pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA)
+        input_df = self._session.create_dataframe(pandas_df)
+
+        for drop_intermediate in params["drop_intermediate"]:
+            actual_fpr, actual_tpr, actual_thresholds = snowml_metrics.roc_curve(
+                df=input_df,
+                y_true_col_name=_Y_TRUE_COL,
+                y_score_col_name=_Y_SCORE_COL,
+                drop_intermediate=drop_intermediate,
+            )
+            sklearn_fpr, sklearn_tpr, sklearn_thresholds = sklearn_metrics.roc_curve(
+                pandas_df[_Y_TRUE_COL],
+                pandas_df[_Y_SCORE_COL],
+                drop_intermediate=drop_intermediate,
+            )
+            np.testing.assert_allclose(
+                np.array((actual_fpr, actual_tpr, actual_thresholds)),
+                np.array((sklearn_fpr, sklearn_tpr, sklearn_thresholds)),
+            )
+
+
+if __name__ == "__main__":
+    main()