Project import generated by Copybara. (#118)

GitOrigin-RevId: ac6dd60ea2f93da707c56f842e5afd9935987137 Co-authored-by: Snowflake Authors <[email protected]>
snowflakedb · Oct 7, 2024 · 6186ce6 · 6186ce6
1 parent f50d041
commit 6186ce6
Show file tree

Hide file tree

Showing 259 changed files with 8,301 additions and 22,148 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,5 @@
 ---
-exclude: ^(.*egg.info.*|.*/parameters.py$|.*\.py_template|.*/experimental/.*|.*/fixtures/.*|docs/source/_themes/.*)
-minimum_pre_commit_version: 3.4.0
+exclude: ^(.*egg.info.*|.*/parameters.py$|.*\.py_template|.*/experimental/.*|.*/fixtures/.*|docs/source/_themes/.*|.*\.patch)
 repos:
   - repo: https://github.com/asottile/pyupgrade
     rev: v2.31.1
@@ -65,7 +64,7 @@ repos:
       - id: markdownlint-fix
         language_version: 16.20.2
   - repo: https://github.com/keith/pre-commit-buildifier
-    rev: 6.0.0
+    rev: 7.3.1
     hooks:
       - id: buildifier
         args:
@@ -84,7 +83,7 @@ repos:
         exclude_types:
           - image
   - repo: https://github.com/lyz-code/yamlfix
-    rev: 1.13.0
+    rev: 1.16.1
     hooks:
       - id: yamlfix
         args:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,25 @@
 # Release History
 
-## 1.6.2 (TBD)
+## 1.6.3
+
+- Model Registry (PrPr) has been removed.
+
+### Bug Fixes
+
+- Registry: Fix a bug that when package whose name does not follow PEP-508 is provided when logging the model,
+  an unexpected normalization is happening.
+- Registry: Fix `not a valid remote uri` error when logging mlflow models.
+- Registry: Fix a bug that `ModelVersion.run` is called in a nested way.
+- Registry: Fix an issue that leads to `log_model` failure when local package version contains parts other than
+  base version.
+
+### New Features
+
+- Data: Improve `DataConnector.to_pandas()` performance when loading from Snowpark DataFrames.
+- Model Registry: Allow users to set a model task while using `log_model`.
+- Feature Store: FeatureView supports ON_CREATE or ON_SCHEDULE initialize mode.
+
+## 1.6.2 (2024-09-04)
 
 ### Bug Fixes
 
@@ -18,8 +37,6 @@
 - Data: Add native batching support via `batch_size` and `drop_last_batch` arguments to `DataConnector.to_torch_dataset()`
 - Feature Store: update_feature_view() supports taking feature view object as argument.
 
-### Behavior Changes
-
 ## 1.6.1 (2024-08-12)
 
 ### Bug Fixes
@@ -42,8 +59,6 @@
 - Registry: Option to `enable_explainability` set to True by default for XGBoost, LightGBM and CatBoost as PuPr feature.
 - Registry: Option to `enable_explainability` when registering SHAP supported sklearn models.
 
-### Behavior Changes
-
 ## 1.6.0 (2024-07-29)
 
 ### Bug Fixes

diff --git a/bazel/requirements/templates/meta.tpl.yaml b/bazel/requirements/templates/meta.tpl.yaml
@@ -11,11 +11,9 @@ build:
 requirements:
   build:
     - python
-    - bazel >=6.0.0
+    - bazel==6.3.0
   run:
     - python>=3.8,<3.12
-  run_constrained:
-    - openjpeg !=2.4.0=*_1 # [win]
 
 about:
   home: https://github.com/snowflakedb/snowflake-ml-python

diff --git a/ci/build_and_run_tests.sh b/ci/build_and_run_tests.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # Usage
-# build_and_run_tests.sh <workspace> [-b <bazel path>] [--env pip|conda] [--mode merge_gate|continuous_run] [--with-snowpark] [--report <report_path>]
+# build_and_run_tests.sh <workspace> [-b <bazel path>] [--env pip|conda] [--mode merge_gate|continuous_run] [--with-snowpark] [--with-spcs-image] [--report <report_path>]
 #
 # Args
 # workspace: path to the workspace, SnowML code should be in snowml directory.
@@ -14,6 +14,7 @@
 #   continuous_run (default): run all tests. (For nightly run. Alias: release)
 #   quarantined: run all quarantined tests.
 # with-snowpark: Build and test with snowpark in snowpark-python directory in the workspace.
+# with-spcs-image: Build and test with spcs-image in spcs-image directory in the workspace.
 # snowflake-env: The environment of the snowflake, use to determine the test quarantine list
 # report: Path to xml test report
 #
@@ -29,14 +30,15 @@ PROG=$0
 
 help() {
     local exit_code=$1
-    echo "Usage: ${PROG} <workspace> [-b <bazel path>] [--env pip|conda] [--mode merge_gate|continuous_run|quarantined] [--with-snowpark] [--snowflake-env <sf_env>] [--report <report_path>]"
+    echo "Usage: ${PROG} <workspace> [-b <bazel path>] [--env pip|conda] [--mode merge_gate|continuous_run|quarantined] [--with-snowpark] [--with-spcs-image] [--snowflake-env <sf_env>] [--report <report_path>]"
     exit "${exit_code}"
 }
 
 WORKSPACE=$1 && shift || help 1
 BAZEL="bazel"
 ENV="pip"
 WITH_SNOWPARK=false
+WITH_SPCS_IMAGE=false
 MODE="continuous_run"
 PYTHON_VERSION=3.8
 PYTHON_ENABLE_SCRIPT="bin/activate"
@@ -86,6 +88,9 @@ while (($#)); do
         shift
         PYTHON_VERSION=$1
         ;;
+    --with-spcs-image)
+        WITH_SPCS_IMAGE=true
+        ;;
     -h | --help)
         help 0
         ;;
@@ -260,11 +265,18 @@ else
     # Build SnowML
     pushd ${SNOWML_DIR}
     # Build conda package
-    conda build --prefix-length 50 --python=${PYTHON_VERSION} --croot "${WORKSPACE}/conda-bld" ci/conda_recipe
+    conda build -c conda-forge --override-channels --prefix-length 50 --python=${PYTHON_VERSION} --croot "${WORKSPACE}/conda-bld" ci/conda_recipe
     conda build purge
     popd
 fi
 
+if [[ "${WITH_SPCS_IMAGE}" = true ]]; then
+    pushd ${SNOWML_DIR}
+    # Build SPCS Image
+    source model_container_services_deployment/ci/build_and_push_images.sh
+    popd
+fi
+
 # Start testing
 pushd "${TEMP_TEST_DIR}"
 
@@ -281,6 +293,11 @@ if [[ -n "${JUNIT_REPORT_PATH}" ]]; then
 fi
 
 if [ "${ENV}" = "pip" ]; then
+    if [ "${WITH_SPCS_IMAGE}" = true ]; then
+        COMMON_PYTEST_FLAG+=(-m "spcs_deployment_image and not pip_incompatible")
+    else
+        COMMON_PYTEST_FLAG+=(-m "not pip_incompatible")
+    fi
     # Copy wheel package
     cp "${WORKSPACE}/snowflake_ml_python-${VERSION}-py3-none-any.whl" "${TEMP_TEST_DIR}"
 
@@ -302,10 +319,15 @@ if [ "${ENV}" = "pip" ]; then
 
     # Run the tests
     set +e
-    TEST_SRCDIR="${TEMP_TEST_DIR}" python -m pytest "${COMMON_PYTEST_FLAG[@]}" -m "not pip_incompatible" tests/integ/
+    TEST_SRCDIR="${TEMP_TEST_DIR}" python -m pytest "${COMMON_PYTEST_FLAG[@]}" tests/integ/
     TEST_RETCODE=$?
     set -e
 else
+    if [ "${WITH_SPCS_IMAGE}" = true ]; then
+        COMMON_PYTEST_FLAG+=(-m "spcs_deployment_image and not conda_incompatible")
+    else
+        COMMON_PYTEST_FLAG+=(-m "not conda_incompatible")
+    fi
     # Create local conda channel
     conda index "${WORKSPACE}/conda-bld"
 
@@ -319,7 +341,7 @@ else
 
     # Run integration tests
     set +e
-    TEST_SRCDIR="${TEMP_TEST_DIR}" conda run -p testenv --no-capture-output python -m pytest "${COMMON_PYTEST_FLAG[@]}" -m "not conda_incompatible" tests/integ/
+    TEST_SRCDIR="${TEMP_TEST_DIR}" conda run -p testenv --no-capture-output python -m pytest "${COMMON_PYTEST_FLAG[@]}" tests/integ/
     TEST_RETCODE=$?
     set -e
 

diff --git a/ci/conda_recipe/README.md b/ci/conda_recipe/README.md
@@ -6,7 +6,7 @@ Conda's guide on building a conda package from a wheel:
 To invoke conda build:
 
 ```sh
-conda build --prefix-length=0 --python=[3.8|3.9|3.10|3.11] ci/conda_recipe
+conda build -c conda-forge --override-channels --prefix-length=0 --python=[3.8|3.9|3.10|3.11] ci/conda_recipe
 ```
 
 - `--prefix-length=0`: prevent the conda build environment from being created in

diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml
@@ -17,11 +17,11 @@ build:
   noarch: python
 package:
   name: snowflake-ml-python
-  version: 1.6.2
+  version: 1.6.3
 requirements:
   build:
     - python
-    - bazel >=6.0.0
+    - bazel==6.3.0
   run:
     - absl-py>=0.15,<2
     - aiohttp!=4.0.0a0, !=4.0.0a1
@@ -39,7 +39,7 @@ requirements:
     - requests
     - retrying>=1.3.3,<2
     - s3fs>=2022.11,<2024
-    - scikit-learn>=1.2.1,<1.4
+    - scikit-learn>=1.2.1,<1.6
     - scipy>=1.9,<2
     - snowflake-connector-python>=3.5.0,<4
     - snowflake-snowpark-python>=1.17.0,<2
@@ -54,11 +54,10 @@ requirements:
     - pytorch>=2.0.1,<2.3.0
     - sentence-transformers>=2.2.2,<3
     - sentencepiece>=0.1.95,<1
-    - shap==0.42.1
+    - shap>=0.42.0,<1
     - tensorflow>=2.10,<3
     - tokenizers>=0.10,<1
     - torchdata>=0.4,<1
     - transformers>=4.32.1,<5
-    - openjpeg !=2.4.0=*_1 # [win]
 source:
   path: ../../
diff --git a/ci/targets/local_only.txt b/ci/targets/local_only.txt
@@ -1,2 +0,0 @@
-//snowflake/ml/model/_deploy_client/image_builds/inference_server:gpu_test
-//snowflake/ml/model/_deploy_client/image_builds/inference_server:main_vllm_test

diff --git a/ci/targets/quarantine/prod3.txt b/ci/targets/quarantine/prod3.txt
@@ -1,5 +1,7 @@
-//tests/integ/snowflake/ml/model:deployment_to_snowservice_integ_test
-//tests/integ/snowflake/ml/registry:model_registry_snowservice_integ_test
-//tests/integ/snowflake/ml/model:spcs_llm_model_integ_test
+//snowflake/ml/model/_packager/model_handlers_test:mlflow_test
 //tests/integ/snowflake/ml/extra_tests:xgboost_external_memory_training_test
-//tests/integ/snowflake/ml/registry:model_registry_snowservice_merge_gate_integ_test
+//tests/integ/snowflake/ml/modeling/ensemble:isolation_forest_test
+//tests/integ/snowflake/ml/modeling/linear_model:sgd_one_class_svm_test
+//tests/integ/snowflake/ml/modeling/preprocessing:k_bins_discretizer_test
+//tests/integ/snowflake/ml/registry/model:registry_mlflow_model_test
+//tests/integ/snowflake/ml/registry/services/...
diff --git a/ci/targets/slow.txt b/ci/targets/slow.txt
@@ -1,3 +0,0 @@
-//tests/integ/snowflake/ml/model:deployment_to_snowservice_integ_test
-//tests/integ/snowflake/ml/registry:model_registry_snowservice_integ_test
-//tests/integ/snowflake/ml/model:spcs_llm_model_integ_test

diff --git a/codegen/sklearn_wrapper_generator.py b/codegen/sklearn_wrapper_generator.py
@@ -1058,12 +1058,41 @@ def generate(self) -> "SklearnWrapperGenerator":
             ]
             self.test_estimator_input_args_list.append(f"dictionary={dictionary}")
 
+        if WrapperGeneratorFactory._is_class_of_type(self.class_object[1], "Isomap"):
+            # Using higher n_neighbors for Isomap to balance accuracy and performance.
+            self.test_estimator_input_args_list.append("n_neighbors=30")
+
+        if WrapperGeneratorFactory._is_class_of_type(
+            self.class_object[1], "KNeighborsClassifier"
+        ) or WrapperGeneratorFactory._is_class_of_type(self.class_object[1], "RadiusNeighborsClassifier"):
+            # Use distance-based weighting to reduce ties and improve prediction accuracy.
+            self.test_estimator_input_args_list.append("weights='distance'")
+
+        if WrapperGeneratorFactory._is_class_of_type(self.class_object[1], "Nystroem"):
+            # Setting specific parameters for Nystroem to ensure a meaningful transformation.
+            # - `gamma`: Controls the shape of the RBF kernel. By setting gamma to a lower value
+            #   like 0.1, you can help generate larger transformation values in the output, making the
+            #   transformation less sensitive to small variations in the input data. This value also
+            #   balances between underfitting and overfitting for most datasets.
+            # - `n_components`: Specifies a larger number of components for the approximation,
+            #   which enhances the accuracy of the kernel approximation. This is especially useful
+            #   in higher-dimensional data or when a more precise transformation is needed.
+            self.test_estimator_input_args_list.append("gamma=0.1")
+            self.test_estimator_input_args_list.append("n_components=200")
+
         if WrapperGeneratorFactory._is_class_of_type(self.class_object[1], "SelectKBest"):
             # Set the k of SelectKBest features transformer to half the number of columns in the dataset.
             self.test_estimator_input_args_list.append("k=int(len(cols)/2)")
 
         if "n_components" in self.original_init_signature.parameters.keys():
-            if WrapperGeneratorFactory._is_class_of_type(self.class_object[1], "SpectralBiclustering"):
+            if self.original_class_name == "KernelPCA":
+                # Explicitly set 'n_components' to the number of input columns (len(cols))
+                # to ensure consistency between implementations. This is necessary because
+                # the default behavior might differ, with 'n_components' otherwise defaulting
+                # to the minimum of the number of features or samples, potentially leading to
+                # discrepancies between the implementations.
+                self.test_estimator_input_args_list.append("n_components=int(len(cols)/2)")
+            elif WrapperGeneratorFactory._is_class_of_type(self.class_object[1], "SpectralBiclustering"):
                 # For spectral bi clustering, set number of singular vectors to consider to number of input cols and
                 # num best vector to select to half the number of input cols.
                 self.test_estimator_input_args_list.append("n_components=len(cols)")

diff --git a/codegen/sklearn_wrapper_template.py_template b/codegen/sklearn_wrapper_template.py_template
@@ -389,6 +389,7 @@ class {transform.original_class_name}(BaseTransformer):
         """
         self._infer_input_output_cols(dataset)
         super()._check_dataset_type(dataset)
+
         model_trainer = ModelTrainerBuilder.build_fit_transform(
             estimator=self._sklearn_object,
             dataset=dataset,

diff --git a/codegen/transformer_autogen_test_template.py_template b/codegen/transformer_autogen_test_template.py_template
@@ -182,7 +182,7 @@ class {transform.test_class_name}(TestCase):
                 # TODO(snandamuri): HistGradientBoostingRegressor is returning different results in different envs.
                 # Needs further debugging.
                 if {transform._is_hist_gradient_boosting_regressor}:
-                    num_diffs = (~np.isclose(actual_arr, sklearn_numpy_arr)).sum()
+                    num_diffs = (~np.isclose(actual_arr, sklearn_numpy_arr, rtol=1.e-2, atol=1.e-2)).sum()
                     num_example = sklearn_numpy_arr.shape[0]
                     assert num_diffs < 0.1 * num_example
                 elif (not {transform._is_deterministic}) or (not {transform._is_deterministic_cross_platform} and platform.system() == 'Windows'):
Original file line number	Diff line number	Diff line change
		@@ -1,2 +0,0 @@
		//snowflake/ml/model/_deploy_client/image_builds/inference_server:gpu_test
		//snowflake/ml/model/_deploy_client/image_builds/inference_server:main_vllm_test