From 6f23e5978b082e5a39570549c7cd636f896576c6 Mon Sep 17 00:00:00 2001 From: Snowflake Provisioner <58576687+snowflake-provisioner@users.noreply.github.com> Date: Tue, 5 Sep 2023 13:05:47 -0700 Subject: [PATCH] Project import generated by Copybara. (#42) --- CHANGELOG.md | 14 +++- bazel/environments/conda-env-snowflake.yml | 4 +- bazel/environments/conda-env.yml | 4 +- ci/conda_recipe/meta.yaml | 7 +- codegen/sklearn_wrapper_template.py_template | 4 +- requirements.yml | 9 ++- snowflake/ml/fileset/stage_fs_test.py | 6 +- .../warehouse/infer_template.py | 2 +- .../_internal/_grid_search_cv.py | 5 +- .../_internal/_randomized_search_cv.py | 5 +- snowflake/ml/requirements.bzl | 2 +- snowflake/ml/version.bzl | 2 +- .../warehouse_snowml_model_integ_test.py | 6 +- .../ml/modeling/pipeline/test_pipeline.py | 76 ++++++++++--------- .../modeling/preprocessing/BUILD_NATIVE.bzl | 2 + ...istry_integ_test_snowservice_merge_gate.py | 58 +++++++------- 16 files changed, 109 insertions(+), 97 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 945d6e4f..bf3f3be8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,18 @@ # Release History -## 1.0.6 +## 1.0.7 + +### Behavior Changes + + +### New Features + + +### Bug Fixes + +- Model Development & Model Registry: Fix an error related to `pandas.io.json.json_normalize`. + +## 1.0.6 (2023-09-01) ### New Features - Model Registry: add `create_if_not_exists` parameter in constructor. diff --git a/bazel/environments/conda-env-snowflake.yml b/bazel/environments/conda-env-snowflake.yml index 277138dc..e49efc81 100644 --- a/bazel/environments/conda-env-snowflake.yml +++ b/bazel/environments/conda-env-snowflake.yml @@ -16,7 +16,7 @@ dependencies: - cryptography==39.0.1 - flask-cors==3.0.10 - flask==2.1.3 -- fsspec==2022.11.0 +- fsspec==2023.3.0 - httpx==0.23.0 - inflection==0.5.1 - joblib==1.1.1 @@ -37,7 +37,7 @@ dependencies: - pyyaml==6.0 - requests==2.29.0 - ruamel.yaml==0.17.21 -- s3fs==2022.11.0 +- s3fs==2023.3.0 - scikit-learn==1.3.0 - scipy==1.9.3 - snowflake-connector-python==3.0.3 diff --git a/bazel/environments/conda-env.yml b/bazel/environments/conda-env.yml index c281fd27..1a159ef8 100644 --- a/bazel/environments/conda-env.yml +++ b/bazel/environments/conda-env.yml @@ -19,7 +19,7 @@ dependencies: - cryptography==39.0.1 - flask-cors==3.0.10 - flask==2.1.3 -- fsspec==2022.11.0 +- fsspec==2023.3.0 - httpx==0.23.0 - inflection==0.5.1 - joblib==1.1.1 @@ -41,7 +41,7 @@ dependencies: - pyyaml==6.0 - requests==2.29.0 - ruamel.yaml==0.17.21 -- s3fs==2022.11.0 +- s3fs==2023.3.0 - scikit-learn==1.3.0 - scipy==1.9.3 - snowflake-connector-python==3.0.3 diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml index 6b8933c0..9951f294 100644 --- a/ci/conda_recipe/meta.yaml +++ b/ci/conda_recipe/meta.yaml @@ -17,7 +17,7 @@ build: noarch: python package: name: snowflake-ml-python - version: 1.0.6 + version: 1.0.7 requirements: build: - python @@ -27,13 +27,14 @@ requirements: - aiohttp!=4.0.0a0, !=4.0.0a1 - anyio>=3.5.0,<4 - cloudpickle - - fsspec>=2022.11,<=2023.1 + - fsspec>=2022.11,<2024 - numpy>=1.23,<2 - packaging>=20.9,<24 - pandas>=1.0.0,<2 - - python + - python>=3.8.13, <3.11 - pyyaml>=6.0,<7 - requests + - s3fs>=2022.11,<2024 - scikit-learn>=1.2.1,<1.4 - scipy>=1.9,<2 - snowflake-connector-python>=3.0.3,<4 diff --git a/codegen/sklearn_wrapper_template.py_template b/codegen/sklearn_wrapper_template.py_template index 8771d318..9b6b1237 100644 --- a/codegen/sklearn_wrapper_template.py_template +++ b/codegen/sklearn_wrapper_template.py_template @@ -476,9 +476,9 @@ class {transform.original_class_name}(BaseTransformer): import pandas as pd import numpy as np - input_df = pd.io.json.json_normalize(ds) + input_df = pd.json_normalize(ds) - # pd.io.json.json_normalize() doesn't remove quotes around quoted identifiers like snowpakr_df.to_pandas(). + # pd.json_normalize() doesn't remove quotes around quoted identifiers like snowpakr_df.to_pandas(). # But trained models have unquoted input column names saved in internal state if trained using snowpark_df # or quoted input column names saved in internal state if trained using pandas_df. # Model expects exact same columns names in the input df for predict call. diff --git a/requirements.yml b/requirements.yml index d3267a52..460596b1 100644 --- a/requirements.yml +++ b/requirements.yml @@ -101,8 +101,8 @@ dev_version: "2.1.3" - name_pypi: fsspec[http] name_conda: fsspec - dev_version: "2022.11.0" - version_requirements: ">=2022.11,<=2023.1" + dev_version: "2023.3.0" + version_requirements: ">=2022.11,<2024" - name: httpx dev_version: "0.23.0" - name: inflection @@ -158,7 +158,7 @@ dev_version: "7.1.2" - name_conda: python dev_version_conda: "3.8.13" - version_requirements_conda: "" + version_requirements_conda: ">=3.8.13, <3.11" - name_pypi: torch name_conda: pytorch dev_version: "2.0.1" @@ -175,7 +175,8 @@ - name: ruamel.yaml dev_version: "0.17.21" - name: s3fs - dev_version: "2022.11.0" + dev_version: "2023.3.0" + version_requirements: ">=2022.11,<2024" - name: scikit-learn dev_version: "1.3.0" version_requirements: ">=1.2.1,<1.4" diff --git a/snowflake/ml/fileset/stage_fs_test.py b/snowflake/ml/fileset/stage_fs_test.py index 84da2ed3..03ce9d9d 100644 --- a/snowflake/ml/fileset/stage_fs_test.py +++ b/snowflake/ml/fileset/stage_fs_test.py @@ -2,11 +2,7 @@ from typing import Dict, List import boto3 - -# library `requests` has known stubs but is not installed. -# TODO(zpeng): we may need to install as many mypy stubs as possible. However that -# would require installing mypy when initializing the bazel conda environment. -import requests # type: ignore +import requests import stage_fs from absl.testing import absltest from moto import server diff --git a/snowflake/ml/model/_deploy_client/warehouse/infer_template.py b/snowflake/ml/model/_deploy_client/warehouse/infer_template.py index 5486d4e5..ed4134a4 100644 --- a/snowflake/ml/model/_deploy_client/warehouse/infer_template.py +++ b/snowflake/ml/model/_deploy_client/warehouse/infer_template.py @@ -52,7 +52,7 @@ def __exit__(self, type, value, traceback): # TODO(halu): Avoid per batch async detection branching. @vectorized(input=pd.DataFrame, max_batch_size=10) def infer(df): - input_df = pd.io.json.json_normalize(df[0]).astype(dtype=dtype_map) + input_df = pd.json_normalize(df[0]).astype(dtype=dtype_map) if inspect.iscoroutinefunction(model.{target_method}): predictions_df = anyio.run(model.{target_method}, input_df[input_cols]) else: diff --git a/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py b/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py index 3dc4fddf..aaee48aa 100644 --- a/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py +++ b/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py @@ -493,7 +493,6 @@ def _fit_snowpark(self, dataset: DataFrame) -> None: ] target_locations = [] for param_chunk in param_chunks: - param_chunk_dist: Any = defaultdict(set) for d in param_chunk: for k, v in d.items(): @@ -675,9 +674,9 @@ def vec_batch_infer(ds: PandasSeries[dict]) -> PandasSeries[dict]: # type: igno import numpy as np import pandas as pd - input_df = pd.io.json.json_normalize(ds) + input_df = pd.json_normalize(ds) - # pd.io.json.json_normalize() doesn't remove quotes around quoted identifiers like snowpakr_df.to_pandas(). + # pd.json_normalize() doesn't remove quotes around quoted identifiers like snowpakr_df.to_pandas(). # But trained models have unquoted input column names saved in internal state if trained using snowpark_df # or quoted input column names saved in internal state if trained using pandas_df. # Model expects exact same columns names in the input df for predict call. diff --git a/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py b/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py index 58abc02f..17ff9b3c 100644 --- a/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py +++ b/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py @@ -503,7 +503,6 @@ def _fit_snowpark(self, dataset: DataFrame) -> None: ] target_locations = [] for param_chunk in param_chunks: - param_chunk_dist: Any = defaultdict(set) for d in param_chunk: for k, v in d.items(): @@ -684,9 +683,9 @@ def vec_batch_infer(ds: PandasSeries[dict]) -> PandasSeries[dict]: # type: igno import numpy as np import pandas as pd - input_df = pd.io.json.json_normalize(ds) + input_df = pd.json_normalize(ds) - # pd.io.json.json_normalize() doesn't remove quotes around quoted identifiers like snowpakr_df.to_pandas(). + # pd.json_normalize() doesn't remove quotes around quoted identifiers like snowpakr_df.to_pandas(). # But trained models have unquoted input column names saved in internal state if trained using snowpark_df # or quoted input column names saved in internal state if trained using pandas_df. # Model expects exact same columns names in the input df for predict call. diff --git a/snowflake/ml/requirements.bzl b/snowflake/ml/requirements.bzl index 86f96cc7..e8194f46 100755 --- a/snowflake/ml/requirements.bzl +++ b/snowflake/ml/requirements.bzl @@ -3,4 +3,4 @@ EXTRA_REQUIREMENTS={'lightgbm': ['lightgbm==3.3.5'], 'mlflow': ['mlflow>=2.1.0,<2.4'], 'tensorflow': ['tensorflow>=2.9,<3'], 'torch': ['torchdata>=0.4,<1'], 'transformers': ['transformers>=4.29.2,<5'], 'all': ['lightgbm==3.3.5', 'mlflow>=2.1.0,<2.4', 'tensorflow>=2.9,<3', 'torchdata>=0.4,<1', 'transformers>=4.29.2,<5']} -REQUIREMENTS=['absl-py>=0.15,<2', 'anyio>=3.5.0,<4', 'cloudpickle', 'fsspec[http]>=2022.11,<=2023.1', 'numpy>=1.23,<2', 'packaging>=20.9,<24', 'pandas>=1.0.0,<2', 'pyyaml>=6.0,<7', 'scikit-learn>=1.2.1,<1.4', 'scipy>=1.9,<2', 'snowflake-connector-python[pandas]>=3.0.3,<4', 'snowflake-snowpark-python>=1.5.1,<2', 'sqlparse>=0.4,<1', 'typing-extensions>=4.1.0,<5', 'xgboost>=1.7.3,<2'] +REQUIREMENTS=['absl-py>=0.15,<2', 'anyio>=3.5.0,<4', 'cloudpickle', 'fsspec[http]>=2022.11,<2024', 'numpy>=1.23,<2', 'packaging>=20.9,<24', 'pandas>=1.0.0,<2', 'pyyaml>=6.0,<7', 's3fs>=2022.11,<2024', 'scikit-learn>=1.2.1,<1.4', 'scipy>=1.9,<2', 'snowflake-connector-python[pandas]>=3.0.3,<4', 'snowflake-snowpark-python>=1.5.1,<2', 'sqlparse>=0.4,<1', 'typing-extensions>=4.1.0,<5', 'xgboost>=1.7.3,<2'] diff --git a/snowflake/ml/version.bzl b/snowflake/ml/version.bzl index a3b92e20..5852eaa4 100644 --- a/snowflake/ml/version.bzl +++ b/snowflake/ml/version.bzl @@ -1,2 +1,2 @@ # This is parsed by regex in conda reciper meta file. Make sure not to break it. -VERSION = "1.0.6" +VERSION = "1.0.7" diff --git a/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py index e59e2d5e..3c622167 100644 --- a/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py @@ -77,7 +77,7 @@ def base_test_case( test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.5"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.7"]) # type: ignore[misc] def test_snowml_model_deploy_snowml_sklearn( self, permanent_deploy: Optional[bool] = False, @@ -110,7 +110,7 @@ def test_snowml_model_deploy_snowml_sklearn( test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.5"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.7"]) # type: ignore[misc] def test_snowml_model_deploy_xgboost( self, permanent_deploy: Optional[bool] = False, @@ -143,7 +143,7 @@ def test_snowml_model_deploy_xgboost( test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.5"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.7"]) # type: ignore[misc] def test_snowml_model_deploy_lightgbm( self, permanent_deploy: Optional[bool] = False, diff --git a/tests/integ/snowflake/ml/modeling/pipeline/test_pipeline.py b/tests/integ/snowflake/ml/modeling/pipeline/test_pipeline.py index 9eb8a115..b1f1ee0c 100644 --- a/tests/integ/snowflake/ml/modeling/pipeline/test_pipeline.py +++ b/tests/integ/snowflake/ml/modeling/pipeline/test_pipeline.py @@ -124,45 +124,47 @@ def test_serde(self) -> None: mms = MinMaxScaler(input_cols=output_cols, output_cols=pipeline_output_cols) pipeline = snowml_pipeline.Pipeline([("ss", ss), ("mms", mms)]) pipeline.fit(df1) - filepath = os.path.join(tempfile.gettempdir(), "test_pipeline.pkl") - self._to_be_deleted_files.append(filepath) - pipeline_dump_cloudpickle = cloudpickle.dumps(pipeline) - pipeline_dump_pickle = pickle.dumps(pipeline) - joblib.dump(pipeline, filepath) + with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as file: + self._to_be_deleted_files.append(file.name) + pipeline_dump_cloudpickle = cloudpickle.dumps(pipeline) + pipeline_dump_pickle = pickle.dumps(pipeline) + joblib.dump(pipeline, file.name) + + self._session.close() + + # transform in session 2 + self._session = Session.builder.configs(SnowflakeLoginOptions()).create() + _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) + input_cols_extended = input_cols.copy() + input_cols_extended.append(id_col) + + importlib.reload(sys.modules["snowflake.ml.modeling.pipeline"]) + + # cloudpickle + pipeline_load_cloudpickle = cloudpickle.loads(pipeline_dump_cloudpickle) + transformed_df_cloudpickle = pipeline_load_cloudpickle.transform(df2[input_cols_extended]) + actual_arr_cloudpickle = ( + transformed_df_cloudpickle.sort(id_col)[pipeline_output_cols].to_pandas().to_numpy() + ) - self._session.close() + # pickle + pipeline_load_pickle = pickle.loads(pipeline_dump_pickle) + transformed_df_pickle = pipeline_load_pickle.transform(df2[input_cols_extended]) + actual_arr_pickle = transformed_df_pickle.sort(id_col)[pipeline_output_cols].to_pandas().to_numpy() - # transform in session 2 - self._session = Session.builder.configs(SnowflakeLoginOptions()).create() - _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) - input_cols_extended = input_cols.copy() - input_cols_extended.append(id_col) - - importlib.reload(sys.modules["snowflake.ml.modeling.pipeline"]) - - # cloudpickle - pipeline_load_cloudpickle = cloudpickle.loads(pipeline_dump_cloudpickle) - transformed_df_cloudpickle = pipeline_load_cloudpickle.transform(df2[input_cols_extended]) - actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[pipeline_output_cols].to_pandas().to_numpy() - - # pickle - pipeline_load_pickle = pickle.loads(pipeline_dump_pickle) - transformed_df_pickle = pipeline_load_pickle.transform(df2[input_cols_extended]) - actual_arr_pickle = transformed_df_pickle.sort(id_col)[pipeline_output_cols].to_pandas().to_numpy() - - # joblib - pipeline_load_joblib = joblib.load(filepath) - transformed_df_joblib = pipeline_load_joblib.transform(df2[input_cols_extended]) - actual_arr_joblib = transformed_df_joblib.sort(id_col)[pipeline_output_cols].to_pandas().to_numpy() - - # sklearn - skpipeline = SkPipeline([("ss", SklearnStandardScaler()), ("mms", SklearnMinMaxScaler())]) - skpipeline.fit(df_pandas[input_cols]) - sklearn_arr = skpipeline.transform(df_pandas[input_cols]) - - assert np.allclose(actual_arr_cloudpickle, sklearn_arr) - assert np.allclose(actual_arr_pickle, sklearn_arr) - assert np.allclose(actual_arr_joblib, sklearn_arr) + # joblib + pipeline_load_joblib = joblib.load(file.name) + transformed_df_joblib = pipeline_load_joblib.transform(df2[input_cols_extended]) + actual_arr_joblib = transformed_df_joblib.sort(id_col)[pipeline_output_cols].to_pandas().to_numpy() + + # sklearn + skpipeline = SkPipeline([("ss", SklearnStandardScaler()), ("mms", SklearnMinMaxScaler())]) + skpipeline.fit(df_pandas[input_cols]) + sklearn_arr = skpipeline.transform(df_pandas[input_cols]) + + np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) def test_pipeline_with_regression_estimators(self) -> None: input_df_pandas = load_diabetes(as_frame=True).frame diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl b/tests/integ/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl index abc73fa7..81e07d89 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl +++ b/tests/integ/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl @@ -122,6 +122,8 @@ def get_build_rules_for_native_impl(): py_test( name = "test_drop_input_cols", srcs = ["test_drop_input_cols.py"], + shard_count = SHARD_COUNT, + timeout = TIMEOUT, deps = [ "//snowflake/ml/modeling/impute:simple_imputer", "//snowflake/ml/modeling/pipeline:pipeline", diff --git a/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_merge_gate.py b/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_merge_gate.py index c2c46e36..ce8b09d7 100644 --- a/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_merge_gate.py +++ b/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_merge_gate.py @@ -2,40 +2,40 @@ # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # -import uuid +# import uuid -import pandas as pd -import pytest +# import pandas as pd +# import pytest from absl.testing import absltest -from snowflake.ml.model import deploy_platforms -from tests.integ.snowflake.ml.registry.model_registry_integ_test_snowservice_base import ( - TestModelRegistryIntegSnowServiceBase, -) -from tests.integ.snowflake.ml.test_utils import model_factory +# from snowflake.ml.model import deploy_platforms +# from tests.integ.snowflake.ml.registry.model_registry_integ_test_snowservice_base import ( +# TestModelRegistryIntegSnowServiceBase, +# ) +# from tests.integ.snowflake.ml.test_utils import model_factory -class TestModelRegistryIntegWithSnowServiceDeployment(TestModelRegistryIntegSnowServiceBase): - @pytest.mark.pip_incompatible - def test_snowml_model_deployment_xgboost(self) -> None: - self._test_snowservice_deployment( - model_name="xgboost_model", - model_version=uuid.uuid4().hex, - prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_snowml_model_xgb, - prediction_assert_fn=lambda local_prediction, remote_prediction: pd.testing.assert_frame_equal( - remote_prediction, local_prediction, check_dtype=False - ), - deployment_options={ - "platform": deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES, - "target_method": "predict", - "options": { - "compute_pool": self._TEST_CPU_COMPUTE_POOL, - "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), - "enable_remote_image_build": True, - }, - }, - omit_target_method_when_deploy=True, - ) +# class TestModelRegistryIntegWithSnowServiceDeployment(TestModelRegistryIntegSnowServiceBase): +# @pytest.mark.pip_incompatible +# def test_snowml_model_deployment_xgboost(self) -> None: +# self._test_snowservice_deployment( +# model_name="xgboost_model", +# model_version=uuid.uuid4().hex, +# prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_snowml_model_xgb, +# prediction_assert_fn=lambda local_prediction, remote_prediction: pd.testing.assert_frame_equal( +# remote_prediction, local_prediction, check_dtype=False +# ), +# deployment_options={ +# "platform": deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES, +# "target_method": "predict", +# "options": { +# "compute_pool": self._TEST_CPU_COMPUTE_POOL, +# "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), +# "enable_remote_image_build": True, +# }, +# }, +# omit_target_method_when_deploy=True, +# ) if __name__ == "__main__":