From cd38c898ad65814dcd36886be7a9de3d6b5e180f Mon Sep 17 00:00:00 2001 From: Snowflake Provisioner <58576687+snowflake-provisioner@users.noreply.github.com> Date: Tue, 23 May 2023 12:54:47 -0700 Subject: [PATCH] Project import generated by Copybara. (#16) GitOrigin-RevId: 1c09d7ecb92720c6367448f920684dabf40d2813 Co-authored-by: Snowflake Authors --- ci/conda_recipe/meta.yaml | 7 +- codegen/sklearn_wrapper_generator.py | 9 +- codegen/sklearn_wrapper_template.py_template | 64 +- conda-env-snowflake.yml | 3 +- conda-env.yml | 3 +- snowflake/ml/BUILD.bazel | 4 +- snowflake/ml/_internal/BUILD.bazel | 1 + snowflake/ml/_internal/env_utils.py | 65 +- snowflake/ml/_internal/env_utils_test.py | 179 ++- snowflake/ml/model/BUILD.bazel | 28 +- snowflake/ml/model/_deployer.py | 48 +- snowflake/ml/model/_handlers/BUILD.bazel | 25 +- snowflake/ml/model/_handlers/snowmlmodel.py | 187 +++ snowflake/ml/model/_model_test.py | 64 + snowflake/ml/model/_udf_util.py | 14 +- snowflake/ml/model/_udf_util_test.py | 34 +- snowflake/ml/model/model_signature.py | 804 +++++++---- snowflake/ml/model/model_signature_test.py | 288 ++-- snowflake/ml/model/type_hints.py | 19 +- snowflake/ml/preprocessing/binarizer.py | 4 +- .../ml/preprocessing/k_bins_discretizer.py | 4 +- snowflake/ml/preprocessing/label_encoder.py | 4 +- snowflake/ml/preprocessing/max_abs_scaler.py | 2 +- snowflake/ml/preprocessing/min_max_scaler.py | 2 +- snowflake/ml/preprocessing/normalizer.py | 4 +- snowflake/ml/preprocessing/one_hot_encoder.py | 4 +- snowflake/ml/preprocessing/ordinal_encoder.py | 3 +- snowflake/ml/preprocessing/robust_scaler.py | 2 +- snowflake/ml/preprocessing/simple_imputer.py | 3 +- snowflake/ml/preprocessing/standard_scaler.py | 2 +- snowflake/ml/registry/model_registry.py | 10 +- .../notebooks/Model Packaging Example.ipynb | 1176 +++++++++++++++++ snowflake/ml/version.bzl | 2 +- .../integ/snowflake/ml/_internal/BUILD.bazel | 1 + .../ml/_internal/env_utils_integ_test.py | 16 +- tests/integ/snowflake/ml/model/BUILD.bazel | 17 +- .../snowflake/ml/model/model_integ_test.py | 36 + 37 files changed, 2596 insertions(+), 542 deletions(-) create mode 100644 snowflake/ml/model/_handlers/snowmlmodel.py create mode 100644 snowflake/ml/registry/notebooks/Model Packaging Example.ipynb diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml index 0529aa6d..348d4bae 100644 --- a/ci/conda_recipe/meta.yaml +++ b/ci/conda_recipe/meta.yaml @@ -22,20 +22,25 @@ requirements: - python - absl-py>=0.15,<2 - anyio>=3.5.0,<4 + - cloudpickle - fsspec>=2022.11,<=2023.1 - numpy>=1.23,<1.24 + - packaging>=23.0,<24 - pyyaml>=6.0,<7 - scipy>=1.9,<2 - snowflake-connector-python - snowflake-snowpark-python>=1.4.0,<=2 - sqlparse>=0.4,<1 + - typing-extensions>=4.1.0,<5 + + # conda-libmamba-solver is conda-specific requirement, and should not appear in wheel's dependency. + - conda-libmamba-solver>=23.1.0,<24 # TODO(snandamuri): Versions of these packages must be exactly same between user's workspace and # snowpark sandbox. Generic definitions like scikit-learn>=1.1.0,<2 wont work because snowflake conda channel # only has a few allowlisted versions of scikit-learn available, so we must force users to use scikit-learn # versions that are available in the snowflake conda channel. Since there is no way to specify allow list of # versions in the requirements file, we are pinning the versions here. - - joblib>=1.0.0,<=1.1.1 - scikit-learn>=1.2.1,<2 - xgboost==1.7.3 about: diff --git a/codegen/sklearn_wrapper_generator.py b/codegen/sklearn_wrapper_generator.py index 8979b9a3..ebb716d4 100644 --- a/codegen/sklearn_wrapper_generator.py +++ b/codegen/sklearn_wrapper_generator.py @@ -802,9 +802,10 @@ def generate(self) -> "SklearnWrapperGenerator": if self._is_hist_gradient_boosting_regressor: self.test_estimator_input_args_list.extend(["min_samples_leaf=1", "max_leaf_nodes=100"]) + # TODO(snandamuri): Replace cloudpickle with joblib after latest version of joblib is added to snowflake conda. self.fit_sproc_deps = self.predict_udf_deps = ( "f'numpy=={np.__version__}', f'pandas=={pd.__version__}', f'scikit-learn=={sklearn.__version__}', " - "f'xgboost=={xgboost.__version__}', f'joblib=={joblib.__version__}'" + "f'xgboost=={xgboost.__version__}', f'cloudpickle=={cp.__version__}'" ) self._construct_string_from_lists() return self @@ -819,9 +820,10 @@ def generate(self) -> "XGBoostWrapperGenerator": self.estimator_imports_list.append("import xgboost") self.test_estimator_input_args_list.extend(["random_state=0", "subsample=1.0", "colsample_bynode=1.0"]) self.fit_sproc_imports = "import xgboost" + # TODO(snandamuri): Replace cloudpickle with joblib after latest version of joblib is added to snowflake conda. self.fit_sproc_deps = self.predict_udf_deps = ( "f'numpy=={np.__version__}', f'pandas=={pd.__version__}', f'xgboost=={xgboost.__version__}', " - "f'joblib=={joblib.__version__}'" + "f'cloudpickle=={cp.__version__}'" ) self._construct_string_from_lists() return self @@ -836,9 +838,10 @@ def generate(self) -> "LightGBMWrapperGenerator": self.estimator_imports_list.append("import lightgbm") self.test_estimator_input_args_list.extend(["random_state=0"]) self.fit_sproc_imports = "import lightgbm" + # TODO(snandamuri): Replace cloudpickle with joblib after latest version of joblib is added to snowflake conda. self.fit_sproc_deps = self.predict_udf_deps = ( "f'numpy=={np.__version__}', f'pandas=={pd.__version__}', f'lightgbm=={lightgbm.__version__}', " - "f'joblib=={joblib.__version__}'" + "f'cloudpickle=={cp.__version__}'" ) self._construct_string_from_lists() return self diff --git a/codegen/sklearn_wrapper_template.py_template b/codegen/sklearn_wrapper_template.py_template index e2f020f2..e45d58cd 100644 --- a/codegen/sklearn_wrapper_template.py_template +++ b/codegen/sklearn_wrapper_template.py_template @@ -6,7 +6,7 @@ import os from typing import Iterable, Optional, Union, List, Any, Dict, Callable from uuid import uuid4 -import joblib +import cloudpickle as cp import pandas as pd import numpy as np {transform.estimator_imports} @@ -183,7 +183,8 @@ class {transform.original_class_name}(BaseTransformer): # Create a temp file and dump the transform to that file. local_transform_file_name = get_temp_file_path() - joblib.dump(self._sklearn_object, local_transform_file_name) + with open(local_transform_file_name, mode="w+b") as local_transform_file: + cp.dump(self._sklearn_object, local_transform_file) # Create temp stage to run fit. transform_stage_name = "SNOWML_TRANSFORM_{{safe_id}}".format(safe_id=self.id) @@ -214,7 +215,13 @@ class {transform.original_class_name}(BaseTransformer): custom_tags=dict([("autogen", True)]), ) # Put locally serialized transform on stage. - session.file.put(local_transform_file_name, stage_transform_file_name, auto_compress=False, overwrite=True, statement_params=statement_params) + session.file.put( + local_transform_file_name, + stage_transform_file_name, + auto_compress=False, + overwrite=True, + statement_params=statement_params + ) @sproc( is_permanent=False, @@ -233,7 +240,7 @@ class {transform.original_class_name}(BaseTransformer): label_cols: List[str], sample_weight_col: Optional[str] ) -> str: - import joblib + import cloudpickle as cp import numpy as np import os import pandas @@ -251,7 +258,12 @@ class {transform.original_class_name}(BaseTransformer): session.file.get(stage_transform_file_name, local_transform_file_name, statement_params=statement_params) - estimator = joblib.load(os.path.join(local_transform_file_name, os.listdir(local_transform_file_name)[0])) + local_transform_file_path = os.path.join( + local_transform_file_name, + os.listdir(local_transform_file_name)[0] + ) + with open(local_transform_file_path, mode="r+b") as local_transform_file_obj: + estimator = cp.load(local_transform_file_obj) argspec = inspect.getfullargspec(estimator.fit) args = {{'X': df[input_cols]}} @@ -268,12 +280,20 @@ class {transform.original_class_name}(BaseTransformer): local_result_file_name = local_result_file.name local_result_file.close() - joblib_dump_files = joblib.dump(estimator, local_result_file_name) - session.file.put(local_result_file_name, stage_result_file_name, auto_compress = False, overwrite = True, statement_params=statement_params) + with open(local_result_file_name, mode="w+b") as local_result_file_obj: + cp.dump(estimator, local_result_file_obj) + + session.file.put( + local_result_file_name, + stage_result_file_name, + auto_compress = False, + overwrite = True, + statement_params=statement_params + ) # Note: you can add something like + "|" + str(df) to the return string # to pass debug information to the caller. - return str(os.path.basename(joblib_dump_files[0])) + return str(os.path.basename(local_result_file_name)) # Call fit sproc statement_params = telemetry.get_function_usage_statement_params( @@ -302,8 +322,13 @@ class {transform.original_class_name}(BaseTransformer): if len(fields) > 1: print("\n".join(fields[1:])) - session.file.get(os.path.join(stage_result_file_name, sproc_export_file_name), local_result_file_name, statement_params=statement_params) - self._sklearn_object = joblib.load(os.path.join(local_result_file_name, sproc_export_file_name)) + session.file.get( + os.path.join(stage_result_file_name, sproc_export_file_name), + local_result_file_name, + statement_params=statement_params + ) + with open(os.path.join(local_result_file_name, sproc_export_file_name),mode="r+b") as result_file_obj: + self._sklearn_object = cp.load(result_file_obj) cleanup_temp_files([local_transform_file_name, local_result_file_name]) @@ -843,7 +868,8 @@ class {transform.original_class_name}(BaseTransformer): # Create a temp file and dump the score to that file. local_score_file_name = get_temp_file_path() - joblib.dump(self._sklearn_object, local_score_file_name) + with open(local_score_file_name, mode="w+b") as local_score_file: + cp.dump(self._sklearn_object, local_score_file) # Create temp stage to run score. score_stage_name = "SNOWML_SCORE_{{safe_id}}".format(safe_id=self.id) @@ -872,7 +898,13 @@ class {transform.original_class_name}(BaseTransformer): custom_tags=dict([("autogen", True)]), ) # Put locally serialized score on stage. - session.file.put(local_score_file_name, stage_score_file_name, auto_compress=False, overwrite=True, statement_params=statement_params) + session.file.put( + local_score_file_name, + stage_score_file_name, + auto_compress=False, + overwrite=True, + statement_params=statement_params + ) @sproc( is_permanent=False, @@ -890,7 +922,7 @@ class {transform.original_class_name}(BaseTransformer): label_cols: List[str], sample_weight_col: Optional[str] ) -> float: - import joblib + import cloudpickle as cp import numpy as np import os import pandas @@ -905,7 +937,11 @@ class {transform.original_class_name}(BaseTransformer): local_score_file.close() session.file.get(stage_score_file_name, local_score_file_name, statement_params=statement_params) - estimator = joblib.load(os.path.join(local_score_file_name, os.listdir(local_score_file_name)[0])) + + local_score_file_name_path = os.path.join(local_score_file_name, os.listdir(local_score_file_name)[0]) + with open(local_score_file_name_path, mode="r+b") as local_score_file_obj: + estimator = cp.load(local_score_file_obj) + argspec = inspect.getfullargspec(estimator.score) if "X" in argspec.args: args = {{'X': df[input_cols]}} diff --git a/conda-env-snowflake.yml b/conda-env-snowflake.yml index 4b5c197f..95dd6a79 100644 --- a/conda-env-snowflake.yml +++ b/conda-env-snowflake.yml @@ -25,6 +25,7 @@ dependencies: - lightgbm==3.3.5 - networkx==2.8.4 - numpy==1.23.4 + - packaging==23.0 - pandas==1.4.4 - pytest==7.1.2 - python==3.8.13 @@ -35,6 +36,6 @@ dependencies: - scikit-learn==1.2.2 - snowflake-snowpark-python==1.4.0 - sqlparse==0.4.3 - - typing-extensions==4.3.0 + - typing-extensions==4.5.0 - xgboost==1.7.3 - mypy==0.981 # not a package dependency. diff --git a/conda-env.yml b/conda-env.yml index de9b753a..51f264e2 100644 --- a/conda-env.yml +++ b/conda-env.yml @@ -22,6 +22,7 @@ dependencies: - mypy==0.981 - networkx==2.8.4 - numpy==1.23.4 + - packaging==23.0 - pandas==1.4.4 - pytest==7.1.2 - python==3.8.13 @@ -36,5 +37,5 @@ dependencies: - torchdata==0.4.1 - transformers==4.27.1 - types-PyYAML==6.0.12 - - typing-extensions==4.3.0 + - typing-extensions==4.5.0 - xgboost==1.7.3 diff --git a/snowflake/ml/BUILD.bazel b/snowflake/ml/BUILD.bazel index 021c70ff..54bf7bb0 100644 --- a/snowflake/ml/BUILD.bazel +++ b/snowflake/ml/BUILD.bazel @@ -38,13 +38,16 @@ snowml_wheel( requires = [ "absl-py>=0.15,<2", "anyio>=3.5.0,<4", + "cloudpickle", # Version range is specified by snowpark. We are implicitly depending on it. "fsspec[http]>=2022.11,<=2023.1", "numpy>=1.23,<1.24", + "packaging>=23.0,<24", "pyyaml>=6.0,<7", "scipy>=1.9,<2", "snowflake-connector-python[pandas]", "snowflake-snowpark-python>=1.4.0,<2", "sqlparse>=0.4,<1", + "typing-extensions>=4.1.0,<5", # TODO(snandamuri): Versions of these packages must be exactly same between user's workspace and # snowpark sandbox. Generic definitions like scikit-learn>=1.1.0,<2 wont work because snowflake conda channel @@ -53,7 +56,6 @@ snowml_wheel( # versions in the requirements file, we are pinning the versions here. "scikit-learn>=1.2.1,<2", "xgboost==1.7.3", - "joblib>=1.0.0,<=1.1.1", # All the release versions between 1.0.0 and 1.1.1 are available in SF Conda channel. ], version = VERSION, deps = [ diff --git a/snowflake/ml/_internal/BUILD.bazel b/snowflake/ml/_internal/BUILD.bazel index aadb4015..d770f973 100644 --- a/snowflake/ml/_internal/BUILD.bazel +++ b/snowflake/ml/_internal/BUILD.bazel @@ -46,6 +46,7 @@ py_test( srcs = ["env_utils_test.py"], deps = [ ":env_utils", + ":env", "//snowflake/ml/test_utils:mock_data_frame", "//snowflake/ml/test_utils:mock_session", ], diff --git a/snowflake/ml/_internal/env_utils.py b/snowflake/ml/_internal/env_utils.py index 10251bf9..b25214e2 100644 --- a/snowflake/ml/_internal/env_utils.py +++ b/snowflake/ml/_internal/env_utils.py @@ -11,6 +11,7 @@ from snowflake.ml._internal.utils import query_result_checker from snowflake.snowpark import session +_INFO_SCHEMA_PACKAGES_HAS_RUNTIME_VERSION: Optional[bool] = None _SNOWFLAKE_CONDA_PACKAGE_CACHE: Dict[str, List[version.Version]] = {} @@ -219,13 +220,16 @@ def relax_requirement_version(req: requirements.Requirement) -> requirements.Req return new_req -def resolve_conda_environment(packages: List[requirements.Requirement], channels: List[str]) -> Optional[List[str]]: +def resolve_conda_environment( + packages: List[requirements.Requirement], channels: List[str], python_version: str +) -> Optional[List[str]]: """Use conda api to check if given packages are resolvable in given channels. Only work when conda is locally installed. Args: packages: Packages to be installed. channels: Anaconda channels (name or url) where conda should search into. + python_version: A string of python version where model is run. Returns: List of frozen dependencies represented in PEP 508 form if resolvable, None otherwise. @@ -234,7 +238,7 @@ def resolve_conda_environment(packages: List[requirements.Requirement], channels from conda_libmamba_solver import solver package_names = list(map(lambda x: x.name, packages)) - specs = list(map(str, packages)) + specs = list(map(str, packages)) + [f"python=={python_version}"] conda_solver = solver.LibMambaSolver("snow-env", channels=channels, specs_to_add=specs) try: @@ -252,18 +256,38 @@ def resolve_conda_environment(packages: List[requirements.Requirement], channels ) +def _check_runtime_version_column_existence(session: session.Session) -> bool: + sql = textwrap.dedent( + """ + SHOW COLUMNS + LIKE 'runtime_version' + IN TABLE information_schema.packages; + """ + ) + result = session.sql(sql).count() + return result == 1 + + def validate_requirements_in_snowflake_conda_channel( - session: session.Session, reqs: List[requirements.Requirement] + session: session.Session, reqs: List[requirements.Requirement], python_version: str ) -> Optional[List[str]]: """Search the snowflake anaconda channel for packages with version meet the specifier. Args: session: Snowflake connection session. reqs: List of requirement specifiers. + python_version: A string of python version where model is run. + + Raises: + ValueError: Raised when the specifier cannot be supported when creating UDF. Returns: A list of pinned latest version that available in Snowflake anaconda channel and meet the version specifier. """ + global _INFO_SCHEMA_PACKAGES_HAS_RUNTIME_VERSION + + if _INFO_SCHEMA_PACKAGES_HAS_RUNTIME_VERSION is None: + _INFO_SCHEMA_PACKAGES_HAS_RUNTIME_VERSION = _check_runtime_version_column_existence(session) ret_list = [] reqs_to_request = [] for req in reqs: @@ -273,14 +297,26 @@ def validate_requirements_in_snowflake_conda_channel( pkg_names_str = " OR ".join( f"package_name = '{req_name}'" for req_name in sorted(req.name for req in reqs_to_request) ) - sql = textwrap.dedent( - f""" - SELECT PACKAGE_NAME, VERSION - FROM information_schema.packages - WHERE ({pkg_names_str}) - AND language = 'python'; - """ - ) + if _INFO_SCHEMA_PACKAGES_HAS_RUNTIME_VERSION: + parsed_python_version = version.Version(python_version) + sql = textwrap.dedent( + f""" + SELECT PACKAGE_NAME, VERSION + FROM information_schema.packages + WHERE ({pkg_names_str}) + AND language = 'python' + AND runtime_version = '{parsed_python_version.major}.{parsed_python_version.minor}'; + """ + ) + else: + sql = textwrap.dedent( + f""" + SELECT PACKAGE_NAME, VERSION + FROM information_schema.packages + WHERE ({pkg_names_str}) + AND language = 'python'; + """ + ) try: result = ( @@ -301,10 +337,11 @@ def validate_requirements_in_snowflake_conda_channel( except snowflake.connector.DataError: return None for req in reqs: - available_versions = list(req.specifier.filter(_SNOWFLAKE_CONDA_PACKAGE_CACHE.get(req.name, []))) + if len(req.specifier) > 1 or any(spec.operator != "==" for spec in req.specifier): + raise ValueError("At most 1 version specifier using == operator is supported without local conda resolver.") + available_versions = list(req.specifier.filter(set(_SNOWFLAKE_CONDA_PACKAGE_CACHE.get(req.name, [])))) if not available_versions: return None else: - latest_version = max(available_versions) - ret_list.append(f"{req.name}=={latest_version}") + ret_list.append(str(req)) return sorted(ret_list) diff --git a/snowflake/ml/_internal/env_utils_test.py b/snowflake/ml/_internal/env_utils_test.py index b5a77192..2aacd2cd 100644 --- a/snowflake/ml/_internal/env_utils_test.py +++ b/snowflake/ml/_internal/env_utils_test.py @@ -1,4 +1,5 @@ import collections +import platform import textwrap from importlib import metadata as importlib_metadata from typing import DefaultDict, List, cast @@ -6,7 +7,7 @@ from absl.testing import absltest from packaging import requirements, specifiers -from snowflake.ml._internal import env_utils +from snowflake.ml._internal import env as snowml_env, env_utils from snowflake.ml.test_utils import mock_data_frame, mock_session from snowflake.snowpark import row, session @@ -270,22 +271,52 @@ def test_relax_requirement_version(self) -> None: def test_resolve_conda_environment(self) -> None: _SNOWFLAKE_CONDA_CHANNEL_URL = "https://repo.anaconda.com/pkgs/snowflake" rl = [requirements.Requirement("numpy")] - self.assertIsNotNone(env_utils.resolve_conda_environment(rl, [_SNOWFLAKE_CONDA_CHANNEL_URL])) + self.assertIsNotNone( + env_utils.resolve_conda_environment( + rl, [_SNOWFLAKE_CONDA_CHANNEL_URL], python_version=snowml_env.PYTHON_VERSION + ) + ) rl = [requirements.Requirement("numpy==1.22.4")] - self.assertIsNone(env_utils.resolve_conda_environment(rl, [_SNOWFLAKE_CONDA_CHANNEL_URL])) + self.assertIsNone( + env_utils.resolve_conda_environment( + rl, [_SNOWFLAKE_CONDA_CHANNEL_URL], python_version=snowml_env.PYTHON_VERSION + ) + ) rl = [requirements.Requirement(f"numpy=={importlib_metadata.version('numpy')}")] self.assertListEqual( - env_utils.resolve_conda_environment(rl, ["defaults"]), [f"numpy=={importlib_metadata.version('numpy')}"] + env_utils.resolve_conda_environment( + rl, + ["defaults"], + python_version=snowml_env.PYTHON_VERSION, + ), + [f"numpy=={importlib_metadata.version('numpy')}"], ) rl = [requirements.Requirement(f"numpy<={importlib_metadata.version('numpy')}")] self.assertListEqual( - env_utils.resolve_conda_environment(rl, ["defaults"]), [f"numpy=={importlib_metadata.version('numpy')}"] + env_utils.resolve_conda_environment( + rl, + ["defaults"], + python_version=snowml_env.PYTHON_VERSION, + ), + [f"numpy=={importlib_metadata.version('numpy')}"], ) def test_validate_requirements_in_snowflake_conda_channel(self) -> None: + m_session = mock_session.MockSession(conn=None, test_case=self) + m_session.add_mock_sql( + query=textwrap.dedent( + """ + SHOW COLUMNS + LIKE 'runtime_version' + IN TABLE information_schema.packages; + """ + ), + result=mock_data_frame.MockDataFrame(count_result=0), + ) + query = textwrap.dedent( """ SELECT PACKAGE_NAME, VERSION @@ -301,23 +332,26 @@ def test_validate_requirements_in_snowflake_conda_channel(self) -> None: row.Row(PACKAGE_NAME="pytorch", VERSION="1.12.1"), ] - m_session = mock_session.MockSession(conn=None, test_case=self) m_session.add_mock_sql(query=query, result=mock_data_frame.MockDataFrame(sql_result)) c_session = cast(session.Session, m_session) self.assertListEqual( env_utils.validate_requirements_in_snowflake_conda_channel( - session=c_session, reqs=[requirements.Requirement("xgboost"), requirements.Requirement("pytorch")] + session=c_session, + reqs=[requirements.Requirement("xgboost"), requirements.Requirement("pytorch")], + python_version=snowml_env.PYTHON_VERSION, ), - sorted(["xgboost==1.7.3", "pytorch==1.12.1"]), + sorted(["xgboost", "pytorch"]), ) # Test cache self.assertListEqual( env_utils.validate_requirements_in_snowflake_conda_channel( - session=c_session, reqs=[requirements.Requirement("xgboost"), requirements.Requirement("pytorch")] + session=c_session, + reqs=[requirements.Requirement("xgboost"), requirements.Requirement("pytorch")], + python_version=snowml_env.PYTHON_VERSION, ), - sorted(["xgboost==1.7.3", "pytorch==1.12.1"]), + sorted(["xgboost", "pytorch"]), ) # clear cache @@ -343,17 +377,21 @@ def test_validate_requirements_in_snowflake_conda_channel(self) -> None: self.assertListEqual( env_utils.validate_requirements_in_snowflake_conda_channel( - session=c_session, reqs=[requirements.Requirement("xgboost")] + session=c_session, + reqs=[requirements.Requirement("xgboost")], + python_version=snowml_env.PYTHON_VERSION, ), - ["xgboost==1.7.3"], + ["xgboost"], ) # Test cache self.assertListEqual( env_utils.validate_requirements_in_snowflake_conda_channel( - session=c_session, reqs=[requirements.Requirement("xgboost")] + session=c_session, + reqs=[requirements.Requirement("xgboost")], + python_version=snowml_env.PYTHON_VERSION, ), - ["xgboost==1.7.3"], + ["xgboost"], ) query = textwrap.dedent( @@ -374,17 +412,21 @@ def test_validate_requirements_in_snowflake_conda_channel(self) -> None: self.assertListEqual( env_utils.validate_requirements_in_snowflake_conda_channel( - session=c_session, reqs=[requirements.Requirement("xgboost"), requirements.Requirement("pytorch")] + session=c_session, + reqs=[requirements.Requirement("xgboost"), requirements.Requirement("pytorch")], + python_version=snowml_env.PYTHON_VERSION, ), - sorted(["xgboost==1.7.3", "pytorch==1.12.1"]), + sorted(["xgboost", "pytorch"]), ) # Test cache self.assertListEqual( env_utils.validate_requirements_in_snowflake_conda_channel( - session=c_session, reqs=[requirements.Requirement("xgboost"), requirements.Requirement("pytorch")] + session=c_session, + reqs=[requirements.Requirement("xgboost"), requirements.Requirement("pytorch")], + python_version=snowml_env.PYTHON_VERSION, ), - sorted(["xgboost==1.7.3", "pytorch==1.12.1"]), + sorted(["xgboost", "pytorch"]), ) # clear cache @@ -399,8 +441,8 @@ def test_validate_requirements_in_snowflake_conda_channel(self) -> None: """ ) sql_result = [ - row.Row(PACKAGE_NAME="xgboost", VERSION="1.3.3"), - row.Row(PACKAGE_NAME="xgboost", VERSION="1.5.1"), + row.Row(PACKAGE_NAME="xgboost", VERSION="1.7.0"), + row.Row(PACKAGE_NAME="xgboost", VERSION="1.7.1"), row.Row(PACKAGE_NAME="xgboost", VERSION="1.7.3"), ] @@ -409,7 +451,9 @@ def test_validate_requirements_in_snowflake_conda_channel(self) -> None: self.assertListEqual( env_utils.validate_requirements_in_snowflake_conda_channel( - session=c_session, reqs=[requirements.Requirement("xgboost==1.7.3")] + session=c_session, + reqs=[requirements.Requirement("xgboost==1.7.3")], + python_version=snowml_env.PYTHON_VERSION, ), ["xgboost==1.7.3"], ) @@ -417,11 +461,27 @@ def test_validate_requirements_in_snowflake_conda_channel(self) -> None: # Test cache self.assertListEqual( env_utils.validate_requirements_in_snowflake_conda_channel( - session=c_session, reqs=[requirements.Requirement("xgboost==1.7.3")] + session=c_session, + reqs=[requirements.Requirement("xgboost==1.7.3")], + python_version=snowml_env.PYTHON_VERSION, ), ["xgboost==1.7.3"], ) + with self.assertRaises(ValueError): + env_utils.validate_requirements_in_snowflake_conda_channel( + session=c_session, + reqs=[requirements.Requirement("xgboost<1.7")], + python_version=snowml_env.PYTHON_VERSION, + ) + + with self.assertRaises(ValueError): + env_utils.validate_requirements_in_snowflake_conda_channel( + session=c_session, + reqs=[requirements.Requirement("xgboost==1.7.1, ==1.7.3")], + python_version=snowml_env.PYTHON_VERSION, + ) + # clear cache env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} @@ -430,17 +490,21 @@ def test_validate_requirements_in_snowflake_conda_channel(self) -> None: self.assertListEqual( env_utils.validate_requirements_in_snowflake_conda_channel( - session=c_session, reqs=[requirements.Requirement("xgboost<1.7")] + session=c_session, + reqs=[requirements.Requirement("xgboost==1.7.*")], + python_version=snowml_env.PYTHON_VERSION, ), - ["xgboost==1.5.1"], + ["xgboost==1.7.*"], ) # Test cache self.assertListEqual( env_utils.validate_requirements_in_snowflake_conda_channel( - session=c_session, reqs=[requirements.Requirement("xgboost<1.7")] + session=c_session, + reqs=[requirements.Requirement("xgboost==1.7.*")], + python_version=snowml_env.PYTHON_VERSION, ), - ["xgboost==1.5.1"], + ["xgboost==1.7.*"], ) # clear cache @@ -451,14 +515,18 @@ def test_validate_requirements_in_snowflake_conda_channel(self) -> None: self.assertIsNone( env_utils.validate_requirements_in_snowflake_conda_channel( - session=c_session, reqs=[requirements.Requirement("xgboost<1.3")] + session=c_session, + reqs=[requirements.Requirement("xgboost==1.3.*")], + python_version=snowml_env.PYTHON_VERSION, ) ) # Test cache self.assertIsNone( env_utils.validate_requirements_in_snowflake_conda_channel( - session=c_session, reqs=[requirements.Requirement("xgboost<1.3")] + session=c_session, + reqs=[requirements.Requirement("xgboost==1.3.*")], + python_version=snowml_env.PYTHON_VERSION, ) ) @@ -481,10 +549,63 @@ def test_validate_requirements_in_snowflake_conda_channel(self) -> None: self.assertIsNone( env_utils.validate_requirements_in_snowflake_conda_channel( - session=c_session, reqs=[requirements.Requirement("python-package")] + session=c_session, + reqs=[requirements.Requirement("python-package")], + python_version=snowml_env.PYTHON_VERSION, ) ) + env_utils._INFO_SCHEMA_PACKAGES_HAS_RUNTIME_VERSION = None + m_session = mock_session.MockSession(conn=None, test_case=self) + m_session.add_mock_sql( + query=textwrap.dedent( + """ + SHOW COLUMNS + LIKE 'runtime_version' + IN TABLE information_schema.packages; + """ + ), + result=mock_data_frame.MockDataFrame(count_result=1), + ) + + query = textwrap.dedent( + f""" + SELECT PACKAGE_NAME, VERSION + FROM information_schema.packages + WHERE (package_name = 'pytorch' OR package_name = 'xgboost') + AND language = 'python' + AND runtime_version = '{platform.python_version_tuple()[0]}.{platform.python_version_tuple()[1]}'; + """ + ) + sql_result = [ + row.Row(PACKAGE_NAME="xgboost", VERSION="1.3.3"), + row.Row(PACKAGE_NAME="xgboost", VERSION="1.5.1"), + row.Row(PACKAGE_NAME="xgboost", VERSION="1.7.3"), + row.Row(PACKAGE_NAME="pytorch", VERSION="1.12.1"), + ] + + m_session.add_mock_sql(query=query, result=mock_data_frame.MockDataFrame(sql_result)) + c_session = cast(session.Session, m_session) + + self.assertListEqual( + env_utils.validate_requirements_in_snowflake_conda_channel( + session=c_session, + reqs=[requirements.Requirement("xgboost"), requirements.Requirement("pytorch")], + python_version=snowml_env.PYTHON_VERSION, + ), + sorted(["xgboost", "pytorch"]), + ) + + # Test cache + self.assertListEqual( + env_utils.validate_requirements_in_snowflake_conda_channel( + session=c_session, + reqs=[requirements.Requirement("xgboost"), requirements.Requirement("pytorch")], + python_version=snowml_env.PYTHON_VERSION, + ), + sorted(["xgboost", "pytorch"]), + ) + if __name__ == "__main__": absltest.main() diff --git a/snowflake/ml/model/BUILD.bazel b/snowflake/ml/model/BUILD.bazel index 8755ab98..85dc9cbc 100644 --- a/snowflake/ml/model/BUILD.bazel +++ b/snowflake/ml/model/BUILD.bazel @@ -12,6 +12,7 @@ py_library( srcs = ["model_signature.py"], deps = [ ":type_hints", + "//snowflake/ml/_internal/utils:identifier", ], ) @@ -28,7 +29,7 @@ py_library( name = "custom_model", srcs = ["custom_model.py"], deps = [ - ":type_hints" + ":type_hints", ], ) @@ -48,10 +49,10 @@ py_library( name = "_udf_util", srcs = ["_udf_util.py"], deps = [ + ":_env", + ":_model", ":_model_meta", ":type_hints", - ":_model", - ":_env", "//snowflake/ml/_internal:env_utils", ], ) @@ -61,9 +62,7 @@ py_library( srcs = ["_deployer.py"], deps = [ ":_udf_util", - ":model_signature", - ":type_hints", - "//snowflake/ml/_internal/utils:identifier" + "//snowflake/ml/_internal/utils:identifier", ], ) @@ -80,14 +79,15 @@ py_library( name = "_model", srcs = ["_model.py"], deps = [ - ":custom_model", ":_model_handler", ":_model_meta", + ":custom_model", ":model_signature", ":type_hints", "//snowflake/ml/model/_handlers:custom", "//snowflake/ml/model/_handlers:sklearn", - "//snowflake/ml/model/_handlers:xgboost" + "//snowflake/ml/model/_handlers:snowmlmodel", + "//snowflake/ml/model/_handlers:xgboost", ], ) @@ -103,8 +103,8 @@ py_test( name = "_env_test", srcs = ["_env_test.py"], deps = [ - "//snowflake/ml/_internal:env", ":_env", + "//snowflake/ml/_internal:env", ], ) @@ -112,8 +112,8 @@ py_test( name = "_udf_util_test", srcs = ["_udf_util_test.py"], deps = [ - ":_udf_util", ":_model_meta", + ":_udf_util", ":model_signature", "//snowflake/ml/_internal:env_utils", "//snowflake/ml/test_utils:mock_data_frame", @@ -126,6 +126,7 @@ py_test( srcs = ["model_signature_test.py"], deps = [ ":model_signature", + "//snowflake/ml/utils:connection_params", ], ) @@ -134,21 +135,22 @@ py_test( srcs = ["_model_meta_test.py"], deps = [ ":_model_meta", - ":model_signature" + ":model_signature", ], ) py_test( name = "_model_test", + timeout = "long", srcs = ["_model_test.py"], shard_count = 5, - timeout = "long", deps = [ ":_model", - ":custom_model", ":_model_handler", ":_model_meta", + ":custom_model", ":model_signature", ":type_hints", + "//snowflake/ml/modeling/linear_model:linear_regression", ], ) diff --git a/snowflake/ml/model/_deployer.py b/snowflake/ml/model/_deployer.py index d7bf5a8b..112b74aa 100644 --- a/snowflake/ml/model/_deployer.py +++ b/snowflake/ml/model/_deployer.py @@ -1,4 +1,3 @@ -import json import os from abc import ABC, abstractmethod from enum import Enum @@ -9,7 +8,7 @@ from snowflake.ml._internal.utils import identifier from snowflake.ml.model import _udf_util, model_signature, type_hints as model_types -from snowflake.snowpark import DataFrame, Session, functions as F +from snowflake.snowpark import DataFrame as SnowparkDataFrame, Session, functions as F from snowflake.snowpark._internal import type_utils @@ -250,7 +249,7 @@ def delete_deployment(self, name: str) -> None: self._manager.delete(name) @overload - def predict(self, name: str, X: model_types.SupportedDataType) -> pd.DataFrame: + def predict(self, name: str, X: model_types.SupportedLocalDataType) -> pd.DataFrame: """Execute batch inference of a model remotely on local data. Can be any supported data type. Return a local Pandas Dataframe. @@ -261,7 +260,7 @@ def predict(self, name: str, X: model_types.SupportedDataType) -> pd.DataFrame: ... @overload - def predict(self, name: str, X: DataFrame) -> DataFrame: + def predict(self, name: str, X: SnowparkDataFrame) -> SnowparkDataFrame: """Execute batch inference of a model remotely on a Snowpark DataFrame. Return a Snowpark DataFrame. Args: @@ -270,7 +269,9 @@ def predict(self, name: str, X: DataFrame) -> DataFrame: """ - def predict(self, name: str, X: Union[model_types.SupportedDataType, DataFrame]) -> Union[pd.DataFrame, DataFrame]: + def predict( + self, name: str, X: Union[model_types.SupportedDataType, SnowparkDataFrame] + ) -> Union[pd.DataFrame, SnowparkDataFrame]: """Execute batch inference of a model remotely. Args: @@ -280,7 +281,6 @@ def predict(self, name: str, X: Union[model_types.SupportedDataType, DataFrame]) Raises: ValueError: Raised when the deployment does not exist. ValueError: Raised when the input is too large to use keep_order option. - NotImplementedError: Raised when confronting unsupported feature group. Returns: The output dataframe. @@ -290,10 +290,11 @@ def predict(self, name: str, X: Union[model_types.SupportedDataType, DataFrame]) raise ValueError(f"Deployment {name} does not exist.") sig = d["signature"] keep_order = d["options"].get("keep_order", True) - if not isinstance(X, DataFrame): - df = model_signature._validate_data_with_features_and_convert_to_df(sig.inputs, X) + if not isinstance(X, SnowparkDataFrame): + df = model_signature._convert_and_validate_local_data(X, sig.inputs) s_df = self._session.create_dataframe(df) else: + model_signature._validate_snowpark_data(X, sig.inputs) s_df = X if keep_order: @@ -311,18 +312,16 @@ def predict(self, name: str, X: Union[model_types.SupportedDataType, DataFrame]) type_utils.ColumnOrName(F.col(col_name)), ] ) - output_col_names = [feature.name for feature in sig.outputs] + output_cols = [] - for output_col_name in output_col_names: + for output_feature in sig.outputs: # To avoid automatic upper-case convert, we quoted the result name. - output_cols.append(F.col("tmp_result")[output_col_name].alias(f'"{output_col_name}"')) - - dtype_map = {} - for feature in sig.outputs: - if isinstance(feature, model_signature.FeatureSpec): - dtype_map[feature.name] = feature._dtype._value - else: - raise NotImplementedError("FeatureGroup is not supported yet.") + output_cols.append( + F.parse_json(type_utils.ColumnOrName(F.col("tmp_result")[output_feature.name])) + .astype(output_feature.as_snowpark_type()) + .alias(f'"{output_feature.name}"') + ) + df_res = s_df.select( F.call_udf(name, type_utils.ColumnOrLiteral(F.object_construct(*cols))).alias("tmp_result") ) @@ -332,11 +331,16 @@ def predict(self, name: str, X: Union[model_types.SupportedDataType, DataFrame]) df_res = df_res.select(*output_cols) - if not isinstance(X, DataFrame): + if not isinstance(X, SnowparkDataFrame): + dtype_map = {} + for feature in sig.outputs: + if isinstance(feature, model_signature.FeatureSpec): + dtype_map[feature.name] = feature._dtype._value + elif isinstance(feature, model_signature.FeatureGroupSpec): + for ft in feature._specs: + dtype_map[ft.name] = ft._dtype._value df_local = df_res.to_pandas() - df_local = ( - df_local.applymap(json.loads).rename(columns=identifier.remove_quote_if_quoted).astype(dtype=dtype_map) - ) + df_local = df_local.rename(columns=identifier.remove_quote_if_quoted).astype(dtype=dtype_map) return pd.DataFrame(df_local) else: return df_res diff --git a/snowflake/ml/model/_handlers/BUILD.bazel b/snowflake/ml/model/_handlers/BUILD.bazel index 804d83bf..4e518737 100644 --- a/snowflake/ml/model/_handlers/BUILD.bazel +++ b/snowflake/ml/model/_handlers/BUILD.bazel @@ -11,18 +11,17 @@ py_library( ], ) - py_library( name = "custom", srcs = ["custom.py"], deps = [ ":_base", "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/model:_model_handler", "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:custom_model", - "//snowflake/ml/model:_model_handler", - "//snowflake/ml/model:type_hints", "//snowflake/ml/model:model_signature", + "//snowflake/ml/model:type_hints", ], ) @@ -31,11 +30,25 @@ py_library( srcs = ["sklearn.py"], deps = [ ":_base", + "//snowflake/ml/_internal:type_utils", "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:custom_model", - "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", + ], +) + +py_library( + name = "snowmlmodel", + srcs = ["snowmlmodel.py"], + deps = [ + ":_base", + "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/model:_model_meta", + "//snowflake/ml/model:custom_model", "//snowflake/ml/model:model_signature", + "//snowflake/ml/model:type_hints", + "//snowflake/ml/framework", ], ) @@ -44,10 +57,10 @@ py_library( srcs = ["xgboost.py"], deps = [ ":_base", + "//snowflake/ml/_internal:type_utils", "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:custom_model", - "//snowflake/ml/_internal:type_utils", - "//snowflake/ml/model:type_hints", "//snowflake/ml/model:model_signature", + "//snowflake/ml/model:type_hints", ], ) diff --git a/snowflake/ml/model/_handlers/snowmlmodel.py b/snowflake/ml/model/_handlers/snowmlmodel.py new file mode 100644 index 00000000..a61454b3 --- /dev/null +++ b/snowflake/ml/model/_handlers/snowmlmodel.py @@ -0,0 +1,187 @@ +import os +from typing import TYPE_CHECKING, Callable, Optional, Sequence, Type, cast + +import cloudpickle +import numpy as np +import pandas as pd +from typing_extensions import TypeGuard, Unpack + +from snowflake.ml._internal import type_utils +from snowflake.ml.model import ( + _model_meta as model_meta_api, + custom_model, + model_signature, + type_hints as model_types, +) +from snowflake.ml.model._handlers import _base + +if TYPE_CHECKING: + from snowflake.ml.framework.base import BaseEstimator + + +class _SnowMLModelHandler(_base._ModelHandler["BaseEstimator"]): + """Handler for SnowML based model. + + Currently snowflake.ml.framework.base.BaseEstimator + and snowflake.ml.framework.pipeline.Pipeline based classes are supported. + """ + + handler_type = "snowml" + DEFAULT_TARGET_METHODS = ["predict", "transform", "predict_proba", "predict_log_proba", "decision_function"] + + @staticmethod + def can_handle( + model: model_types.SupportedModelType, + ) -> TypeGuard["BaseEstimator"]: + return ( + type_utils.LazyType("snowflake.ml.framework.base.BaseEstimator").isinstance(model) + # Pipeline is inherited from BaseEstimator, so no need to add one more check + ) and any( + (hasattr(model, method) and callable(getattr(model, method, None))) + for method in _SnowMLModelHandler.DEFAULT_TARGET_METHODS + ) + + @staticmethod + def cast_model( + model: model_types.SupportedModelType, + ) -> "BaseEstimator": + from snowflake.ml.framework.base import BaseEstimator + + assert isinstance(model, BaseEstimator) + # Pipeline is inherited from BaseEstimator, so no need to add one more check + + return cast("BaseEstimator", model) + + @staticmethod + def _save_model( + name: str, + model: "BaseEstimator", + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + sample_input: Optional[model_types.SupportedDataType] = None, + is_sub_model: Optional[bool] = False, + **kwargs: Unpack[model_types.SNOWModelSaveOptions], + ) -> None: + from snowflake.ml.framework.base import BaseEstimator + + assert isinstance(model, BaseEstimator) + # Pipeline is inherited from BaseEstimator, so no need to add one more check + + if not is_sub_model: + # TODO(xjiang): get model signature from modeling. + if model_meta._signatures is None: + # In this case sample_input should be available, because of the check in save_model. + assert sample_input is not None + target_methods = kwargs.pop("target_methods", None) + if target_methods is None: + target_methods = [ + method + for method in _SnowMLModelHandler.DEFAULT_TARGET_METHODS + if hasattr(model, method) and callable(getattr(model, method, None)) + ] + else: + for method_name in target_methods: + if not callable(getattr(model, method_name, None)): + raise ValueError(f"Target method {method_name} is not callable.") + if method_name not in _SnowMLModelHandler.DEFAULT_TARGET_METHODS: + raise ValueError(f"Target method {method_name} is not supported.") + + model_meta._signatures = {} + for method_name in target_methods: + target_method = getattr(model, method_name) + sig = model_signature.infer_signature(sample_input, target_method(sample_input)) + model_meta._signatures[method_name] = sig + else: + for method_name in model_meta._signatures.keys(): + if not callable(getattr(model, method_name, None)): + raise ValueError(f"Target method {method_name} is not callable.") + if method_name not in _SnowMLModelHandler.DEFAULT_TARGET_METHODS: + raise ValueError(f"Target method {method_name} is not supported.") + + model_blob_path = os.path.join(model_blobs_dir_path, name) + os.makedirs(model_blob_path, exist_ok=True) + with open(os.path.join(model_blob_path, _SnowMLModelHandler.MODEL_BLOB_FILE), "wb") as f: + cloudpickle.dump(model, f) + base_meta = model_meta_api._ModelBlobMetadata( + name=name, model_type=_SnowMLModelHandler.handler_type, path=_SnowMLModelHandler.MODEL_BLOB_FILE + ) + model_meta.models[name] = base_meta + model_meta._include_if_absent( + [("scikit-learn", "scikit-learn"), ("xgboost", "xgboost"), ("lightgbm", "lightgbm"), ("joblib", "joblib")] + ) + + @staticmethod + def _load_model(name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str) -> "BaseEstimator": + model_blob_path = os.path.join(model_blobs_dir_path, name) + if not hasattr(model_meta, "models"): + raise ValueError("Ill model metadata found.") + model_blobs_metadata = model_meta.models + if name not in model_blobs_metadata: + raise ValueError(f"Blob of model {name} does not exist.") + model_blob_metadata = model_blobs_metadata[name] + model_blob_filename = model_blob_metadata.path + with open(os.path.join(model_blob_path, model_blob_filename), "rb") as f: + m = cloudpickle.load(f) + + from snowflake.ml.framework.base import BaseEstimator + + assert isinstance(m, BaseEstimator) + return m + + @staticmethod + def _load_as_custom_model( + name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + ) -> custom_model.CustomModel: + """Create a custom model class wrap for unified interface when being deployed. The predict method will be + re-targeted based on target_method metadata. + + Args: + name: Name of the model. + model_meta: The model metadata. + model_blobs_dir_path: Directory path to the whole model. + + Returns: + The model object as a custom model. + """ + from snowflake.ml.model import custom_model + + def _create_custom_model( + raw_model: "BaseEstimator", + model_meta: model_meta_api.ModelMetadata, + ) -> Type[custom_model.CustomModel]: + def fn_factory( + raw_model: "BaseEstimator", + output_col_names: Sequence[str], + target_method: str, + ) -> Callable[[custom_model.CustomModel, pd.DataFrame], pd.DataFrame]: + @custom_model.inference_api + def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: + res = getattr(raw_model, target_method)(X) + + if isinstance(res, list) and len(res) > 0 and isinstance(res[0], np.ndarray): + # In case of multi-output estimators, predict_proba(), decision_function(), etc., functions + # return a list of ndarrays. We need to concatenate them. + res = np.concatenate(res, axis=1) + return pd.DataFrame(res, columns=output_col_names) + + return fn + + type_method_dict = {} + for target_method_name, sig in model_meta.signatures.items(): + type_method_dict[target_method_name] = fn_factory( + raw_model, [spec.name for spec in sig.outputs], target_method_name + ) + + _SnowMLModel = type( + "_SnowMLModel", + (custom_model.CustomModel,), + type_method_dict, + ) + + return _SnowMLModel + + raw_model = _SnowMLModelHandler._load_model(name, model_meta, model_blobs_dir_path) + _SnowMLModel = _create_custom_model(raw_model, model_meta) + snowml_model = _SnowMLModel(custom_model.ModelContext()) + + return snowml_model diff --git a/snowflake/ml/model/_model_test.py b/snowflake/ml/model/_model_test.py index c11fa80f..959dfee0 100644 --- a/snowflake/ml/model/_model_test.py +++ b/snowflake/ml/model/_model_test.py @@ -15,6 +15,7 @@ model_signature, type_hints as model_types, ) +from snowflake.ml.modeling.linear_model import LinearRegression class DemoModelWithManyArtifacts(custom_model.CustomModel): @@ -458,6 +459,69 @@ def test_xgb(self) -> None: assert callable(predict_method) np.testing.assert_allclose(predict_method(cal_X_test), y_pred_proba) + def test_snowml(self) -> None: + iris = datasets.load_iris() + + df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=iris["feature_names"] + ["target"]) + df.columns = [s.replace(" (CM)", "").replace(" ", "") for s in df.columns.str.upper()] + + INPUT_COLUMNS = ["SEPALLENGTH", "SEPALWIDTH", "PETALLENGTH", "PETALWIDTH"] + LABEL_COLUMNS = "TARGET" + OUTPUT_COLUMNS = "PREDICTED_TARGET" + regr = LinearRegression(input_cols=INPUT_COLUMNS, output_cols=OUTPUT_COLUMNS, label_cols=LABEL_COLUMNS) + regr.fit(df) + + predictions = regr.predict(df[:1])[[OUTPUT_COLUMNS]] + + with tempfile.TemporaryDirectory() as tmpdir: + s = {"predict": model_signature.infer_signature(df[INPUT_COLUMNS], regr.predict(df)[[OUTPUT_COLUMNS]])} + with self.assertRaises(ValueError): + model_api.save_model( + name="model1", + model_dir_path=os.path.join(tmpdir, "model1"), + model=regr, + signatures={**s, "another_predict": s["predict"]}, + metadata={"author": "halu", "version": "1"}, + ) + + model_api.save_model( + name="model1", + model_dir_path=os.path.join(tmpdir, "model1"), + model=regr, + signatures=s, + metadata={"author": "halu", "version": "1"}, + ) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + + m: LinearRegression + m, _ = model_api.load_model(os.path.join(tmpdir, "model1")) + np.testing.assert_allclose(predictions, m.predict(df[:1])[[OUTPUT_COLUMNS]]) + m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + predict_method = getattr(m_udf, "predict", None) + assert callable(predict_method) + np.testing.assert_allclose(predictions, predict_method(df[:1])[[OUTPUT_COLUMNS]]) + + model_api.save_model( + name="model1_no_sig", + model_dir_path=os.path.join(tmpdir, "model1_no_sig"), + model=regr, + sample_input=df, + metadata={"author": "halu", "version": "1"}, + ) + + m, meta = model_api.load_model(os.path.join(tmpdir, "model1_no_sig")) + np.testing.assert_allclose(np.array([[-0.08254936]]), m.predict(df[:1])[[OUTPUT_COLUMNS]]) + # TODO: After model_signatures() function is updated in codegen, next line should be changed to + # s = regr.model_signatures() + # self.assertEqual(s["predict"], meta.signatures["predict"]) + + m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig")) + predict_method = getattr(m_udf, "predict", None) + assert callable(predict_method) + np.testing.assert_allclose(np.array([[-0.08254936]]), predict_method(df[:1])[[OUTPUT_COLUMNS]]) + if __name__ == "__main__": absltest.main() diff --git a/snowflake/ml/model/_udf_util.py b/snowflake/ml/model/_udf_util.py index ac7f710c..1f9540f4 100644 --- a/snowflake/ml/model/_udf_util.py +++ b/snowflake/ml/model/_udf_util.py @@ -215,12 +215,13 @@ def _get_model_final_packages( raise RuntimeError("PIP requirements and dependencies from non-Snowflake anaconda channel is not supported.") try: final_packages = env_utils.resolve_conda_environment( - meta._conda_dependencies[""], [model_env._SNOWFLAKE_CONDA_CHANNEL_URL] + meta._conda_dependencies[""], [model_env._SNOWFLAKE_CONDA_CHANNEL_URL], python_version=meta.python_version ) if final_packages is None and relax_version: final_packages = env_utils.resolve_conda_environment( list(map(env_utils.relax_requirement_version, meta._conda_dependencies[""])), [model_env._SNOWFLAKE_CONDA_CHANNEL_URL], + python_version=meta.python_version, ) except ImportError: warnings.warn( @@ -228,10 +229,17 @@ def _get_model_final_packages( category=RuntimeWarning, ) final_packages = env_utils.validate_requirements_in_snowflake_conda_channel( - session=session, reqs=meta._conda_dependencies[""] + session=session, + reqs=meta._conda_dependencies[""], + python_version=meta.python_version, ) if final_packages is None and relax_version: - final_packages = list(map(str, map(env_utils.relax_requirement_version, meta._conda_dependencies[""]))) + final_packages = env_utils.validate_requirements_in_snowflake_conda_channel( + session=session, + reqs=list(map(env_utils.relax_requirement_version, meta._conda_dependencies[""])), + python_version=meta.python_version, + ) + finally: if final_packages is None: raise RuntimeError( diff --git a/snowflake/ml/model/_udf_util_test.py b/snowflake/ml/model/_udf_util_test.py index 85f900ac..53d5ed47 100644 --- a/snowflake/ml/model/_udf_util_test.py +++ b/snowflake/ml/model/_udf_util_test.py @@ -25,20 +25,34 @@ class TestFinalPackagesWithoutConda(absltest.TestCase): - def setUp(self) -> None: - self._temp_conda = None + @classmethod + def setUpClass(cls) -> None: + cls._temp_conda = None if sys.modules.get("conda"): - self._temp_conda = sys.modules["conda"] + cls._temp_conda = sys.modules["conda"] sys.modules["conda"] = None # type: ignore[assignment] - self.m_session = mock_session.MockSession(conn=None, test_case=self) + cls.m_session = mock_session.MockSession(conn=None, test_case=None) + cls.m_session.add_mock_sql( + query=textwrap.dedent( + """ + SHOW COLUMNS + LIKE 'runtime_version' + IN TABLE information_schema.packages; + """ + ), + result=mock_data_frame.MockDataFrame(count_result=0), + ) + + def setUp(self) -> None: self.add_packages( {basic_dep: [importlib_metadata.version(basic_dep)] for basic_dep in _model_meta._BASIC_DEPENDENCIES} ) - def tearDown(self) -> None: - if self._temp_conda: - sys.modules["conda"] = self._temp_conda + @classmethod + def tearDownClass(cls) -> None: + if cls._temp_conda: + sys.modules["conda"] = cls._temp_conda else: del sys.modules["conda"] @@ -73,7 +87,7 @@ def test_get_model_final_packages(self) -> None: def test_get_model_final_packages_no_relax(self) -> None: env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} meta = _model_meta.ModelMetadata( - name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["pandas<1"] + name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["pandas==1.0.*"] ) c_session = cast(session.Session, self.m_session) with self.assertWarns(RuntimeWarning): @@ -83,12 +97,12 @@ def test_get_model_final_packages_no_relax(self) -> None: def test_get_model_final_packages_relax(self) -> None: env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} meta = _model_meta.ModelMetadata( - name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["pandas<1"] + name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["pandas==1.0.*"] ) c_session = cast(session.Session, self.m_session) with self.assertWarns(RuntimeWarning): final_packages = _udf_util._get_model_final_packages(meta, c_session, relax_version=True) - self.assertListEqual(final_packages, _model_meta._BASIC_DEPENDENCIES) + self.assertListEqual(final_packages, sorted(_model_meta._BASIC_DEPENDENCIES)) def test_get_model_final_packages_with_pip(self) -> None: env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} diff --git a/snowflake/ml/model/model_signature.py b/snowflake/ml/model/model_signature.py index 485603a8..6986ad27 100644 --- a/snowflake/ml/model/model_signature.py +++ b/snowflake/ml/model/model_signature.py @@ -2,39 +2,56 @@ import warnings from abc import ABC, abstractmethod from enum import Enum -from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast +from typing import ( + Any, + Callable, + Dict, + Final, + Generic, + List, + Literal, + Optional, + Sequence, + Tuple, + Type, + Union, + final, +) import numpy as np import numpy.typing as npt import pandas as pd +from typing_extensions import TypeGuard +import snowflake.snowpark import snowflake.snowpark.types as spt +from snowflake.ml._internal.utils import identifier from snowflake.ml.model import type_hints as model_types class DataType(Enum): - def __init__(self, value: str, sql_type: str, snowpark_type: spt.DataType, numpy_type: npt.DTypeLike) -> None: + def __init__(self, value: str, sql_type: str, snowpark_type: Type[spt.DataType], numpy_type: npt.DTypeLike) -> None: self._value = value self._sql_type = sql_type self._snowpark_type = snowpark_type self._numpy_type = numpy_type - INT8 = ("int8", "INTEGER", spt.IntegerType(), np.int8) - INT16 = ("int16", "INTEGER", spt.IntegerType(), np.int16) - INT32 = ("int32", "INTEGER", spt.IntegerType(), np.int32) - INT64 = ("int64", "INTEGER", spt.IntegerType(), np.int64) + INT8 = ("int8", "INTEGER", spt.IntegerType, np.int8) + INT16 = ("int16", "INTEGER", spt.IntegerType, np.int16) + INT32 = ("int32", "INTEGER", spt.IntegerType, np.int32) + INT64 = ("int64", "INTEGER", spt.IntegerType, np.int64) - FLOAT = ("float", "FLOAT", spt.FloatType(), np.float32) - DOUBLE = ("double", "DOUBLE", spt.DoubleType(), np.float64) + FLOAT = ("float", "FLOAT", spt.FloatType, np.float32) + DOUBLE = ("double", "DOUBLE", spt.DoubleType, np.float64) - UINT8 = ("uint8", "INTEGER", spt.IntegerType(), np.uint8) - UINT16 = ("uint16", "INTEGER", spt.IntegerType(), np.uint16) - UINT32 = ("uint32", "INTEGER", spt.IntegerType(), np.uint32) - UINT64 = ("uint64", "INTEGER", spt.IntegerType(), np.uint64) + UINT8 = ("uint8", "INTEGER", spt.IntegerType, np.uint8) + UINT16 = ("uint16", "INTEGER", spt.IntegerType, np.uint16) + UINT32 = ("uint32", "INTEGER", spt.IntegerType, np.uint32) + UINT64 = ("uint64", "INTEGER", spt.IntegerType, np.uint64) - BOOL = ("bool", "BOOLEAN", spt.BooleanType(), np.bool8) - STRING = ("string", "VARCHAR", spt.StringType(), np.str0) - BYTES = ("bytes", "VARBINARY", spt.BinaryType(), np.bytes0) + BOOL = ("bool", "BOOLEAN", spt.BooleanType, np.bool8) + STRING = ("string", "VARCHAR", spt.StringType, np.str0) + BYTES = ("bytes", "VARBINARY", spt.BinaryType, np.bytes0) def as_sql_type(self) -> str: """Convert to corresponding Snowflake Logic Type. @@ -50,7 +67,7 @@ def as_snowpark_type(self) -> spt.DataType: Returns: A Snowpark type. """ - return self._snowpark_type + return self._snowpark_type() def __repr__(self) -> str: return f"DataType.{self.name}" @@ -75,6 +92,44 @@ def from_numpy_type(cls, np_type: npt.DTypeLike) -> "DataType": return np_to_snowml_type_mapping[potential_type] raise NotImplementedError(f"Type {np_type} is not supported as a DataType.") + @classmethod + def from_snowpark_type(cls, snowpark_type: spt.DataType) -> "DataType": + """Translate snowpark type to DataType for signature definition. + + Args: + snowpark_type: The snowpark type. + + Raises: + NotImplementedError: Raised when the given numpy type is not supported. + + Returns: + Corresponding DataType. + """ + snowpark_to_snowml_type_mapping: Dict[Type[spt.DataType], DataType] = { + spt._IntegralType: DataType.INT64, + **{i._snowpark_type: i for i in DataType if i._snowpark_type != spt.IntegerType}, + } + for potential_type in snowpark_to_snowml_type_mapping.keys(): + if isinstance(snowpark_type, potential_type): + return snowpark_to_snowml_type_mapping[potential_type] + raise NotImplementedError(f"Type {snowpark_type} is not supported as a DataType.") + + def is_same_snowpark_type(self, incoming_snowpark_type: spt.DataType) -> bool: + """Check if provided snowpark type is the same as Data Type. + Since for Snowflake all integer types are same, thus when datatype is a integer type, the incoming snowpark + type can be any type inherit from _IntegralType. + + Args: + incoming_snowpark_type: The snowpark type. + + Returns: + If the provided snowpark type is the same as the DataType. + """ + if self._snowpark_type == spt.IntegerType: + return isinstance(incoming_snowpark_type, spt._IntegralType) + else: + return isinstance(incoming_snowpark_type, self._snowpark_type) + class BaseFeatureSpec(ABC): """Abstract Class for specification of a feature.""" @@ -82,6 +137,7 @@ class BaseFeatureSpec(ABC): def __init__(self, name: str) -> None: self._name = name + @final @property def name(self) -> str: """Name of the feature.""" @@ -345,11 +401,277 @@ def __repr__(self) -> str: ) +class _BaseDataHandler(ABC, Generic[model_types._DataType]): + FEATURE_PREFIX: Final[str] = "feature" + INPUT_PREFIX: Final[str] = "input" + OUTPUT_PREFIX: Final[str] = "output" + + @staticmethod + @abstractmethod + def can_handle(data: model_types.SupportedDataType) -> TypeGuard[model_types._DataType]: + ... + + @staticmethod + @abstractmethod + def validate(data: model_types._DataType) -> None: + ... + + @staticmethod + @abstractmethod + def infer_signature(data: model_types._DataType, role: Literal["input", "output"]) -> Sequence[FeatureSpec]: + ... + + @staticmethod + @abstractmethod + def convert_to_df(data: model_types._DataType) -> Union[pd.DataFrame, snowflake.snowpark.DataFrame]: + ... + + +class _PandasDataFrameHandler(_BaseDataHandler[pd.DataFrame]): + @staticmethod + def can_handle(data: model_types.SupportedDataType) -> TypeGuard[pd.DataFrame]: + return isinstance(data, pd.DataFrame) + + @staticmethod + def validate(data: pd.DataFrame) -> None: + df_cols = data.columns + if not all(hasattr(data[col], "dtype") for col in data.columns): + raise ValueError(f"Data Validation Error: Unknown column confronted in {data}.") + + if len(df_cols) == 0: + raise ValueError("Data Validation Error: Empty data is found.") + + if df_cols.has_duplicates: # Rule out categorical index with duplicates + raise ValueError("Data Validation Error: Duplicate column index is found.") + + if df_cols.dtype not in [np.int64, np.uint64, np.float64, np.object0]: + raise ValueError("Data Validation Error: Unsupported column index type is found.") + + df_col_dtypes = [data[col].dtype for col in data.columns] + for df_col, df_col_dtype in zip(df_cols, df_col_dtypes): + if df_col_dtype == np.dtype("O"): + # Check if all objects have the same type + if not all(isinstance(data_row, type(data[df_col][0])) for data_row in data[df_col]): + raise ValueError( + f"Data Validation Error: Inconsistent type of object found in column data {data[df_col]}." + ) + + if isinstance(data[df_col][0], list): + arr = _convert_list_to_ndarray(data[df_col][0]) + arr_dtype = DataType.from_numpy_type(arr.dtype) + + converted_data_list = [_convert_list_to_ndarray(data_row) for data_row in data[df_col]] + + if not all( + DataType.from_numpy_type(converted_data.dtype) == arr_dtype + for converted_data in converted_data_list + ): + raise ValueError( + "Data Validation Error: " + + f"Inconsistent type of object found in column data {data[df_col]}." + ) + + elif isinstance(data[df_col][0], np.ndarray): + arr_dtype = DataType.from_numpy_type(data[df_col][0].dtype) + + if not all(DataType.from_numpy_type(data_row.dtype) == arr_dtype for data_row in data[df_col]): + raise ValueError( + "Data Validation Error: " + + f"Inconsistent type of object found in column data {data[df_col]}." + ) + elif not isinstance(data[df_col][0], (str, bytes)): + raise ValueError(f"Data Validation Error: Unsupported type confronted in {data[df_col]}") + + @staticmethod + def infer_signature(data: pd.DataFrame, role: Literal["input", "output"]) -> Sequence[FeatureSpec]: + feature_prefix = f"{_PandasDataFrameHandler.FEATURE_PREFIX}_" + df_cols = data.columns + if df_cols.dtype in [np.int64, np.uint64, np.float64]: + ft_names = [f"{feature_prefix}{i}" for i in df_cols] + else: + ft_names = list(map(str, data.columns.to_list())) + + df_col_dtypes = [data[col].dtype for col in data.columns] + + specs = [] + for df_col, df_col_dtype, ft_name in zip(df_cols, df_col_dtypes, ft_names): + if df_col_dtype == np.dtype("O"): + if isinstance(data[df_col][0], list): + arr = _convert_list_to_ndarray(data[df_col][0]) + arr_dtype = DataType.from_numpy_type(arr.dtype) + ft_shape = np.shape(data[df_col][0]) + + converted_data_list = [_convert_list_to_ndarray(data_row) for data_row in data[df_col]] + + if not all(np.shape(converted_data) == ft_shape for converted_data in converted_data_list): + ft_shape = (-1,) + + specs.append(FeatureSpec(dtype=arr_dtype, name=ft_name, shape=ft_shape)) + elif isinstance(data[df_col][0], np.ndarray): + arr_dtype = DataType.from_numpy_type(data[df_col][0].dtype) + ft_shape = np.shape(data[df_col][0]) + + if not all(np.shape(data_row) == ft_shape for data_row in data[df_col]): + ft_shape = (-1,) + + specs.append(FeatureSpec(dtype=arr_dtype, name=ft_name, shape=ft_shape)) + elif isinstance(data[df_col][0], str): + specs.append(FeatureSpec(dtype=DataType.STRING, name=ft_name)) + elif isinstance(data[df_col][0], bytes): + specs.append(FeatureSpec(dtype=DataType.BYTES, name=ft_name)) + else: + specs.append(FeatureSpec(dtype=DataType.from_numpy_type(df_col_dtype), name=ft_name)) + return specs + + @staticmethod + @abstractmethod + def convert_to_df(data: pd.DataFrame) -> pd.DataFrame: + return data + + +class _NumpyArrayHandler(_BaseDataHandler[model_types._SupportedNumpyArray]): + @staticmethod + def can_handle(data: model_types.SupportedDataType) -> TypeGuard[model_types._SupportedNumpyArray]: + return isinstance(data, np.ndarray) + + @staticmethod + def validate(data: model_types._SupportedNumpyArray) -> None: + if data.shape == (0,): + # Empty array + raise ValueError("Data Validation Error: Empty data is found.") + + if data.shape == (): + # scalar + raise ValueError("Data Validation Error: Scalar data is found.") + + @staticmethod + def infer_signature( + data: model_types._SupportedNumpyArray, role: Literal["input", "output"] + ) -> Sequence[FeatureSpec]: + feature_prefix = f"{_PandasDataFrameHandler.FEATURE_PREFIX}_" + dtype = DataType.from_numpy_type(data.dtype) + if len(data.shape) == 1: + return [FeatureSpec(dtype=dtype, name=f"{feature_prefix}0")] + else: + # For high-dimension array, 0-axis is for batch, 1-axis is for column, further more is details of columns. + features = [] + n_cols = data.shape[1] + ft_names = [f"{feature_prefix}{i}" for i in range(n_cols)] + for col_data, ft_name in zip(data[0], ft_names): + if isinstance(col_data, np.ndarray): + ft_shape = np.shape(col_data) + features.append(FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape)) + else: + features.append(FeatureSpec(dtype=dtype, name=ft_name)) + return features + + @staticmethod + def convert_to_df(data: model_types._SupportedNumpyArray) -> pd.DataFrame: + if len(data.shape) == 1: + data = np.expand_dims(data, axis=1) + return pd.DataFrame(data) + + +class _ListOfNumpyArrayHandler(_BaseDataHandler[List[model_types._SupportedNumpyArray]]): + @staticmethod + def can_handle(data: model_types.SupportedDataType) -> TypeGuard[List[model_types._SupportedNumpyArray]]: + return ( + isinstance(data, list) + and len(data) > 0 + and all(_NumpyArrayHandler.can_handle(data_col) for data_col in data) + ) + + @staticmethod + def validate(data: List[model_types._SupportedNumpyArray]) -> None: + for data_col in data: + _NumpyArrayHandler.validate(data_col) + + @staticmethod + def infer_signature( + data: List[model_types._SupportedNumpyArray], role: Literal["input", "output"] + ) -> Sequence[FeatureSpec]: + features: List[FeatureSpec] = [] + + for i, data_col in enumerate(data): + inferred_res = _NumpyArrayHandler.infer_signature(data_col, role) + for ft in inferred_res: + additional_prefix = ( + _ListOfNumpyArrayHandler.OUTPUT_PREFIX + if role == "output" + else _ListOfNumpyArrayHandler.INPUT_PREFIX + ) + ft._name = f"{additional_prefix}_{i}_{ft._name}" + features.extend(inferred_res) + return features + + @staticmethod + def convert_to_df(data: List[model_types._SupportedNumpyArray]) -> pd.DataFrame: + arr = np.concatenate(data, axis=1) + return pd.DataFrame(arr) + + +class _ListOfBuiltinHandler(_BaseDataHandler[model_types._SupportedBuiltinsList]): + @staticmethod + def can_handle(data: model_types.SupportedDataType) -> TypeGuard[model_types._SupportedBuiltinsList]: + return ( + isinstance(data, list) + and len(data) > 0 + and all(isinstance(data_col, (int, float, bool, str, bytes, list)) for data_col in data) + ) + + @staticmethod + def validate(data: model_types._SupportedBuiltinsList) -> None: + if not all(isinstance(data_row, type(data[0])) for data_row in data): + raise ValueError(f"Data Validation Error: Inconsistent type of object found in data {data}.") + df = pd.DataFrame(data) + if df.isnull().values.any(): + raise ValueError(f"Data Validation Error: Ill-shaped list data {data} confronted.") + + @staticmethod + def infer_signature( + data: model_types._SupportedBuiltinsList, role: Literal["input", "output"] + ) -> Sequence[FeatureSpec]: + return _PandasDataFrameHandler.infer_signature(pd.DataFrame(data), role) + + @staticmethod + def convert_to_df(data: model_types._SupportedBuiltinsList) -> pd.DataFrame: + return pd.DataFrame(data) + + +class _SnowparkDataFrameHandler(_BaseDataHandler[snowflake.snowpark.DataFrame]): + @staticmethod + def can_handle(data: model_types.SupportedDataType) -> TypeGuard[snowflake.snowpark.DataFrame]: + return isinstance(data, snowflake.snowpark.DataFrame) + + @staticmethod + def validate(data: snowflake.snowpark.DataFrame) -> None: + schema = data.schema + for field in schema.fields: + if not any(type.is_same_snowpark_type(field.datatype) for type in DataType): + raise ValueError( + f"Data Validation Error: Unsupported data type {field.datatype} in column {field.name}." + ) + + @staticmethod + def infer_signature(data: snowflake.snowpark.DataFrame, role: Literal["input", "output"]) -> Sequence[FeatureSpec]: + features: List[FeatureSpec] = [] + schema = data.schema + for field in schema.fields: + name = identifier.remove_quote_if_quoted(field.name) + features.append(FeatureSpec(name=name, dtype=DataType.from_snowpark_type(field.datatype))) + return features + + @staticmethod + def convert_to_df(data: snowflake.snowpark.DataFrame) -> snowflake.snowpark.DataFrame: + return data + + +_LOCAL_DATA_HANDLERS = [_PandasDataFrameHandler, _NumpyArrayHandler, _ListOfNumpyArrayHandler, _ListOfBuiltinHandler] +_ALL_DATA_HANDLERS = _LOCAL_DATA_HANDLERS + [_SnowparkDataFrameHandler] + + def _infer_signature( - data: model_types.SupportedDataType, - feature_prefix: str = "feature_", - output_prefix: str = "output_", - is_output: bool = False, + data: model_types.SupportedLocalDataType, role: Literal["input", "output"] ) -> Sequence[FeatureSpec]: """Infer the inputs/outputs signature given a data that could be dataframe, numpy array or list. Dispatching is used to separate logic for different types. @@ -357,11 +679,7 @@ def _infer_signature( Args: data: The data that we want to infer signature from. - feature_prefix: a prefix string to added before the column name to distinguish them as a fallback. - Defaults to "feature_". - output_prefix: a prefix string to added in multi-output case before the column name to distinguish them as a - fallback. Defaults to "output_". - is_output: a flag indicating that if this is to infer an output feature. + role: a flag indicating that if this is to infer an input or output feature. Raises: NotImplementedError: Raised when an unsupported data type is provided. @@ -369,18 +687,10 @@ def _infer_signature( Returns: A sequence of feature specifications and feature group specifications. """ - if isinstance(data, pd.DataFrame): - return _infer_signature_pd_DataFrame(data, feature_prefix=feature_prefix) - if isinstance(data, np.ndarray): - return _infer_signature_np_ndarray(data, feature_prefix=feature_prefix) - if isinstance(data, list) and len(data) > 0: - if is_output and all(isinstance(data_col, np.ndarray) for data_col in data): - # Added because mypy still claiming that data has a wider type than - # Sequence[model_types._SupportedNumpyArray] since we don't have pandas stubs. - data = cast(Sequence[model_types._SupportedNumpyArray], data) - return _infer_signature_list_multioutput(data, feature_prefix=feature_prefix, output_prefix=output_prefix) - else: - return _infer_signature_list_builtins(data, feature_prefix=feature_prefix) + for handler in _ALL_DATA_HANDLERS: + if handler.can_handle(data): + handler.validate(data) + return handler.infer_signature(data, role) raise NotImplementedError( f"Unable to infer model signature: Un-supported type provided {type(data)} for X type inference." ) @@ -416,243 +726,193 @@ def _convert_list_to_ndarray(data: List[Any]) -> npt.NDArray[Any]: return arr -def _infer_signature_pd_DataFrame(data: pd.DataFrame, feature_prefix: str = "feature_") -> Sequence[FeatureSpec]: - """If a dataframe is provided, its index will be used to name these features. Children features specifications are - are created according to dataframe type. If it is simple type, then scalar feature specification would be created. - If it is Python list, then a feature specification with shape are created correspondingly. +def _rename_features( + features: Sequence[FeatureSpec], feature_names: Optional[List[str]] = None +) -> Sequence[FeatureSpec]: + """It renames the feature in features provided optional feature names. Args: - data: The data that we want to infer signature from. - feature_prefix: a prefix string to added before the column name to distinguish them as a fallback. - Defaults to "feature_". + features: A sequence of feature specifications and feature group specifications. + feature_names: A list of names to assign to features and feature groups. Defaults to None. Raises: - NotImplementedError: Raised when an unsupported column is provided. - ValueError: Raised when an empty data is provided. - NotImplementedError: Raised when an unsupported column index type is provided. - - ValueError: Raised when an object column have different Python object types. - - ValueError: Raised when an column of list have different element types. - NotImplementedError: Raised when an column of list have different variant shapes. - - ValueError: Raised when an column of array have different element types. - NotImplementedError: Raised when an column of array have different variant shapes. - - NotImplementedError: Raised when an unsupported data type is provided. + ValueError: Raised when provided feature_names does not match the data shape. Returns: - A sequence of feature specifications and feature group specifications. + A sequence of feature specifications and feature group specifications being renamed if names provided. """ - df_cols = data.columns - if not all(hasattr(data[col], "dtype") for col in data.columns): - raise NotImplementedError(f"Unable to infer signature: Unsupported column confronted in {data}.") - - if len(df_cols) == 0: - raise ValueError("Unable to construct signature: Empty data is found.") - - df_col_dtypes = [data[col].to_numpy().dtype for col in data.columns] - if df_cols.dtype == np.dtype("O"): # List of String index - ft_names = list(map(str, data.columns.to_list())) - elif isinstance(df_cols, pd.RangeIndex): - ft_names = [f"{feature_prefix}{i}" for i in df_cols] - elif isinstance(df_cols, pd.CategoricalIndex): - raise NotImplementedError( - f"Unable to infer model signature: Unsupported column index type confronted in {df_cols}." - ) - else: - ft_names = [str(x) for x in df_cols.to_list()] - - specs = [] - for df_col, df_col_dtype, ft_name in zip(df_cols, df_col_dtypes, ft_names): - if df_col_dtype == np.dtype("O"): - # Check if all objects have the same type - if not all(isinstance(data_row, type(data[df_col][0])) for data_row in data[df_col]): - raise ValueError( - "Unable to construct model signature: " - + f"Inconsistent type of object found in column data {data[df_col]}." - ) - - if isinstance(data[df_col][0], list): - arr = _convert_list_to_ndarray(data[df_col][0]) - arr_dtype = DataType.from_numpy_type(arr.dtype) - ft_shape = np.shape(data[df_col][0]) - - converted_data_list = [_convert_list_to_ndarray(data_row) for data_row in data[df_col]] - - if not all( - DataType.from_numpy_type(converted_data.dtype) == arr_dtype - for converted_data in converted_data_list - ): - raise ValueError( - "Unable to construct model signature: " - + f"Inconsistent type of object found in column data {data[df_col]}." - ) - - if not all(np.shape(converted_data) == ft_shape for converted_data in converted_data_list): - raise NotImplementedError( - "Unable to infer model signature: " - + f"Inconsistent shape of element found in column data {data[df_col]}. " - + "Model signature infer for variant length feature is not currently supported. " - + "Consider specify the model signature manually." - ) - - specs.append(FeatureSpec(dtype=arr_dtype, name=ft_name, shape=ft_shape)) - elif isinstance(data[df_col][0], np.ndarray): - arr_dtype = DataType.from_numpy_type(data[df_col][0].dtype) - ft_shape = np.shape(data[df_col][0]) - - if not all(DataType.from_numpy_type(data_row.dtype) == arr_dtype for data_row in data[df_col]): - raise ValueError( - "Unable to construct model signature: " - + f"Inconsistent type of object found in column data {data[df_col]}." - ) - - if not all(np.shape(data_row) == ft_shape for data_row in data[df_col]): - raise NotImplementedError( - "Unable to infer model signature: " - + f"Inconsistent shape of element found in column data {data[df_col]}. " - + "Model signature infer for variant length feature is not currently supported. " - + "Consider specify the model signature manually." - ) - - specs.append(FeatureSpec(dtype=arr_dtype, name=ft_name, shape=ft_shape)) - elif isinstance(data[df_col][0], str): - specs.append(FeatureSpec(dtype=DataType.STRING, name=ft_name)) - elif isinstance(data[df_col][0], bytes): - specs.append(FeatureSpec(dtype=DataType.BYTES, name=ft_name)) - else: - raise NotImplementedError(f"Unsupported type confronted in {data[df_col]}") + if feature_names: + if len(feature_names) == len(features): + for ft, ft_name in zip(features, feature_names): + ft._name = ft_name else: - specs.append(FeatureSpec(dtype=DataType.from_numpy_type(df_col_dtype), name=ft_name)) - return specs + raise ValueError( + f"{len(feature_names)} feature names are provided, while there are {len(features)} features." + ) + return features -def _infer_signature_np_ndarray( - data: model_types._SupportedNumpyArray, feature_prefix: str = "feature_" -) -> Sequence[FeatureSpec]: - """If a numpy array is provided, `feature_name` if provided is used to name features, otherwise, name like - `feature_0` `feature_1` will be generated and assigned. +def _rename_pandas_df(data: pd.DataFrame, features: Sequence[BaseFeatureSpec]) -> pd.DataFrame: + """It renames pandas dataframe that has non-object column index with provided features. Args: - data: The data that we want to infer signature from. - feature_prefix: a prefix string to added before the column name to distinguish them as a fallback. - Defaults to "feature_". + data: A pandas dataframe to be renamed. + features: A sequence of feature specifications and feature group specifications to rename the dataframe. Raises: - ValueError: Raised when an empty data is provided. - ValueError: Raised when a scalar data is provided. + ValueError: Raised when the data does not have the same number of features as signature. Returns: - A sequence of feature specifications and feature group specifications. + A pandas dataframe with columns renamed. """ - if data.shape == (0,): - # Empty array - raise ValueError("Unable to construct signature: Empty data is found. Unable to infer signature.") - - if data.shape == (): - # scalar - raise ValueError("Unable to construct signature: Scalar data is found. Unable to infer signature.") - dtype = DataType.from_numpy_type(data.dtype) - if len(data.shape) == 1: - return [FeatureSpec(dtype=dtype, name=f"{feature_prefix}0")] - else: - # For high-dimension array, 0-axis is for batch, 1-axis is for column, further more is details of columns. - features = [] - n_cols = data.shape[1] - ft_names = [f"{feature_prefix}{i}" for i in range(n_cols)] - for col_data, ft_name in zip(data[0], ft_names): - if isinstance(col_data, np.ndarray): - ft_shape = np.shape(col_data) - features.append(FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape)) - else: - features.append(FeatureSpec(dtype=dtype, name=ft_name)) - return features + df_cols = data.columns + if df_cols.dtype in [np.int64, np.uint64, np.float64]: + if len(features) != len(data.columns): + raise ValueError( + "Data does not have the same number of features as signature. " + + f"Signature requires {len(features)} features, but have {len(data.columns)} in input data." + ) + data.columns = pd.Index([feature.name for feature in features]) + return data -def _infer_signature_list_builtins( - data: Union[Sequence[model_types._SupportedNumpyArray], model_types._SupportedBuiltinsList], - feature_prefix: str = "feature_", -) -> Sequence[FeatureSpec]: - """If a list or a nested list of built-in types are provided, we treat them as a pd.DataFrame. - Before that we check if all elements have the same type. - After converting to dataframe, if the original data has ill shape, there would be nan or None. +def _validate_pandas_df(data: pd.DataFrame, features: Sequence[BaseFeatureSpec]) -> None: + """It validates pandas dataframe with provided features. Args: - data: The data that we want to infer signature from. - feature_prefix: a prefix string to added before the column name to distinguish them as a fallback. - Defaults to "feature_". + data: A pandas dataframe to be validated. + features: A sequence of feature specifications and feature group specifications, where the dataframe should fit. Raises: - ValueError: Raised when the list have different Python object types. - ValueError: Raised when converted dataframe has nan or None, meaning that the original data is ill-shaped. - - Returns: - A sequence of feature specifications and feature group specifications. + ValueError: Raised when a feature cannot be found. + ValueError: Raised when feature is scalar but confront list element. + ValueError: Raised when feature type is not aligned in list element. + ValueError: Raised when feature shape is not aligned in list element. + ValueError: Raised when feature is scalar but confront array element. + ValueError: Raised when feature type is not aligned in numpy array element. + ValueError: Raised when feature shape is not aligned in numpy array element. + ValueError: Raised when feature type is not aligned in string element. + ValueError: Raised when feature type is not aligned in bytes element. + ValueError: Raised when feature type is not met. """ - if not all(isinstance(data_row, type(data[0])) for data_row in data): - raise ValueError(f"Unable to construct model signature: Inconsistent type of object found in data {data}.") - df = pd.DataFrame(data) - if df.isnull().values.any(): - raise ValueError(f"Unable to construct model signature: Ill-shaped list data {data} confronted.") - return _infer_signature_pd_DataFrame(df, feature_prefix=feature_prefix) - + _features: List[FeatureSpec] = [] + for feature in features: + if isinstance(feature, FeatureSpec): + _features.append(feature) + elif isinstance(feature, FeatureGroupSpec): + _features.extend(feature._specs) + for feature in _features: + ft_name = feature.name + try: + data_col = data[ft_name] + except KeyError: + raise ValueError(f"Data Validation Error: feature {ft_name} does not exist in data.") + + df_col_dtype = data_col.dtype + ft_type = feature._dtype + if df_col_dtype == np.dtype("O"): + if isinstance(data_col[0], list): + ft_shape = feature._shape + if not ft_shape: + raise ValueError( + f"Data Validation Error in feature {ft_name}: " + + "Feature is a scalar feature while list data is provided." + ) -def _infer_signature_list_multioutput( - data: Sequence[model_types._SupportedNumpyArray], feature_prefix: str = "feature_", output_prefix: str = "output_" -) -> Sequence[FeatureSpec]: - """If a Python list is provided, which will happen if user packs a multi-output model. In this case, - _infer_signature is called for every element of the list, and a output prefix like `output_0_` `output_1_` would be - added to the name. All children feature specifications would be flatten. + converted_data_list = [_convert_list_to_ndarray(data_row) for data_row in data_col] - Args: - data: The data that we want to infer signature from. - feature_prefix: a prefix string to added before the column name to distinguish them as a fallback. - Defaults to "feature_". - output_prefix: a prefix string to added in multi-output case before the column name to distinguish them as a - fallback. Defaults to "output_". + if not all( + DataType.from_numpy_type(converted_data.dtype) == ft_type for converted_data in converted_data_list + ): + raise ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature type {ft_type} is not met by all elements in {data_col}." + ) - Returns: - A sequence of feature specifications and feature group specifications. - """ - # If the estimator is a multi-output estimator, output will be a list of ndarrays. - features: List[FeatureSpec] = [] - - for i, d in enumerate(data): - inferred_res = _infer_signature(d, feature_prefix=feature_prefix, output_prefix=output_prefix) - for ft in inferred_res: - ft._name = f"{output_prefix}{i}_{ft._name}" - features.extend(inferred_res) - return features + if ft_shape and ft_shape != (-1,): + if not all(np.shape(converted_data) == ft_shape for converted_data in converted_data_list): + raise ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature shape {ft_shape} is not met by all elements in {data_col}." + ) + elif isinstance(data_col[0], np.ndarray): + ft_shape = feature._shape + if not ft_shape: + raise ValueError( + f"Data Validation Error in feature {ft_name}: " + + "Feature is a scalar feature while array data is provided." + ) + if not all(DataType.from_numpy_type(data_row.dtype) == ft_type for data_row in data_col): + raise ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature type {ft_type} is not met by all elements in {data_col}." + ) -def _rename_features( - features: Sequence[FeatureSpec], feature_names: Optional[List[str]] = None -) -> Sequence[FeatureSpec]: - """It renames the feature in features provided optional feature names. + ft_shape = feature._shape + if ft_shape and ft_shape != (-1,): + if not all(np.shape(data_row) == ft_shape for data_row in data_col): + ft_shape = (-1,) + raise ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature shape {ft_shape} is not met by all elements in {data_col}." + ) + elif isinstance(data_col[0], str): + if ft_type != DataType.STRING: + raise ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature type {ft_type} is not met by all elements in {data_col}." + ) + elif isinstance(data_col[0], bytes): + if ft_type != DataType.BYTES: + raise ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature type {ft_type} is not met by all elements in {data_col}." + ) + else: + if ft_type != DataType.from_numpy_type(df_col_dtype): + raise ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature type {ft_type} is not met by all elements in {data_col}." + ) - Args: - features: A sequence of feature specifications and feature group specifications. - feature_names: A list of names to assign to features and feature groups. Defaults to None. - Raises: - ValueError: Raised when provided feature_names does not match the data shape. +def _validate_snowpark_data(data: snowflake.snowpark.DataFrame, features: Sequence[BaseFeatureSpec]) -> None: + _features: List[FeatureSpec] = [] + for feature in features: + if isinstance(feature, FeatureSpec): + _features.append(feature) + elif isinstance(feature, FeatureGroupSpec): + _features.extend(feature._specs) + schema = data.schema + for feature in _features: + ft_name = feature.name + found = False + for field in schema.fields: + name = identifier.remove_quote_if_quoted(field.name) + if name == ft_name: + found = True + if field.nullable: + warnings.warn( + f"Warn in feature {ft_name}: Nullable column {field.name} provided," + + " inference might fail if there is null value.", + category=RuntimeWarning, + ) - Returns: - A sequence of feature specifications and feature group specifications being renamed if names provided. - """ - if feature_names: - if len(feature_names) == len(features): - for ft, ft_name in zip(features, feature_names): - ft._name = ft_name - else: - raise ValueError( - f"{len(feature_names)} feature names are provided, while there are {len(features)} features." - ) - return features + ft_type = feature._dtype + if not ft_type.is_same_snowpark_type(field.datatype): + raise ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature type {ft_type} is not met by column {field.name}." + ) + break + if not found: + raise ValueError(f"Data Validation Error: feature {ft_name} does not exist in data.") -def _validate_data_with_features_and_convert_to_df(features: Sequence[BaseFeatureSpec], data: Any) -> pd.DataFrame: +def _convert_and_validate_local_data( + data: model_types.SupportedDataType, features: Sequence[BaseFeatureSpec] +) -> pd.DataFrame: """Validate the data with features in model signature and convert to DataFrame Args: @@ -660,62 +920,28 @@ def _validate_data_with_features_and_convert_to_df(features: Sequence[BaseFeatur data: The provided data. Raises: - ValueError: Raised when input data is empty dataframe. - ValueError: Raised when input data is empty array. - ValueError: Raised when input data is scalar. - ValueError: Raised when input data is list with different types. - ValueError: Raised when input data is ill-shaped list. - NotImplementedError: Raised when input data has unsupported types. - ValueError: Raised when input data has different number of features as the features required. + ValueError: Raised when data cannot be handled by any data handler. Returns: The converted dataframe with renamed column index. """ - keep_columns = False - if isinstance(data, pd.DataFrame): - df_cols = data.columns - - if len(df_cols) == 0: - raise ValueError("Empty dataframe is invalid input data.") - if df_cols.dtype == np.dtype("O"): - # List of String index, users should take care about names - keep_columns = True - df = data - elif isinstance(data, np.ndarray): - if data.shape == (0,): - # Empty array - raise ValueError("Empty array is invalid input data.") - - if data.shape == (): - # scalar - raise ValueError("Scalar is invalid input data.") + df = None + for handler in _LOCAL_DATA_HANDLERS: + if handler.can_handle(data): + handler.validate(data) + df = handler.convert_to_df(data) + if df is None: + raise ValueError(f"Data Validation Error: Un-supported type {type(data)} provided.") + assert isinstance(df, pd.DataFrame) + df = _rename_pandas_df(df, features) + _validate_pandas_df(df, features) - if len(data.shape) == 1: - data = np.expand_dims(data, axis=1) - df = pd.DataFrame(data) - elif isinstance(data, list) and len(data) > 0: - if not all(isinstance(data_row, type(data[0])) for data_row in data): - raise ValueError("List of data with different types is invalid input data.") - df = pd.DataFrame(data) - if df.isnull().values.any(): - raise ValueError("Ill-shaped list is invalid input data.") - else: - raise NotImplementedError(f"Unable to validate data: Un-supported type provided {type(data)} as X.") - - # Rename if that data may have name inferred if provided to infer signature - if not keep_columns: - if len(features) != len(df.columns): - raise ValueError( - "Input data does not have the same number of features as signature. " - + f"Signature requires {len(features)} features, but have {len(df.columns)} in input data." - ) - df.columns = pd.Index([feature.name for feature in features]) return df def infer_signature( - input_data: model_types.SupportedDataType, - output_data: model_types.SupportedDataType, + input_data: model_types.SupportedLocalDataType, + output_data: model_types.SupportedLocalDataType, input_feature_names: Optional[List[str]] = None, output_feature_names: Optional[List[str]] = None, ) -> ModelSignature: @@ -746,8 +972,8 @@ def infer_signature( Returns: A model signature. """ - inputs = _infer_signature(input_data) + inputs = _infer_signature(input_data, role="input") inputs = _rename_features(inputs, input_feature_names) - outputs = _infer_signature(output_data, is_output=True) + outputs = _infer_signature(output_data, role="output") outputs = _rename_features(outputs, output_feature_names) return ModelSignature(inputs, outputs) diff --git a/snowflake/ml/model/model_signature_test.py b/snowflake/ml/model/model_signature_test.py index bdf1dbcc..904cfa22 100644 --- a/snowflake/ml/model/model_signature_test.py +++ b/snowflake/ml/model/model_signature_test.py @@ -4,6 +4,8 @@ import snowflake.snowpark.types as spt from snowflake.ml.model import model_signature +from snowflake.ml.utils import connection_params +from snowflake.snowpark import Session class DataTypeTest(absltest.TestCase): @@ -145,38 +147,61 @@ def test_2(self) -> None: self.assertEqual(s, eval(repr(s), model_signature.__dict__)) self.assertEqual(s, model_signature.ModelSignature.from_dict(s.to_dict())) - def test_infer_signature_pd_DataFrame(self) -> None: + +class PandasDataFrameHandlerTest(absltest.TestCase): + def test_validate_pd_DataFrame(self) -> None: df = pd.DataFrame([]) with self.assertRaises(ValueError): - self.assertEmpty(model_signature._infer_signature_pd_DataFrame(df)) + model_signature._PandasDataFrameHandler.validate(df) + + sub_df = pd.DataFrame([2.5, 6.8]) + df = pd.DataFrame([[1, sub_df], [2, sub_df]], columns=["a", "b"]) + with self.assertRaises(ValueError): + model_signature._PandasDataFrameHandler.validate(df) + + df = pd.DataFrame( + [[1, 2.0, 1, 2.0, 1, 2.0], [2, 4.0, 2, 4.0, 2, 4.0]], + columns=pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]), + ) + with self.assertRaises(ValueError): + model_signature._PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, [2.5, 6.8]], [2, [2, 6]]], columns=["a", "b"]) + with self.assertRaises(ValueError): + model_signature._PandasDataFrameHandler.validate(df) + df = pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2, 6])]], columns=["a", "b"]) + with self.assertRaises(ValueError): + model_signature._PandasDataFrameHandler.validate(df) + + def test_infer_signature_pd_DataFrame(self) -> None: df = pd.DataFrame([1, 2, 3, 4]) self.assertListEqual( - model_signature._infer_signature_pd_DataFrame(df), + model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), [model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64)], ) df = pd.DataFrame([1, 2, 3, 4], columns=["a"]) self.assertListEqual( - model_signature._infer_signature_pd_DataFrame(df), + model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), [model_signature.FeatureSpec("a", model_signature.DataType.INT64)], ) df = pd.DataFrame(["a", "b", "c", "d"], columns=["a"]) self.assertListEqual( - model_signature._infer_signature_pd_DataFrame(df), + model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), [model_signature.FeatureSpec("a", model_signature.DataType.STRING)], ) df = pd.DataFrame([ele.encode() for ele in ["a", "b", "c", "d"]], columns=["a"]) self.assertListEqual( - model_signature._infer_signature_pd_DataFrame(df), + model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), [model_signature.FeatureSpec("a", model_signature.DataType.BYTES)], ) df = pd.DataFrame([[1, 2.0], [2, 4.0]]) self.assertListEqual( - model_signature._infer_signature_pd_DataFrame(df), + model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), [ model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64), model_signature.FeatureSpec("feature_1", model_signature.DataType.DOUBLE), @@ -185,24 +210,25 @@ def test_infer_signature_pd_DataFrame(self) -> None: df = pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5, 6.8]]], columns=["a", "b"]) self.assertListEqual( - model_signature._infer_signature_pd_DataFrame(df), + model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), [ model_signature.FeatureSpec("a", model_signature.DataType.INT64), model_signature.FeatureSpec("b", model_signature.DataType.DOUBLE, shape=(2,)), ], ) - df = pd.DataFrame([[1, [2.5, 6.8]], [2, [2, 6]]], columns=["a", "b"]) - with self.assertRaises(ValueError): - model_signature._infer_signature_pd_DataFrame(df) - df = pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5]]], columns=["a", "b"]) - with self.assertRaises(NotImplementedError): - model_signature._infer_signature_pd_DataFrame(df) + self.assertListEqual( + model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), + [ + model_signature.FeatureSpec("a", model_signature.DataType.INT64), + model_signature.FeatureSpec("b", model_signature.DataType.DOUBLE, shape=(-1,)), + ], + ) df = pd.DataFrame([[1, [[2.5], [6.8]]], [2, [[2.5], [6.8]]]], columns=["a", "b"]) self.assertListEqual( - model_signature._infer_signature_pd_DataFrame(df), + model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), [ model_signature.FeatureSpec("a", model_signature.DataType.INT64), model_signature.FeatureSpec("b", model_signature.DataType.DOUBLE, shape=(2, 1)), @@ -212,39 +238,35 @@ def test_infer_signature_pd_DataFrame(self) -> None: a = np.array([2.5, 6.8]) df = pd.DataFrame([[1, a], [2, a]], columns=["a", "b"]) self.assertListEqual( - model_signature._infer_signature_pd_DataFrame(df), + model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), [ model_signature.FeatureSpec("a", model_signature.DataType.INT64), model_signature.FeatureSpec("b", model_signature.DataType.DOUBLE, shape=(2,)), ], ) - df = pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2, 6])]], columns=["a", "b"]) - with self.assertRaises(ValueError): - model_signature._infer_signature_pd_DataFrame(df) - df = pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2.5])]], columns=["a", "b"]) - with self.assertRaises(NotImplementedError): - model_signature._infer_signature_pd_DataFrame(df) + self.assertListEqual( + model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), + [ + model_signature.FeatureSpec("a", model_signature.DataType.INT64), + model_signature.FeatureSpec("b", model_signature.DataType.DOUBLE, shape=(-1,)), + ], + ) a = np.array([[2, 5], [6, 8]]) df = pd.DataFrame([[1, a], [2, a]], columns=["a", "b"]) self.assertListEqual( - model_signature._infer_signature_pd_DataFrame(df), + model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), [ model_signature.FeatureSpec("a", model_signature.DataType.INT64), model_signature.FeatureSpec("b", model_signature.DataType.INT64, shape=(2, 2)), ], ) - sub_df = pd.DataFrame([2.5, 6.8]) - df = pd.DataFrame([[1, sub_df], [2, sub_df]], columns=["a", "b"]) - with self.assertRaises(NotImplementedError): - model_signature._infer_signature_pd_DataFrame(df) - df = pd.DataFrame([[1, 2.0], [2, 4.0]], columns=pd.PeriodIndex(year=[2000, 2002], quarter=[1, 3])) self.assertListEqual( - model_signature._infer_signature_pd_DataFrame(df), + model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), [ model_signature.FeatureSpec("2000Q1", model_signature.DataType.INT64), model_signature.FeatureSpec("2002Q3", model_signature.DataType.DOUBLE), @@ -253,7 +275,7 @@ def test_infer_signature_pd_DataFrame(self) -> None: df = pd.DataFrame([[1, 2.0], [2, 4.0]], columns=pd.date_range("2020-01-06", "2020-03-03", freq="MS")) self.assertListEqual( - model_signature._infer_signature_pd_DataFrame(df), + model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), [ model_signature.FeatureSpec("2020-02-01 00:00:00", model_signature.DataType.INT64), model_signature.FeatureSpec("2020-03-01 00:00:00", model_signature.DataType.DOUBLE), @@ -264,7 +286,7 @@ def test_infer_signature_pd_DataFrame(self) -> None: [[1, 2.0], [2, 4.0]], columns=pd.TimedeltaIndex(data=["1 days 02:00:00", "1 days 06:05:01.000030"]) ) self.assertListEqual( - model_signature._infer_signature_pd_DataFrame(df), + model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), [ model_signature.FeatureSpec("1 days 02:00:00", model_signature.DataType.INT64), model_signature.FeatureSpec("1 days 06:05:01.000030", model_signature.DataType.DOUBLE), @@ -273,44 +295,44 @@ def test_infer_signature_pd_DataFrame(self) -> None: df = pd.DataFrame([[1, 2.0], [2, 4.0]], columns=pd.interval_range(start=0, end=2)) self.assertListEqual( - model_signature._infer_signature_pd_DataFrame(df), + model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), [ model_signature.FeatureSpec("(0, 1]", model_signature.DataType.INT64), model_signature.FeatureSpec("(1, 2]", model_signature.DataType.DOUBLE), ], ) - df = pd.DataFrame( - [[1, 2.0, 1, 2.0, 1, 2.0], [2, 4.0, 2, 4.0, 2, 4.0]], - columns=pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]), - ) - with self.assertRaises(NotImplementedError): - model_signature._infer_signature_pd_DataFrame(df) - arrays = [[1, 2], ["red", "blue"]] df = pd.DataFrame([[1, 2.0], [2, 4.0]], columns=pd.MultiIndex.from_arrays(arrays, names=("number", "color"))) self.assertListEqual( - model_signature._infer_signature_pd_DataFrame(df), + model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), [ model_signature.FeatureSpec("(1, 'red')", model_signature.DataType.INT64), model_signature.FeatureSpec("(2, 'blue')", model_signature.DataType.DOUBLE), ], ) - def test_infer_signature_np_ndarray(self) -> None: + +class NumpyArrayHandlerTest(absltest.TestCase): + def test_validate_np_ndarray(self) -> None: arr = np.array([]) with self.assertRaises(ValueError): - self.assertEmpty(model_signature._infer_signature_np_ndarray(arr)) + model_signature._NumpyArrayHandler.validate(arr) + + arr = np.array(1) + with self.assertRaises(ValueError): + model_signature._NumpyArrayHandler.validate(arr) + def test_infer_schema_np_ndarray(self) -> None: arr = np.array([1, 2, 3, 4]) self.assertListEqual( - model_signature._infer_signature_np_ndarray(arr), + model_signature._NumpyArrayHandler.infer_signature(arr, role="input"), [model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64)], ) arr = np.array([[1, 2], [3, 4]]) self.assertListEqual( - model_signature._infer_signature_np_ndarray(arr), + model_signature._NumpyArrayHandler.infer_signature(arr, role="input"), [ model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64), model_signature.FeatureSpec("feature_1", model_signature.DataType.INT64), @@ -319,28 +341,34 @@ def test_infer_signature_np_ndarray(self) -> None: arr = np.array([[[1, 1], [2, 2]], [[3, 3], [4, 4]]]) self.assertListEqual( - model_signature._infer_signature_np_ndarray(arr), + model_signature._NumpyArrayHandler.infer_signature(arr, role="input"), [ model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64, shape=(2,)), model_signature.FeatureSpec("feature_1", model_signature.DataType.INT64, shape=(2,)), ], ) - def test_infer_signature_list_of(self) -> None: + +class ListOfNumpyArrayHandlerTest(absltest.TestCase): + def test_validate_list_of_numpy_array(self) -> None: + lt8 = [pd.DataFrame([1]), pd.DataFrame([2, 3])] + self.assertFalse(model_signature._ListOfNumpyArrayHandler.can_handle(lt8)) + + def test_infer_signature_list_of_numpy_array(self) -> None: arr = np.array([1, 2, 3, 4]) lt = [arr, arr] self.assertListEqual( - model_signature._infer_signature_list_multioutput(lt), + model_signature._ListOfNumpyArrayHandler.infer_signature(lt, role="input"), [ - model_signature.FeatureSpec("output_0_feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("output_1_feature_0", model_signature.DataType.INT64), + model_signature.FeatureSpec("input_0_feature_0", model_signature.DataType.INT64), + model_signature.FeatureSpec("input_1_feature_0", model_signature.DataType.INT64), ], ) arr = np.array([[[1, 1], [2, 2]], [[3, 3], [4, 4]]]) lt = [arr, arr] self.assertListEqual( - model_signature._infer_signature_list_multioutput(lt), + model_signature._ListOfNumpyArrayHandler.infer_signature(lt, role="output"), [ model_signature.FeatureSpec("output_0_feature_0", model_signature.DataType.INT64, shape=(2,)), model_signature.FeatureSpec("output_0_feature_1", model_signature.DataType.INT64, shape=(2,)), @@ -349,28 +377,38 @@ def test_infer_signature_list_of(self) -> None: ], ) + +class ListOfBuiltinsHandlerTest(absltest.TestCase): + def test_validate_list_builtins(self) -> None: + lt7 = [[1], [2, 3]] + with self.assertRaises(ValueError): + model_signature._ListOfBuiltinHandler.validate(lt7) + + lt8 = [pd.DataFrame([1]), pd.DataFrame([2, 3])] + self.assertFalse(model_signature._ListOfBuiltinHandler.can_handle(lt8)) + def test_infer_signature_list_builtins(self) -> None: lt1 = [1, 2, 3, 4] self.assertListEqual( - model_signature._infer_signature_list_builtins(lt1), + model_signature._ListOfBuiltinHandler.infer_signature(lt1, role="input"), [model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64)], ) lt2 = ["a", "b", "c", "d"] self.assertListEqual( - model_signature._infer_signature_list_builtins(lt2), + model_signature._ListOfBuiltinHandler.infer_signature(lt2, role="input"), [model_signature.FeatureSpec("feature_0", model_signature.DataType.STRING)], ) lt3 = [ele.encode() for ele in lt2] self.assertListEqual( - model_signature._infer_signature_list_builtins(lt3), + model_signature._ListOfBuiltinHandler.infer_signature(lt3, role="input"), [model_signature.FeatureSpec("feature_0", model_signature.DataType.BYTES)], ) lt4 = [[1, 2], [3, 4]] self.assertListEqual( - model_signature._infer_signature_list_builtins(lt4), + model_signature._ListOfBuiltinHandler.infer_signature(lt4, role="input"), [ model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64), model_signature.FeatureSpec("feature_1", model_signature.DataType.INT64), @@ -379,7 +417,7 @@ def test_infer_signature_list_builtins(self) -> None: lt5 = [[1, 2.0], [3, 4]] # This is not encouraged and will have type error, but we support it. self.assertListEqual( - model_signature._infer_signature_list_builtins(lt5), # type:ignore[arg-type] + model_signature._ListOfBuiltinHandler.infer_signature(lt5, role="input"), # type:ignore[arg-type] [ model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64), model_signature.FeatureSpec("feature_1", model_signature.DataType.DOUBLE), @@ -388,21 +426,73 @@ def test_infer_signature_list_builtins(self) -> None: lt6 = [[[1, 1], [2, 2]], [[3, 3], [4, 4]]] self.assertListEqual( - model_signature._infer_signature_list_builtins(lt6), + model_signature._ListOfBuiltinHandler.infer_signature(lt6, role="input"), [ model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64, shape=(2,)), model_signature.FeatureSpec("feature_1", model_signature.DataType.INT64, shape=(2,)), ], ) - lt7 = [[1], [2, 3]] + +class SnowParkDataFrameHandlerTest(absltest.TestCase): + @classmethod + def setUpClass(cls) -> None: + cls._session = Session.builder.configs(connection_params.SnowflakeLoginOptions()).create() + + @classmethod + def tearDownClass(cls) -> None: + cls._session.close() + + def test_validate_snowpark_df(self) -> None: + schema = spt.StructType([spt.StructField('"a"', spt.VariantType()), spt.StructField('"b"', spt.StringType())]) + df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) with self.assertRaises(ValueError): - model_signature._infer_signature_list_builtins(lt7) + model_signature._SnowparkDataFrameHandler.validate(df) - lt8 = [pd.DataFrame([1]), pd.DataFrame([2, 3])] - with self.assertRaises(NotImplementedError): - model_signature._infer_signature_list_builtins(lt8) + def test_infer_schema_snowpark_df(self) -> None: + schema = spt.StructType([spt.StructField('"a"', spt.LongType()), spt.StructField('"b"', spt.StringType())]) + df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) + self.assertListEqual( + model_signature._SnowparkDataFrameHandler.infer_signature(df, role="input"), + [ + model_signature.FeatureSpec("a", model_signature.DataType.INT64), + model_signature.FeatureSpec("b", model_signature.DataType.STRING), + ], + ) + def test_validate_data_with_features(self) -> None: + fts = [ + model_signature.FeatureSpec("a", model_signature.DataType.INT64), + model_signature.FeatureSpec("b", model_signature.DataType.INT64), + ] + df = self._session.create_dataframe([{'"a"': 1}, {'"b"': 2}]) + with self.assertWarns(RuntimeWarning): + model_signature._validate_snowpark_data(df, fts) + + fts = [ + model_signature.FeatureSpec("a", model_signature.DataType.INT64), + model_signature.FeatureSpec("b", model_signature.DataType.STRING), + ] + schema = spt.StructType([spt.StructField('"a"', spt.LongType()), spt.StructField('"b"', spt.StringType())]) + df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) + model_signature._validate_snowpark_data(df, fts) + + schema = spt.StructType([spt.StructField('"a"', spt.LongType()), spt.StructField('"b"', spt.IntegerType())]) + df = self._session.create_dataframe([[1, 3], [3, 9]], schema) + with self.assertRaises(ValueError): + model_signature._validate_snowpark_data(df, fts) + + schema = spt.StructType([spt.StructField('"a1"', spt.LongType()), spt.StructField('"b"', spt.StringType())]) + df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) + with self.assertRaises(ValueError): + model_signature._validate_snowpark_data(df, fts) + + df = self._session.create_dataframe([{'"a"': 1}, {'"b"': 2}]) + with self.assertRaises(ValueError): + model_signature._validate_snowpark_data(df, fts) + + +class ModelSignatureMiscTest(absltest.TestCase): def test_rename_features(self) -> None: model_signature._rename_features([]) @@ -428,25 +518,25 @@ def test_rename_features(self) -> None: def test_infer_signature(self) -> None: df = pd.DataFrame([1, 2, 3, 4]) self.assertListEqual( - model_signature._infer_signature(df), + model_signature._infer_signature(df, role="input"), [model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64)], ) arr = np.array([1, 2, 3, 4]) self.assertListEqual( - model_signature._infer_signature(arr), + model_signature._infer_signature(arr, role="input"), [model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64)], ) lt1 = [1, 2, 3, 4] self.assertListEqual( - model_signature._infer_signature(lt1), + model_signature._infer_signature(lt1, role="input"), [model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64)], ) lt2 = [[1, 2], [3, 4]] self.assertListEqual( - model_signature._infer_signature(lt2), + model_signature._infer_signature(lt2, role="input"), [ model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64), model_signature.FeatureSpec("feature_1", model_signature.DataType.INT64), @@ -455,7 +545,7 @@ def test_infer_signature(self) -> None: lt = [arr, arr] self.assertListEqual( - model_signature._infer_signature(lt, is_output=True), + model_signature._infer_signature(lt, role="output"), [ model_signature.FeatureSpec("output_0_feature_0", model_signature.DataType.INT64), model_signature.FeatureSpec("output_1_feature_0", model_signature.DataType.INT64), @@ -463,28 +553,26 @@ def test_infer_signature(self) -> None: ) self.assertListEqual( - model_signature._infer_signature(lt), + model_signature._infer_signature(lt, role="input"), [ - model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("feature_1", model_signature.DataType.INT64), - model_signature.FeatureSpec("feature_2", model_signature.DataType.INT64), - model_signature.FeatureSpec("feature_3", model_signature.DataType.INT64), + model_signature.FeatureSpec("input_0_feature_0", model_signature.DataType.INT64), + model_signature.FeatureSpec("input_1_feature_0", model_signature.DataType.INT64), ], ) df = pd.DataFrame([1, 2, 3, 4]) lt = [df, arr] - with self.assertRaises(ValueError): - model_signature._infer_signature(lt) + with self.assertRaises(NotImplementedError): + model_signature._infer_signature(lt, role="input") with self.assertRaises(ValueError): - model_signature._infer_signature([True, 1]) + model_signature._infer_signature([True, 1], role="input") with self.assertRaises(NotImplementedError): - model_signature._infer_signature(1) + model_signature._infer_signature(1, role="input") with self.assertRaises(NotImplementedError): - model_signature._infer_signature([]) + model_signature._infer_signature([], role="input") def test_validate_data_with_features(self) -> None: fts = [ @@ -493,47 +581,63 @@ def test_validate_data_with_features(self) -> None: ] with self.assertRaises(ValueError): - model_signature._validate_data_with_features_and_convert_to_df(fts, np.array([])) + model_signature._convert_and_validate_local_data(np.array([]), fts) with self.assertRaises(ValueError): - model_signature._validate_data_with_features_and_convert_to_df(fts, np.array(5)) + model_signature._convert_and_validate_local_data(np.array(5), fts) - with self.assertRaises(NotImplementedError): - model_signature._validate_data_with_features_and_convert_to_df(fts, []) + with self.assertRaises(ValueError): + model_signature._convert_and_validate_local_data(np.array([[2.5, 5], [6.8, 8]]), fts) + + with self.assertRaises(ValueError): + model_signature._convert_and_validate_local_data([], fts) + + with self.assertRaises(ValueError): + model_signature._convert_and_validate_local_data([1, [1, 1]], fts) + + with self.assertRaises(ValueError): + model_signature._convert_and_validate_local_data([[1], [1, 1]], fts) + + with self.assertRaises(ValueError): + model_signature._convert_and_validate_local_data([[2.1, 5.0], [6.8, 8.0]], fts) with self.assertRaises(ValueError): - model_signature._validate_data_with_features_and_convert_to_df(fts, [1, [1, 1]]) + model_signature._convert_and_validate_local_data(pd.DataFrame([[2.5, 5], [6.8, 8]]), fts) with self.assertRaises(ValueError): - model_signature._validate_data_with_features_and_convert_to_df(fts, [[1], [1, 1]]) + model_signature._convert_and_validate_local_data(pd.DataFrame([5, 6]), fts) with self.assertRaises(ValueError): - model_signature._validate_data_with_features_and_convert_to_df(fts, pd.DataFrame([5, 6])) + model_signature._convert_and_validate_local_data(np.array([5, 6]), fts) with self.assertRaises(ValueError): - model_signature._validate_data_with_features_and_convert_to_df(fts, np.array([5, 6])) + model_signature._convert_and_validate_local_data(pd.DataFrame([[2, 5], [6, 8]], columns=["a", "b"]), fts) - df = model_signature._validate_data_with_features_and_convert_to_df(fts[:1], np.array([5, 6])) + df = model_signature._convert_and_validate_local_data(np.array([5, 6]), fts[:1]) self.assertListEqual(df.columns.to_list(), ["feature_0"]) - df = model_signature._validate_data_with_features_and_convert_to_df(fts[:1], pd.DataFrame([5, 6])) + df = model_signature._convert_and_validate_local_data(pd.DataFrame([5, 6]), fts[:1]) self.assertListEqual(df.columns.to_list(), ["feature_0"]) - df = model_signature._validate_data_with_features_and_convert_to_df(fts[:1], [5, 6]) + df = model_signature._convert_and_validate_local_data([5, 6], fts[:1]) self.assertListEqual(df.columns.to_list(), ["feature_0"]) - df = model_signature._validate_data_with_features_and_convert_to_df(fts, np.array([[2, 5], [6, 8]])) + df = model_signature._convert_and_validate_local_data(np.array([[2, 5], [6, 8]]), fts) self.assertListEqual(df.columns.to_list(), ["feature_0", "feature_1"]) - df = model_signature._validate_data_with_features_and_convert_to_df(fts, pd.DataFrame([[2, 5], [6, 8]])) + df = model_signature._convert_and_validate_local_data(pd.DataFrame([[2, 5], [6, 8]]), fts) self.assertListEqual(df.columns.to_list(), ["feature_0", "feature_1"]) - df = model_signature._validate_data_with_features_and_convert_to_df( - fts, pd.DataFrame([[2, 5], [6, 8]], columns=["a", "b"]) + df = model_signature._convert_and_validate_local_data( + pd.DataFrame([[2, 5], [6, 8]], columns=["a", "b"]), + [ + model_signature.FeatureSpec("a", model_signature.DataType.INT64), + model_signature.FeatureSpec("b", model_signature.DataType.INT64), + ], ) self.assertListEqual(df.columns.to_list(), ["a", "b"]) - df = model_signature._validate_data_with_features_and_convert_to_df(fts, [[2, 5], [6, 8]]) + df = model_signature._convert_and_validate_local_data([[2, 5], [6, 8]], fts) self.assertListEqual(df.columns.to_list(), ["feature_0", "feature_1"]) diff --git a/snowflake/ml/model/type_hints.py b/snowflake/ml/model/type_hints.py index 5cbe143a..1dc75d8f 100644 --- a/snowflake/ml/model/type_hints.py +++ b/snowflake/ml/model/type_hints.py @@ -11,16 +11,20 @@ import sklearn.pipeline import xgboost + import snowflake.ml.framework.base.BaseEstimator import snowflake.ml.model.custom_model + import snowflake.snowpark _SupportedBuiltins = Union[int, float, bool, str, bytes, "_SupportedBuiltinsList"] _SupportedNumpyDtype = Union[ + "np.int8", "np.int16", "np.int32", "np.int64", "np.float32", "np.float64", + "np.uint8", "np.uint16", "np.uint32", "np.uint64", @@ -31,7 +35,13 @@ _SupportedNumpyArray = npt.NDArray[_SupportedNumpyDtype] _SupportedBuiltinsList = Sequence[_SupportedBuiltins] -SupportedDataType = Union["pd.DataFrame", _SupportedNumpyArray, Sequence[_SupportedNumpyArray], _SupportedBuiltinsList] +SupportedLocalDataType = Union[ + "pd.DataFrame", _SupportedNumpyArray, Sequence[_SupportedNumpyArray], _SupportedBuiltinsList +] + +SupportedDataType = Union[SupportedLocalDataType, "snowflake.snowpark.DataFrame"] + +_DataType = TypeVar("_DataType", bound=SupportedDataType) CustomModelType = TypeVar("CustomModelType", bound="snowflake.ml.model.custom_model.CustomModel") @@ -41,6 +51,7 @@ "sklearn.pipeline.Pipeline", "xgboost.XGBModel", "xgboost.Booster", + "snowflake.ml.framework.base.BaseEstimator", ] """This is defined as the type that Snowflake native model packaging could accept. Here is all acceptable types of Snowflake native model packaging and its handler file in _handlers/ folder. @@ -52,6 +63,8 @@ | sklearn.pipeline.Pipeline | sklearn.py | _SKLModelHandler | | xgboost.XGBModel | xgboost.py | _XGBModelHandler | | xgboost.Booster | xgboost.py | _XGBModelHandler | +| snowflake.ml.framework.base.BaseEstimator | snowmlmodel.py | _SnowMLModelHandler | +| snowflake.ml.framework.pipeline.Pipeline | snowmlmodel.py | _SnowMLModelHandler | """ @@ -91,3 +104,7 @@ class SKLModelSaveOptions(ModelSaveOption): class XGBModelSaveOptions(ModelSaveOption): target_methods: NotRequired[Sequence[str]] + + +class SNOWModelSaveOptions(ModelSaveOption): + target_methods: NotRequired[Sequence[str]] diff --git a/snowflake/ml/preprocessing/binarizer.py b/snowflake/ml/preprocessing/binarizer.py index 45ad8017..e254eb85 100644 --- a/snowflake/ml/preprocessing/binarizer.py +++ b/snowflake/ml/preprocessing/binarizer.py @@ -36,10 +36,8 @@ def __init__( output_cols: Single or multiple output columns. drop_input_cols: Remove input columns from output if set True. False by default. """ + super().__init__(drop_input_cols=drop_input_cols) self.threshold = threshold - - base.BaseTransformer.__init__(self, drop_input_cols=drop_input_cols) - self.set_input_cols(input_cols) self.set_output_cols(output_cols) diff --git a/snowflake/ml/preprocessing/k_bins_discretizer.py b/snowflake/ml/preprocessing/k_bins_discretizer.py index 4c3ad941..df000e35 100644 --- a/snowflake/ml/preprocessing/k_bins_discretizer.py +++ b/snowflake/ml/preprocessing/k_bins_discretizer.py @@ -98,12 +98,10 @@ def __init__( output_cols: Optional[Union[str, Iterable[str]]] = None, drop_input_cols: Optional[bool] = False, ) -> None: + super().__init__(drop_input_cols=drop_input_cols) self.n_bins = n_bins self.encode = encode self.strategy = strategy - - base.BaseTransformer.__init__(self, drop_input_cols=drop_input_cols) - self.set_input_cols(input_cols) self.set_output_cols(output_cols) diff --git a/snowflake/ml/preprocessing/label_encoder.py b/snowflake/ml/preprocessing/label_encoder.py index fc6356ae..f11cd2c3 100644 --- a/snowflake/ml/preprocessing/label_encoder.py +++ b/snowflake/ml/preprocessing/label_encoder.py @@ -32,11 +32,9 @@ def __init__( classes_: A np.ndarray that holds the label for each class. """ + super().__init__(drop_input_cols=drop_input_cols) self._ordinal_encoder: Optional[ordinal_encoder.OrdinalEncoder] = None self.classes_: Optional[type_utils.LiteralNDArrayType] = None - - base.BaseTransformer.__init__(self, drop_input_cols=drop_input_cols) - self.set_input_cols(input_cols) self.set_output_cols(output_cols) diff --git a/snowflake/ml/preprocessing/max_abs_scaler.py b/snowflake/ml/preprocessing/max_abs_scaler.py index f6f82a8f..c1b42260 100644 --- a/snowflake/ml/preprocessing/max_abs_scaler.py +++ b/snowflake/ml/preprocessing/max_abs_scaler.py @@ -48,7 +48,7 @@ def __init__( "SQL>>>max(abs({col_name}))", ] - base.BaseTransformer.__init__(self, drop_input_cols=drop_input_cols, custom_states=self.custom_states) + super().__init__(drop_input_cols=drop_input_cols, custom_states=self.custom_states) self.set_input_cols(input_cols) self.set_output_cols(output_cols) diff --git a/snowflake/ml/preprocessing/min_max_scaler.py b/snowflake/ml/preprocessing/min_max_scaler.py index 73fdc3b3..55b8cfdc 100644 --- a/snowflake/ml/preprocessing/min_max_scaler.py +++ b/snowflake/ml/preprocessing/min_max_scaler.py @@ -53,7 +53,7 @@ def __init__( self.custom_states: List[str] = [_utils.NumericStatistics.MIN, _utils.NumericStatistics.MAX] - base.BaseTransformer.__init__(self, drop_input_cols=drop_input_cols, custom_states=self.custom_states) + super().__init__(drop_input_cols=drop_input_cols, custom_states=self.custom_states) self.set_input_cols(input_cols) self.set_output_cols(output_cols) diff --git a/snowflake/ml/preprocessing/normalizer.py b/snowflake/ml/preprocessing/normalizer.py index f04385c3..04f2fd39 100644 --- a/snowflake/ml/preprocessing/normalizer.py +++ b/snowflake/ml/preprocessing/normalizer.py @@ -39,11 +39,9 @@ def __init__( output_cols: Single or multiple output columns. drop_input_cols: Remove input columns from output if set True. False by default. """ + super().__init__(drop_input_cols=drop_input_cols) self.norm = norm self._is_fitted = False - - base.BaseTransformer.__init__(self, drop_input_cols=drop_input_cols) - self.set_input_cols(input_cols) self.set_output_cols(output_cols) diff --git a/snowflake/ml/preprocessing/one_hot_encoder.py b/snowflake/ml/preprocessing/one_hot_encoder.py index 96e2e6e0..85815f2c 100644 --- a/snowflake/ml/preprocessing/one_hot_encoder.py +++ b/snowflake/ml/preprocessing/one_hot_encoder.py @@ -163,7 +163,7 @@ def __init__( drop_input_cols: Optional[bool] = False, ) -> None: """See class-level docstring.""" - # Object parameters + super().__init__(drop_input_cols=drop_input_cols) self.categories = categories self.drop = drop self.sparse = sparse @@ -184,8 +184,6 @@ def __init__( str, List[str] ] = {} # transform state when output columns are unset before fitting - base.BaseTransformer.__init__(self, drop_input_cols=drop_input_cols) - self.set_input_cols(input_cols) self.set_output_cols(output_cols) diff --git a/snowflake/ml/preprocessing/ordinal_encoder.py b/snowflake/ml/preprocessing/ordinal_encoder.py index 7936fa6b..8b606040 100644 --- a/snowflake/ml/preprocessing/ordinal_encoder.py +++ b/snowflake/ml/preprocessing/ordinal_encoder.py @@ -81,6 +81,7 @@ def __init__( Attributes: categories_: The categories of each feature determined during fitting. """ + super().__init__(drop_input_cols=drop_input_cols) self.categories = categories self.handle_unknown = handle_unknown self.unknown_value = unknown_value @@ -91,8 +92,6 @@ def __init__( self._missing_indices: Dict[int, int] = {} self._vocab_table_name = "snowml_preprocessing_ordinal_encoder_temp_table_" + uuid.uuid4().hex - base.BaseTransformer.__init__(self, drop_input_cols=drop_input_cols) - self.set_input_cols(input_cols) self.set_output_cols(output_cols) diff --git a/snowflake/ml/preprocessing/robust_scaler.py b/snowflake/ml/preprocessing/robust_scaler.py index 744eeff1..2d6c24ad 100644 --- a/snowflake/ml/preprocessing/robust_scaler.py +++ b/snowflake/ml/preprocessing/robust_scaler.py @@ -68,7 +68,7 @@ def __init__( "SQL>>>percentile_cont(" + str(r_range) + ") within group (order by {col_name})", ] - base.BaseTransformer.__init__(self, drop_input_cols=drop_input_cols, custom_states=self.custom_states) + super().__init__(drop_input_cols=drop_input_cols, custom_states=self.custom_states) self.set_input_cols(input_cols) self.set_output_cols(output_cols) diff --git a/snowflake/ml/preprocessing/simple_imputer.py b/snowflake/ml/preprocessing/simple_imputer.py index b5cc0eb6..2cec9b8c 100644 --- a/snowflake/ml/preprocessing/simple_imputer.py +++ b/snowflake/ml/preprocessing/simple_imputer.py @@ -124,6 +124,7 @@ def __init__( Raises: ValueError: If strategy is invalid, or if fill value is specified for strategy that isn't "constant". """ + super().__init__(drop_input_cols=drop_input_cols) if strategy in STRATEGY_TO_STATE_DICT: self.strategy = strategy else: @@ -139,8 +140,6 @@ def __init__( # Add back when `keep_empty_features` is supported. # self.keep_empty_features = keep_empty_features - base.BaseTransformer.__init__(self, drop_input_cols=drop_input_cols) - self.set_input_cols(input_cols) self.set_output_cols(output_cols) diff --git a/snowflake/ml/preprocessing/standard_scaler.py b/snowflake/ml/preprocessing/standard_scaler.py index 70dedfed..c16ec99e 100644 --- a/snowflake/ml/preprocessing/standard_scaler.py +++ b/snowflake/ml/preprocessing/standard_scaler.py @@ -66,7 +66,7 @@ def __init__( self.custom_states.append(_utils.NumericStatistics.VAR_POP) self.custom_states.append(_utils.NumericStatistics.STDDEV_POP) - base.BaseTransformer.__init__(self, drop_input_cols=drop_input_cols, custom_states=self.custom_states) + super().__init__(drop_input_cols=drop_input_cols, custom_states=self.custom_states) self.set_input_cols(input_cols) self.set_output_cols(output_cols) diff --git a/snowflake/ml/registry/model_registry.py b/snowflake/ml/registry/model_registry.py index 30733675..1bbcb767 100644 --- a/snowflake/ml/registry/model_registry.py +++ b/snowflake/ml/registry/model_registry.py @@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast from uuid import uuid1 -import joblib +import cloudpickle as cp from absl import logging from snowflake import connector, snowpark @@ -1170,7 +1170,7 @@ def log_model( if not is_native_model_format: with tempfile.NamedTemporaryFile(delete=True) as local_model_file: - joblib.dump(model, local_model_file) + cp.dump(model, local_model_file) local_model_file.flush() id = self.log_model_path( @@ -1367,9 +1367,9 @@ def load_model(self, model_name: str, model_version: str) -> Any: except TypeError: pass if not is_native_model_format: - restored_model = joblib.load( - os.path.join(local_model_directory, os.path.basename(os.path.basename(remote_model_path))) - ) + file_path = os.path.join(local_model_directory, os.path.basename(os.path.basename(remote_model_path))) + with open(file_path, mode="r+b") as model_file: + restored_model = cp.load(model_file) return restored_model diff --git a/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb b/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb new file mode 100644 index 00000000..69c31e59 --- /dev/null +++ b/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb @@ -0,0 +1,1176 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "5de3eb26", + "metadata": {}, + "source": [ + "# Model Packaging Example" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "197efd00", + "metadata": {}, + "source": [ + "## Before Everything" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6ce97b36", + "metadata": {}, + "source": [ + "### Install `snowflake-ml-python` locally" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1117c596", + "metadata": {}, + "source": [ + "Before `snowflake-ml-python` is publicly available, you have to install from wheel file. Once it is ready, you could install them like other packages in PIP or conda." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da314158", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install snowflake_ml_python-0.3.2-py3-none-any.whl" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "285c1b29", + "metadata": {}, + "source": [ + "Notice: It is suggested to use pure-pip environment or empty conda environment when you try this. If you insist to install snowML in a conda environment with packages, it is suggested that you should install all requirements and install `snowflake-ml-python` with `--no-deps` flag." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b1b950fe", + "metadata": {}, + "source": [ + "If you are about to go over the **Use with customize model** part in this notebook, you will need tensorflow and transformers, which could be installed by following command." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47c9fa8f", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install snowflake_ml_python-0.3.2-py3-none-any.whl[tensorflow] transformers==4.24.0" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "99e58d8c", + "metadata": {}, + "source": [ + "### Setup Notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afd16ff5", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d609ff44", + "metadata": {}, + "outputs": [], + "source": [ + "# Scale cell width with the browser window to accommodate .show() commands for wider tables.\n", + "from IPython.display import display, HTML\n", + "\n", + "display(HTML(\"\"))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1ac32c6f", + "metadata": {}, + "source": [ + "### Start Snowpark Session\n", + "\n", + "To avoid exposing credentials in Github, we use a small utility `SnowflakeLoginOptions`. It allows you to score your default credentials in `~/.snowsql/config` in the following format:\n", + "```\n", + "[connections]\n", + "accountname = # Account identifier to connect to Snowflake.\n", + "username = # User name in the account. Optional.\n", + "password = # User password. Optional.\n", + "dbname = # Default database. Optional.\n", + "schemaname = # Default schema. Optional.\n", + "warehousename = # Default warehouse. Optional.\n", + "#rolename = # Default role. Optional.\n", + "#authenticator = # Authenticator: 'snowflake', 'externalbrowser', etc\n", + "```\n", + "Please follow [this](https://docs.snowflake.com/en/user-guide/snowsql-start.html#configuring-default-connection-settings) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2efc0a8", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.utils.connection_params import SnowflakeLoginOptions\n", + "from snowflake.snowpark import Session\n", + "\n", + "session = Session.builder.configs(SnowflakeLoginOptions()).create()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e2fcbe4a", + "metadata": {}, + "source": [ + "### Let `snowflake-ml-python` available for your models to be deployed" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "671a7710", + "metadata": {}, + "source": [ + "Unfortunately, since `snowflake-ml-python` does not exist in Anaconda channel yet, we have to import them manually to use it when the model get deployed to Snowflake. To avoid upload them again and again, we could set up a temporary stage and upload the wheel file there." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5eae711f", + "metadata": {}, + "outputs": [], + "source": [ + "SNOW_ML_WHEEL_LOCAL_PATH = \"~/snowml/bazel-bin/snowflake/ml/snowflake_ml_python-0.3.2-py3-none-any.whl\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fcececa", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from typing import Optional\n", + "\n", + "def upload_snowml_to_tmp_stage(session: Session, wheel_path: str, stage_name: Optional[str] = None) -> str:\n", + " \"\"\"Upload model module of snowml to tmp stage.\n", + "\n", + " Args:\n", + " session: Snowpark session.\n", + " wheel_path: Path to the local SnowML wheel file.\n", + "\n", + " Returns:\n", + " The stage path to uploaded snowml.zip file.\n", + " \"\"\"\n", + " if stage_name is None:\n", + " stage_name = session.get_session_stage()\n", + " _ = session.file.put(wheel_path, stage_name, auto_compress=False, overwrite=True)\n", + " whl_filename = os.path.basename(wheel_path)\n", + " return f\"{stage_name}/{whl_filename}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90ea99cc", + "metadata": {}, + "outputs": [], + "source": [ + "SNOW_ML_WHEEL_STAGE_PATH = upload_snowml_to_tmp_stage(session, SNOW_ML_WHEEL_LOCAL_PATH)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "dfa9ab88", + "metadata": {}, + "source": [ + "### Open/Create Model Registry" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b0a0c8a8", + "metadata": {}, + "source": [ + "A model registry needs to be created before it can be used. The creation will create a new database in the current account so the active role needs to have permissions to create a database. After the first creation, the model registry can be opened without the need to create it again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a95e3431", + "metadata": {}, + "outputs": [], + "source": [ + "REGISTRY_DATABASE_NAME = \"TEMP\"\n", + "REGISTRY_SCHEMA_NAME = \"WZHAO\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fff21bc", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.registry import model_registry\n", + "model_registry.create_model_registry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)\n", + "registry = model_registry.ModelRegistry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d76e14a1", + "metadata": {}, + "source": [ + "## Use with scikit-learn model" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c592d46c", + "metadata": {}, + "source": [ + "### Train A Small Scikit-learn Model" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "378eb3ba", + "metadata": {}, + "source": [ + "The cell below trains a small model for demonstration purposes. The nature of the model does not matter, it is purely used to demonstrate the usage of the Model Packaging and Registry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8cf44218", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import svm\n", + "from sklearn.datasets import load_digits\n", + "\n", + "digits = load_digits()\n", + "target_digit = 6\n", + "num_training_examples = 10\n", + "svc_gamma = 0.001\n", + "svc_C = 10.0\n", + "\n", + "clf = svm.SVC(gamma=svc_gamma, C=svc_C, probability=True)\n", + "\n", + "\n", + "def one_vs_all(dataset, digit):\n", + " return [x == digit for x in dataset]\n", + "\n", + "\n", + "# Train a classifier using num_training_examples and use the last 100 examples for test.\n", + "train_features = digits.data[:num_training_examples]\n", + "train_labels = one_vs_all(digits.target[:num_training_examples], target_digit)\n", + "clf.fit(train_features, train_labels)\n", + "\n", + "test_features = digits.data[-100:]\n", + "test_labels = one_vs_all(digits.target[-100:], target_digit)\n", + "prediction = clf.predict(test_features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c25bd0d4", + "metadata": {}, + "outputs": [], + "source": [ + "print(prediction[:10])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "dda57d0b", + "metadata": {}, + "source": [ + "SVC has multiple method, for example, `predict_proba`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd7ee333", + "metadata": {}, + "outputs": [], + "source": [ + "prediction_proba = clf.predict_proba(test_features)\n", + "print(prediction_proba[:10])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "317e7843", + "metadata": {}, + "source": [ + "### Register Model" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3b482561", + "metadata": {}, + "source": [ + "The call to `log_model` executes a few steps:\n", + "1. The given model object is serialized and uploaded to a stage.\n", + "1. An entry in the Model Registry is created for the model, referencing the model stage location.\n", + "1. Additional metadata is updated for the model as provided in the call.\n", + "\n", + "For the serialization to work, the model object needs to be serializable in python.\n", + "\n", + "Aso, you have to provide a sample input data so that we could infer the model signature for you, or you can specify the model signature manually." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d8ad06e", + "metadata": {}, + "outputs": [], + "source": [ + "# A name and model tags can be added to the model at registration time.\n", + "model_id = registry.log_model(\n", + " model_name=\"SIMPLE_SVC_MODEL\",\n", + " model_version=\"1\",\n", + " model=clf,\n", + " tags={\"stage\": \"testing\", \"classifier_type\": \"svm.SVC\", \"svc_gamma\": svc_gamma, \"svc_C\": svc_C},\n", + " sample_input_data=test_features[:10],\n", + ")\n", + "\n", + "# The object API can be used to reference a model after creation.\n", + "model = model_registry.ModelReference(registry=registry, model_name=\"SIMPLE_SVC_MODEL\", model_version=\"1\")\n", + "print(\"Registered new model:\", model_id)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "735f0ac3", + "metadata": {}, + "source": [ + "### Load Model" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f778a9ad", + "metadata": {}, + "source": [ + "We can also restore the model we saved to the registry and load it back into the local context to make predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2796f2e0", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "model = model_registry.ModelReference(registry=registry, model_name=\"SIMPLE_SVC_MODEL\", model_version=\"1\")\n", + "restored_clf = model.load_model()\n", + "\n", + "restored_prediction = restored_clf.predict(test_features)\n", + "\n", + "print(\"Original prediction:\", prediction[:10])\n", + "print(\"Restored prediction:\", restored_prediction[:10])\n", + "\n", + "print(\"Result comparison:\", np.array_equal(prediction, restored_prediction))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3717853f", + "metadata": {}, + "outputs": [], + "source": [ + "restored_prediction_proba = restored_clf.predict_proba(test_features)\n", + "\n", + "print(\"Original prediction:\", prediction_proba[:10])\n", + "print(\"Restored prediction:\", restored_prediction_proba[:10])\n", + "\n", + "print(\"Result comparison:\", np.array_equal(prediction_proba, restored_prediction_proba))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "45c75e28", + "metadata": {}, + "source": [ + "### Deploy Model and Batch Inference" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a8d496db", + "metadata": {}, + "source": [ + "We can also deploy the model we saved to the registry to warehouse and predict it in the warehouse.\n", + "\n", + "Although the model may contain multiple methods, every deployment can only have one target method, and you need to specify that when you deploy the model." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c52611ac", + "metadata": {}, + "source": [ + "Also, since `snowflake-ml-python` does not exist in Anaconda channel yet, we have to import them manually in the options when deploying, it will not required when we our package into Snowflake Anaconda Channel." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ecab97c", + "metadata": {}, + "outputs": [], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "model = model_registry.ModelReference(registry=registry, model_name=\"SIMPLE_SVC_MODEL\", model_version=\"1\")\n", + "model.deploy(\n", + " deployment_name=\"svc_model_predict\",\n", + " target_method=\"predict\",\n", + " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e150421", + "metadata": {}, + "outputs": [], + "source": [ + "remote_prediction = registry.predict(deployment_name=\"svc_model_predict\", data=test_features)\n", + "\n", + "print(\"Remote prediction:\", remote_prediction[:10])\n", + "\n", + "print(\"Result comparison:\", np.array_equal(prediction, remote_prediction[\"feature_0\"].values))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6c1f3c07", + "metadata": {}, + "source": [ + "We can also deploy another method to warehouse." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c6f189c", + "metadata": {}, + "outputs": [], + "source": [ + "model.deploy(\n", + " deployment_name=\"svc_model_predict_proba\",\n", + " target_method=\"predict_proba\",\n", + " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36a00e1e", + "metadata": {}, + "outputs": [], + "source": [ + "remote_prediction_proba = registry.predict(deployment_name=\"svc_model_predict_proba\", data=test_features)\n", + "\n", + "print(\"Remote prediction:\", remote_prediction_proba[:10])\n", + "\n", + "print(\"Result comparison:\", np.array_equal(prediction_proba, remote_prediction_proba.values))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "dc2e2f5e", + "metadata": {}, + "source": [ + "## Use with customize model" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f2224cc7", + "metadata": {}, + "source": [ + "Also with customized model, it could do much more than what shows above." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9bc58b66", + "metadata": {}, + "source": [ + "### Download a GPT-2 model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ce2cca3", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoModelForCausalLM, AutoTokenizer\n", + "\n", + "model_name = \"gpt2-medium\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "model = AutoModelForCausalLM.from_pretrained(model_name)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "03454cba", + "metadata": {}, + "source": [ + "### Store GPT-2 Model components locally" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05a0e170", + "metadata": {}, + "outputs": [], + "source": [ + "ARTIFACTS_DIR = \"/tmp/gpt-2/\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f60d49c4", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.makedirs(os.path.join(ARTIFACTS_DIR, \"model\"), exist_ok=True)\n", + "os.makedirs(os.path.join(ARTIFACTS_DIR, \"tokenizer\"), exist_ok=True)\n", + "\n", + "model.save_pretrained(os.path.join(ARTIFACTS_DIR, \"model\"))\n", + "tokenizer.save_pretrained(os.path.join(ARTIFACTS_DIR, \"tokenizer\"))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "333118b7", + "metadata": {}, + "source": [ + "### Create a custom model using GPT-2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49c27920", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.model import custom_model\n", + "import pandas as pd\n", + "\n", + "\n", + "class GPT2Model(custom_model.CustomModel):\n", + " def __init__(self, context: custom_model.ModelContext) -> None:\n", + " super().__init__(context)\n", + "\n", + " self.model = AutoModelForCausalLM.from_pretrained(self.context.path(\"model\"))\n", + " self.tokenizer = AutoTokenizer.from_pretrained(self.context.path(\"tokenizer\"))\n", + "\n", + " @custom_model.inference_api\n", + " def predict(self, X: pd.DataFrame) -> pd.DataFrame:\n", + " def _generate(input_text: str) -> str:\n", + " input_ids = self.tokenizer.encode(input_text, return_tensors=\"pt\")\n", + "\n", + " output = self.model.generate(input_ids, max_length=50, do_sample=True, top_p=0.95, top_k=60)\n", + " generated_text = self.tokenizer.decode(output[0], skip_special_tokens=True)\n", + "\n", + " return generated_text\n", + "\n", + " res_df = pd.DataFrame({\"output\": pd.Series.apply(X[\"input\"], _generate)})\n", + " return res_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36438fd5", + "metadata": {}, + "outputs": [], + "source": [ + "gpt_model = GPT2Model(custom_model.ModelContext(models={}, artifacts={\n", + " \"model\":os.path.join(ARTIFACTS_DIR, \"model\"),\n", + " \"tokenizer\":os.path.join(ARTIFACTS_DIR, \"tokenizer\")\n", + "}))\n", + "\n", + "gpt_model.predict(pd.DataFrame({\"input\":[\"Hello, are you GPT?\"]}))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e111b527", + "metadata": {}, + "source": [ + "### Register the custom model" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c27ed16a", + "metadata": {}, + "source": [ + "Here, how to specify dependencies and model signature manually is shown." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a913530", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.model import model_signature\n", + "\n", + "model_id_gpt = registry.log_model(\n", + " model_name=\"GPT2_MODEL\",\n", + " model_version=\"1\",\n", + " model=gpt_model,\n", + " conda_dependencies=[\"tensorflow\", \"transformers\"],\n", + " signatures={\n", + " \"predict\": model_signature.ModelSignature(\n", + " inputs=[model_signature.FeatureSpec(name=\"input\", dtype=model_signature.DataType.STRING)],\n", + " outputs=[model_signature.FeatureSpec(name=\"output\", dtype=model_signature.DataType.STRING)],\n", + " )\n", + " },\n", + ")\n", + "\n", + "gpt_model = model_registry.ModelReference(registry=registry, model_name=\"GPT2_MODEL\", model_version=\"1\")\n", + "print(\"Registered new model:\", model_id_gpt)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e634f4c1", + "metadata": {}, + "source": [ + "### Deploy the model and predict" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "fc0f289d", + "metadata": {}, + "source": [ + "Relax version is an option that allow the deployer tries to relax the version specifications when initial attempt to\n", + "resolve the dependencies in Snowflake Anaconda Channel fails." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6d64cb0", + "metadata": {}, + "outputs": [], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "gpt_model = model_registry.ModelReference(\n", + " registry=registry,\n", + " model_name=\"GPT2_MODEL\",\n", + " model_version=\"1\",\n", + ")\n", + "gpt_model.deploy(\n", + " deployment_name=\"gpt_model_predict\",\n", + " target_method=\"predict\",\n", + " options={\"relax_version\": True, \"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24702087", + "metadata": {}, + "outputs": [], + "source": [ + "res = registry.predict(deployment_name=\"gpt_model_predict\", data=pd.DataFrame({\"input\":[\"Hello, are you GPT?\"]}))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e479c77d", + "metadata": {}, + "outputs": [], + "source": [ + "print(res)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b44a55b7", + "metadata": {}, + "source": [ + "## Use with XGBoost Model, Snowpark DataFrame and permanent deployment" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "91c61e80", + "metadata": {}, + "source": [ + "### Prepare a stage for permanent UDF deployment" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d420ccdd", + "metadata": {}, + "source": [ + "A non-temporary and Snowflake internal stage is required to permanently deploy a model as a UDF. We have to create manually now but it will eventually managed by model registry. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92ccb750", + "metadata": {}, + "outputs": [], + "source": [ + "PERMANENT_UDF_STAGE_NAME = \"SNOWML_MODEL_UDF_DEPLOYMENT\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b407434d", + "metadata": {}, + "outputs": [], + "source": [ + "session.sql(f\"CREATE OR REPLACE STAGE {PERMANENT_UDF_STAGE_NAME}\").collect()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3203f803", + "metadata": {}, + "source": [ + "To make the deployment permanent, any dependency must be put into the a permanent stage as well. Of course, this will no longer be necessary after `snowflake-ml-python` gets available in Snowflake Anaconda channel." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a25b641a", + "metadata": {}, + "outputs": [], + "source": [ + "SNOW_ML_WHEEL_STAGE_PATH = upload_snowml_to_tmp_stage(session, SNOW_ML_WHEEL_LOCAL_PATH, f\"@{PERMANENT_UDF_STAGE_NAME}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "05e45630", + "metadata": {}, + "source": [ + "### Prepare dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16debd21", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import fetch_kddcup99\n", + "\n", + "DATA_TABLE_NAME = \"KDDCUP99_DATASET\"\n", + "\n", + "kddcup99_data = fetch_kddcup99(as_frame=True)\n", + "kddcup99_sp_df = session.create_dataframe(kddcup99_data.frame)\n", + "kddcup99_sp_df.write.mode(\"overwrite\").save_as_table(DATA_TABLE_NAME)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "771cad94", + "metadata": {}, + "source": [ + "### Preprocessing Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04b976c8", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.preprocessing import one_hot_encoder, ordinal_encoder, standard_scaler\n", + "import snowflake.snowpark.functions as F\n", + "\n", + "quote_fn = lambda x: f'\"{x}\"'\n", + "\n", + "ONE_HOT_ENCODE_COL_NAMES = [\"protocol_type\", \"service\", \"flag\"]\n", + "ORDINAL_ENCODE_COL_NAMES = [\"labels\"]\n", + "STANDARD_SCALER_COL_NAMES = [\n", + " \"duration\",\n", + " \"src_bytes\",\n", + " \"dst_bytes\",\n", + " \"wrong_fragment\",\n", + " \"urgent\",\n", + " \"hot\",\n", + " \"num_failed_logins\",\n", + " \"num_compromised\",\n", + " \"num_root\",\n", + " \"num_file_creations\",\n", + " \"num_shells\",\n", + " \"num_access_files\",\n", + " \"num_outbound_cmds\",\n", + " \"count\",\n", + " \"srv_count\",\n", + " \"dst_host_count\",\n", + " \"dst_host_srv_count\",\n", + "]\n", + "\n", + "TRAIN_SIZE_K = 0.2\n", + "kddcup99_data = session.table(DATA_TABLE_NAME)\n", + "kddcup99_data = kddcup99_data.with_columns(\n", + " list(map(quote_fn, ONE_HOT_ENCODE_COL_NAMES + ORDINAL_ENCODE_COL_NAMES)),\n", + " [\n", + " F.to_char(col_name, \"utf-8\")\n", + " for col_name in list(map(quote_fn, ONE_HOT_ENCODE_COL_NAMES + ORDINAL_ENCODE_COL_NAMES))\n", + " ],\n", + ")\n", + "kddcup99_sp_df_train, kddcup99_sp_df_test = tuple(\n", + " kddcup99_data.random_split([TRAIN_SIZE_K, 1 - TRAIN_SIZE_K], seed=2568)\n", + ")\n", + "\n", + "ft_one_hot_encoder = one_hot_encoder.OneHotEncoder(\n", + " handle_unknown=\"ignore\",\n", + " input_cols=list(map(quote_fn, ONE_HOT_ENCODE_COL_NAMES)),\n", + " output_cols=ONE_HOT_ENCODE_COL_NAMES,\n", + " drop_input_cols=True,\n", + ")\n", + "ft_one_hot_encoder = ft_one_hot_encoder.fit(kddcup99_sp_df_train)\n", + "kddcup99_sp_df_train = ft_one_hot_encoder.transform(kddcup99_sp_df_train)\n", + "kddcup99_sp_df_test = ft_one_hot_encoder.transform(kddcup99_sp_df_test)\n", + "\n", + "ft_ordinal_encoder = ordinal_encoder.OrdinalEncoder(\n", + " input_cols=list(map(quote_fn, ORDINAL_ENCODE_COL_NAMES)),\n", + " output_cols=list(map(quote_fn, ORDINAL_ENCODE_COL_NAMES)),\n", + " drop_input_cols=True,\n", + ")\n", + "ft_ordinal_encoder = ft_ordinal_encoder.fit(kddcup99_sp_df_train)\n", + "kddcup99_sp_df_train = ft_ordinal_encoder.transform(kddcup99_sp_df_train)\n", + "kddcup99_sp_df_test = ft_ordinal_encoder.transform(kddcup99_sp_df_test)\n", + "\n", + "ft_standard_scaler = standard_scaler.StandardScaler(\n", + " input_cols=list(map(quote_fn, STANDARD_SCALER_COL_NAMES)),\n", + " output_cols=list(map(quote_fn, STANDARD_SCALER_COL_NAMES)),\n", + " drop_input_cols=True,\n", + ")\n", + "ft_standard_scaler = ft_standard_scaler.fit(kddcup99_sp_df_train)\n", + "kddcup99_sp_df_train = ft_standard_scaler.transform(kddcup99_sp_df_train)\n", + "kddcup99_sp_df_test = ft_standard_scaler.transform(kddcup99_sp_df_test)\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d4d25ee7", + "metadata": {}, + "source": [ + "### Train an XGBoost model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68bb0f77", + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost\n", + "\n", + "regressor = xgboost.XGBClassifier(objective=\"multi:softprob\", n_estimators=500, reg_lambda=1, gamma=0, max_depth=5)\n", + "kddcup99_pd_df_train = kddcup99_sp_df_train.to_pandas()\n", + "regressor.fit(\n", + " kddcup99_pd_df_train.drop(\n", + " columns=[\n", + " col_name for col_name in kddcup99_pd_df_train.columns if col_name.startswith(\"labels\")\n", + " ] # Since there is a bug in OrdinalEncoder's output\n", + " ),\n", + " kddcup99_pd_df_train[\"labels\"],\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2e9446fc", + "metadata": {}, + "source": [ + "### Log the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bf06733", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.model import model_signature\n", + "\n", + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "# A name and model tags can be added to the model at registration time.\n", + "model_id_xgb = registry.log_model(\n", + " model_name=\"XGB_MODEL_KDDCUP99\",\n", + " model_version=\"1\",\n", + " model=regressor,\n", + " sample_input_data=kddcup99_sp_df_train.limit(10).drop(\n", + " *[col_name for col_name in kddcup99_sp_df_train.columns if col_name.startswith('\"labels')]\n", + " ),\n", + ")\n", + "\n", + "# The object API can be used to reference a model after creation.\n", + "xgb_model = model_registry.ModelReference(registry=registry, model_name=\"XGB_MODEL_KDDCUP99\", model_version=\"1\")\n", + "print(\"Registered new model:\", model_id_xgb)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "5948b7c8", + "metadata": {}, + "source": [ + "### Deploy the model permanently" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1f4cc21", + "metadata": {}, + "outputs": [], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "xgb_model = model_registry.ModelReference(\n", + " registry=registry,\n", + " model_name=\"XGB_MODEL_KDDCUP99\",\n", + " model_version=\"1\",\n", + ")\n", + "xgb_model.deploy(\n", + " deployment_name=\"xgb_model_predict\",\n", + " target_method=\"predict\",\n", + " options={\n", + " \"relax_version\": True,\n", + " \"permanent_udf_stage_location\": f\"@{PERMANENT_UDF_STAGE_NAME}\",\n", + " \"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH,\n", + " },\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e560bd8d", + "metadata": {}, + "source": [ + "### Predict with Snowpark DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9578a89b", + "metadata": {}, + "outputs": [], + "source": [ + "sp_res = registry.predict(deployment_name=\"xgb_model_predict\", data=kddcup99_sp_df_test)\n", + "sp_res.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "08614b16", + "metadata": {}, + "source": [ + "### Prepare another SQL connection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "421ff7e1", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.utils.connection_params import SnowflakeLoginOptions\n", + "from snowflake.snowpark import Session\n", + "\n", + "another_session = Session.builder.configs(SnowflakeLoginOptions()).create()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d1e99456", + "metadata": {}, + "source": [ + "### Call the deployed permanent UDF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8523b768", + "metadata": {}, + "outputs": [], + "source": [ + "registry._session = another_session # Since permanent deployment managing has not been finished in registry.\n", + "sp_res = registry.predict(\n", + " deployment_name=\"xgb_model_predict\", data=another_session.create_dataframe(kddcup99_sp_df_test.to_pandas())\n", + ")\n", + "sp_res.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6b4eabe1", + "metadata": {}, + "source": [ + "### Remove the deployed UDF" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "be5ecdb5", + "metadata": {}, + "source": [ + "This would be done by calling delete_deployment in the registry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8eceb2a", + "metadata": {}, + "outputs": [], + "source": [ + "session.sql(f\"DROP FUNCTION xgb_model_predict(object)\").collect()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + }, + "vscode": { + "interpreter": { + "hash": "fb0a62cbfaa59af7646af5a6672c5c3e72ec75fbadf6ff0336b6769523f221a5" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/snowflake/ml/version.bzl b/snowflake/ml/version.bzl index 0b6d8963..5cc482e4 100644 --- a/snowflake/ml/version.bzl +++ b/snowflake/ml/version.bzl @@ -1,2 +1,2 @@ # This is parsed by regex in conda reciper meta file. Make sure not to break it. -VERSION = "0.3.1" +VERSION = "0.3.2" diff --git a/tests/integ/snowflake/ml/_internal/BUILD.bazel b/tests/integ/snowflake/ml/_internal/BUILD.bazel index afb10ccb..45eee1ef 100644 --- a/tests/integ/snowflake/ml/_internal/BUILD.bazel +++ b/tests/integ/snowflake/ml/_internal/BUILD.bazel @@ -6,6 +6,7 @@ py_test( timeout = "long", deps = [ "//snowflake/ml/_internal:env_utils", + "//snowflake/ml/_internal:env", "//snowflake/ml/utils:connection_params" ], tags = ["skip_merge_gates"], diff --git a/tests/integ/snowflake/ml/_internal/env_utils_integ_test.py b/tests/integ/snowflake/ml/_internal/env_utils_integ_test.py index 51d04179..6a6a4383 100644 --- a/tests/integ/snowflake/ml/_internal/env_utils_integ_test.py +++ b/tests/integ/snowflake/ml/_internal/env_utils_integ_test.py @@ -6,7 +6,7 @@ from absl.testing import absltest from packaging import requirements -from snowflake.ml._internal import env_utils +from snowflake.ml._internal import env as snowml_env, env_utils from snowflake.ml.utils import connection_params from snowflake.snowpark import Session @@ -20,24 +20,30 @@ def tearDown(self) -> None: def test_validate_requirement_in_snowflake_conda_channel(self) -> None: res = env_utils.validate_requirements_in_snowflake_conda_channel( - session=self._session, reqs=[requirements.Requirement("xgboost")] + session=self._session, reqs=[requirements.Requirement("xgboost")], python_version=snowml_env.PYTHON_VERSION ) self.assertNotEmpty(res) res = env_utils.validate_requirements_in_snowflake_conda_channel( - session=self._session, reqs=[requirements.Requirement("xgboost"), requirements.Requirement("pytorch")] + session=self._session, + reqs=[requirements.Requirement("xgboost"), requirements.Requirement("pytorch")], + python_version=snowml_env.PYTHON_VERSION, ) self.assertNotEmpty(res) self.assertIsNone( env_utils.validate_requirements_in_snowflake_conda_channel( - session=self._session, reqs=[requirements.Requirement("xgboost<1.3")] + session=self._session, + reqs=[requirements.Requirement("xgboost==1.0.*")], + python_version=snowml_env.PYTHON_VERSION, ) ) self.assertIsNone( env_utils.validate_requirements_in_snowflake_conda_channel( - session=self._session, reqs=[requirements.Requirement("python-package")] + session=self._session, + reqs=[requirements.Requirement("python-package")], + python_version=snowml_env.PYTHON_VERSION, ) ) diff --git a/tests/integ/snowflake/ml/model/BUILD.bazel b/tests/integ/snowflake/ml/model/BUILD.bazel index 0bc2b31d..0ffa1ae2 100644 --- a/tests/integ/snowflake/ml/model/BUILD.bazel +++ b/tests/integ/snowflake/ml/model/BUILD.bazel @@ -2,16 +2,17 @@ load("//bazel:py_rules.bzl", "py_test") py_test( name = "model_integ_test", - srcs = ["model_integ_test.py"], timeout = "long", - shard_count=6, + srcs = ["model_integ_test.py"], + data = [ + "//snowflake/ml:wheel", + ], + shard_count = 6, + tags = ["skip_merge_gates"], deps = [ - "//snowflake/ml/model:_model", "//snowflake/ml/model:_deployer", - "//snowflake/ml/utils:connection_params" + "//snowflake/ml/model:_model", + "//snowflake/ml/modeling/linear_model:linear_regression", + "//snowflake/ml/utils:connection_params", ], - tags = ["skip_merge_gates"], - data = [ - "//snowflake/ml:wheel" - ] ) diff --git a/tests/integ/snowflake/ml/model/model_integ_test.py b/tests/integ/snowflake/ml/model/model_integ_test.py index eaa42ca5..fda3937f 100644 --- a/tests/integ/snowflake/ml/model/model_integ_test.py +++ b/tests/integ/snowflake/ml/model/model_integ_test.py @@ -22,6 +22,7 @@ custom_model, type_hints as model_types, ) +from snowflake.ml.modeling.linear_model import LinearRegression from snowflake.ml.utils import connection_params from snowflake.snowpark import Session @@ -433,6 +434,41 @@ def test_xgb(self) -> None: res = dc.predict(di_predict["name"], cal_X_test) np.testing.assert_allclose(res.values, np.expand_dims(regressor.predict(cal_X_test), axis=1)) + def test_snowml_model_deploy(self) -> None: + iris = datasets.load_iris() + df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=iris["feature_names"] + ["target"]) + df.columns = [s.replace(" (CM)", "").replace(" ", "") for s in df.columns.str.upper()] + + INPUT_COLUMNS = ["SEPALLENGTH", "SEPALWIDTH", "PETALLENGTH", "PETALWIDTH"] + LABEL_COLUMNS = "TARGET" + OUTPUT_COLUMNS = "PREDICTED_TARGET" + regr = LinearRegression(input_cols=INPUT_COLUMNS, output_cols=OUTPUT_COLUMNS, label_cols=LABEL_COLUMNS) + test_features = df[:10] + regr.fit(test_features) + + with tempfile.TemporaryDirectory() as tmpdir: + model_api.save_model( + name="snowml_model", + model_dir_path=os.path.join(tmpdir, "snowml_model"), + model=regr, + sample_input=test_features, + metadata={"author": "xjiang", "version": "1"}, + ) + dc = _deployer.Deployer(self._session, _deployer.LocalDeploymentManager()) + di = dc.create_deployment( + name=f"snowml_model{self.run_id}", + model_dir_path=os.path.join(tmpdir, "snowml_model"), + platform=_deployer.TargetPlatform.WAREHOUSE, + target_method="predict", + options=model_types.WarehouseDeployOptions( + {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} + ), + ) + + assert di is not None + res = dc.predict(di["name"], test_features) + np.testing.assert_allclose(res[OUTPUT_COLUMNS].values, regr.predict(test_features)[OUTPUT_COLUMNS].values) + if __name__ == "__main__": absltest.main()