diff --git a/.bazelrc b/.bazelrc index 62e8ede3..01f7ddac 100644 --- a/.bazelrc +++ b/.bazelrc @@ -14,6 +14,7 @@ coverage --instrumentation_filter="-//tests[/:]" build:_build --platforms //bazel/platforms:snowflake_conda_env --host_platform //bazel/platforms:snowflake_conda_env --repo_env=BAZEL_CONDA_ENV_NAME=build build:_sf_only --platforms //bazel/platforms:snowflake_conda_env --host_platform //bazel/platforms:snowflake_conda_env --repo_env=BAZEL_CONDA_ENV_NAME=sf_only build:_extended --platforms //bazel/platforms:extended_conda_env --host_platform //bazel/platforms:extended_conda_env --repo_env=BAZEL_CONDA_ENV_NAME=extended +build:_extended_oss --platforms //bazel/platforms:extended_conda_env --host_platform //bazel/platforms:extended_conda_env --repo_env=BAZEL_CONDA_ENV_NAME=extended_oss # Public definitions @@ -35,6 +36,7 @@ run:pre_build --config=_build --config=py3.8 # Config to run type check build:typecheck --aspects @rules_mypy//:mypy.bzl%mypy_aspect --output_groups=mypy --config=_extended --config=py3.8 +build:typecheck_oss --aspects @rules_mypy//:mypy.bzl%mypy_aspect --output_groups=mypy --config=_extended_oss --config=py3.8 # Config to build the doc build:docs --config=_sf_only --config=py3.8 @@ -44,3 +46,6 @@ build:docs --config=_sf_only --config=py3.8 test:extended --config=_extended run:extended --config=_extended cquery:extended --config=_extended +test:extended_oss --config=_extended_oss +run:extended_oss --config=_extended_oss +cquery:extended_oss --config=_extended_oss diff --git a/CHANGELOG.md b/CHANGELOG.md index a8fe80bd..6b8d8869 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,23 @@ # Release History +## 1.0.11 + +### New Features + +- Model Registry: Add log_artifact() public method. +- Model Development: Add support for `kneighbors`. + +### Behavior Changes + +- Model Registry: Change log_model() argument from TrainingDataset to List of Artifact. +- Model Registry: Change get_training_dataset() to get_artifact(). + +### Bug Fixes + +- Model Development: Fix support for XGBoost and LightGBM models using SKLearn Grid Search and Randomized Search model selectors. +- Model Development: DecimalType is now supported as a DataType. +- Model Development: Fix metrics compatibility with Snowpark Dataframes that use Snowflake identifiers + ## 1.0.10 ### Behavior Changes diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6c2560e4..727b8eb1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -347,6 +347,57 @@ When you add a new test file, you should always ensure the existence of a `if __ the test file will not be instructed by bazel. We have a test wrapper [here](./bazel/test_wrapper.sh) to ensure that the test will fail if you forget that part. +## Integration test + +### Test in Store Procedure + +To test if your code is working in store procedure or not simply, you could work based on `CommonTestBase` in +`tests/integ/snowflake/ml/test_utils/common_test_base.py`. An example of such test could be found in +`tests/integ/snowflake/ml/_internal/file_utils_integ_test.py`. + +To write a such test, you need to + +1. Let your test case inherit from `common_test_base.CommonTestBase`. +1. Remove all Snowpark Session creation in your test, and use `self.session` to access the session if needed. +1. If you write your own `setUp` and `tearDown` method, remember to call `super().setUp()` or `super().tearDown().` +1. Decorate your test method with `common_test_base.CommonTestBase.sproc_test()`. If you want your test running in +store procedure only rather than both locally and in store procedure, set `local=False`. If you don't want to test +with caller's rights, set `test_callers_rights=False`. (Owner's rights store procedure is always tested) + + **Attention**: Depending on your configurations, 1-3 sub-tests will be run in your test method. + Sub-test means that `setUp` and `tearDown` won't run every sub-test and will only run once before and + after the whole test method. So it is important to make your test case self-contained. + +### Compatibility Test + +To test if your code is compatible with previous version simply, you could work based on `CommonTestBase` in +`tests/integ/snowflake/ml/test_utils/common_test_base.py`. An example of such test could be found in +`tests/integ/snowflake/ml/registry/model_registry_compat_test.py`. + +To write a such test, you need to + +1. Let your test case inherit from `common_test_base.CommonTestBase`. +1. Remove all Snowpark Session creation in your test, and use `self.session` to access the session if needed. +1. If you write your own `setUp` and `tearDown` method, remember to call `super().setUp()` or `super().tearDown().` +1. Write a factory method in your test class that return a tuple of a function and its parameters as a tuple. The +function will be run as a store procedure in the environment with previous version of library. + + **Note**: Since the function will be created as a store procedure, the first argument must be a Snowpark Session. + The arguments tuple you provided via the factory method does not require to include the session object. + + **Note**: To avoid any objects from current environment affecting the result, instead of using `cloudpickle` to + pickle the function, the function will be created as a Python file and registered as a store procedure. This means + you cannot use any object outside of the function, and if you want to import anything, you need to import inside + the function definition. So it would help if you make your prepare function as simple as possible. + +1. Decorate your test method with `common_test_base.CommonTestBase.compatibility_test`, providing the factory method +you created in the above step, optional version range to test with, as well as additional package requirements. + + **Attention**: For every version available in the server and within the version range, a sub-test will be run that + contains a run of prepare function in the store procedure and a run of the method. Sub-test means that `setUp` and + `tearDown` won't run every sub-test and will only run once before and after the whole test method. So it is + important to make your test case self-contained. + ## `pre-commit` Pull requests against the main branch are subject to `pre-commit` checks. Those checks enforce the code style. diff --git a/README.md b/README.md index 7ebcbaac..d4acf9c2 100644 --- a/README.md +++ b/README.md @@ -2,31 +2,32 @@ Snowpark ML is a set of tools including SDKs and underlying infrastructure to build and deploy machine learning models. With Snowpark ML, you can pre-process data, train, manage and deploy ML models all within Snowflake, using a single SDK, - and benefit from Snowflake’s proven performance, scalability, stability and governance at every stage of the Machine - Learning workflow. +and benefit from Snowflake’s proven performance, scalability, stability and governance at every stage of the Machine +Learning workflow. ## Key Components of Snowpark ML The Snowpark ML Python SDK provides a number of APIs to support each stage of an end-to-end Machine Learning development - and deployment process, and includes two key components. +and deployment process, and includes two key components. ### Snowpark ML Development [Public Preview] -A collection of python APIs to enable efficient model development directly in Snowflake: +[Snowpark ML Development](https://docs.snowflake.com/en/developer-guide/snowpark-ml/index#snowpark-ml-development) +provides a collection of python APIs enabling efficient ML model development directly in Snowflake: -1. Modeling API (snowflake.ml.modeling) for data preprocessing, feature engineering and model training in Snowflake. -This includes snowflake.ml.modeling.preprocessing for scalable data transformations on large data sets utilizing the -compute resources of underlying Snowpark Optimized High Memory Warehouses, and a large collection of ML model -development classes based on sklearn, xgboost, and lightgbm. See the private preview limited access docs (Preprocessing, - Modeling for more details on these. +1. Modeling API (`snowflake.ml.modeling`) for data preprocessing, feature engineering and model training in Snowflake. +This includes the `snowflake.ml.modeling.preprocessing` module for scalable data transformations on large data sets +utilizing the compute resources of underlying Snowpark Optimized High Memory Warehouses, and a large collection of ML +model development classes based on sklearn, xgboost, and lightgbm. 1. Framework Connectors: Optimized, secure and performant data provisioning for Pytorch and Tensorflow frameworks in their native data loader formats. ### Snowpark ML Ops [Private Preview] -Snowpark MLOps complements the Snowpark ML Development API, and provides model management capabilities along with -integrated deployment into Snowflake. Currently, the API consists of +[Snowpark MLOps](https://docs.snowflake.com/en/developer-guide/snowpark-ml/index#snowpark-ml-ops) complements the +Snowpark ML Development API, and provides model management capabilities along with integrated deployment into Snowflake. +Currently, the API consists of: 1. FileSet API: FileSet provides a Python fsspec-compliant API for materializing data into a Snowflake internal stage from a query or Snowpark Dataframe along with a number of convenience APIs. @@ -37,26 +38,48 @@ Snowflake Warehouses as vectorized UDFs. During PrPr, we are iterating on API without backward compatibility guarantees. It is better to recreate your registry everytime you update the package. This means, at this time, you cannot use the registry for production use. -- [Documentation](https://docs.snowflake.com/developer-guide/snowpark-ml) - ## Getting started ### Have your Snowflake account ready If you don't have a Snowflake account yet, you can [sign up for a 30-day free trial account](https://signup.snowflake.com/). -### Create a Python virtual environment +### Installation + +Follow the [installation instructions](https://docs.snowflake.com/en/developer-guide/snowpark-ml/index#installing-snowpark-ml) +in the Snowflake documentation. -Python version 3.8, 3.9 & 3.10 are supported. You can use [miniconda](https://docs.conda.io/en/latest/miniconda.html), -[anaconda](https://www.anaconda.com/), or [virtualenv](https://docs.python.org/3/tutorial/venv.html) to create a virtual - environment. +Python versions 3.8, 3.9 & 3.10 are supported. You can use [miniconda](https://docs.conda.io/en/latest/miniconda.html) or +[anaconda](https://www.anaconda.com/) to create a Conda environment (recommended), +or [virtualenv](https://docs.python.org/3/tutorial/venv.html) to create a virtual environment. -To have the best experience when using this library, [creating a local conda environment with the Snowflake channel]( - https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-packages.html#local-development-and-testing) -is recommended. +### Conda channels -### Install the library to the Python virtual environment +The [Snowflake Conda Channel](https://repo.anaconda.com/pkgs/snowflake/) contains the official snowpark ML package releases. +The recommended approach is to install `snowflake-ml-python` this conda channel: ```sh -pip install snowflake-ml-python +conda install \ + -c https://repo.anaconda.com/pkgs/snowflake \ + --override-channels \ + snowflake-ml-python +``` + +See [the developer guide](https://docs.snowflake.com/en/developer-guide/snowpark-ml/index) for installation instructions. + +The latest version of the `snowpark-ml-python` package is also published in a conda channel in this repository. Package versions +in this channel may not yet be present in the official Snowflake conda channel. + +Install `snowflake-ml-python` from this channel with the following (being sure to replace `` with the +desired version, e.g. `1.0.10`): + +```bash +conda install \ + -c https://raw.githubusercontent.com/snowflakedb/snowflake-ml-python/conda/releases/ \ + -c https://repo.anaconda.com/pkgs/snowflake \ + --override-channels \ + snowflake-ml-python== ``` + +Note that until a `snowflake-ml-python` package version is available in the official Snowflake conda channel, there may +be compatibility issues. Server-side functionality that `snowflake-ml-python` depends on may not yet be released. diff --git a/bazel/environments/conda-env.yml b/bazel/environments/conda-env.yml index 288a6497..9277ffd3 100644 --- a/bazel/environments/conda-env.yml +++ b/bazel/environments/conda-env.yml @@ -58,3 +58,7 @@ dependencies: - types-requests==2.30.0.0 - typing-extensions==4.5.0 - xgboost==1.7.3 + - pip + - pip: + - --extra-index-url https://pypi.org/simple + - peft==0.5.0 diff --git a/bazel/environments/fetch_conda_env_config.bzl b/bazel/environments/fetch_conda_env_config.bzl index 0bba6623..edd1cb0c 100644 --- a/bazel/environments/fetch_conda_env_config.bzl +++ b/bazel/environments/fetch_conda_env_config.bzl @@ -16,6 +16,12 @@ def _fetch_conda_env_config_impl(rctx): "compatible_target": ["@SnowML//bazel/platforms:extended_conda_channels"], "environment": "@//bazel/environments:conda-env.yml", }, + # `extended_oss` is the extended env for OSS repo which is a strict subset of `extended`. + # It's intended for development without dev VPN. + "extended_oss": { + "compatible_target": ["@SnowML//bazel/platforms:extended_conda_channels"], + "environment": "@//bazel/environments:conda-env.yml", + }, "sf_only": { "compatible_target": ["@SnowML//bazel/platforms:snowflake_conda_channel"], "environment": "@//bazel/environments:conda-env-snowflake.yml", diff --git a/bazel/requirements/parse_and_generate_requirements.py b/bazel/requirements/parse_and_generate_requirements.py index 5b4e0965..fca25bd7 100644 --- a/bazel/requirements/parse_and_generate_requirements.py +++ b/bazel/requirements/parse_and_generate_requirements.py @@ -1,6 +1,7 @@ import argparse import collections import contextlib +import copy import functools import itertools import json @@ -146,6 +147,9 @@ def generate_dev_pinned_string( version = req_info.get("dev_version_conda", req_info.get("dev_version", None)) if version is None: raise ValueError("No pinned version exists.") + if env == "conda-only": + if "dev_version_conda" in req_info or "dev_version" in req_info: + return None from_channel = req_info.get("from_channel", None) if version == "": version_str = "" @@ -158,6 +162,9 @@ def generate_dev_pinned_string( version = req_info.get("dev_version_pypi", req_info.get("dev_version", None)) if version is None: raise ValueError("No pinned version exists.") + if env == "pip-only": + if "dev_version_conda" in req_info or "dev_version" in req_info: + return None if version == "": version_str = "" else: @@ -341,9 +348,15 @@ def generate_requirements( sorted(filter(None, map(lambda req_info: generate_dev_pinned_string(req_info, "conda"), requirements))) ) - extended_env: List[Union[str, MutableMapping[str, Sequence[str]]]] = extended_env_conda # type: ignore[assignment] + extended_env: List[Union[str, MutableMapping[str, Sequence[str]]]] = copy.deepcopy( + extended_env_conda # type: ignore[arg-type] + ) + # Relative order needs to be maintained here without sorting. + # For external pip-only packages, we want to it able to access pypi.org index, + # while for internal pip-only packages, nexus is the only viable index. + # Relative order is here to prevent nexus index overriding public index. pip_only_reqs = list( - sorted(filter(None, map(lambda req_info: generate_dev_pinned_string(req_info, "pip-only"), requirements))) + filter(None, map(lambda req_info: generate_dev_pinned_string(req_info, "pip-only"), requirements)) ) if pip_only_reqs: extended_env.extend(["pip", {"pip": pip_only_reqs}]) diff --git a/ci/RunBazelAction.sh b/ci/RunBazelAction.sh index 5a9bdc0f..435b1844 100755 --- a/ci/RunBazelAction.sh +++ b/ci/RunBazelAction.sh @@ -158,6 +158,7 @@ elif [[ "${action}" = "coverage" ]]; then "${cache_test_results}" \ --combined_report=lcov \ "${coverage_tag_filter}" \ + --experimental_collect_code_coverage_for_generated_files \ --target_pattern_file "${sf_only_test_targets_file}" sf_only_bazel_exit_code=$? @@ -170,6 +171,7 @@ elif [[ "${action}" = "coverage" ]]; then "${cache_test_results}" \ --combined_report=lcov \ "${coverage_tag_filter}" \ + --experimental_collect_code_coverage_for_generated_files \ --target_pattern_file "${extended_test_targets_file}" extended_bazel_exit_code=$? diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml index e0e272fc..0da5ff39 100644 --- a/ci/conda_recipe/meta.yaml +++ b/ci/conda_recipe/meta.yaml @@ -17,7 +17,7 @@ build: noarch: python package: name: snowflake-ml-python - version: 1.0.10 + version: 1.0.11 requirements: build: - python @@ -49,7 +49,7 @@ requirements: - mlflow>=2.1.0,<2.4 - sentencepiece>=0.1.95,<0.2 - shap==0.42.1 - - tensorflow>=2.9,<3 + - tensorflow>=2.9,<3,!=2.12.0 - tokenizers>=0.10,<1 - torchdata>=0.4,<1 - transformers>=4.29.2,<5 diff --git a/codegen/sklearn_wrapper_generator.py b/codegen/sklearn_wrapper_generator.py index 001a059c..f3b7d681 100644 --- a/codegen/sklearn_wrapper_generator.py +++ b/codegen/sklearn_wrapper_generator.py @@ -21,18 +21,18 @@ input_cols: Optional[Union[str, List[str]]] A string or list of strings representing column names that contain features. If this parameter is not specified, all columns in the input DataFrame except - the columns specified by label_cols and sample-weight_col parameters are + the columns specified by label_cols and sample_weight_col parameters are considered input columns. label_cols: Optional[Union[str, List[str]]] A string or list of strings representing column names that contain labels. This is a required param for estimators, as there is no way to infer these columns. If this parameter is not specified, then object is fitted without - labels(Like a transformer). + labels (like a transformer). output_cols: Optional[Union[str, List[str]]] A string or list of strings representing column names that will store the - output of predict and transform operations. The length of output_cols mus + output of predict and transform operations. The length of output_cols must match the expected number of output columns from the specific estimator or transformer class used. If this parameter is not specified, output column names are derived by @@ -41,7 +41,7 @@ be set explicitly for transformers. sample_weight_col: Optional[str] - A string representing the column name containing the examples’ weights. + A string representing the column name containing the sample weights. This argument is only required when working with weighted datasets. drop_input_cols: Optional[bool], default=False @@ -60,8 +60,8 @@ class WrapperGeneratorFactory: """ - Reads a estimator class descriptor and generates a WrapperGenerator object which will have - aprropriate fields to fill a template string. + Reads an estimator class descriptor and generates a WrapperGenerator object which will have + appropriate fields to fill a template string. Example ------- @@ -187,6 +187,18 @@ def _is_multioutput_estimator_obj(class_object: Tuple[str, type]) -> bool: """ return WrapperGeneratorFactory._is_class_of_type(class_object[1], "_MultiOutputEstimator") + @staticmethod + def _is_k_neighbors_obj(class_object: Tuple[str, type]) -> bool: + """Check if the given estimator is a k-neighbors estimator. + + Args: + class_object: Meta class object which needs to be checked. + + Returns: + True if the class inherits from KNeighborsMixin, otherwise False. + """ + return WrapperGeneratorFactory._is_class_of_type(class_object[1], "KNeighborsMixin") + @staticmethod def _is_xgboost(module_name: str) -> bool: """Checks if the given module belongs to XGBoost package. @@ -505,9 +517,9 @@ def __init__(self, module_name: str, class_object: Tuple[str, type]) -> None: self.predict_docstring = "" self.predict_proba_docstring = "" self.score_docstring = "" - self.predict_proba_docstring = "" self.predict_log_proba_docstring = "" self.decision_function_docstring = "" + self.kneighbors_docstring = "" # Import strings self.estimator_imports = "" @@ -568,6 +580,7 @@ def _populate_flags(self) -> None: self._is_transformer = WrapperGeneratorFactory._is_transformer_obj(self.class_object) self._is_multioutput = WrapperGeneratorFactory._is_multioutput_obj(self.class_object) self._is_multioutput_estimator = WrapperGeneratorFactory._is_multioutput_estimator_obj(self.class_object) + self._is_k_neighbors = WrapperGeneratorFactory._is_k_neighbors_obj(self.class_object) self._is_heterogeneous_ensemble = WrapperGeneratorFactory._is_heterogeneous_ensemble_obj(self.class_object) self._is_stacking_ensemble = WrapperGeneratorFactory._is_stacking_ensemble_obj(self.class_object) self._is_voting_ensemble = WrapperGeneratorFactory._is_voting_ensemble_obj(self.class_object) @@ -629,7 +642,16 @@ def _populate_class_doc_fields(self) -> None: self.estimator_class_docstring = class_docstring def _populate_function_doc_fields(self) -> None: - _METHODS = ["fit", "predict", "predict_log_proba", "predict_proba", "decision_function", "transform", "score"] + _METHODS = [ + "fit", + "predict", + "predict_log_proba", + "predict_proba", + "decision_function", + "transform", + "score", + "kneighbors", + ] _CLASS_FUNC = {name: func for name, func in inspect.getmembers(self.class_object[1])} for _each_method in _METHODS: if _each_method in _CLASS_FUNC.keys(): @@ -660,6 +682,7 @@ def _populate_function_doc_fields(self) -> None: self.predict_log_proba_docstring = self.estimator_function_docstring["predict_log_proba"] self.decision_function_docstring = self.estimator_function_docstring["decision_function"] self.score_docstring = self.estimator_function_docstring["score"] + self.kneighbors_docstring = self.estimator_function_docstring["kneighbors"] def _populate_class_names(self) -> None: self.original_class_name = self.class_object[0] @@ -827,9 +850,13 @@ def generate(self) -> "SklearnWrapperGenerator": # Populate all the common values super().generate() + is_model_selector = WrapperGeneratorFactory._is_class_of_type(self.class_object[1], "BaseSearchCV") + # Populate SKLearn specific values self.estimator_imports_list.extend(["import sklearn", f"import {self.root_module_name}"]) - self.wrapper_provider_class = "SklearnWrapperProvider" + self.wrapper_provider_class = ( + "SklearnModelSelectionWrapperProvider" if is_model_selector else "SklearnWrapperProvider" + ) self.score_sproc_imports = ["sklearn"] if "random_state" in self.original_init_signature.parameters.keys(): @@ -926,10 +953,6 @@ def generate(self) -> "SklearnWrapperGenerator": if self._is_hist_gradient_boosting_regressor: self.test_estimator_input_args_list.extend(["min_samples_leaf=1", "max_leaf_nodes=100"]) - # TODO(snandamuri): Replace cloudpickle with joblib after latest version of joblib is added to snowflake conda. - self.deps = ( - "f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'" - ) self.supported_export_method = "to_sklearn" self.unsupported_export_methods = ["to_xgboost", "to_lightgbm"] self._construct_string_from_lists() @@ -962,7 +985,6 @@ def generate(self) -> "XGBoostWrapperGenerator": # TODO(snandamuri): Replace cloudpickle with joblib after latest version of joblib is added to snowflake conda. self.supported_export_method = "to_xgboost" self.unsupported_export_methods = ["to_sklearn", "to_lightgbm"] - self.deps = "f'numpy=={np.__version__}', f'xgboost=={xgboost.__version__}', f'cloudpickle=={cp.__version__}'" self._construct_string_from_lists() return self @@ -990,8 +1012,6 @@ def generate(self) -> "LightGBMWrapperGenerator": self.score_sproc_imports = ["lightgbm"] self.wrapper_provider_class = "LightGBMWrapperProvider" - # TODO(snandamuri): Replace cloudpickle with joblib after latest version of joblib is added to snowflake conda. - self.deps = "f'numpy=={np.__version__}', f'lightgbm=={lightgbm.__version__}', f'cloudpickle=={cp.__version__}'" self.supported_export_method = "to_lightgbm" self.unsupported_export_methods = ["to_sklearn", "to_xgboost"] self._construct_string_from_lists() diff --git a/codegen/sklearn_wrapper_template.py_template b/codegen/sklearn_wrapper_template.py_template index e123a3ca..280d7cfe 100644 --- a/codegen/sklearn_wrapper_template.py_template +++ b/codegen/sklearn_wrapper_template.py_template @@ -53,7 +53,9 @@ class {transform.original_class_name}(BaseTransformer): {transform.estimator_init_signature} ) -> None: super().__init__() - deps: Set[str] = set([{transform.deps}]) + + {transform.estimator_init_member_args} + deps = set({transform.wrapper_provider_class}().dependencies) {transform.estimator_args_gathering_calls} self._deps = list(deps) {transform.estimator_args_transform_calls} @@ -66,7 +68,6 @@ class {transform.original_class_name}(BaseTransformer): {transform.sklearn_init_arguments} ) self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None - {transform.estimator_init_member_args} # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols self._snowpark_cols: Optional[List[str]] = self.input_cols self._handlers: FitPredictHandlers = HandlersImpl(class_name={transform.original_class_name}.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True, wrapper_provider={transform.wrapper_provider_class}()) @@ -193,6 +194,8 @@ class {transform.original_class_name}(BaseTransformer): inference_method: str, expected_output_cols_list: List[str], expected_output_cols_type: str = "", + *args: Any, + **kwargs: Any, ) -> DataFrame: """Util method to create UDF and run batch inference. """ @@ -225,7 +228,9 @@ class {transform.original_class_name}(BaseTransformer): self.input_cols, self._get_pass_through_columns(dataset), expected_output_cols_list, - expected_output_cols_type + expected_output_cols_type, + *args, + **kwargs, ) @@ -233,7 +238,9 @@ class {transform.original_class_name}(BaseTransformer): self, dataset: pd.DataFrame, inference_method: str, - expected_output_cols_list: List[str] + expected_output_cols_list: List[str], + *args: Any, + **kwargs: Any, ) -> pd.DataFrame: output_cols = expected_output_cols_list.copy() @@ -278,34 +285,41 @@ class {transform.original_class_name}(BaseTransformer): input_df = dataset[columns_to_select] input_df.columns = features_required_by_estimator - transformed_numpy_array = getattr(estimator, inference_method)( - input_df - ) + inference_res = getattr(estimator, inference_method)(input_df, *args, **kwargs) if ( - isinstance(transformed_numpy_array, list) - and len(transformed_numpy_array) > 0 - and isinstance(transformed_numpy_array[0], np.ndarray) + isinstance(inference_res, list) + and len(inference_res) > 0 + and isinstance(inference_res[0], np.ndarray) ): - # In case of multioutput estimators, predict_proba(), decision_function(), etc., functions return - # a list of ndarrays. We need to concatenate them. + # In case of multioutput estimators, predict_proba, decision_function etc., functions return a list of + # ndarrays. We need to concatenate them. # First compute output column names - if len(output_cols) == len(transformed_numpy_array): + if len(output_cols) == len(inference_res): actual_output_cols = [] - for idx, np_arr in enumerate(transformed_numpy_array): + for idx, np_arr in enumerate(inference_res): for i in range(1 if len(np_arr.shape) <= 1 else np_arr.shape[1]): actual_output_cols.append(f"{{output_cols[idx]}}_{{i}}") output_cols = actual_output_cols # Concatenate np arrays - transformed_numpy_array = np.concatenate(transformed_numpy_array, axis=1) + transformed_numpy_array = np.concatenate(inference_res, axis=1) + elif ( + isinstance(inference_res, tuple) + and len(inference_res) > 0 + and isinstance(inference_res[0], np.ndarray) + ): + # In case of kneighbors, functions return a tuple of ndarrays. + transformed_numpy_array = np.stack(inference_res, axis=1) + else: + transformed_numpy_array = inference_res - if len(transformed_numpy_array.shape) == 3: + if (len(transformed_numpy_array.shape) == 3) and inference_method != "kneighbors": # VotingClassifier will return results of shape (n_classifiers, n_samples, n_classes) # when voting = "soft" and flatten_transform = False. We can't handle unflatten transforms, # so we ignore flatten_transform flag and flatten the results. - transformed_numpy_array = np.hstack(transformed_numpy_array) + transformed_numpy_array = np.hstack(transformed_numpy_array) # type: ignore[call-overload] if len(transformed_numpy_array.shape) == 1: transformed_numpy_array = np.reshape(transformed_numpy_array, (-1, 1)) @@ -325,11 +339,28 @@ class {transform.original_class_name}(BaseTransformer): actual_output_cols.append(f"{{output_cols[0]}}_{{i}}") output_cols = actual_output_cols - if self._drop_input_cols: - dataset = pd.DataFrame(data=transformed_numpy_array, columns=output_cols) + if inference_method == "kneighbors": + if (len(transformed_numpy_array.shape) == 3): # return_distance=True + shape = transformed_numpy_array.shape + data = [transformed_numpy_array[:, i, :].tolist() for i in range(shape[1])] + kneighbors_df = pd.DataFrame({{output_cols[i]: data[i] for i in range(shape[1])}}) + else: # return_distance=False + kneighbors_df = pd.DataFrame( + {{output_cols[0]: [ + transformed_numpy_array[i, :].tolist() for i in range(transformed_numpy_array.shape[0]) + ]}} + ) + + if self._drop_input_cols: + dataset = kneighbors_df + else: + dataset = pd.concat([dataset, kneighbors_df], axis=1) else: - dataset = dataset.copy() - dataset[output_cols] = transformed_numpy_array + if self._drop_input_cols: + dataset = pd.DataFrame(data=transformed_numpy_array, columns=output_cols) + else: + dataset = dataset.copy() + dataset[output_cols] = transformed_numpy_array return dataset @available_if(original_estimator_has_callable("predict")) # type: ignore[misc] @@ -423,11 +454,14 @@ class {transform.original_class_name}(BaseTransformer): return output_df - def _get_output_column_names(self, output_cols_prefix: str) -> List[str]: + def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]: """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions. Returns a list with output_cols_prefix as the only element if the estimator is not a classifier. """ output_cols_prefix = identifier.resolve_identifier(output_cols_prefix) + if output_cols: + return [f"{{output_cols_prefix}}{{identifier.resolve_identifier(c)}}" for c in output_cols] + if getattr(self._sklearn_object, "classes_", None) is None: return [output_cols_prefix] @@ -442,10 +476,10 @@ class {transform.original_class_name}(BaseTransformer): # For binary classification, there is only one output column for each class # ndarray as the two classes are complementary. if len(cl) == 2: - output_cols.append(f'{{output_cols_prefix}}_{{i}}_{{cl[0]}}') + output_cols.append(f'{{output_cols_prefix}}{{i}}_{{cl[0]}}') else: output_cols.extend([ - f'{{output_cols_prefix}}_{{i}}_{{c}}' for c in cl.tolist() + f'{{output_cols_prefix}}{{i}}_{{c}}' for c in cl.tolist() ]) return output_cols return [] @@ -612,6 +646,56 @@ class {transform.original_class_name}(BaseTransformer): return score + @available_if(original_estimator_has_callable("kneighbors")) # type: ignore[misc] + @telemetry.send_api_usage_telemetry( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + @telemetry.add_stmt_params_to_df( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + def kneighbors( + self, + dataset: Union[DataFrame, pd.DataFrame], + n_neighbors: Optional[int] = None, + return_distance: bool = True, + output_cols_prefix: str = "kneighbors_", + ) -> Union[DataFrame, pd.DataFrame]: + """{transform.kneighbors_docstring} + output_cols_prefix: str + Prefix for the response columns + + Returns: + Output dataset with results of the K-neighbors for the samples in input dataset. + """ + super()._check_dataset_type(dataset) + output_cols = ["neigh_ind"] + if return_distance: + output_cols.insert(0, "neigh_dist") + if isinstance(dataset, DataFrame): + # TODO: Solve inconsistent neigh_ind with sklearn due to different precisions in case of close distances. + output_df = self._batch_inference( + dataset=dataset, + inference_method="kneighbors", + expected_output_cols_list=self._get_output_column_names(output_cols_prefix, output_cols), + expected_output_cols_type="array", + n_neighbors=n_neighbors, + return_distance=return_distance, + ) + elif isinstance(dataset, pd.DataFrame): + output_df = self._sklearn_inference( + dataset=dataset, + inference_method="kneighbors", + expected_output_cols_list=self._get_output_column_names(output_cols_prefix, output_cols), + n_neighbors=n_neighbors, + return_distance=return_distance, + ) + + return output_df + def _get_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None: self._model_signature_dict = dict() diff --git a/codegen/transformer_autogen_test_template.py_template b/codegen/transformer_autogen_test_template.py_template index fffedd41..9025ff57 100644 --- a/codegen/transformer_autogen_test_template.py_template +++ b/codegen/transformer_autogen_test_template.py_template @@ -181,7 +181,7 @@ class {transform.test_class_name}(TestCase): else: np.testing.assert_allclose(actual_arr, sklearn_numpy_arr, rtol=1.e-1, atol=1.e-2) - expected_methods = ["predict_proba", "predict_log_proba", "decision_function"] + expected_methods = ["predict_proba", "predict_log_proba", "decision_function", "kneighbors"] for m in expected_methods: assert not ( callable(getattr(sklearn_reg, m, None)) @@ -196,20 +196,46 @@ class {transform.test_class_name}(TestCase): actual_inference_result = getattr(reg, m)(dataset=input_df_pandas, output_cols_prefix="OUTPUT_") actual_output_cols = [c for c in actual_inference_result.columns if c.find("OUTPUT_") >= 0] - actual_inference_result = actual_inference_result[actual_output_cols].to_numpy() + if {transform._is_k_neighbors} and m == "kneighbors": + if inference_with_udf: + actual_inference_result[actual_output_cols] = actual_inference_result[ + actual_output_cols + ].applymap(lambda x: json.loads(x)) + actual_inference_result = actual_inference_result[actual_output_cols].to_numpy() + if actual_inference_result.shape[1] > 1: # return_distance=True + actual_inference_result = np.array(actual_inference_result.tolist()) + else: # return_distance=False + actual_inference_result = np.vstack([np.array(res[0]) for res in actual_inference_result]) + else: + actual_inference_result = actual_inference_result[actual_output_cols].to_numpy() sklearn_inference_result = getattr(sklearn_reg, m)(input_df_pandas[input_cols]) if isinstance(sklearn_inference_result, list): - # Incase of multioutput estimators predict_proba, decision_function, etc., returns a list of + # Incase of multioutput estimators predict_proba, decision_function etc., returns a list of # ndarrays as output. We need to concatenate them to compare with snowflake output. sklearn_inference_result = np.concatenate(sklearn_inference_result, axis=1) + elif isinstance(sklearn_inference_result, tuple): + # Incase of kneighbors, returns a tuple of ndarrays as output. + sklearn_inference_result = np.stack(sklearn_inference_result, axis=1) elif len(sklearn_inference_result.shape) == 1: # Some times sklearn retuns results as 1D array of shape (n_samples,), but snowfkale always retunrs # response as 2D array of shape (n_samples, 1). Flatten the snowflake response to compare results. actual_inference_result = actual_inference_result.flatten() - np.testing.assert_allclose( - actual_inference_result, sklearn_inference_result, rtol=1.e-1, atol=1.e-2) + if ( + {transform._is_k_neighbors} + and m == "kneighbors" + and len(actual_inference_result.shape) == 3 + ): # return_distance=True + # Only compare neigh_dist, as different precisions cause neigh_ind to differ in case of close + # distances. + np.testing.assert_allclose( + actual_inference_result[:, 0, :], sklearn_inference_result[:, 0, :], rtol=1.e-1, atol=1.e-2 + ) + else: + np.testing.assert_allclose( + actual_inference_result, sklearn_inference_result, rtol=1.e-1, atol=1.e-2 + ) if callable(getattr(sklearn_reg, "score", None)) and callable(getattr(reg, "score", None)): score_argspec = inspect.getfullargspec(sklearn_reg.score) diff --git a/mypy.ini b/mypy.ini index b520c715..a634503b 100644 --- a/mypy.ini +++ b/mypy.ini @@ -9,24 +9,38 @@ explicit_package_bases = True # This is default but vscode plugin may be old namespace_packages = True -# Enables the following checks. These are meant to be a subset of the checks enabled by --strict. Notably, -# --disallow_subclassing_any is not enabled. -warn_unused_configs = True +# Enables the following checks. These are meant to be a subset of the checks enabled by --strict. +check_untyped_defs = True + disallow_any_generics = True -disallow_untyped_calls = True -disallow_untyped_defs = True disallow_incomplete_defs = True -check_untyped_defs = True +# We inherit from classes like fsspec.AbstractFileSystem, absltest.TestCases which are considered of type Any. +#disallow_subclassing_any = True +disallow_untyped_calls = True disallow_untyped_decorators = True +disallow_untyped_defs = True + +warn_no_return = True warn_redundant_casts = True -warn_unused_ignores = True warn_return_any = True -no_implicit_reexport = True +# It seems, today we have lots of these. +#warn_unreachable = True +warn_unused_configs = True +warn_unused_ignores = True + +implicit_reexport = False strict_equality = True extra_checks = True enable_incomplete_feature = Unpack +pretty = True +show_absolute_path = True +show_column_numbers = True +show_error_codes = True +show_error_context = True +verbosity = 0 + exclude = (?x)( (^.*\/experimental\/.*)|(^bazel-.*) # ignore everything in the `/experimental/` directory ) diff --git a/requirements.txt b/requirements.txt index b9c04935..e005708e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ # DO NOT EDIT! # Generate by running 'bazel run --config=pre_build //bazel/requirements:sync_requirements' +--extra-index-url https://pypi.org/simple absl-py==1.3.0 accelerate==0.22.0 anyio==3.5.0 @@ -24,6 +25,7 @@ networkx==2.8.4 numpy==1.24.3 packaging==23.0 pandas==1.5.3 +peft==0.5.0 protobuf==3.20.3 pytest==7.4.0 pytimeparse==1.1.8 diff --git a/requirements.yml b/requirements.yml index 1747fa73..f34af4af 100644 --- a/requirements.yml +++ b/requirements.yml @@ -217,7 +217,7 @@ - name: tensorflow dev_version_conda: 2.10.0 dev_version_pypi: 2.13.1 - version_requirements: '>=2.9,<3' + version_requirements: '>=2.9,<3,!=2.12.0' requirements_extra_tags: - tensorflow - name: tokenizers @@ -261,3 +261,12 @@ - name: pytimeparse dev_version: 1.1.8 version_requirements: '>=1.1.8,<2' + +# Below are pip only external packages +- name_pypi: --extra-index-url https://pypi.org/simple + dev_version_pypi: '' +- name_pypi: peft + dev_version_pypi: 0.5.0 + version_requirements_pypi: '>=0.5.0,<1' + requirements_extra_tags: + - llm diff --git a/snowflake/ml/_internal/utils/snowpark_dataframe_utils.py b/snowflake/ml/_internal/utils/snowpark_dataframe_utils.py index 59c8f8ac..9362a97a 100644 --- a/snowflake/ml/_internal/utils/snowpark_dataframe_utils.py +++ b/snowflake/ml/_internal/utils/snowpark_dataframe_utils.py @@ -1,4 +1,5 @@ import logging +import warnings from snowflake import snowpark from snowflake.snowpark import functions, types @@ -58,3 +59,54 @@ def cast_snowpark_dataframe(df: snowpark.DataFrame) -> snowpark.DataFrame: selected_cols.append(functions.col(src)) df = df.select(selected_cols) return df + + +def cast_snowpark_dataframe_column_types(df: snowpark.DataFrame) -> snowpark.DataFrame: + """Cast columns in the dataframe to types that are compatible with pandas DataFrame. + + It assists modeling API (fit, predict, ...) in performing implicit data casting. + The reason for casting: snowpark dataframe would transform as pandas dataframe + to compute within sproc. + + Args: + df: A snowpark dataframe. + + Returns: + A snowpark dataframe whose data type has been casted. + """ + fields = df.schema.fields + selected_cols = [] + for field in fields: + src = field.column_identifier.quoted_name + # Handle DecimalType: Numbers up to 38 digits, with an optional precision and scale + # By default, precision is 38 and scale is 0 (i.e. NUMBER(38, 0)) + if isinstance(field.datatype, types.DecimalType): + # If datatype has scale; convert into float/double type + # In snowflake, DOUBLE is the same as FLOAT, provides precision up to 18. + if field.datatype.scale: + dest_dtype: types.DataType = types.DoubleType() + warnings.warn( + f"Warning: The Decimal({field.datatype.precision}, {field.datatype.scale}) data type" + " is being automatically converted to DoubleType in the Snowpark DataFrame. " + "This automatic conversion may lead to potential precision loss and rounding errors. " + "If you wish to prevent this conversion, you should manually perform " + "the necessary data type conversion." + ) + else: + # IntegerType default as NUMBER(38, 0), but + # snowpark dataframe would automatically transform to LongType in function `convert_sf_to_sp_type` + # To align with snowpark, set all the decimal without scale as LongType + dest_dtype = types.LongType() + warnings.warn( + f"Warning: The Decimal({field.datatype.precision}, 0) data type" + " is being automatically converted to LongType in the Snowpark DataFrame. " + "This automatic conversion may lead to potential precision loss and rounding errors. " + "If you wish to prevent this conversion, you should manually perform " + "the necessary data type conversion." + ) + selected_cols.append(functions.cast(functions.col(src), dest_dtype).alias(src)) + # TODO: add more type handling or error message + else: + selected_cols.append(functions.col(src)) + df = df.select(selected_cols) + return df diff --git a/snowflake/ml/dataset/BUILD.bazel b/snowflake/ml/dataset/BUILD.bazel index c2bd2e7a..6dc0b288 100644 --- a/snowflake/ml/dataset/BUILD.bazel +++ b/snowflake/ml/dataset/BUILD.bazel @@ -9,5 +9,6 @@ py_library( ], deps = [ "//snowflake/ml/_internal/utils:query_result_checker", + "//snowflake/ml/registry:artifact_manager", ], ) diff --git a/snowflake/ml/dataset/dataset.py b/snowflake/ml/dataset/dataset.py index 4ccb6f51..c683b523 100644 --- a/snowflake/ml/dataset/dataset.py +++ b/snowflake/ml/dataset/dataset.py @@ -2,8 +2,8 @@ import time from dataclasses import dataclass from typing import Any, Dict, List, Optional -from uuid import uuid4 +from snowflake.ml.registry.artifact import Artifact, ArtifactType from snowflake.snowpark import DataFrame, Session @@ -42,9 +42,9 @@ def to_json(self) -> str: # but we retrieve it as an object. Snowpark serialization is inconsistent with # our deserialization. A fix is let artifact table stores string and callers # handles both serialization and deserialization. - "spine_query": _wrap_embedded_str(self.spine_query), - "connection_params": _wrap_embedded_str(json.dumps(self.connection_params)), - "features": _wrap_embedded_str(json.dumps(self.features)), + "spine_query": self.spine_query, + "connection_params": json.dumps(self.connection_params), + "features": json.dumps(self.features), } return json.dumps(state_dict) @@ -58,7 +58,7 @@ def from_json(cls, json_str: str) -> "FeatureStoreMetadata": ) -class Dataset: +class Dataset(Artifact): """Metadata of dataset.""" def __init__( @@ -95,19 +95,10 @@ def __init__( self.label_cols = label_cols self.feature_store_metadata = feature_store_metadata self.desc = desc - - self.id = uuid4().hex.upper() self.owner = session.sql("SELECT CURRENT_USER()").collect()[0]["CURRENT_USER()"] - self.version = DATASET_SCHEMA_VERSION - - @property - def name(self) -> str: - """Get name of this dataset. It returns snapshot table name if it exists. Otherwise returns empty string. + self.schema_version = DATASET_SCHEMA_VERSION - Returns: - A string name. - """ - return self.snapshot_table if self.snapshot_table is not None else "" + super().__init__(type=ArtifactType.DATASET, spec=self.to_json()) def load_features(self) -> Optional[List[str]]: if self.feature_store_metadata is not None: @@ -115,6 +106,14 @@ def load_features(self) -> Optional[List[str]]: else: return None + def features_df(self) -> DataFrame: + result = self.df + if self.timestamp_col is not None: + result = result.drop(self.timestamp_col) + if self.label_cols is not None: + result = result.drop(self.label_cols) + return result + def to_json(self) -> str: if len(self.df.queries["queries"]) != 1: raise ValueError( @@ -124,18 +123,17 @@ def to_json(self) -> str: ) state_dict = { - "df_query": self.df.queries["queries"][0], - "id": self.id, + "df_query": _wrap_embedded_str(self.df.queries["queries"][0]), "generation_timestamp": self.generation_timestamp, "owner": self.owner, - "materialized_table": _get_val_or_null(self.materialized_table), - "snapshot_table": _get_val_or_null(self.snapshot_table), - "timestamp_col": _get_val_or_null(self.timestamp_col), + "materialized_table": _wrap_embedded_str(_get_val_or_null(self.materialized_table)), + "snapshot_table": _wrap_embedded_str(_get_val_or_null(self.snapshot_table)), + "timestamp_col": _wrap_embedded_str(_get_val_or_null(self.timestamp_col)), "label_cols": _get_val_or_null(self.label_cols), - "feature_store_metadata": self.feature_store_metadata.to_json() + "feature_store_metadata": _wrap_embedded_str(self.feature_store_metadata.to_json()) if self.feature_store_metadata is not None else "null", - "version": self.version, + "schema_version": self.schema_version, "desc": self.desc, } return json.dumps(state_dict) @@ -150,13 +148,11 @@ def from_json(cls, json_str: str, session: Session) -> "Dataset": FeatureStoreMetadata.from_json(fs_meta_json) if fs_meta_json != "null" else None ) - uid = json_dict.pop("id") - version = json_dict.pop("version") + schema_version = json_dict.pop("schema_version") owner = json_dict.pop("owner") result = cls(session, **json_dict) - result.id = uid - result.version = version + result.schema_version = schema_version result.owner = owner return result diff --git a/snowflake/ml/feature_store/feature_store.py b/snowflake/ml/feature_store/feature_store.py index a2beb080..3916abea 100644 --- a/snowflake/ml/feature_store/feature_store.py +++ b/snowflake/ml/feature_store/feature_store.py @@ -140,24 +140,25 @@ def __init__( "SCHEMAS": (f"DATABASE {self._config.database}", "SCHEMA"), "TAGS": (self._config.full_schema_path, None), "TASKS": (self._config.full_schema_path, "TASK"), + "WAREHOUSES": (None, None), } - try: - self._session.sql(f"DESC WAREHOUSE {self._config.default_warehouse}").collect() - except Exception as e: + # DESC WAREHOUSE requires MONITOR privilege on the warehouse which is a high privilege + # some users not usually have. + warehouse_result = self._find_object("WAREHOUSES", self._config.default_warehouse) + if len(warehouse_result) == 0: raise snowml_exceptions.SnowflakeMLException( - error_code=error_codes.SNOWML_NOT_FOUND, - original_exception=ValueError(f"Cannot find warehouse {default_warehouse}: {e}"), - ) from e + error_code=error_codes.NOT_FOUND, + original_exception=ValueError(f"Cannot find warehouse {self._config.default_warehouse}"), + ) if creation_mode == CreationMode.FAIL_IF_NOT_EXIST: - try: - self._session.sql(f"DESC SCHEMA {self._config.full_schema_path}").collect() - except Exception as e: + schema_result = self._find_object("SCHEMAS", self._config.schema) + if len(schema_result) == 0: raise snowml_exceptions.SnowflakeMLException( error_code=error_codes.NOT_FOUND, original_exception=ValueError(f"Feature store {name} does not exist."), - ) from e + ) else: try: self._session.sql(f"CREATE DATABASE IF NOT EXISTS {self._config.database}").collect( @@ -1454,12 +1455,16 @@ def _fetch_column_descs(self, obj_type: str, obj_name: str) -> Dict[str, str]: return descs def _find_object(self, object_type: str, object_name_pattern: str) -> List[Row]: - """Try to find an object in a given place with respect to case-sensitivity. - If object name is not quoted, then it's case insensitive. Otherwise it's case sensitive. + """Try to find an object by given type and name pattern. Args: - object_type: Type of the object. Could be TABLE, TAG etc. - object_name_pattern: Name match pattern of object. Could be either quoted or not. + object_type: Type of the object. Could be TABLES, TAGS etc. + object_name_pattern: Name match pattern of object. It obeys snowflake identifier requirements. + and can be used with SQL wildcard character '%'. + Examples: + 1. object_name_pattern="bar" will return objects with lowercase name: bar. + 2. object_name_pattern=BAR will return objects with case-insensitive name: bar. + 3. object_name_pattern=BAR% will return objects with name starts with case-insensitive: bar. Raises: SnowflakeMLException: [RuntimeError] Failed to find resource. @@ -1474,6 +1479,7 @@ def _find_object(self, object_type: str, object_name_pattern: str) -> List[Row]: unesc_object_name = object_name_pattern object_name = "" elif object_name_pattern[-1] == "%": + assert '"' not in object_name_pattern, "wildcard search doesn't support double quotes" unesc_object_name = object_name_pattern object_name = unesc_object_name[:-1] else: @@ -1485,7 +1491,8 @@ def _find_object(self, object_type: str, object_name_pattern: str) -> List[Row]: fs_objects = [] tag_free_object_types = ["TAGS", "SCHEMAS"] try: - all_rows = self._session.sql(f"SHOW {object_type} LIKE '{unesc_object_name}' IN {search_space}").collect( + search_scope = f"IN {search_space}" if search_space is not None else "" + all_rows = self._session.sql(f"SHOW {object_type} LIKE '{unesc_object_name}' {search_scope}").collect( statement_params=self._telemetry_stmp ) if object_name_pattern == "%" and object_type not in tag_free_object_types and len(all_rows) > 0: diff --git a/snowflake/ml/feature_store/notebooks/customer_demo/Basic_Feature_Demo.ipynb b/snowflake/ml/feature_store/notebooks/customer_demo/Basic_Feature_Demo.ipynb index 7f6af1b0..a2447da8 100644 --- a/snowflake/ml/feature_store/notebooks/customer_demo/Basic_Feature_Demo.ipynb +++ b/snowflake/ml/feature_store/notebooks/customer_demo/Basic_Feature_Demo.ipynb @@ -5,9 +5,9 @@ "id": "0bb54abc", "metadata": {}, "source": [ - "Version: 0.1.1\n", + "Version: 0.1.2\n", "\n", - "Updated date: 10/11/2023" + "Updated date: 10/18/2023" ] }, { @@ -73,7 +73,7 @@ "id": "494e1503", "metadata": {}, "source": [ - "Following sections will use below database and schema name to store test data and feature store objects. You can rename with your own name if needed. " + "Below cell creates temporary database, schema and warehouse for this notebook. All temporary resources will be deleted at the end of this notebook. You can rename with your own name if needed. " ] }, { @@ -91,13 +91,16 @@ "FS_DEMO_SCHEMA = \"AWESOME_FS_BASIC_FEATURES\"\n", "# Model registry database name.\n", "MR_DEMO_DB = f\"FEATURE_STORE_BASIC_FEATURE_NOTEBOOK_MR_DEMO\"\n", + "# warehouse name used in this notebook.\n", + "FS_DEMO_WH = \"FEATURE_STORE_BASIC_FEATURE_NOTEBOOK_DEMO\"\n", "\n", "session.sql(f\"DROP DATABASE IF EXISTS {FS_DEMO_DB}\").collect()\n", "session.sql(f\"DROP DATABASE IF EXISTS {MR_DEMO_DB}\").collect()\n", "session.sql(f\"CREATE DATABASE IF NOT EXISTS {FS_DEMO_DB}\").collect()\n", "session.sql(f\"\"\"\n", " CREATE SCHEMA IF NOT EXISTS \n", - " {FS_DEMO_DB}.{TEST_DATASET_SCHEMA}\"\"\").collect()" + " {FS_DEMO_DB}.{TEST_DATASET_SCHEMA}\"\"\").collect()\n", + "session.sql(f\"CREATE WAREHOUSE IF NOT EXISTS {FS_DEMO_WH}\").collect()" ] }, { @@ -168,15 +171,11 @@ "metadata": {}, "outputs": [], "source": [ - "session.sql(f\"\"\"CREATE OR REPLACE WAREHOUSE PUBLIC WITH \n", - " WAREHOUSE_SIZE='XSMALL'\n", - " \"\"\").collect()\n", - "\n", "fs = FeatureStore(\n", " session=session, \n", " database=FS_DEMO_DB, \n", " name=FS_DEMO_SCHEMA, \n", - " default_warehouse=\"PUBLIC\",\n", + " default_warehouse=FS_DEMO_WH,\n", " creation_mode=CreationMode.CREATE_IF_NOT_EXIST,\n", ")" ] @@ -202,7 +201,7 @@ "source": [ "entity = Entity(name=\"WINE\", join_keys=[\"WINE_ID\"])\n", "fs.register_entity(entity)\n", - "fs.list_entities().to_pandas()" + "fs.list_entities().show()" ] }, { @@ -227,7 +226,7 @@ "outputs": [], "source": [ "source_df = session.table(f\"{FS_DEMO_DB}.{TEST_DATASET_SCHEMA}.WINE_DATA\")\n", - "source_df.to_pandas()" + "source_df.show()" ] }, { @@ -265,7 +264,7 @@ " 'MY_NEW_FEATURE',\n", " ]\n", ")\n", - "feature_df.to_pandas()" + "feature_df.show()" ] }, { @@ -325,7 +324,7 @@ "outputs": [], "source": [ "# Examine the FeatureView content\n", - "fs.read_feature_view(fv).to_pandas()" + "fs.read_feature_view(fv).show()" ] }, { @@ -425,7 +424,7 @@ "spine_df = session.table(f\"{FS_DEMO_DB}.{TEST_DATASET_SCHEMA}.WINE_DATA\")\n", "spine_df = addIdColumn(source_df, \"WINE_ID\")\n", "spine_df = spine_df.select(\"WINE_ID\", \"QUALITY\")\n", - "spine_df.to_pandas()" + "spine_df.show()" ] }, { @@ -473,7 +472,7 @@ "source": [ "## Train a model\n", "\n", - "Now let's training a simple random forest model with snowflake.ml library, and evaluate the prediction accuracy." + "Now let's training a simple random forest model with sklearn library, and evaluate the prediction accuracy." ] }, { @@ -580,6 +579,14 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "83fb2531", + "metadata": {}, + "source": [ + "Register the dataset into model registry with `log_artifact`. Artifact is a generalized concept of ML pipeline outputs that are needed for subsequent execution. Refer to https://docs.snowflake.com/LIMITEDACCESS/snowflake-ml-model-registry for more details about the API." + ] + }, { "cell_type": "code", "execution_count": null, @@ -595,6 +602,14 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "2dd12070", + "metadata": {}, + "source": [ + "Now you can log the model together with the registered artifact (which is a dataset here)." + ] + }, { "cell_type": "code", "execution_count": null, @@ -636,7 +651,7 @@ "registered_artifact = registry.get_artifact(\n", " artifact_ref.name, \n", " artifact_ref.version)\n", - "registered_dataset = Dataset.from_json(registered_artifact.spec, session)\n", + "registered_dataset = Dataset.from_json(registered_artifact._spec, session)\n", "test_df = spine_df.limit(3).select(\"WINE_ID\")\n", "\n", "enriched_df = fs.retrieve_feature_values(\n", @@ -679,7 +694,8 @@ "outputs": [], "source": [ "session.sql(f\"DROP DATABASE IF EXISTS {FS_DEMO_DB}\").collect()\n", - "session.sql(f\"DROP DATABASE IF EXISTS {MR_DEMO_DB}\").collect()" + "session.sql(f\"DROP DATABASE IF EXISTS {MR_DEMO_DB}\").collect()\n", + "session.sql(f\"DROP WAREHOUSE IF EXISTS {FS_DEMO_WH}\").collect()" ] } ], diff --git a/snowflake/ml/feature_store/notebooks/customer_demo/Basic_Feature_Demo.pdf b/snowflake/ml/feature_store/notebooks/customer_demo/Basic_Feature_Demo.pdf index 00a95424..59ed9703 100644 Binary files a/snowflake/ml/feature_store/notebooks/customer_demo/Basic_Feature_Demo.pdf and b/snowflake/ml/feature_store/notebooks/customer_demo/Basic_Feature_Demo.pdf differ diff --git a/snowflake/ml/feature_store/notebooks/customer_demo/Time_Series_Feature_Demo.ipynb b/snowflake/ml/feature_store/notebooks/customer_demo/Time_Series_Feature_Demo.ipynb index 138c202c..83e2edfd 100644 --- a/snowflake/ml/feature_store/notebooks/customer_demo/Time_Series_Feature_Demo.ipynb +++ b/snowflake/ml/feature_store/notebooks/customer_demo/Time_Series_Feature_Demo.ipynb @@ -5,9 +5,9 @@ "id": "4f029c96", "metadata": {}, "source": [ - "Notebook version: 0.1.1\n", + "Notebook version: 0.1.2\n", "\n", - "Updated date: 10/11/2023" + "Updated date: 10/18/2023" ] }, { @@ -77,7 +77,7 @@ "id": "9affb013", "metadata": {}, "source": [ - "Following sections will use below database and schema name to store test data and feature store objects. You can rename with your own name if needed. " + "Below cell creates temporary database, schema and warehouse for this notebook. All temporary resources will be deleted at the end of this notebook. You can rename with your own name if needed. " ] }, { @@ -99,13 +99,16 @@ "FS_DEMO_STAGE = \"FEATURE_STORE_TIME_SERIES_FEATURE_NOTEBOOK_STAGE_DEMO\"\n", "FS_DEMO_STAGE_FULL_PATH = \\\n", " f\"{FS_DEMO_DB}.{TEST_DATASET_SCHEMA}.{FS_DEMO_STAGE}\"\n", + "# warehouse name used in this notebook.\n", + "FS_DEMO_WH = \"FEATURE_STORE_TIME_SERIES_FEATURE_NOTEBOOK_DEMO\"\n", "\n", "session.sql(f\"DROP DATABASE IF EXISTS {FS_DEMO_DB}\").collect()\n", "session.sql(f\"DROP DATABASE IF EXISTS {MR_DEMO_DB}\").collect()\n", "session.sql(f\"CREATE DATABASE IF NOT EXISTS {FS_DEMO_DB}\").collect()\n", "session.sql(f\"\"\"CREATE SCHEMA IF NOT EXISTS \n", " {FS_DEMO_DB}.{TEST_DATASET_SCHEMA}\"\"\").collect()\n", - "session.sql(f\"CREATE OR REPLACE STAGE {FS_DEMO_STAGE_FULL_PATH}\").collect()" + "session.sql(f\"CREATE OR REPLACE STAGE {FS_DEMO_STAGE_FULL_PATH}\").collect()\n", + "session.sql(f\"CREATE WAREHOUSE IF NOT EXISTS {FS_DEMO_WH}\").collect()" ] }, { @@ -205,15 +208,11 @@ "metadata": {}, "outputs": [], "source": [ - "session.sql(f\"\"\"CREATE OR REPLACE WAREHOUSE PUBLIC WITH\n", - " WAREHOUSE_SIZE='XSMALL'\n", - " \"\"\").collect()\n", - "\n", "fs = FeatureStore(\n", " session=session, \n", " database=FS_DEMO_DB, \n", " name=FS_DEMO_SCHEMA, \n", - " default_warehouse=\"PUBLIC\",\n", + " default_warehouse=FS_DEMO_WH,\n", " creation_mode=CreationMode.CREATE_IF_NOT_EXIST,\n", ")" ] @@ -238,7 +237,7 @@ "trip_dropoff = Entity(name=\"TRIP_DROPOFF\", join_keys=[\"DOLOCATIONID\"])\n", "fs.register_entity(trip_pickup)\n", "fs.register_entity(trip_dropoff)\n", - "fs.list_entities().to_pandas()" + "fs.list_entities().show()" ] }, { @@ -625,6 +624,14 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "84daec8a", + "metadata": {}, + "source": [ + "Register the dataset into model registry with `log_artifact`. Artifact is a generalized concept of ML pipeline outputs that are needed for subsequent execution. Refer to https://docs.snowflake.com/LIMITEDACCESS/snowflake-ml-model-registry for more details about the API." + ] + }, { "cell_type": "code", "execution_count": null, @@ -636,10 +643,18 @@ " artifact_type=artifact.ArtifactType.DATASET,\n", " artifact_name=\"MY_COOL_DATASET\",\n", " artifact_spec=training_data.to_json(),\n", - " artifact_version=\"v5\",\n", + " artifact_version=\"V5\",\n", ")" ] }, + { + "cell_type": "markdown", + "id": "abbf6948", + "metadata": {}, + "source": [ + "Now you can log the model together with the registered artifact (which is a dataset here)." + ] + }, { "cell_type": "code", "execution_count": null, @@ -679,7 +694,7 @@ "registered_artifact = registry.get_artifact(\n", " artifact_ref.name, \n", " artifact_ref.version)\n", - "registered_dataset = Dataset.from_json(registered_artifact.spec, session)\n", + "registered_dataset = Dataset.from_json(registered_artifact._spec, session)\n", "\n", "enriched_df = fs.retrieve_feature_values(\n", " spine_df=pred_df, \n", @@ -723,7 +738,8 @@ "outputs": [], "source": [ "session.sql(f\"DROP DATABASE IF EXISTS {FS_DEMO_DB}\").collect()\n", - "session.sql(f\"DROP DATABASE IF EXISTS {MR_DEMO_DB}\").collect()" + "session.sql(f\"DROP DATABASE IF EXISTS {MR_DEMO_DB}\").collect()\n", + "session.sql(f\"DROP WAREHOUSE IF EXISTS {FS_DEMO_WH}\").collect()" ] } ], diff --git a/snowflake/ml/feature_store/notebooks/customer_demo/Time_Series_Feature_Demo.pdf b/snowflake/ml/feature_store/notebooks/customer_demo/Time_Series_Feature_Demo.pdf index 207838e9..35e6e63d 100644 Binary files a/snowflake/ml/feature_store/notebooks/customer_demo/Time_Series_Feature_Demo.pdf and b/snowflake/ml/feature_store/notebooks/customer_demo/Time_Series_Feature_Demo.pdf differ diff --git a/snowflake/ml/feature_store/tests/feature_store_case_sensitivity_test.py b/snowflake/ml/feature_store/tests/feature_store_case_sensitivity_test.py index 13134a84..723fec56 100644 --- a/snowflake/ml/feature_store/tests/feature_store_case_sensitivity_test.py +++ b/snowflake/ml/feature_store/tests/feature_store_case_sensitivity_test.py @@ -53,7 +53,7 @@ def setUpClass(cls) -> None: def tearDownClass(cls) -> None: for fs in cls._active_fs: fs.clear() - cls._session.sql(f"DROP SCHEMA IF EXISTS {fs._config.schema}").collect() + cls._session.sql(f"DROP SCHEMA IF EXISTS {fs._config.full_schema_path}").collect() cls._session.sql(f"DROP TABLE IF EXISTS {cls._mock_table}").collect() cls._session.close() @@ -108,7 +108,7 @@ def test_feature_store_location(self, database: str, schema: str) -> None: @parameterized.parameters(WAREHOUSE_NAMES) # type: ignore[misc] def test_warehouse_names(self, warehouse: str) -> None: - current_schema = create_random_schema(self._session, "TEST_SHEMA") + current_schema = create_random_schema(self._session, "TEST_WAREHOUSE_NAMES") self._session.sql(f"CREATE WAREHOUSE IF NOT EXISTS {warehouse} WITH WAREHOUSE_SIZE='XSMALL'").collect() @@ -190,7 +190,7 @@ def generate_unique_name(names: List[str]) -> List[str]: # 5. delete_entity @parameterized.parameters(TEST_NAMES) # type: ignore[misc] def test_entity_names(self, equi_names: List[str], diff_names: List[str]) -> None: - current_schema = create_random_schema(self._session, "TEST_SHEMA") + current_schema = create_random_schema(self._session, "TEST_ENTITY_NAMES") fs = FeatureStore( self._session, FS_INTEG_TEST_DB, @@ -235,7 +235,7 @@ def test_entity_names(self, equi_names: List[str], diff_names: List[str]) -> Non # 5. FeatureView.timestamp_col @parameterized.parameters(TEST_NAMES) # type: ignore[misc] def test_join_keys_and_ts_col(self, equi_names: List[str], diff_names: List[str]) -> None: - current_schema = create_random_schema(self._session, "TEST_SHEMA") + current_schema = create_random_schema(self._session, "TEST_JOIN_KEYS_AND_TS_COL") fs = FeatureStore( self._session, FS_INTEG_TEST_DB, @@ -292,7 +292,7 @@ def test_feature_view_names_and_versions_combination( equi_full_names: List[Tuple[str, str]], diff_full_names: List[Tuple[str, str]], ) -> None: - current_schema = create_random_schema(self._session, "TEST_SHEMA") + current_schema = create_random_schema(self._session, "TEST_FEATURE_VIEW_NAMES") fs = FeatureStore( self._session, FS_INTEG_TEST_DB, @@ -363,6 +363,25 @@ def test_feature_view_names_and_versions_combination( version = diff_name[1] fs.get_feature_view(fv_name, version) + @parameterized.parameters(TEST_NAMES) # type: ignore[misc] + def test_find_objects(self, equi_names: List[str], diff_names: List[str]) -> None: + current_schema = create_random_schema(self._session, "TEST_FIND_OBJECTS") + fs = FeatureStore( + self._session, + FS_INTEG_TEST_DB, + current_schema, + FS_INTEG_TEST_DEFAULT_WAREHOUSE, + creation_mode=CreationMode.CREATE_IF_NOT_EXIST, + ) + self._active_fs.append(fs) + + self._session.sql(f"CREATE SCHEMA IF NOT EXISTS {FS_INTEG_TEST_DB}.{equi_names[0]}").collect() + for name in equi_names: + self.assertEqual(len(fs._find_object("SCHEMAS", name)), 1) + for name in diff_names: + self.assertEqual(len(fs._find_object("SCHEMAS", name)), 0) + self._session.sql(f"DROP SCHEMA IF EXISTS {FS_INTEG_TEST_DB}.{equi_names[0]}").collect() + if __name__ == "__main__": absltest.main() diff --git a/snowflake/ml/feature_store/tests/feature_store_large_scale_test.py b/snowflake/ml/feature_store/tests/feature_store_large_scale_test.py index 8047e36d..1f351f27 100644 --- a/snowflake/ml/feature_store/tests/feature_store_large_scale_test.py +++ b/snowflake/ml/feature_store/tests/feature_store_large_scale_test.py @@ -37,11 +37,11 @@ def setUpClass(self) -> None: def tearDownClass(self) -> None: for fs in self._active_feature_store: fs.clear() - self._session.sql(f"DROP SCHEMA IF EXISTS {fs._config.schema}").collect() + self._session.sql(f"DROP SCHEMA IF EXISTS {fs._config.full_schema_path}").collect() self._session.close() def _create_feature_store(self, name: Optional[str] = None) -> FeatureStore: - current_schema = create_random_schema(self._session, "TEST_SHEMA") if name is None else name + current_schema = create_random_schema(self._session, "FS_LARGE_SCALE_TEST") if name is None else name fs = FeatureStore( self._session, FS_INTEG_TEST_DB, @@ -91,7 +91,7 @@ def addIdColumn(df: DataFrame, id_column_name: str) -> DataFrame: self._session.sql(f"DROP TABLE {cloned_wine_data}").collect() def test_external_table(self) -> None: - current_schema = create_random_schema(self._session, "TEST_SHEMA") + current_schema = create_random_schema(self._session, "TEST_EXTERNAL_TABLE") fs = self._create_feature_store(current_schema) e_loc = Entity("LOCATION", ["PULOCATIONID"]) diff --git a/snowflake/ml/feature_store/tests/feature_store_test.py b/snowflake/ml/feature_store/tests/feature_store_test.py index 6080c297..0161d0bc 100644 --- a/snowflake/ml/feature_store/tests/feature_store_test.py +++ b/snowflake/ml/feature_store/tests/feature_store_test.py @@ -85,7 +85,7 @@ def _create_mock_table(self, name: str) -> str: return table_full_path def _create_feature_store(self, name: Optional[str] = None) -> FeatureStore: - current_schema = create_random_schema(self._session, "TEST_SHEMA") if name is None else name + current_schema = create_random_schema(self._session, "FS_TEST") if name is None else name fs = FeatureStore( self._session, FS_INTEG_TEST_DB, @@ -123,7 +123,7 @@ def test_invalid_warehouse(self) -> None: FeatureStore( session=self._session, database=FS_INTEG_TEST_DB, - name=create_random_schema(self._session, "TEST_SHEMA"), + name=create_random_schema(self._session, "TEST_INVALID_WAREHOUSE"), default_warehouse=schema_name, creation_mode=CreationMode.CREATE_IF_NOT_EXIST, ) @@ -150,7 +150,7 @@ def test_create_if_not_exist_failure(self) -> None: # Schema still exist even feature store creation failed. res = self._session.sql(f"SHOW SCHEMAS LIKE '{schema_name}' in DATABASE {FS_INTEG_TEST_DB}").collect() self.assertEqual(len(res), 1) - self._session.sql(f"DROP SCHEMA IF EXISTS {schema_name}").collect() + self._session.sql(f"DROP SCHEMA IF EXISTS {FS_INTEG_TEST_DB}.{schema_name}").collect() def test_create_if_not_exist_system_error(self) -> None: mock_session = create_mock_session( @@ -959,7 +959,7 @@ def test_list_feature_views_system_error(self) -> None: fs.list_feature_views(entity_name="foo") def test_create_and_cleanup_tags(self) -> None: - current_schema = create_random_schema(self._session, "TEST_SHEMA") + current_schema = create_random_schema(self._session, "TEST_CREATE_AND_CLEANUP_TAGS") fs = FeatureStore( self._session, FS_INTEG_TEST_DB, @@ -974,7 +974,7 @@ def test_create_and_cleanup_tags(self) -> None: ).collect() self.assertEqual(len(res), 1) - self._session.sql(f"DROP SCHEMA IF EXISTS {current_schema}").collect() + self._session.sql(f"DROP SCHEMA IF EXISTS {FS_INTEG_TEST_DB}.{current_schema}").collect() row_list = self._session.sql( f"SHOW TAGS LIKE '{FEATURE_VIEW_ENTITY_TAG}' IN DATABASE {fs._config.database}" @@ -1154,7 +1154,7 @@ def test_generate_dataset(self) -> None: ) def test_clear_feature_store_in_existing_schema(self) -> None: - current_schema = create_random_schema(self._session, "TEST_SHEMA") + current_schema = create_random_schema(self._session, "TEST_CLEAR_FEATURE_STORE_IN_EXISTING_SCHEMA") # create some objects outside of feature store domain, later will check if they still exists after fs.clear() full_schema_path = f"{FS_INTEG_TEST_DB}.{current_schema}" diff --git a/snowflake/ml/model/BUILD.bazel b/snowflake/ml/model/BUILD.bazel index b060efde..600c0004 100644 --- a/snowflake/ml/model/BUILD.bazel +++ b/snowflake/ml/model/BUILD.bazel @@ -131,6 +131,7 @@ py_library( "//snowflake/ml/_internal/exceptions", "//snowflake/ml/model/_handlers:custom", "//snowflake/ml/model/_handlers:huggingface_pipeline", + "//snowflake/ml/model/_handlers:llm", "//snowflake/ml/model/_handlers:mlflow", "//snowflake/ml/model/_handlers:pytorch", "//snowflake/ml/model/_handlers:sklearn", diff --git a/snowflake/ml/model/_deploy_client/image_builds/docker_context.py b/snowflake/ml/model/_deploy_client/image_builds/docker_context.py index bbd5d562..3e9c2924 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/docker_context.py +++ b/snowflake/ml/model/_deploy_client/image_builds/docker_context.py @@ -91,8 +91,10 @@ def _generate_docker_file(self) -> None: assert len(get_res_list) == 1, f"Single zip file should be returned, but got {len(get_res_list)} files." local_zip_file_path = os.path.basename(get_res_list[0].file) copy_model_statement = f"COPY {local_zip_file_path} {absolute_path}" + extra_env_statement = f"ENV MODEL_ZIP_STAGE_PATH={absolute_path}" else: copy_model_statement = "" + extra_env_statement = "" with open(docker_file_path, "w", encoding="utf-8") as dockerfile, open( docker_file_template, encoding="utf-8" @@ -113,6 +115,7 @@ def _generate_docker_file(self) -> None: # https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-virtual.html "cuda_override_env": cuda_version_str, "copy_model_statement": copy_model_statement, + "extra_env_statement": extra_env_statement, } ) dockerfile.write(dockerfile_content) diff --git a/snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py b/snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py index 21ab330f..74a028b7 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +++ b/snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py @@ -201,9 +201,9 @@ def _construct_and_upload_job_spec(self, base_image: str, kaniko_shell_script_st ) def _launch_kaniko_job(self, spec_stage_location: str) -> None: - logger.debug("Submitting job for building docker image with kaniko") + logger.info("Submitting SPCS job for building docker image.") job_id = self.client.create_job(compute_pool=self.compute_pool, spec_stage_location=spec_stage_location) - logger.debug(f"Kaniko job id is {job_id}") + logger.info(f"Server image building SPCS job id is {job_id}.") # Given image build can take a while, we set a generous timeout to be 1 hour. self.client.block_until_resource_is_ready( resource_name=job_id, diff --git a/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template b/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template index 070d02ab..a448e284 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +++ b/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template @@ -4,8 +4,6 @@ FROM ${base_image} as build COPY ${model_env_folder}/conda.yaml conda.yaml COPY ${model_env_folder}/requirements.txt requirements.txt -${copy_model_statement} - # Set MAMBA_DOCKERFILE_ACTIVATE=1 to activate the conda environment during build time. ARG MAMBA_DOCKERFILE_ACTIVATE=1 @@ -17,11 +15,13 @@ RUN --mount=type=cache,target=/opt/conda/pkgs CONDA_OVERRIDE_CUDA="${cuda_overri python -m pip install -r requirements.txt && \ micromamba clean -afy -# Bitsandbytes uses this ENVVAR to determine CUDA library location -ENV CONDA_PREFIX=/opt/conda - COPY ${inference_server_dir} ./${inference_server_dir} COPY ${entrypoint_script} ./${entrypoint_script} +${copy_model_statement} + +# Bitsandbytes uses this ENVVAR to determine CUDA library location +ENV CONDA_PREFIX=/opt/conda +${extra_env_statement} USER root RUN if id mambauser >/dev/null 2>&1; then \ diff --git a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture index a03cd460..2130d55b 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture +++ b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture @@ -9,10 +9,11 @@ RUN --mount=type=cache,target=/opt/conda/pkgs CONDA_OVERRIDE_CUDA="" \ python -m pip install "uvicorn[standard]" gunicorn starlette==0.30.0 && \ python -m pip install -r requirements.txt && \ micromamba clean -afy -ENV CONDA_PREFIX=/opt/conda COPY inference_server ./inference_server COPY gunicorn_run.sh ./gunicorn_run.sh +ENV CONDA_PREFIX=/opt/conda + USER root RUN if id mambauser >/dev/null 2>&1; then \ diff --git a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_CUDA b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_CUDA index b393b172..29c67a59 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_CUDA +++ b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_CUDA @@ -9,10 +9,11 @@ RUN --mount=type=cache,target=/opt/conda/pkgs CONDA_OVERRIDE_CUDA="11.7" \ python -m pip install "uvicorn[standard]" gunicorn starlette==0.30.0 && \ python -m pip install -r requirements.txt && \ micromamba clean -afy -ENV CONDA_PREFIX=/opt/conda COPY inference_server ./inference_server COPY gunicorn_run.sh ./gunicorn_run.sh +ENV CONDA_PREFIX=/opt/conda + USER root RUN if id mambauser >/dev/null 2>&1; then \ diff --git a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_model b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_model index adff8f90..f0719f73 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_model +++ b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_model @@ -3,18 +3,18 @@ FROM mambaorg/micromamba:1.4.3 as build COPY env/conda.yaml conda.yaml COPY env/requirements.txt requirements.txt - -COPY model.zip /model_repo/model.zip ARG MAMBA_DOCKERFILE_ACTIVATE=1 RUN --mount=type=cache,target=/opt/conda/pkgs CONDA_OVERRIDE_CUDA="11.7" \ micromamba install -y -n base -f conda.yaml && \ python -m pip install "uvicorn[standard]" gunicorn starlette==0.30.0 && \ python -m pip install -r requirements.txt && \ micromamba clean -afy -ENV CONDA_PREFIX=/opt/conda COPY inference_server ./inference_server COPY gunicorn_run.sh ./gunicorn_run.sh +COPY model.zip /model_repo/model.zip +ENV CONDA_PREFIX=/opt/conda +ENV MODEL_ZIP_STAGE_PATH=/model_repo/model.zip USER root RUN if id mambauser >/dev/null 2>&1; then \ diff --git a/snowflake/ml/model/_deploy_client/snowservice/deploy.py b/snowflake/ml/model/_deploy_client/snowservice/deploy.py index f096bd38..919c8e7d 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/deploy.py +++ b/snowflake/ml/model/_deploy_client/snowservice/deploy.py @@ -461,6 +461,7 @@ def _prepare_and_upload_artifacts_to_stage(self, image: str) -> str: } if self.options.model_in_image: del substitutes["model_stage"] + del substitutes["model_zip_stage_path"] content = string.Template(template.read()).substitute(substitutes) content_dict = yaml.safe_load(content) if self.options.use_gpu: @@ -526,11 +527,9 @@ def _deploy_workflow(self, image: str) -> Tuple[str, str]: if self.options.use_gpu: for model_blob_meta in self.model_meta.models.values(): if model_blob_meta.model_type == "huggingface_pipeline": - batch_size = int(model_blob_meta.options.get("batch_size", 1)) - if max_batch_rows is None: - max_batch_rows = batch_size - else: - max_batch_rows = min(batch_size, max_batch_rows) + max_batch_rows = int(model_blob_meta.options.get("batch_size", 1)) + if model_blob_meta.model_type == "llm": + max_batch_rows = int(model_blob_meta.options.get("batch_size", 1)) service_function_sql = client.create_or_replace_service_function( service_func_name=self.service_func_name, diff --git a/snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template_with_model b/snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template_with_model index cd4671cb..1035d80a 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template_with_model +++ b/snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template_with_model @@ -3,7 +3,6 @@ spec: - name: ${inference_server_container_name} image: ${image} env: - MODEL_ZIP_STAGE_PATH: ${model_zip_stage_path} TARGET_METHOD: ${target_method} NUM_WORKERS: ${num_workers} SNOWML_USE_GPU: ${use_gpu} diff --git a/snowflake/ml/model/_handlers/BUILD.bazel b/snowflake/ml/model/_handlers/BUILD.bazel index f0ef7019..158867fb 100644 --- a/snowflake/ml/model/_handlers/BUILD.bazel +++ b/snowflake/ml/model/_handlers/BUILD.bazel @@ -148,3 +148,19 @@ py_library( "//snowflake/ml/model/models:huggingface_pipeline", ], ) + +py_library( + name = "llm", + srcs = ["llm.py"], + deps = [ + ":_base", + "//snowflake/ml/_internal:env_utils", + "//snowflake/ml/_internal:file_utils", + "//snowflake/ml/model:_model_meta", + "//snowflake/ml/model:custom_model", + "//snowflake/ml/model:model_signature", + "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_signatures:core", + "//snowflake/ml/model/models:llm_model", + ], +) diff --git a/snowflake/ml/model/_handlers/llm.py b/snowflake/ml/model/_handlers/llm.py new file mode 100644 index 00000000..94a3fa06 --- /dev/null +++ b/snowflake/ml/model/_handlers/llm.py @@ -0,0 +1,178 @@ +import os +from typing import Optional, cast + +import cloudpickle +import pandas as pd +from packaging import requirements +from typing_extensions import TypeGuard, Unpack + +from snowflake.ml._internal import env_utils, file_utils +from snowflake.ml.model import ( + _model_meta as model_meta_api, + custom_model, + type_hints as model_types, +) +from snowflake.ml.model._handlers import _base +from snowflake.ml.model._signatures import core +from snowflake.ml.model.models import llm + + +class _LLMHandler(_base._ModelHandler[llm.LLM]): + handler_type = "llm" + MODEL_BLOB_DIR = "model" + LLM_META = "llm_meta" + is_auto_signature = True + + @staticmethod + def can_handle( + model: model_types.SupportedModelType, + ) -> TypeGuard[llm.LLM]: + return isinstance(model, llm.LLM) + + @staticmethod + def cast_model( + model: model_types.SupportedModelType, + ) -> llm.LLM: + assert isinstance(model, llm.LLM) + return cast(llm.LLM, model) + + @staticmethod + def _save_model( + name: str, + model: llm.LLM, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + sample_input: Optional[model_types.SupportedDataType] = None, + is_sub_model: Optional[bool] = False, + **kwargs: Unpack[model_types.BaseModelSaveOption], + ) -> None: + assert not is_sub_model, "LLM can not be sub-model." + model_blob_path = os.path.join(model_blobs_dir_path, name) + os.makedirs(model_blob_path, exist_ok=True) + model_blob_dir_path = os.path.join(model_blob_path, _LLMHandler.MODEL_BLOB_DIR) + model_meta.cuda_version = model_meta_api._DEFAULT_CUDA_VERSION + sig = core.ModelSignature( + inputs=[ + core.FeatureSpec(name="input", dtype=core.DataType.STRING), + ], + outputs=[ + core.FeatureSpec(name="generated_text", dtype=core.DataType.STRING), + ], + ) + model_meta._signatures = {"infer": sig} + assert os.path.isdir(model.model_id_or_path), "Only model dir is supported for now." + file_utils.copytree(model.model_id_or_path, model_blob_dir_path) + with open( + os.path.join(model_blob_dir_path, _LLMHandler.LLM_META), + "wb", + ) as f: + cloudpickle.dump(model, f) + + base_meta = model_meta_api._ModelBlobMetadata( + name=name, + model_type=_LLMHandler.handler_type, + path=_LLMHandler.MODEL_BLOB_DIR, + options={ + "batch_size": str(model.max_batch_size), + }, + ) + model_meta.models[name] = base_meta + pkgs_requirements = [ + model_meta_api.Dependency(conda_name="transformers", pip_req="transformers"), + model_meta_api.Dependency(conda_name="pytorch", pip_req="torch==2.0.1"), + ] + if model.model_type == llm.SupportedLLMType.LLAMA_MODEL_TYPE: + pkgs_requirements = [ + model_meta_api.Dependency(conda_name="sentencepiece", pip_req="sentencepiece"), + model_meta_api.Dependency(conda_name="protobuf", pip_req="protobuf"), + *pkgs_requirements, + ] + model_meta._include_if_absent(pkgs_requirements) + # Recent peft versions are only available in PYPI. + env_utils.append_requirement_list( + model_meta._pip_requirements, + requirements.Requirement("peft==0.5.0"), + ) + + @staticmethod + def _load_model( + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], + ) -> llm.LLM: + model_blob_path = os.path.join(model_blobs_dir_path, name) + if not hasattr(model_meta, "models"): + raise ValueError("Ill model metadata found.") + model_blobs_metadata = model_meta.models + if name not in model_blobs_metadata: + raise ValueError(f"Blob of model {name} does not exist.") + model_blob_metadata = model_blobs_metadata[name] + model_blob_filename = model_blob_metadata.path + model_blob_dir_path = os.path.join(model_blob_path, model_blob_filename) + assert model_blob_dir_path, "It must be a directory." + with open(os.path.join(model_blob_dir_path, _LLMHandler.LLM_META), "rb") as f: + m = cloudpickle.load(f) + assert isinstance(m, llm.LLM) + # Switch to local path + m.model_id_or_path = model_blob_dir_path + return m + + @staticmethod + def _load_as_custom_model( + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], + ) -> custom_model.CustomModel: + raw_model = _LLMHandler._load_model( + name, + model_meta, + model_blobs_dir_path, + **kwargs, + ) + import peft + import transformers + + hub_kwargs = { + "revision": raw_model.revision, + "token": raw_model.token, + } + model_dir_path = raw_model.model_id_or_path + hf_model = peft.AutoPeftModelForCausalLM.from_pretrained( # type: ignore[attr-defined] + model_dir_path, + device_map="auto", + torch_dtype="auto", + **hub_kwargs, + ) + peft_config = peft.PeftConfig.from_pretrained(model_dir_path) # type: ignore[attr-defined] + base_model_path = peft_config.base_model_name_or_path + tokenizer = transformers.AutoTokenizer.from_pretrained( + base_model_path, + padding_side="right", + use_fast=False, + **hub_kwargs, + ) + hf_model.eval() + + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + # TODO(lhw): migrate away from hf pipeline + pipe = transformers.pipeline( + task="text-generation", + model=hf_model, + tokenizer=tokenizer, + batch_size=raw_model.max_batch_size, + ) + + class _LLMCustomModel(custom_model.CustomModel): + @custom_model.inference_api + def infer(self, X: pd.DataFrame) -> pd.DataFrame: + input_data = X.to_dict("list")["input"] + res = pipe(input_data, return_full_text=False) + # TODO(lhw): Assume single beam only. + return pd.DataFrame({"generated_text": [output[0]["generated_text"] for output in res]}) + + llm_custom = _LLMCustomModel(custom_model.ModelContext()) + + return llm_custom diff --git a/snowflake/ml/model/_signatures/core.py b/snowflake/ml/model/_signatures/core.py index 2f5287d0..1df982fc 100644 --- a/snowflake/ml/model/_signatures/core.py +++ b/snowflake/ml/model/_signatures/core.py @@ -1,4 +1,5 @@ import textwrap +import warnings from abc import ABC, abstractmethod from enum import Enum from typing import ( @@ -140,7 +141,23 @@ def from_snowpark_type(cls, snowpark_type: spt.DataType) -> "DataType": # Fallback for decimal type. if isinstance(snowpark_type, spt.DecimalType): if snowpark_type.scale == 0: + warnings.warn( + f"Warning: Type {snowpark_type}" + " is being automatically converted to INT64 in the Snowpark DataFrame. " + "This automatic conversion may lead to potential precision loss and rounding errors. " + "If you wish to prevent this conversion, you should manually perform " + "the necessary data type conversion." + ) return DataType.INT64 + else: + warnings.warn( + f"Warning: Type {snowpark_type}" + " is being automatically converted to DOUBLE in the Snowpark DataFrame. " + "This automatic conversion may lead to potential precision loss and rounding errors. " + "If you wish to prevent this conversion, you should manually perform " + "the necessary data type conversion." + ) + return DataType.DOUBLE raise snowml_exceptions.SnowflakeMLException( error_code=error_codes.NOT_IMPLEMENTED, original_exception=NotImplementedError(f"Type {snowpark_type} is not supported as a DataType."), diff --git a/snowflake/ml/model/_signatures/core_test.py b/snowflake/ml/model/_signatures/core_test.py index f62329e5..897ac23f 100644 --- a/snowflake/ml/model/_signatures/core_test.py +++ b/snowflake/ml/model/_signatures/core_test.py @@ -25,13 +25,7 @@ def test_snowpark_type(self) -> None: self.assertEqual(core.DataType.FLOAT, core.DataType.from_snowpark_type(spt.FloatType())) self.assertEqual(core.DataType.DOUBLE, core.DataType.from_snowpark_type(spt.DoubleType())) - with exception_utils.assert_snowml_exceptions( - self, - expected_original_error_type=NotImplementedError, - expected_regex="Type .+ is not supported as a DataType.", - ): - core.DataType.from_snowpark_type(spt.DecimalType(38, 6)) - + self.assertEqual(core.DataType.DOUBLE, core.DataType.from_snowpark_type(spt.DecimalType(38, 6))) self.assertEqual(core.DataType.BOOL, core.DataType.from_snowpark_type(spt.BooleanType())) self.assertEqual(core.DataType.STRING, core.DataType.from_snowpark_type(spt.StringType())) self.assertEqual(core.DataType.BYTES, core.DataType.from_snowpark_type(spt.BinaryType())) diff --git a/snowflake/ml/model/models/BUILD.bazel b/snowflake/ml/model/models/BUILD.bazel index e452497b..bbe59851 100644 --- a/snowflake/ml/model/models/BUILD.bazel +++ b/snowflake/ml/model/models/BUILD.bazel @@ -7,8 +7,20 @@ py_library( srcs = ["huggingface_pipeline.py"], ) +py_library( + name = "llm_model", + srcs = ["llm.py"], +) + py_test( name = "huggingface_pipeline_test", srcs = ["huggingface_pipeline_test.py"], deps = [":huggingface_pipeline"], ) + +py_test( + name = "llm_test", + srcs = ["llm_test.py"], + compatible_with_snowpark = False, + deps = [":llm_model"], +) diff --git a/snowflake/ml/model/models/llm.py b/snowflake/ml/model/models/llm.py new file mode 100644 index 00000000..52488852 --- /dev/null +++ b/snowflake/ml/model/models/llm.py @@ -0,0 +1,75 @@ +import os +from dataclasses import dataclass, field +from enum import Enum +from typing import Optional, Set + +_PEFT_CONFIG_NAME = "adapter_config.json" + + +class SupportedLLMType(Enum): + LLAMA_MODEL_TYPE = "llama" + OPT_MODEL_TYPE = "opt" + + @classmethod + def valid_values(cls) -> Set[str]: + return {member.value for member in cls} + + +@dataclass(frozen=True) +class LLMOptions: + """ + This is the option class for LLM. + + Args: + revision: Revision of HF model. Defaults to None. + token: The token to use as HTTP bearer authorization for remote files. Defaults to None. + max_batch_size: Max batch size allowed for single inferenced. Defaults to 1. + """ + + revision: Optional[str] = field(default=None) + token: Optional[str] = field(default=None) + max_batch_size: int = field(default=1) + + +class LLM: + def __init__( + self, + model_id_or_path: str, + *, + options: Optional[LLMOptions] = None, + ) -> None: + """ + + Args: + model_id_or_path: Local dir to PEFT weights. + options: Options for LLM. Defaults to be None. + + Raises: + ValueError: When unsupported. + """ + if not (os.path.isdir(model_id_or_path) and os.path.isfile(os.path.join(model_id_or_path, _PEFT_CONFIG_NAME))): + raise ValueError("Peft config is not found.") + import peft + import transformers + + if not options: + options = LLMOptions() + + hub_kwargs = { + "revision": options.revision, + "token": options.token, + } + peft_config = peft.PeftConfig.from_pretrained(model_id_or_path, **hub_kwargs) # type: ignore[attr-defined] + if peft_config.peft_type != peft.PeftType.LORA: # type: ignore[attr-defined] + raise ValueError("Only LORA is supported.") + if peft_config.task_type != peft.TaskType.CAUSAL_LM: # type: ignore[attr-defined] + raise ValueError("Only CAUSAL_LM is supported.") + base_model = peft_config.base_model_name_or_path + base_config = transformers.AutoConfig.from_pretrained(base_model, **hub_kwargs) + assert base_config.model_type in SupportedLLMType.valid_values(), f"{base_config.model_type} is not supported." + + self.model_id_or_path = model_id_or_path + self.token = options.token + self.revision = options.revision + self.max_batch_size = options.max_batch_size + self.model_type = base_config.model_type diff --git a/snowflake/ml/model/models/llm_test.py b/snowflake/ml/model/models/llm_test.py new file mode 100644 index 00000000..ae453518 --- /dev/null +++ b/snowflake/ml/model/models/llm_test.py @@ -0,0 +1,37 @@ +import os +import tempfile + +from absl.testing import absltest + +from snowflake.ml.model.models import llm + + +class LLMTest(absltest.TestCase): + @classmethod + def setUpClass(self) -> None: + self.cache_dir = tempfile.TemporaryDirectory() + self._original_hf_home = os.getenv("HF_HOME", None) + os.environ["HF_HOME"] = self.cache_dir.name + + @classmethod + def tearDownClass(self) -> None: + if self._original_hf_home: + os.environ["HF_HOME"] = self._original_hf_home + else: + del os.environ["HF_HOME"] + self.cache_dir.cleanup() + + def test_llm(self) -> None: + import peft + + ft_model = peft.AutoPeftModelForCausalLM.from_pretrained( # type: ignore[attr-defined] + "peft-internal-testing/tiny-OPTForCausalLM-lora", + device_map="auto", + ) + tmp_dir = self.create_tempdir().full_path + ft_model.save_pretrained(tmp_dir) + llm.LLM(model_id_or_path=tmp_dir) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/type_hints.py b/snowflake/ml/model/type_hints.py index ba747b11..9c58e344 100644 --- a/snowflake/ml/model/type_hints.py +++ b/snowflake/ml/model/type_hints.py @@ -20,6 +20,7 @@ import snowflake.ml.model.custom_model import snowflake.ml.model.models.huggingface_pipeline + import snowflake.ml.model.models.llm import snowflake.snowpark from snowflake.ml.modeling.framework import base # noqa: F401 @@ -70,6 +71,7 @@ "mlflow.pyfunc.PyFuncModel", "transformers.Pipeline", "snowflake.ml.model.models.huggingface_pipeline.HuggingFacePipelineModel", + "snowflake.ml.model.models.llm.LLM", ] SupportedModelType = Union[ diff --git a/snowflake/ml/modeling/_internal/BUILD.bazel b/snowflake/ml/modeling/_internal/BUILD.bazel index 3d28e22e..7e663da9 100644 --- a/snowflake/ml/modeling/_internal/BUILD.bazel +++ b/snowflake/ml/modeling/_internal/BUILD.bazel @@ -46,3 +46,11 @@ py_test( ":estimator_utils", ], ) + +py_test( + name = "snowpark_handlers_test", + srcs = ["snowpark_handlers_test.py"], + deps = [ + ":estimator_utils", + ], +) diff --git a/snowflake/ml/modeling/_internal/estimator_protocols.py b/snowflake/ml/modeling/_internal/estimator_protocols.py index a1acde86..d6cf59a9 100644 --- a/snowflake/ml/modeling/_internal/estimator_protocols.py +++ b/snowflake/ml/modeling/_internal/estimator_protocols.py @@ -1,6 +1,7 @@ -from typing import List, Optional, Protocol +from typing import Any, Dict, List, Optional, Protocol, Union import pandas as pd +from sklearn import model_selection from snowflake.snowpark import DataFrame, Session @@ -115,3 +116,17 @@ def score_snowpark( sample_weight_col: Optional[str], ) -> float: raise NotImplementedError + + def fit_search_snowpark( + self, + param_list: Union[Dict[str, Any], List[Dict[str, Any]]], + dataset: DataFrame, + session: Session, + estimator: Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV], + dependencies: List[str], + udf_imports: List[str], + input_cols: List[str], + label_cols: List[str], + sample_weight_col: Optional[str], + ) -> Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV]: + raise NotImplementedError diff --git a/snowflake/ml/modeling/_internal/estimator_utils.py b/snowflake/ml/modeling/_internal/estimator_utils.py index 5b345535..dbe6b567 100644 --- a/snowflake/ml/modeling/_internal/estimator_utils.py +++ b/snowflake/ml/modeling/_internal/estimator_utils.py @@ -7,6 +7,7 @@ from snowflake.ml._internal.exceptions import error_codes, exceptions from snowflake.ml.modeling.framework._utils import to_native_format from snowflake.ml.modeling.framework.base import BaseTransformer +from snowflake.snowpark import Session def validate_sklearn_args(args: Dict[str, Tuple[Any, Any, bool]], klass: type) -> Dict[str, Any]: @@ -107,3 +108,27 @@ def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]: return callable(getattr(self._sklearn_object, attr, None)) return check + + +def if_single_node(session: Session) -> bool: + """Retrieve the current session's warehouse type and warehouse size, and depends on those information + to identify if it is single node or not + + Args: + session (Session): session object that is used by user currently + + Returns: + bool: single node or not. True stands for yes. + """ + warehouse_name = session.get_current_warehouse() + if warehouse_name: + warehouse_name = warehouse_name.replace('"', "") + df = session.sql(f"SHOW WAREHOUSES like '{warehouse_name}';")['"type"', '"size"'].collect()[0] + # filter out the conditions when it is single node + single_node: bool = (df[0] == "SNOWPARK-OPTIMIZED" and df[1] == "Medium") or ( + df[0] == "STANDARD" and df[1] == "X-Small" + ) + return single_node + # If current session cannot retrieve the warehouse name back, + # Default as True; Let HPO fall back to stored procedure implementation + return True diff --git a/snowflake/ml/modeling/_internal/snowpark_handlers.py b/snowflake/ml/modeling/_internal/snowpark_handlers.py index add12188..b78aded3 100644 --- a/snowflake/ml/modeling/_internal/snowpark_handlers.py +++ b/snowflake/ml/modeling/_internal/snowpark_handlers.py @@ -5,12 +5,13 @@ import os import posixpath import sys -from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union from uuid import uuid4 import cloudpickle as cp import numpy as np import pandas as pd +import sklearn from scipy.stats import rankdata from sklearn import model_selection @@ -21,7 +22,7 @@ exceptions, modeling_error_messages, ) -from snowflake.ml._internal.utils import identifier +from snowflake.ml._internal.utils import identifier, snowpark_dataframe_utils from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator from snowflake.ml._internal.utils.temp_file_utils import ( cleanup_temp_files, @@ -37,7 +38,7 @@ TempObjectType, random_name_for_temp_object, ) -from snowflake.snowpark.functions import col, pandas_udf, sproc, udtf +from snowflake.snowpark.functions import col, pandas_udf, sproc from snowflake.snowpark.stored_procedure import StoredProcedure from snowflake.snowpark.types import ( FloatType, @@ -53,7 +54,9 @@ class WrapperProvider: - imports: List[str] = [] + def __init__(self) -> None: + self.imports: List[str] = [] + self.dependencies: List[str] = [] def get_fit_wrapper_function( self, @@ -134,15 +137,63 @@ def fit_wrapper_function( class SklearnWrapperProvider(WrapperProvider): - imports: List[str] = ["sklearn"] + def __init__(self) -> None: + import sklearn + + self.imports: List[str] = ["sklearn"] + + # TODO(snandamuri): Replace cloudpickle with joblib after latest version of joblib is added to snowflake conda. + self.dependencies: List[str] = [ + f"numpy=={np.__version__}", + f"scikit-learn=={sklearn.__version__}", + f"cloudpickle=={cp.__version__}", + ] class XGBoostWrapperProvider(WrapperProvider): - imports: List[str] = ["xgboost"] + def __init__(self) -> None: + import xgboost + + self.imports: List[str] = ["xgboost"] + self.dependencies = [ + f"numpy=={np.__version__}", + f"xgboost=={xgboost.__version__}", + f"cloudpickle=={cp.__version__}", + ] class LightGBMWrapperProvider(WrapperProvider): - imports: List[str] = ["lightgbm"] + def __init__(self) -> None: + import lightgbm + + self.imports: List[str] = ["lightgbm"] + self.dependencies = [ + f"numpy=={np.__version__}", + f"lightgbm=={lightgbm.__version__}", + f"cloudpickle=={cp.__version__}", + ] + + +class SklearnModelSelectionWrapperProvider(WrapperProvider): + def __init__(self) -> None: + import xgboost + + self.imports: List[str] = ["sklearn", "xgboost"] + self.dependencies = [ + f"numpy=={np.__version__}", + f"scikit-learn=={sklearn.__version__}", + f"cloudpickle=={cp.__version__}", + f"xgboost=={xgboost.__version__}", + ] + + # Only include lightgbm in the dependencies if it is installed. + try: + import lightgbm + except ModuleNotFoundError: + pass + else: + self.imports.append("lightgbm") + self.dependencies.append(f"lightgbm=={lightgbm.__version__}") def _get_rand_id() -> str: @@ -223,6 +274,8 @@ def fit_snowpark( label_cols: List[str], sample_weight_col: Optional[str], ) -> Any: + dataset = snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset) + # If we are already in a stored procedure, no need to kick off another one. if SNOWML_SPROC_ENV in os.environ: statement_params = telemetry.get_function_usage_statement_params( @@ -330,10 +383,13 @@ def batch_inference( pass_through_columns: List[str], expected_output_cols_list: List[str], expected_output_cols_type: str = "", + *args: Any, + **kwargs: Any, ) -> DataFrame: # Register vectorized UDF for batch inference batch_inference_udf_name = random_name_for_temp_object(TempObjectType.FUNCTION) snowpark_cols = dataset.select(input_cols).columns + dataset = snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset) statement_params = telemetry.get_function_usage_statement_params( project=_PROJECT, @@ -379,36 +435,43 @@ def vec_batch_infer(ds: PandasSeries[dict]) -> PandasSeries[dict]: # type: igno else: # Just rename the column names to unquoted identifiers. input_df.columns = snowpark_cols # Replace the quoted columns identifier with unquoted column ids. - transformed_numpy_array = getattr(estimator, inference_method)(input_df) - if ( - isinstance(transformed_numpy_array, list) - and len(transformed_numpy_array) > 0 - and isinstance(transformed_numpy_array[0], np.ndarray) + inference_res = getattr(estimator, inference_method)(input_df, *args, **kwargs) + if isinstance(inference_res, list) and len(inference_res) > 0 and isinstance(inference_res[0], np.ndarray): + # In case of multioutput estimators, predict_proba, decision_function etc., functions return a list of + # ndarrays. We need to concatenate them. + transformed_numpy_array = np.concatenate(inference_res, axis=1) + elif ( + isinstance(inference_res, tuple) and len(inference_res) > 0 and isinstance(inference_res[0], np.ndarray) ): - # In case of multioutput estimators, predict_proba(), decision_function(), etc., functions return - # a list of ndarrays. We need to concatenate them. - transformed_numpy_array = np.concatenate(transformed_numpy_array, axis=1) + # In case of kneighbors, functions return a tuple of ndarrays. + transformed_numpy_array = np.stack(inference_res, axis=1) + else: + transformed_numpy_array = inference_res - if len(transformed_numpy_array.shape) == 3: + if (len(transformed_numpy_array.shape) == 3) and inference_method != "kneighbors": # VotingClassifier will return results of shape (n_classifiers, n_samples, n_classes) # when voting = "soft" and flatten_transform = False. We can't handle unflatten transforms, # so we ignore flatten_transform flag and flatten the results. - transformed_numpy_array = np.hstack(transformed_numpy_array) - - if len(transformed_numpy_array.shape) > 1 and transformed_numpy_array.shape[1] != len( - expected_output_cols_list - ): - # HeterogeneousEnsemble's transform method produce results with variying shapes - # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes). - # It is hard to predict the response shape without using fragile introspection logic. - # So, to avoid that we are packing the results into a dataframe of shape (n_samples, 1) with - # each element being a list. - if len(expected_output_cols_list) != 1: - raise TypeError( - "expected_output_cols_list must be same length as transformed array or " "should be of length 1" + transformed_numpy_array = np.hstack(transformed_numpy_array) # type: ignore[call-overload] + + if len(transformed_numpy_array.shape) > 1: + if transformed_numpy_array.shape[1] != len(expected_output_cols_list): + # HeterogeneousEnsemble's transform method produce results with variying shapes + # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes). + # It is hard to predict the response shape without using fragile introspection logic. + # So, to avoid that we are packing the results into a dataframe of shape (n_samples, 1) with + # each element being a list. + if len(expected_output_cols_list) != 1: + raise TypeError( + "expected_output_cols_list must be same length as transformed array or " + "should be of length 1" + ) + series = pd.Series(transformed_numpy_array.tolist()) + transformed_pandas_df = pd.DataFrame(series, columns=expected_output_cols_list) + else: + transformed_pandas_df = pd.DataFrame( + transformed_numpy_array.tolist(), columns=expected_output_cols_list ) - series = pd.Series(transformed_numpy_array.tolist()) - transformed_pandas_df = pd.DataFrame(series, columns=expected_output_cols_list) else: transformed_pandas_df = pd.DataFrame(transformed_numpy_array, columns=expected_output_cols_list) @@ -502,6 +565,7 @@ def score_snowpark( label_cols: List[str], sample_weight_col: Optional[str], ) -> float: + dataset = snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset) if SNOWML_SPROC_ENV in os.environ: statement_params = telemetry.get_function_usage_statement_params( project=_PROJECT, @@ -636,7 +700,7 @@ def score_wrapper_sproc( return score - def _fit_search_snowpark( + def fit_search_snowpark( self, param_list: Union[Dict[str, Any], List[Dict[str, Any]]], dataset: DataFrame, @@ -647,33 +711,33 @@ def _fit_search_snowpark( input_cols: List[str], label_cols: List[str], sample_weight_col: Optional[str], - ) -> Dict[str, Union[float, Dict[str, Any]]]: + ) -> Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV]: from itertools import product import cachetools from sklearn.base import is_classifier from sklearn.calibration import check_cv - from snowflake.ml._internal.utils.snowpark_dataframe_utils import ( - cast_snowpark_dataframe, - ) - # Create one stage for data and for estimators. temp_stage_name = random_name_for_temp_object(TempObjectType.STAGE) temp_stage_creation_query = f"CREATE OR REPLACE TEMP STAGE {temp_stage_name};" session.sql(temp_stage_creation_query).collect() # Stage data. - dataset = cast_snowpark_dataframe(dataset) + dataset = snowpark_dataframe_utils.cast_snowpark_dataframe(dataset) remote_file_path = f"{temp_stage_name}/{temp_stage_name}.parquet" dataset.write.copy_into_location( # type:ignore[call-overload] remote_file_path, file_format_type="parquet", header=True, overwrite=True ) imports = [f"@{row.name}" for row in session.sql(f"LIST @{temp_stage_name}").collect()] + # Store GridSearchCV's refit variable. If user set it as False, we don't need to refit it again + refit_bool = estimator.refit # Create a temp file and dump the score to that file. estimator_file_name = get_temp_file_path() with open(estimator_file_name, mode="w+b") as local_estimator_file_obj: + # Set GridSearchCV refit as False and fit it again after retrieving the best param + estimator.refit = False cp.dump(estimator, local_estimator_file_obj) stage_estimator_file_name = posixpath.join(temp_stage_name, os.path.basename(estimator_file_name)) statement_params = telemetry.get_function_usage_statement_params( @@ -697,11 +761,13 @@ def _fit_search_snowpark( estimator_location = put_result[0].target imports.append(f"@{temp_stage_name}/{estimator_location}") - cv_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE) + search_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE) + random_udtf_name = random_name_for_temp_object(TempObjectType.FUNCTION) + random_table_name = random_name_for_temp_object(TempObjectType.TABLE) @sproc( # type: ignore[misc] is_permanent=False, - name=cv_sproc_name, + name=search_sproc_name, packages=dependencies + ["snowflake-snowpark-python", "pyarrow", "fastparquet"], # type: ignore[arg-type] replace=True, session=session, @@ -709,7 +775,7 @@ def _fit_search_snowpark( imports=imports, # type: ignore[arg-type] statement_params=statement_params, ) - def preprocess_cv_ind( + def _distributed_search( session: Session, imports: List[str], stage_estimator_file_name: str, @@ -717,8 +783,11 @@ def preprocess_cv_ind( label_cols: List[str], statement_params: Dict[str, str], ) -> str: + import copy import os import tempfile + import time + from typing import Iterator, List import cloudpickle as cp import pandas as pd @@ -754,216 +823,261 @@ def preprocess_cv_ind( estimator = cp.load(local_estimator_file_obj) cv_orig = check_cv(estimator.cv, y, classifier=is_classifier(estimator.estimator)) - indices = [[train, test] for train, test in cv_orig.split(X, y)] + indices = [test for _, test in cv_orig.split(X, y)] + indices_df = pd.DataFrame({"TEST": indices}) + indices_df = session.create_dataframe(indices_df) + + remote_file_path = f"{temp_stage_name}/indices.parquet" + indices_df.write.copy_into_location( + remote_file_path, file_format_type="parquet", header=True, overwrite=True + ) + imports.extend([f"@{row.name}" for row in session.sql(f"LIST @{temp_stage_name}/indices").collect()]) + + indices_len = len(indices) + + assert estimator is not None + params_to_evaluate = [] + for param_to_eval in list(param_list): + for k, v in param_to_eval.items(): # type: ignore[attr-defined] + param_to_eval[k] = [v] # type: ignore[index] + params_to_evaluate.append([param_to_eval]) + + @cachetools.cached(cache={}) + def _load_data_into_udf() -> Tuple[ + Dict[str, pd.DataFrame], + Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV], + pd.DataFrame, + int, + ]: + import pyarrow.parquet as pq + + data_files = [ + filename + for filename in os.listdir(sys._xoptions["snowflake_import_directory"]) + if filename.startswith(temp_stage_name) + ] + partial_df = [ + pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas() + for file_name in data_files + ] + df = pd.concat(partial_df, ignore_index=True) + + # load estimator + local_estimator_file_path = os.path.join( + sys._xoptions["snowflake_import_directory"], f"{estimator_location}" + ) + with open(local_estimator_file_path, mode="rb") as local_estimator_file_obj: + estimator = cp.load(local_estimator_file_obj) + + # load indices + indices_files = [ + filename + for filename in os.listdir(sys._xoptions["snowflake_import_directory"]) + if filename.startswith("indices") + ] + indices_partial_df = [ + pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas() + for file_name in indices_files + ] + indices = pd.concat(indices_partial_df, ignore_index=True) + + argspec = inspect.getfullargspec(estimator.fit) + args = {"X": df[input_cols]} + + if label_cols: + label_arg_name = "Y" if "Y" in argspec.args else "y" + args[label_arg_name] = df[label_cols].squeeze() + + if sample_weight_col is not None and "sample_weight" in argspec.args: + args["sample_weight"] = df[sample_weight_col].squeeze() + return args, estimator, indices, len(df) + + class SearchCV: + def __init__(self) -> None: + args, estimator, indices, data_length = _load_data_into_udf() + self.args = args + self.estimator = estimator + self.indices = indices + self.data_length = data_length + + def process( + self, params: List[dict], idx: int # type:ignore[type-arg] + ) -> Iterator[Tuple[float, str, str]]: + if hasattr(estimator, "param_grid"): + self.estimator.param_grid = params + else: + self.estimator.param_distributions = params + full_indices = np.array([i for i in range(self.data_length)]) + test_indice = json.loads(self.indices["TEST"][idx]) + train_indice = np.setdiff1d(full_indices, test_indice) + self.estimator.cv = [(train_indice, test_indice)] + self.estimator.fit(**self.args) + binary_cv_results = None + with io.BytesIO() as f: + cp.dump(self.estimator.cv_results_, f) + f.seek(0) + binary_cv_results = f.getvalue().hex() + yield (self.estimator.best_score_, json.dumps(self.estimator.best_params_), binary_cv_results) + + def end_partition(self) -> None: + ... + + session.udtf.register( + SearchCV, + output_schema=StructType( + [ + StructField("BEST_SCORE", FloatType()), + StructField("BEST_PARAMS", StringType()), + StructField("CV_RESULTS", StringType()), + ] + ), + input_types=[VariantType(), IntegerType()], + name=random_udtf_name, + packages=dependencies + ["pyarrow", "fastparquet"], # type: ignore[arg-type] + replace=True, + is_permanent=False, + imports=imports, # type: ignore[arg-type] + statement_params=statement_params, + ) - local_cv_file = tempfile.NamedTemporaryFile(delete=True) - local_cv_file_name = local_cv_file.name - local_cv_file.close() - with open(local_cv_file_name, mode="w+b") as local_cv_file_obj: - cp.dump(indices, local_cv_file_obj) + HP_TUNING = F.table_function(random_udtf_name) + + idx_length = int(indices_len) + params_length = len(params_to_evaluate) + idxs = [i for i in range(idx_length)] + params, param_indices = [], [] + for param, param_idx in product(params_to_evaluate, idxs): + params.append(param) + param_indices.append(param_idx) + + pd_df = pd.DataFrame( + { + "PARAMS": params, + "TRAIN_IND": param_indices, + "PARAM_INDEX": [i for i in range(idx_length * params_length)], + } + ) + df = session.create_dataframe(pd_df) + results = df.select( + F.cast(df["PARAM_INDEX"], IntegerType()).as_("PARAM_INDEX"), + (HP_TUNING(df["PARAMS"], df["TRAIN_IND"]).over(partition_by=df["PARAM_INDEX"])), + ) - put_result = session.file.put( - local_cv_file_name, + results.write.saveAsTable(random_table_name, mode="overwrite", table_type="temporary") + table_result = session.table(random_table_name).sort(col("PARAM_INDEX")) + + # cv_result maintains the original order + cv_results_ = dict() + for i, val in enumerate(table_result.select("CV_RESULTS").collect()): + # retrieved string had one more double quote in the front and end of the string. + # use [1:-1] to remove the extra double quotes + hex_str = bytes.fromhex(val[0]) + with io.BytesIO(hex_str) as f_reload: + each_cv_result = cp.load(f_reload) + for k, v in each_cv_result.items(): + cur_cv = i % idx_length + key = k + if k == "split0_test_score": + key = f"split{cur_cv}_test_score" + elif k.startswith("param"): + if cur_cv != 0: + key = False + if key: + if key not in cv_results_: + cv_results_[key] = v + else: + cv_results_[key] = np.concatenate([cv_results_[key], v]) + + # Use numpy to re-calculate all the information in cv_results_ again + # Generally speaking, reshape all the results into the (3, idx_length, params_length) shape, + # and average them by the idx_length; + # idx_length is the number of cv folds; params_length is the number of parameter combinations + fit_score_test_matrix = np.stack( + ( + np.reshape(cv_results_["mean_fit_time"], (idx_length, -1)), # idx_length x params_length + np.reshape(cv_results_["mean_score_time"], (idx_length, -1)), + np.reshape( + np.concatenate([cv_results_[f"split{cur_cv}_test_score"] for cur_cv in range(idx_length)]), + (idx_length, -1), + ), + ) + ) + mean_fit_score_test_matrix = np.mean(fit_score_test_matrix, axis=1) + std_fit_score_test_matrix = np.std(fit_score_test_matrix, axis=1) + cv_results_["std_fit_time"] = std_fit_score_test_matrix[0] + cv_results_["mean_fit_time"] = mean_fit_score_test_matrix[0] + cv_results_["std_score_time"] = std_fit_score_test_matrix[1] + cv_results_["mean_score_time"] = mean_fit_score_test_matrix[1] + cv_results_["std_test_score"] = std_fit_score_test_matrix[2] + cv_results_["mean_test_score"] = mean_fit_score_test_matrix[2] + # re-compute the ranking again with mean_test_score + cv_results_["rank_test_score"] = rankdata(-cv_results_["mean_test_score"], method="min") + # best param is the highest ranking (which is 1) and we choose the first time ranking 1 appeared + best_param_index = np.where(cv_results_["rank_test_score"] == 1)[0][0] + + estimator.best_params_ = cv_results_["params"][best_param_index] + estimator.best_score_ = cv_results_["mean_test_score"][best_param_index] + estimator.cv_results_ = cv_results_ + + if refit_bool: + estimator.best_estimator_ = copy.deepcopy( + copy.deepcopy(estimator.estimator).set_params(**estimator.best_params_) + ) + # Let the sproc use all cores to refit. + estimator.n_jobs = -1 if not estimator.n_jobs else estimator.n_jobs + + # process the input as args + argspec = inspect.getfullargspec(estimator.fit) + args = {"X": X} + if label_cols: + label_arg_name = "Y" if "Y" in argspec.args else "y" + args[label_arg_name] = y + if sample_weight_col is not None and "sample_weight" in argspec.args: + args["sample_weight"] = df[sample_weight_col].squeeze() + estimator.refit = True + refit_start_time = time.time() + estimator.best_estimator_.fit(**args) + refit_end_time = time.time() + estimator.refit_time_ = refit_end_time - refit_start_time + + local_result_file = tempfile.NamedTemporaryFile(delete=True) + local_result_file_name = local_result_file.name + local_result_file.close() + + with open(local_result_file_name, mode="w+b") as local_result_file_obj: + cp.dump(estimator, local_result_file_obj) + + session.file.put( + local_result_file_name, temp_stage_name, auto_compress=False, overwrite=True, + statement_params=statement_params, ) - ind_location = put_result[0].target - return ind_location + "|" + str(len(indices)) - ind_location, indices_len = preprocess_cv_ind( + # Note: you can add something like + "|" + str(df) to the return string + # to pass debug information to the caller. + return str(os.path.basename(local_result_file_name)) + + sproc_export_file_name = _distributed_search( session, imports, stage_estimator_file_name, input_cols, label_cols, statement_params, - ).split("|") - imports.append(f"@{temp_stage_name}/{ind_location}") - - # Create estimators with subset of param grid. - # TODO: Decide how to choose parallelization factor. - assert estimator is not None - params_to_evaluate = [] - for param_to_eval in list(param_list): - for k, v in param_to_eval.items(): # type: ignore[attr-defined] - param_to_eval[k] = [v] # type: ignore[index] - params_to_evaluate.append([param_to_eval]) - - @cachetools.cached(cache={}) - def _load_data_into_udf() -> Tuple[ - Dict[str, pd.DataFrame], - Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV], - List[List[int]], - ]: - import pyarrow.parquet as pq - - data_files = [ - filename - for filename in os.listdir(sys._xoptions["snowflake_import_directory"]) - if filename.startswith(temp_stage_name) - ] - partial_df = [ - pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas() - for file_name in data_files - ] - df = pd.concat(partial_df, ignore_index=True) - - # load estimator - local_estimator_file_path = os.path.join( - sys._xoptions["snowflake_import_directory"], f"{estimator_location}" - ) - with open(local_estimator_file_path, mode="rb") as local_estimator_file_obj: - estimator = cp.load(local_estimator_file_obj) - - # load index file - local_ind_file_path = os.path.join(sys._xoptions["snowflake_import_directory"], f"{ind_location}") - with open(local_ind_file_path, mode="rb") as local_ind_file_obj: - indices = cp.load(local_ind_file_obj) - - argspec = inspect.getfullargspec(estimator.fit) - args = {"X": df[input_cols]} - - if label_cols: - label_arg_name = "Y" if "Y" in argspec.args else "y" - args[label_arg_name] = df[label_cols].squeeze() - - if sample_weight_col is not None and "sample_weight" in argspec.args: - args["sample_weight"] = df[sample_weight_col].squeeze() - return args, estimator, indices - - random_udtf_name = random_name_for_temp_object(TempObjectType.TABLE_FUNCTION) - statement_params = telemetry.get_function_usage_statement_params( - project=_PROJECT, - subproject=self._subproject, - function_name=telemetry.get_statement_params_full_func_name( - inspect.currentframe(), self.__class__.__name__ - ), - api_calls=[udtf], ) - @udtf( # type: ignore[arg-type] - output_schema=StructType( - [ - StructField("BEST_SCORE", FloatType()), - StructField("BEST_PARAMS", StringType()), - StructField("CV_RESULTS", StringType()), - ] - ), - input_types=[VariantType(), IntegerType()], - name=random_udtf_name, - packages=dependencies + ["pyarrow", "fastparquet"], # type: ignore[arg-type] - replace=True, - is_permanent=False, - imports=imports, # type: ignore[arg-type] + local_estimator_path = get_temp_file_path() + session.file.get( + posixpath.join(temp_stage_name, sproc_export_file_name), + local_estimator_path, statement_params=statement_params, - session=session, - ) - class SearchCV: - def __init__(self) -> None: - args, estimator, indices = _load_data_into_udf() - self.args = args - self.estimator = estimator - self.indices = indices - - def process( - self, params: List[dict], idx: int # type:ignore[type-arg] - ) -> Iterator[Tuple[float, str, str]]: - if hasattr(estimator, "param_grid"): - self.estimator.param_grid = params - else: - self.estimator.param_distributions = params - - self.estimator.cv = [(self.indices[idx][0], self.indices[idx][1])] - self.estimator.fit(**self.args) - # TODO: handle the case of estimator size > maximum column size or to just serialize and return score. - binary_cv_results = None - with io.BytesIO() as f: - cp.dump(self.estimator.cv_results_, f) - f.seek(0) - binary_cv_results = f.getvalue().hex() - yield (self.estimator.best_score_, json.dumps(self.estimator.best_params_), binary_cv_results) - - def end_partition(self) -> None: - ... - - HP_TUNING = F.table_function(random_udtf_name) - - idx_length = int(indices_len) - params_length = len(params_to_evaluate) - idxs = [i for i in range(idx_length)] - params, param_indices = [], [] - for param, param_idx in product(params_to_evaluate, idxs): - params.append(param) - param_indices.append(param_idx) - - pd_df = pd.DataFrame( - { - "PARAMS": params, - "TRAIN_IND": param_indices, - "PARAM_INDEX": [i for i in range(idx_length * params_length)], - } - ) - df = session.create_dataframe(pd_df) - results = df.select( - F.cast(df["PARAM_INDEX"], IntegerType()).as_("PARAM_INDEX"), - (HP_TUNING(df["PARAMS"], df["TRAIN_IND"]).over(partition_by=df["PARAM_INDEX"])), ) - random_table_name = random_name_for_temp_object(TempObjectType.TABLE) - results.write.saveAsTable(random_table_name, mode="overwrite", table_type="temporary") - table_result = session.table(random_table_name).sort(col("PARAM_INDEX")) - - # cv_result maintains the original order - cv_results_ = dict() - for i, val in enumerate(table_result.select("CV_RESULTS").collect()): - # retrieved string had one more double quote in the front and end of the string. - # use [1:-1] to remove the extra double quotes - hex_str = bytes.fromhex(val[0]) - with io.BytesIO(hex_str) as f_reload: - each_cv_result = cp.load(f_reload) - for k, v in each_cv_result.items(): - cur_cv = i % idx_length - key = k - if k == "split0_test_score": - key = f"split{cur_cv}_test_score" - elif k.startswith("param"): - if cur_cv != 0: - key = False - if key: - if key not in cv_results_: - cv_results_[key] = v - else: - cv_results_[key] = np.concatenate([cv_results_[key], v]) - - # Use numpy to re-calculate all the information in cv_results_ again - # Generally speaking, reshape all the results into the (3, idx_length, params_length) shape, - # and average them by the idx_length; - # idx_length is the number of cv folds; params_length is the number of parameter combinations - fit_score_test_matrix = np.stack( - ( - np.reshape(cv_results_["mean_fit_time"], (idx_length, -1)), # idx_length x params_length - np.reshape(cv_results_["mean_score_time"], (idx_length, -1)), - np.reshape( - np.concatenate([cv_results_[f"split{cur_cv}_test_score"] for cur_cv in range(idx_length)]), - (idx_length, -1), - ), - ) - ) - mean_fit_score_test_matrix = np.mean(fit_score_test_matrix, axis=1) - std_fit_score_test_matrix = np.std(fit_score_test_matrix, axis=1) - cv_results_["std_fit_time"] = std_fit_score_test_matrix[0] - cv_results_["mean_fit_time"] = mean_fit_score_test_matrix[0] - cv_results_["std_score_time"] = std_fit_score_test_matrix[1] - cv_results_["mean_score_time"] = mean_fit_score_test_matrix[1] - cv_results_["std_test_score"] = std_fit_score_test_matrix[2] - cv_results_["mean_test_score"] = mean_fit_score_test_matrix[2] - # re-compute the ranking again with mean_test_score - cv_results_["rank_test_score"] = rankdata(-cv_results_["mean_test_score"], method="min") - # best param is the highest ranking (which is 1) and we choose the first time ranking 1 appeared - best_param_index = np.where(cv_results_["rank_test_score"] == 1)[0][0] - - # assign the parameter to a dict - best_param = cv_results_["params"][best_param_index] - best_score = cv_results_["mean_test_score"][best_param_index] - return {"best_param": best_param, "best_score": best_score, "cv_results": cv_results_} + with open(os.path.join(local_estimator_path, sproc_export_file_name), mode="r+b") as result_file_obj: + fit_estimator = cp.load(result_file_obj) + + cleanup_temp_files([local_estimator_path]) + + return fit_estimator diff --git a/snowflake/ml/modeling/_internal/snowpark_handlers_test.py b/snowflake/ml/modeling/_internal/snowpark_handlers_test.py new file mode 100644 index 00000000..e17803b7 --- /dev/null +++ b/snowflake/ml/modeling/_internal/snowpark_handlers_test.py @@ -0,0 +1,67 @@ +from typing import Any +from unittest import mock + +from absl.testing import absltest, parameterized + +from snowflake.ml.modeling._internal.snowpark_handlers import ( + LightGBMWrapperProvider, + SklearnModelSelectionWrapperProvider, + SklearnWrapperProvider, + XGBoostWrapperProvider, +) + + +class SnowparkHandlersUnitTest(parameterized.TestCase): + def test_sklearn_model_selection_wrapper_provider_lightgbm_installed(self) -> None: + orig_import = __import__ + + def import_mock(name: str, *args: Any, **kwargs: Any) -> Any: + if name == "lightgbm": + lightgbm_mock = mock.MagicMock() + lightgbm_mock.__version__ = "1" + return lightgbm_mock + return orig_import(name, *args, **kwargs) + + with mock.patch("builtins.__import__", side_effect=import_mock): + provider = SklearnModelSelectionWrapperProvider() + + self.assertEqual(provider.imports, ["sklearn", "xgboost", "lightgbm"]) + + def test_sklearn_model_selection_wrapper_provider_lightgbm_not_installed(self) -> None: + orig_import = __import__ + + def import_mock(name: str, *args: Any, **kwargs: Any) -> Any: + if name == "lightgbm": + raise ModuleNotFoundError + return orig_import(name, *args, **kwargs) + + with mock.patch("builtins.__import__", side_effect=import_mock): + provider = SklearnModelSelectionWrapperProvider() + + self.assertEqual(provider.imports, ["sklearn", "xgboost"]) + + def test_xgboost_wrapper_provider(self) -> None: + provider = XGBoostWrapperProvider() + self.assertEqual(provider.imports, ["xgboost"]) + + def test_sklearn_wrapper_provider(self) -> None: + provider = SklearnWrapperProvider() + self.assertEqual(provider.imports, ["sklearn"]) + + def test_lightgbm_wrapper_provider(self) -> None: + orig_import = __import__ + + def import_mock(name: str, *args: Any, **kwargs: Any) -> Any: + if name == "lightgbm": + lightgbm_mock = mock.MagicMock() + lightgbm_mock.__version__ = "1" + return lightgbm_mock + return orig_import(name, *args, **kwargs) + + with mock.patch("builtins.__import__", side_effect=import_mock): + provider = LightGBMWrapperProvider() + self.assertEqual(provider.imports, ["lightgbm"]) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/modeling/impute/simple_imputer.py b/snowflake/ml/modeling/impute/simple_imputer.py index d9fac1a3..1b05255d 100644 --- a/snowflake/ml/modeling/impute/simple_imputer.py +++ b/snowflake/ml/modeling/impute/simple_imputer.py @@ -73,7 +73,55 @@ ] +# TODO(thoyt): Implement logic for `add_indicator` parameter and `indicator_` attribute.Requires +# `snowflake.ml.impute.MissingIndicator` to be implemented. class SimpleImputer(base.BaseTransformer): + """ + Univariate imputer for completing missing values with simple strategies. + Note that the `add_indicator` parameter is not implemented. For more details on this class, see + [sklearn.impute.SimpleImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html). + + Args: + missing_values: int, float, str, np.nan or None, default=np.nan. + The values to treat as missing and impute during transform. + + strategy: str, default="mean". + The imputation strategy. + + * If "mean", replace missing values using the mean along each column. + Can only be used with numeric data. + * If "median", replace missing values using the median along each column. + Can only be used with numeric data. + * If "most_frequent", replace missing using the most frequent value along each column. + Can be used with strings or numeric data. + If there is more than one such value, only the smallest is returned. + * If "constant", replace the missing values with `fill_value`. Can be used with strings or numeric data. + + fill_value: Optional[str] + When `strategy == "constant"`, `fill_value` is used to replace all occurrences of `missing_values`. + For string or object data types, `fill_value` must be a string. If `None`, `fill_value` will be 0 when + imputing numerical data and `missing_value` for strings and object data types. + input_cols: Optional[Union[str, List[str]]] + Columns to use as inputs during fit and transform. + output_cols: Optional[Union[str, List[str]]] + A string or list of strings representing column names that will store the output of transform operation. + The length of `output_cols` must equal the length of `input_cols`. + drop_input_cols: bool, default=False + Remove input columns from output if set `True`. + + Attributes: + statistics_: dict {input_col: stats_value} + Dict containing the imputation fill value for each feature. Computing statistics can result in `np.nan` + values. During `transform`, features corresponding to `np.nan` statistics will be discarded. + n_features_in_: int + Number of features seen during `fit`. + feature_names_in_: ndarray of shape (n_features_in,) + Names of features seen during `fit`. + + Raises: + SnowflakeMLException: If strategy is invalid, or if fill value is specified for strategy that isn't "constant". + """ + def __init__( self, *, @@ -84,47 +132,6 @@ def __init__( output_cols: Optional[Union[str, Iterable[str]]] = None, drop_input_cols: Optional[bool] = False, ) -> None: - """ - Univariate imputer for completing missing values with simple strategies. - Note that the `add_indicator` param/functionality is not implemented. - - Args: - missing_values: The values to treat as missing and impute during transform. - strategy: The imputation strategy. - * If "mean", replace missing values using the mean along each column. - Can only be used with numeric data. - * If "median", replace missing values using the median along each column. - Can only be used with numeric data. - * If "most_frequent", replace missing using the most frequent value along each column. - Can be used with strings or numeric data. - If there is more than one such value, only the smallest is returned. - * If "constant", replace the missing values with `fill_value`. Can be used with strings or numeric data. - fill_value: - When `strategy == "constant"`, `fill_value` is used to replace all occurrences of `missing_values`. - For string or object data types, `fill_value` must be a string. If `None`, `fill_value` will be 0 when - imputing numerical data and `missing_value` for strings and object data types. - input_cols: - Columns to use as inputs during fit or transform. - output_cols: - New column labels for the columns that will contain the output of a transform. - drop_input_cols: Remove input columns from output if set True. False by default. - - Attributes: - statistics_: dict {input_col: stats_value} - Dict containing the imputation fill value for each feature. Computing statistics can result in `np.nan` - values. During `transform`, features corresponding to `np.nan` statistics will be discarded. - n_features_in_: int - Number of features seen during `fit`. - feature_names_in_: ndarray of shape (n_features_in,) - Names of features seen during `fit`. - - TODO(thoyt): Implement logic for `add_indicator` parameter and `indicator_` attribute. Requires - `snowflake.ml.impute.MissingIndicator` to be implemented. - - Raises: - SnowflakeMLException: If strategy is invalid, or if fill value is specified for strategy that isn't - "constant". - """ super().__init__(drop_input_cols=drop_input_cols) if strategy in STRATEGY_TO_STATE_DICT: self.strategy = strategy diff --git a/snowflake/ml/modeling/metrics/ranking.py b/snowflake/ml/modeling/metrics/ranking.py index 1b5f6b77..4abb3e4c 100644 --- a/snowflake/ml/modeling/metrics/ranking.py +++ b/snowflake/ml/modeling/metrics/ranking.py @@ -79,8 +79,10 @@ def precision_recall_curve( sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) + cols = metrics_utils.flatten_cols([y_true_col_name, probas_pred_col_name, sample_weight_col_name]) queries = df[cols].queries["queries"] + pickled_result_module = cloudpickle.dumps(result) @F.sproc( # type: ignore[misc] @@ -99,7 +101,10 @@ def precision_recall_curve( def precision_recall_curve_anon_sproc(session: snowpark.Session) -> bytes: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) - df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) + sp_df = session.sql(queries[-1]) + df = sp_df.to_pandas(statement_params=statement_params) + df.columns = sp_df.columns + y_true = df[y_true_col_name] probas_pred = df[probas_pred_col_name] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None @@ -215,8 +220,10 @@ class scores must correspond to the order of ``labels``, sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) + cols = metrics_utils.flatten_cols([y_true_col_names, y_score_col_names, sample_weight_col_name]) queries = df[cols].queries["queries"] + pickled_result_module = cloudpickle.dumps(result) @F.sproc( # type: ignore[misc] @@ -235,7 +242,10 @@ class scores must correspond to the order of ``labels``, def roc_auc_score_anon_sproc(session: snowpark.Session) -> bytes: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) - df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) + sp_df = session.sql(queries[-1]) + df = sp_df.to_pandas(statement_params=statement_params) + df.columns = sp_df.columns + y_true = df[y_true_col_names] y_score = df[y_score_col_names] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None @@ -306,8 +316,10 @@ def roc_curve( sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) + cols = metrics_utils.flatten_cols([y_true_col_name, y_score_col_name, sample_weight_col_name]) queries = df[cols].queries["queries"] + pickled_result_module = cloudpickle.dumps(result) @F.sproc( # type: ignore[misc] @@ -326,7 +338,10 @@ def roc_curve( def roc_curve_anon_sproc(session: snowpark.Session) -> bytes: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) - df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) + sp_df = session.sql(queries[-1]) + df = sp_df.to_pandas(statement_params=statement_params) + df.columns = sp_df.columns + y_true = df[y_true_col_name] y_score = df[y_score_col_name] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None diff --git a/snowflake/ml/modeling/metrics/regression.py b/snowflake/ml/modeling/metrics/regression.py index 517fa520..c71459c4 100644 --- a/snowflake/ml/modeling/metrics/regression.py +++ b/snowflake/ml/modeling/metrics/regression.py @@ -65,8 +65,10 @@ def d2_absolute_error_score( sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) + cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) queries = df[cols].queries["queries"] + pickled_snowflake_result = cloudpickle.dumps(result) @F.sproc( # type: ignore[misc] @@ -85,7 +87,10 @@ def d2_absolute_error_score( def d2_absolute_error_score_anon_sproc(session: snowpark.Session) -> bytes: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) - df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) + sp_df = session.sql(queries[-1]) + df = sp_df.to_pandas(statement_params=statement_params) + df.columns = sp_df.columns + y_true = df[y_true_col_names] y_pred = df[y_pred_col_names] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None @@ -151,8 +156,10 @@ def d2_pinball_score( sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) + cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) queries = df[cols].queries["queries"] + pickled_result_module = cloudpickle.dumps(result) @F.sproc( # type: ignore[misc] @@ -171,7 +178,10 @@ def d2_pinball_score( def d2_pinball_score_anon_sproc(session: snowpark.Session) -> bytes: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) - df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) + sp_df = session.sql(queries[-1]) + df = sp_df.to_pandas(statement_params=statement_params) + df.columns = sp_df.columns + y_true = df[y_true_col_names] y_pred = df[y_pred_col_names] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None @@ -255,8 +265,10 @@ def explained_variance_score( sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) + cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) queries = df[cols].queries["queries"] + pickled_result_module = cloudpickle.dumps(result) @F.sproc( # type: ignore[misc] @@ -275,7 +287,10 @@ def explained_variance_score( def explained_variance_score_anon_sproc(session: snowpark.Session) -> bytes: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) - df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) + sp_df = session.sql(queries[-1]) + df = sp_df.to_pandas(statement_params=statement_params) + df.columns = sp_df.columns + y_true = df[y_true_col_names] y_pred = df[y_pred_col_names] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None diff --git a/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py b/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py index 6d252c38..00a1283d 100644 --- a/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py +++ b/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py @@ -1,4 +1,7 @@ -import copy +# +# This code is auto-generated using the sklearn_wrapper_template.py_template template. +# Do not modify the auto-generated code(except automatic reformatting by precommit hooks). +# from typing import Any, Dict, Iterable, List, Optional, Set, Union from uuid import uuid4 @@ -25,6 +28,7 @@ from snowflake.ml.modeling._internal.estimator_protocols import CVHandlers from snowflake.ml.modeling._internal.estimator_utils import ( gather_dependencies, + if_single_node, original_estimator_has_callable, transform_snowml_obj_to_sklearn_obj, validate_sklearn_args, @@ -42,6 +46,7 @@ # and converting module name from underscore to CamelCase # e.g. sklearn.linear_model -> LinearModel. _SUBPROJECT = "ModelSelection" +DEFAULT_UDTF_NJOBS = 3 class GridSearchCV(BaseTransformer): @@ -344,37 +349,41 @@ def _fit_snowpark(self, dataset: DataFrame) -> None: dataset = dataset.select(selected_cols) assert self._sklearn_object is not None - # Set GridSearchCV refit as False and fit it again after retrieving the best param - self._sklearn_object.refit = False - result_dict = self._handlers._fit_search_snowpark( - param_list=ParameterGrid(self._sklearn_object.param_grid), - dataset=dataset, - session=session, - estimator=self._sklearn_object, - dependencies=self._get_dependencies(), - udf_imports=["sklearn"], - input_cols=self.input_cols, - label_cols=self.label_cols, - sample_weight_col=self.sample_weight_col, - ) - - self._sklearn_object.best_params_ = result_dict["best_param"] - self._sklearn_object.best_score_ = result_dict["best_score"] - self._sklearn_object.cv_results_ = result_dict["cv_results"] - - self._sklearn_object.best_estimator_ = copy.deepcopy( - copy.deepcopy(self._sklearn_object.estimator).set_params(**self._sklearn_object.best_params_) - ) - self._sklearn_object.refit = True - self._sklearn_object.best_estimator_ = self._handlers.fit_snowpark( - dataset=dataset, - session=session, - estimator=self._sklearn_object.best_estimator_, - dependencies=["snowflake-snowpark-python"] + self._get_dependencies(), - input_cols=self.input_cols, - label_cols=self.label_cols, - sample_weight_col=self.sample_weight_col, - ) + single_node = if_single_node(session) + if not single_node: + # Set the default value of the `n_jobs` attribute for the estimator. + # If minus one is set, it will not be abided by in the UDTF, so we set that to the default value as well. + if hasattr(self._sklearn_object.estimator, "n_jobs") and self._sklearn_object.estimator.n_jobs in [ + None, + -1, + ]: + self._sklearn_object.estimator.n_jobs = DEFAULT_UDTF_NJOBS + self._sklearn_object = self._handlers.fit_search_snowpark( + param_list=ParameterGrid(self._sklearn_object.param_grid), + dataset=dataset, + session=session, + estimator=self._sklearn_object, + dependencies=self._get_dependencies(), + udf_imports=["sklearn"], + input_cols=self.input_cols, + label_cols=self.label_cols, + sample_weight_col=self.sample_weight_col, + ) + else: + # Fall back with stored procedure implementation + # set the parallel factor to default to minus one, to fully accelerate the cores in single node + if self._sklearn_object.n_jobs is None: + self._sklearn_object.n_jobs = -1 + + self._sklearn_object = self._handlers.fit_snowpark( + dataset, + session, + self._sklearn_object, + ["snowflake-snowpark-python"] + self._get_dependencies(), + self.input_cols, + self.label_cols, + self.sample_weight_col, + ) def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]: if self._drop_input_cols: diff --git a/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py b/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py index deb6da0f..1849c881 100644 --- a/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py +++ b/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py @@ -1,4 +1,3 @@ -import copy from typing import Any, Dict, Iterable, List, Optional, Set, Union from uuid import uuid4 @@ -26,6 +25,7 @@ from snowflake.ml.modeling._internal.estimator_protocols import CVHandlers from snowflake.ml.modeling._internal.estimator_utils import ( gather_dependencies, + if_single_node, original_estimator_has_callable, transform_snowml_obj_to_sklearn_obj, validate_sklearn_args, @@ -43,6 +43,7 @@ # and converting module name from underscore to CamelCase # e.g. sklearn.linear_model -> LinearModel. _SUBPROJECT = "ModelSelection" +DEFAULT_UDTF_NJOBS = 3 class RandomizedSearchCV(BaseTransformer): @@ -360,40 +361,45 @@ def _fit_snowpark(self, dataset: DataFrame) -> None: dataset = dataset.select(selected_cols) assert self._sklearn_object is not None - self._sklearn_object.refit = False - result_dict = self._handlers._fit_search_snowpark( - param_list=ParameterSampler( - self._sklearn_object.param_distributions, - n_iter=self._sklearn_object.n_iter, - random_state=self._sklearn_object.random_state, - ), - dataset=dataset, - session=session, - estimator=self._sklearn_object, - dependencies=self._get_dependencies(), - udf_imports=["sklearn"], - input_cols=self.input_cols, - label_cols=self.label_cols, - sample_weight_col=self.sample_weight_col, - ) - - self._sklearn_object.best_params_ = result_dict["best_param"] - self._sklearn_object.best_score_ = result_dict["best_score"] - self._sklearn_object.cv_results_ = result_dict["cv_results"] - - self._sklearn_object.best_estimator_ = copy.deepcopy( - copy.deepcopy(self._sklearn_object.estimator).set_params(**self._sklearn_object.best_params_) - ) - self._sklearn_object.refit = True - self._sklearn_object.best_estimator_ = self._handlers.fit_snowpark( - dataset=dataset, - session=session, - estimator=self._sklearn_object.best_estimator_, - dependencies=["snowflake-snowpark-python"] + self._get_dependencies(), - input_cols=self.input_cols, - label_cols=self.label_cols, - sample_weight_col=self.sample_weight_col, - ) + single_node = if_single_node(session) + if not single_node: + # Set the default value of the `n_jobs` attribute for the estimator. + # If minus one is set, it will not be abided by in the UDTF, so we set that to the default value as well. + if hasattr(self._sklearn_object.estimator, "n_jobs") and self._sklearn_object.estimator.n_jobs in [ + None, + -1, + ]: + self._sklearn_object.estimator.n_jobs = DEFAULT_UDTF_NJOBS + self._sklearn_object = self._handlers.fit_search_snowpark( + param_list=ParameterSampler( + self._sklearn_object.param_distributions, + n_iter=self._sklearn_object.n_iter, + random_state=self._sklearn_object.random_state, + ), + dataset=dataset, + session=session, + estimator=self._sklearn_object, + dependencies=self._get_dependencies(), + udf_imports=["sklearn"], + input_cols=self.input_cols, + label_cols=self.label_cols, + sample_weight_col=self.sample_weight_col, + ) + else: + # Fall back with stored procedure implementation + # set the parallel factor to default to minus one, to fully accelerate the cores in single node + if self._sklearn_object.n_jobs is None: + self._sklearn_object.n_jobs = -1 + + self._sklearn_object = self._handlers.fit_snowpark( + dataset, + session, + self._sklearn_object, + ["snowflake-snowpark-python"] + self._get_dependencies(), + self.input_cols, + self.label_cols, + self.sample_weight_col, + ) def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]: if self._drop_input_cols: diff --git a/snowflake/ml/modeling/pipeline/pipeline.py b/snowflake/ml/modeling/pipeline/pipeline.py index 3d7c38b3..e7327a5b 100644 --- a/snowflake/ml/modeling/pipeline/pipeline.py +++ b/snowflake/ml/modeling/pipeline/pipeline.py @@ -12,6 +12,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry from snowflake.ml._internal.exceptions import error_codes, exceptions +from snowflake.ml._internal.utils import snowpark_dataframe_utils from snowflake.ml.model.model_signature import ModelSignature, _infer_signature from snowflake.ml.modeling.framework import _utils, base @@ -237,6 +238,11 @@ def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "Pipeline": """ self._validate_steps() + dataset = ( + snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset) + if isinstance(dataset, snowpark.DataFrame) + else dataset + ) transformed_dataset = self._fit_transform_dataset(dataset) estimator = self._get_estimator() @@ -268,6 +274,11 @@ def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[s Transformed data. Output datatype will be same as input datatype. """ self._enforce_fit() + dataset = ( + snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset) + if isinstance(dataset, snowpark.DataFrame) + else dataset + ) transformed_dataset = self._transform_dataset(dataset=dataset) estimator = self._get_estimator() @@ -301,6 +312,11 @@ def fit_transform( """ self._validate_steps() + dataset = ( + snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset) + if isinstance(dataset, snowpark.DataFrame) + else dataset + ) transformed_dataset = self._fit_transform_dataset(dataset=dataset) @@ -340,6 +356,11 @@ def fit_predict(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union """ self._validate_steps() + dataset = ( + snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset) + if isinstance(dataset, snowpark.DataFrame) + else dataset + ) transformed_dataset = self._fit_transform_dataset(dataset=dataset) diff --git a/snowflake/ml/modeling/preprocessing/normalizer.py b/snowflake/ml/modeling/preprocessing/normalizer.py index ecd00ba9..d2c89f43 100644 --- a/snowflake/ml/modeling/preprocessing/normalizer.py +++ b/snowflake/ml/modeling/preprocessing/normalizer.py @@ -14,6 +14,30 @@ class Normalizer(base.BaseTransformer): + """ + Normalize samples individually to each row's unit norm. + + Each sample (i.e. each row of the data matrix) with at least one + non-zero component is rescaled independently of other samples so + that its norm (l1, l2 or inf) equals one. + + Args: + norm: str, default="l2" + The norm to use to normalize each non-zero sample. If norm='max' + is used, values will be rescaled by the maximum of the absolute + values. It must be one of 'l1', 'l2', or 'max'. + + input_cols: Optional[Union[str, List[str]]] + Columns to use as inputs during transform. + + output_cols: Optional[Union[str, List[str]]] + A string or list of strings representing column names that will store the output of transform operation. + The length of `output_cols` must equal the length of `input_cols`. + + drop_input_cols: bool, default=False + Remove input columns from output if set `True`. + """ + def __init__( self, *, @@ -22,21 +46,6 @@ def __init__( output_cols: Optional[Union[str, Iterable[str]]] = None, drop_input_cols: Optional[bool] = False, ) -> None: - """ - Normalize samples individually to each row's unit norm. - - Each sample (i.e. each row of the data matrix) with at least one - nonzero component is rescaled independently of other samples so - that its norm (l1, l2 or inf) equals one. - - Args: - norm: The norm to use to normalize each non zero sample. If norm='max' - is used, values will be rescaled by the maximum of the absolute - values. It must be one of 'l1', 'l2', or 'max'. - input_cols: Single or multiple input columns. - output_cols: Single or multiple output columns. - drop_input_cols: Remove input columns from output if set True. False by default. - """ super().__init__(drop_input_cols=drop_input_cols) self.norm = norm self._is_fitted = False @@ -79,7 +88,7 @@ def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "Normalizer": ) def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[snowpark.DataFrame, pd.DataFrame]: """ - Scale each nonzero row of the input dataset to the unit norm. + Scale each non-zero row of the input dataset to the unit norm. Args: dataset: Input dataset. diff --git a/snowflake/ml/registry/BUILD.bazel b/snowflake/ml/registry/BUILD.bazel index fdf918ba..479176b3 100644 --- a/snowflake/ml/registry/BUILD.bazel +++ b/snowflake/ml/registry/BUILD.bazel @@ -4,9 +4,12 @@ package(default_visibility = ["//visibility:public"]) py_library( name = "model_registry", - srcs = ["model_registry.py"], + srcs = [ + "artifact.py", + "model_registry.py", + ], deps = [ - ":_ml_artifact", + ":artifact_manager", ":schema", "//snowflake/ml/_internal:telemetry", "//snowflake/ml/_internal/utils:formatting", @@ -49,8 +52,11 @@ py_library( ) py_library( - name = "_ml_artifact", - srcs = ["_ml_artifact.py"], + name = "artifact_manager", + srcs = [ + "_artifact_manager.py", + "artifact.py", + ], deps = [ ":schema", "//snowflake/ml/_internal/utils:formatting", @@ -59,10 +65,10 @@ py_library( ) py_test( - name = "_ml_artifact_test", - srcs = ["_ml_artifact_test.py"], + name = "_artifact_test", + srcs = ["_artifact_test.py"], deps = [ - ":_ml_artifact", + ":artifact_manager", "//snowflake/ml/_internal/utils:identifier", "//snowflake/ml/test_utils:mock_data_frame", "//snowflake/ml/test_utils:mock_session", diff --git a/snowflake/ml/registry/_artifact_manager.py b/snowflake/ml/registry/_artifact_manager.py new file mode 100644 index 00000000..4927e0aa --- /dev/null +++ b/snowflake/ml/registry/_artifact_manager.py @@ -0,0 +1,156 @@ +from typing import Optional, cast + +from snowflake import connector, snowpark +from snowflake.ml._internal.utils import formatting, table_manager +from snowflake.ml.registry import _initial_schema, artifact + + +class ArtifactManager: + """It manages artifacts in model registry.""" + + def __init__( + self, + session: snowpark.Session, + database_name: str, + schema_name: str, + ) -> None: + """Initializer of artifact manager. + + Args: + session: Session object to communicate with Snowflake. + database_name: Desired name of the model registry database. + schema_name: Desired name of the schema used by this model registry inside the database. + """ + self._session = session + self._database_name = database_name + self._schema_name = schema_name + self._fully_qualified_table_name = table_manager.get_fully_qualified_table_name( + self._database_name, self._schema_name, _initial_schema._ARTIFACT_TABLE_NAME + ) + + def exists( + self, + artifact_name: str, + artifact_version: Optional[str] = None, + ) -> bool: + """Validate if an artifact exists. + + Args: + artifact_name: Name of artifact. + artifact_version: Version of artifact. + + Returns: + bool: True if the artifact exists, False otherwise. + """ + selected_artifact = self.get(artifact_name, artifact_version).collect() + + assert ( + len(selected_artifact) < 2 + ), f"""Multiple records found for artifact with name/version: {artifact_name}/{artifact_version}!""" + + return len(selected_artifact) == 1 + + def add( + self, + artifact: artifact.Artifact, + artifact_id: str, + artifact_name: str, + artifact_version: Optional[str] = None, + ) -> artifact.Artifact: + """ + Add a new artifact. + + Args: + artifact: artifact object. + artifact_id: id of artifact. + artifact_name: name of artifact. + artifact_version: version of artifact. + + Returns: + A reference to artifact. + """ + if artifact_version is None: + artifact_version = "" + assert artifact_id != "", "Artifact id can't be empty." + + new_artifact = { + "ID": artifact_id, + "TYPE": artifact.type.value, + "NAME": artifact_name, + "VERSION": artifact_version, + "CREATION_ROLE": self._session.get_current_role(), + "CREATION_TIME": formatting.SqlStr("CURRENT_TIMESTAMP()"), + "ARTIFACT_SPEC": artifact._spec, + } + + # TODO: Consider updating the METADATA table for artifact history tracking as well. + table_manager.insert_table_entry(self._session, self._fully_qualified_table_name, new_artifact) + artifact._log(name=artifact_name, version=artifact_version, id=artifact_id) + return artifact + + def delete( + self, + artifact_name: str, + artifact_version: Optional[str] = None, + error_if_not_exist: bool = False, + ) -> None: + """ + Remove an artifact. + + Args: + artifact_name: Name of artifact. + artifact_version: Version of artifact. + error_if_not_exist: Whether to raise errors if the target entry doesn't exist. Default to be false. + + Raises: + DataError: If error_if_not_exist is true and the artifact doesn't exist in the database. + RuntimeError: If the artifact deletion failed. + """ + if not self.exists(artifact_name, artifact_version): + if error_if_not_exist: + raise connector.DataError( + f"Artifact {artifact_name}/{artifact_version} doesn't exist. Deletion failed." + ) + else: + return + + if artifact_version is None: + artifact_version = "" + delete_query = f"""DELETE FROM {self._fully_qualified_table_name} + WHERE NAME='{artifact_name}' AND VERSION='{artifact_version}' + """ + + # TODO: Consider updating the METADATA table for artifact history tracking as well. + try: + self._session.sql(delete_query).collect() + except Exception as e: + raise RuntimeError(f"Delete artifact {artifact_name}/{artifact_version} failed due to {e}") + + def get( + self, + artifact_name: str, + artifact_version: Optional[str] = None, + ) -> snowpark.DataFrame: + """Retrieve the Snowpark dataframe of the artifact matching the provided artifact id and type. + + Given that ID and TYPE act as a compound primary key for the artifact table, + the resulting dataframe should have at most, one row. + + Args: + artifact_name: Name of artifact. + artifact_version: Version of artifact. + + Returns: + A Snowpark dataframe representing the artifacts that match the given constraints. + + WARNING: + The returned DataFrame is writable and shouldn't be made accessible to users. + """ + if artifact_version is None: + artifact_version = "" + + artifacts = self._session.sql(f"SELECT * FROM {self._fully_qualified_table_name}") + target_artifact = artifacts.filter(snowpark.Column("NAME") == artifact_name).filter( + snowpark.Column("VERSION") == artifact_version + ) + return cast(snowpark.DataFrame, target_artifact) diff --git a/snowflake/ml/registry/_ml_artifact_test.py b/snowflake/ml/registry/_artifact_test.py similarity index 56% rename from snowflake/ml/registry/_ml_artifact_test.py rename to snowflake/ml/registry/_artifact_test.py index a684408e..2202c813 100644 --- a/snowflake/ml/registry/_ml_artifact_test.py +++ b/snowflake/ml/registry/_artifact_test.py @@ -5,7 +5,7 @@ from snowflake import connector, snowpark from snowflake.ml._internal.utils import identifier, table_manager -from snowflake.ml.registry import _ml_artifact +from snowflake.ml.registry import _artifact_manager, artifact from snowflake.ml.test_utils import mock_data_frame, mock_session _DATABASE_NAME = identifier.get_inferred_name("_SYSTEM_MODEL_REGISTRY") @@ -55,52 +55,34 @@ def _get_select_artifact(self) -> List[snowpark.Row]: return [ snowpark.Row( id="FAKE_ID", - type=_ml_artifact.ArtifactType.TESTTYPE, - name="FAKE_NAME", - version=None, + type=artifact.ArtifactType.TESTTYPE, creation_time=datetime.datetime(2022, 11, 4, 17, 1, 30, 153000), creation_role="OWNER_ROLE", artifact_spec={}, ) ] - def test_if_artifact_table_exists(self) -> None: - for mock_df, expected_res in [ - (mock_data_frame.MockDataFrame(self._get_show_tables_success(name=_TABLE_NAME)), True), - (mock_data_frame.MockDataFrame([]), False), - ]: - with self.subTest(): - self._session.add_mock_sql( - query=f"SHOW TABLES LIKE '{_TABLE_NAME}' IN {_DATABASE_NAME}.{_SCHEMA_NAME}", - result=mock_df, - ) - self.assertEqual( - _ml_artifact.if_artifact_table_exists( - cast(snowpark.Session, self._session), _DATABASE_NAME, _SCHEMA_NAME - ), - expected_res, - ) - def test_if_artifact_exists(self) -> None: for mock_df_collect, expected_res in [ (self._get_select_artifact(), True), ([], False), ]: with self.subTest(): - artifact_id = "FAKE_ID" - artifact_type = _ml_artifact.ArtifactType.TESTTYPE + artifact_name = "FAKE_ID" + artifact_version = "FAKE_VERSION" expected_df = mock_data_frame.MockDataFrame() expected_df.add_operation("filter") expected_df.add_operation("filter") expected_df.add_collect_result(cast(List[snowpark.Row], mock_df_collect)) self._session.add_mock_sql(query=f"SELECT * FROM {_FULLY_QUALIFIED_TABLE_NAME}", result=expected_df) self.assertEqual( - _ml_artifact.if_artifact_exists( - cast(snowpark.Session, self._session), - _DATABASE_NAME, - _SCHEMA_NAME, - artifact_id, - artifact_type, + _artifact_manager.ArtifactManager( + session=cast(snowpark.Session, self._session), + database_name=_DATABASE_NAME, + schema_name=_SCHEMA_NAME, + ).exists( + artifact_name, + artifact_version, ), expected_res, ) @@ -108,15 +90,8 @@ def test_if_artifact_exists(self) -> None: def test_add_artifact(self) -> None: artifact_id = "FAKE_ID" artifact_name = "FAKE_NAME" - artifact_version = "1.0.0" - artifact_spec = {"description": "mock description"} - - # Mock the get_artifact call - expected_df = mock_data_frame.MockDataFrame() - expected_df.add_operation("filter") - expected_df.add_operation("filter") - expected_df.add_collect_result([]) - self._session.add_mock_sql(query=f"SELECT * FROM {_FULLY_QUALIFIED_TABLE_NAME}", result=expected_df) + artifact_version = "FAKE_VERSION" + art_obj = artifact.Artifact(type=artifact.ArtifactType.TESTTYPE, spec='{"description": "mock description"}') # Mock the insertion call self._session.add_operation("get_current_role", result="current_role") @@ -124,59 +99,58 @@ def test_add_artifact(self) -> None: f"INSERT INTO {_FULLY_QUALIFIED_TABLE_NAME}" " ( ARTIFACT_SPEC,CREATION_ROLE,CREATION_TIME,ID,NAME,TYPE,VERSION )" " SELECT" - " OBJECT_CONSTRUCT('description','mock description'),'current_role',CURRENT_TIMESTAMP()," - "'FAKE_ID','FAKE_NAME','TESTTYPE','1.0.0' " + " '{\"description\": \"mock description\"}','current_role',CURRENT_TIMESTAMP()," + "'FAKE_ID','FAKE_NAME', 'TESTTYPE', 'FAKE_VERSION' " ) self._session.add_mock_sql( query=insert_query, result=mock_data_frame.MockDataFrame([snowpark.Row(**{"number of rows inserted": 1})]), ) - _ml_artifact.add_artifact( - cast(snowpark.Session, self._session), - _DATABASE_NAME, - _SCHEMA_NAME, - artifact_id, - _ml_artifact.ArtifactType.TESTTYPE, - artifact_name, - artifact_version, - artifact_spec, + _artifact_manager.ArtifactManager( + session=cast(snowpark.Session, self._session), + database_name=_DATABASE_NAME, + schema_name=_SCHEMA_NAME, + ).add( + artifact=art_obj, + artifact_id=artifact_id, + artifact_name=artifact_name, + artifact_version=artifact_version, ) def test_delete_artifact(self) -> None: for error_if_not_exist in [True, False]: with self.subTest(): if error_if_not_exist: - artifact_id = "FAKE_ID" + artifact_name = "FAKE_NAME" + artifact_version = "FAKE_VERSION" expected_df = mock_data_frame.MockDataFrame() expected_df.add_operation("filter") expected_df.add_operation("filter") expected_df.add_collect_result([]) self._session.add_mock_sql(query=f"SELECT * FROM {_FULLY_QUALIFIED_TABLE_NAME}", result=expected_df) with self.assertRaises(connector.DataError): - _ml_artifact.delete_artifact( - cast(snowpark.Session, self._session), - _DATABASE_NAME, - _SCHEMA_NAME, - artifact_id, - _ml_artifact.ArtifactType.TESTTYPE, + _artifact_manager.ArtifactManager( + session=cast(snowpark.Session, self._session), + database_name=_DATABASE_NAME, + schema_name=_SCHEMA_NAME, + ).delete( + artifact_name, + artifact_version, True, ) else: - # Mock the delete call - insert_query = ( - f"DELETE FROM {_FULLY_QUALIFIED_TABLE_NAME}" - f" WHERE ID='{artifact_id}' AND TYPE='{_ml_artifact.ArtifactType.TESTTYPE.name}'" - ) - self._session.add_mock_sql( - query=insert_query, - result=mock_data_frame.MockDataFrame([snowpark.Row(**{"number of rows deleted": 1})]), - ) - _ml_artifact.delete_artifact( - cast(snowpark.Session, self._session), - _DATABASE_NAME, - _SCHEMA_NAME, - artifact_id, - _ml_artifact.ArtifactType.TESTTYPE, + expected_df = mock_data_frame.MockDataFrame() + expected_df.add_operation("filter") + expected_df.add_operation("filter") + expected_df.add_collect_result([]) + self._session.add_mock_sql(query=f"SELECT * FROM {_FULLY_QUALIFIED_TABLE_NAME}", result=expected_df) + _artifact_manager.ArtifactManager( + session=cast(snowpark.Session, self._session), + database_name=_DATABASE_NAME, + schema_name=_SCHEMA_NAME, + ).delete( + artifact_name, + artifact_version, ) diff --git a/snowflake/ml/registry/_ml_artifact.py b/snowflake/ml/registry/_ml_artifact.py deleted file mode 100644 index fca5de41..00000000 --- a/snowflake/ml/registry/_ml_artifact.py +++ /dev/null @@ -1,181 +0,0 @@ -import enum -from typing import Any, Dict, Optional, cast - -from snowflake import connector, snowpark -from snowflake.ml._internal.utils import formatting, table_manager -from snowflake.ml.registry import _initial_schema - - -# Set of allowed artifact types. -class ArtifactType(enum.Enum): - TESTTYPE = "TESTTYPE" # A placeholder type just for unit test - DATASET = "DATASET" - - -def if_artifact_table_exists( - session: snowpark.Session, - database_name: str, - schema_name: str, -) -> bool: - """ - Verify the existence of the artifact table. - - Args: - session: Snowpark session object to communicate with Snowflake. - database_name: Desired name of the model registry database. - schema_name: Desired name of the schema used by this model registry inside the database. - - Returns: - bool: True if the artifact table exists, False otherwise. - """ - qualified_schema_name = table_manager.get_fully_qualified_schema_name(database_name, schema_name) - return table_manager.validate_table_exist(session, _initial_schema._ARTIFACT_TABLE_NAME, qualified_schema_name) - - -def if_artifact_exists( - session: snowpark.Session, database_name: str, schema_name: str, artifact_id: str, artifact_type: ArtifactType -) -> bool: - """Validate if a specific artifact record exists in the artifact table. - - Args: - session: Session object to communicate with Snowflake. - database_name: Desired name of the model registry database. - schema_name: Desired name of the schema used by this model registry inside the database. - artifact_id: Unique identifier of the target artifact. - artifact_type: Type of the target artifact - - Returns: - bool: True if the artifact exists, False otherwise. - """ - selected_artifact = _get_artifact(session, database_name, schema_name, artifact_id, artifact_type).collect() - - assert ( - len(selected_artifact) < 2 - ), f"Multiple records found for the specified artifact (ID: {artifact_id}, TYPE: {artifact_type.name})!" - - return len(selected_artifact) == 1 - - -def add_artifact( - session: snowpark.Session, - database_name: str, - schema_name: str, - artifact_id: str, - artifact_type: ArtifactType, - artifact_name: str, - artifact_version: Optional[str], - artifact_spec: Dict[str, Any], -) -> None: - """ - Insert a new artifact record into the designated artifact table. - - Args: - session: Session object to communicate with Snowflake. - database_name: Desired name of the model registry database. - schema_name: Desired name of the schema used by this model registry inside the database. - artifact_id: Unique identifier for the artifact. - artifact_type: Type of the artifact. - artifact_name: Name of the artifact. - artifact_version: Version of the artifact if applicable. - artifact_spec: Specifications related to the artifact. - - Raises: - TypeError: If the given artifact type isn't valid. - DataError: If the given artifact already exists in the database. - """ - if not isinstance(artifact_type, ArtifactType): - raise TypeError(f"{artifact_type} isn't a recognized artifact type.") - - if if_artifact_exists(session, database_name, schema_name, artifact_id, artifact_type): - raise connector.DataError( - f"artifact with ID {artifact_id} and TYPE {artifact_type.name} already exists. Unable to add the artifact." - ) - - fully_qualified_table_name = table_manager.get_fully_qualified_table_name( - database_name, schema_name, _initial_schema._ARTIFACT_TABLE_NAME - ) - - new_artifact = { - "ID": artifact_id, - "TYPE": artifact_type.name, - "NAME": artifact_name, - "VERSION": artifact_version, - "CREATION_ROLE": session.get_current_role(), - "CREATION_TIME": formatting.SqlStr("CURRENT_TIMESTAMP()"), - "ARTIFACT_SPEC": artifact_spec, - } - - # TODO: Consider updating the METADATA table for artifact history tracking as well. - table_manager.insert_table_entry(session, fully_qualified_table_name, new_artifact) - - -def delete_artifact( - session: snowpark.Session, - database_name: str, - schema_name: str, - artifact_id: str, - artifact_type: ArtifactType, - error_if_not_exist: bool = False, -) -> None: - """ - Remove an artifact record from the designated artifact table. - - Args: - session: Session object to communicate with Snowflake. - database_name: Desired name of the model registry database. - schema_name: Desired name of the schema used by this model registry inside the database. - artifact_id: Unique identifier for the artifact to be deleted. - artifact_type: Type of the artifact to be deleted. - error_if_not_exist: Whether to raise errors if the target entry doesn't exist. Default to be false. - - Raises: - DataError: If error_if_not_exist is true and the artifact doesn't exist in the database. - RuntimeError: If the artifact deletion failed. - """ - if error_if_not_exist and not if_artifact_exists(session, database_name, schema_name, artifact_id, artifact_type): - raise connector.DataError( - f"Artifact with ID '{artifact_id}' and TYPE '{artifact_type.name}' doesn't exist. Deletion not possible." - ) - - fully_qualified_table_name = table_manager.get_fully_qualified_table_name( - database_name, schema_name, _initial_schema._ARTIFACT_TABLE_NAME - ) - - delete_query = f"DELETE FROM {fully_qualified_table_name} WHERE ID='{artifact_id}' AND TYPE='{artifact_type.name}'" - - # TODO: Consider updating the METADATA table for artifact history tracking as well. - try: - session.sql(delete_query).collect() - except Exception as e: - raise RuntimeError(f"Delete ML artifact (ID: {artifact_id}, TYPE: {artifact_type.name}) failed due to {e}") - - -def _get_artifact( - session: snowpark.Session, database_name: str, schema_name: str, artifact_id: str, artifact_type: ArtifactType -) -> snowpark.DataFrame: - """Retrieve the Snowpark dataframe of the artifact matching the provided artifact id and type. - - Given that ID and TYPE act as a compound primary key for the artifact table, the resulting dataframe should have, - at most, one row. - - Args: - session: Session object to communicate with Snowflake. - database_name: Desired name of the model registry database. - schema_name: Desired name of the schema used by this model registry inside the database. - artifact_id: Unique identifier of the target artifact. - artifact_type: Type of the target artifact - - Returns: - A Snowpark dataframe representing the artifacts that match the given constraints. - - WARNING: - The returned DataFrame is writable and shouldn't be made accessible to users. - """ - full_table_path = table_manager.get_fully_qualified_table_name( - database_name, schema_name, _initial_schema._ARTIFACT_TABLE_NAME - ) - artifacts = session.sql(f"SELECT * FROM {full_table_path}") - target_artifact = artifacts.filter(snowpark.Column("ID") == artifact_id).filter( - snowpark.Column("TYPE") == artifact_type.name - ) - return cast(snowpark.DataFrame, target_artifact) diff --git a/snowflake/ml/registry/_schema.py b/snowflake/ml/registry/_schema.py index 40f01fbe..9b5e1609 100644 --- a/snowflake/ml/registry/_schema.py +++ b/snowflake/ml/registry/_schema.py @@ -4,7 +4,7 @@ # BUMP THIS VERSION WHENEVER YOU CHANGE ANY SCHEMA TABLES. # ALSO UPDATE SCHEMA UPGRADE PLANS. -_CURRENT_SCHEMA_VERSION = 2 +_CURRENT_SCHEMA_VERSION = 3 _REGISTRY_TABLE_SCHEMA: List[Tuple[str, str]] = [ ("CREATION_CONTEXT", "VARCHAR"), @@ -52,7 +52,7 @@ ("VERSION", "VARCHAR"), ("CREATION_ROLE", "VARCHAR"), ("CREATION_TIME", "TIMESTAMP_TZ"), - ("ARTIFACT_SPEC", "OBJECT"), + ("ARTIFACT_SPEC", "VARCHAR"), # Below is out-of-line constraints of Snowflake table. # See https://docs.snowflake.com/en/sql-reference/sql/create-table ("PRIMARY KEY", "(ID, TYPE) RELY"), @@ -76,6 +76,7 @@ # NOTE, all version from _INITIAL_VERSION + 1 till _CURRENT_SCHEMA_VERSION must exists. 1: _schema_upgrade_plans.AddTrainingDatasetIdIfNotExists, 2: _schema_upgrade_plans.ReplaceTrainingDatasetIdWithArtifactIds, + 3: _schema_upgrade_plans.ChangeArtifactSpecFromObjectToVarchar, } assert len(_SCHEMA_UPGRADE_PLANS) == _CURRENT_SCHEMA_VERSION - _initial_schema._INITIAL_VERSION diff --git a/snowflake/ml/registry/_schema_upgrade_plans.py b/snowflake/ml/registry/_schema_upgrade_plans.py index 93205f8a..fa79e539 100644 --- a/snowflake/ml/registry/_schema_upgrade_plans.py +++ b/snowflake/ml/registry/_schema_upgrade_plans.py @@ -80,3 +80,37 @@ def upgrade(self) -> None: ADD COLUMN {new_column} ARRAY """ ).collect(statement_params=self._statement_params) + + +class ChangeArtifactSpecFromObjectToVarchar(BaseSchemaUpgradePlans): + """Change artifact spec type from object to varchar. It's fine to drop the column as it's empty.""" + + def __init__( + self, + session: snowpark.Session, + database_name: str, + schema_name: str, + statement_params: Optional[Dict[str, Any]] = None, + ) -> None: + super().__init__(session, database_name, schema_name, statement_params) + + def upgrade(self) -> None: + full_schema_path = f"{self._database}.{self._schema}" + update_col = "ARTIFACT_SPEC" + self._session.sql( + f"""ALTER TABLE {full_schema_path}.{_initial_schema._ARTIFACT_TABLE_NAME} + DROP COLUMN {update_col} + """ + ).collect(statement_params=self._statement_params) + + self._session.sql( + f"""ALTER TABLE {full_schema_path}.{_initial_schema._ARTIFACT_TABLE_NAME} + ADD COLUMN {update_col} VARCHAR + """ + ).collect(statement_params=self._statement_params) + + self._session.sql( + f"""COMMENT ON COLUMN {full_schema_path}.{_initial_schema._ARTIFACT_TABLE_NAME}.{update_col} IS + 'This column is VARCHAR but supposed to store a valid JSON object' + """ + ).collect(statement_params=self._statement_params) diff --git a/snowflake/ml/registry/artifact.py b/snowflake/ml/registry/artifact.py new file mode 100644 index 00000000..f6aff3d5 --- /dev/null +++ b/snowflake/ml/registry/artifact.py @@ -0,0 +1,46 @@ +import enum +from typing import Optional + + +# Set of allowed artifact types. +class ArtifactType(enum.Enum): + TESTTYPE = "TESTTYPE" # A placeholder type just for unit test + DATASET = "DATASET" + + +class Artifact: + """ + A reference to artifact. + + Properties: + id: A globally unique id represents this artifact. + spec: Specification of artifact in json format. + type: Type of artifact. + name: Name of artifact. + version: Version of artifact. + """ + + def __init__(self, type: ArtifactType, spec: str) -> None: + """Create an artifact. + + Args: + type: type of artifact. + spec: specification in json format. + """ + self.type: ArtifactType = type + self.name: Optional[str] = None + self.version: Optional[str] = None + self._spec: str = spec + self._id: Optional[str] = None + + def _log(self, name: str, version: str, id: str) -> None: + """Additional information when this artifact is logged. + + Args: + name: name of artifact. + version: version of artifact. + id: A global unique id represents this artifact. + """ + self.name = name + self.version = version + self._id = id diff --git a/snowflake/ml/registry/model_registry.py b/snowflake/ml/registry/model_registry.py index b1a1251d..c5c52257 100644 --- a/snowflake/ml/registry/model_registry.py +++ b/snowflake/ml/registry/model_registry.py @@ -36,7 +36,12 @@ model_signature, type_hints as model_types, ) -from snowflake.ml.registry import _initial_schema, _ml_artifact, _schema_version_manager +from snowflake.ml.registry import ( + _artifact_manager, + _initial_schema, + _schema_version_manager, + artifact, +) from snowflake.snowpark._internal import utils as snowpark_utils if TYPE_CHECKING: @@ -231,7 +236,10 @@ def _create_registry_views( {artifact_table_name}.* FROM {registry_table_name} LEFT JOIN {artifact_table_name} - ON (ARRAY_CONTAINS({artifact_table_name}.ID::VARIANT, {registry_table_name}.ARTIFACT_IDS)) + ON (ARRAY_CONTAINS( + {artifact_table_name}.ID::VARIANT, + {registry_table_name}.ARTIFACT_IDS) + ) """ ).collect(statement_params=statement_params) @@ -313,6 +321,7 @@ def __init__( self._artifact_view = identifier.concat_names([self._artifact_table, "_VIEW"]) self._session = session self._svm = _schema_version_manager.SchemaVersionManager(self._session, self._name, self._schema) + self._artifact_manager = _artifact_manager.ArtifactManager(self._session, self._name, self._schema) # A in-memory deployment info cache to store information of temporary deployments # TODO(zhe): Use a temporary table to replace the in-memory cache. @@ -800,7 +809,7 @@ def _register_model_with_id( output_spec: Optional[Dict[str, str]] = None, description: Optional[str] = None, tags: Optional[Dict[str, str]] = None, - dataset: Optional[dataset.Dataset] = None, + artifacts: Optional[List[artifact.Artifact]] = None, ) -> None: """Helper function to register model metadata. @@ -820,9 +829,10 @@ def _register_model_with_id( description: A description for the model. The description can be changed later. tags: Key-value pairs of tags to be set for this model. Tags can be modified after model registration. - dataset: An object contains dataset metadata. + artifacts: A list of artifact references. Raises: + ValueError: Artifact ids not found in model registry. DataError: The given model already exists. DatabaseError: Unable to register the model properties into table. """ @@ -838,24 +848,11 @@ def _register_model_with_id( new_model["CREATION_ROLE"] = self._session.get_current_role() new_model["CREATION_ENVIRONMENT_SPEC"] = {"python": ".".join(map(str, sys.version_info[:3]))} - if dataset is not None: - is_artifact_exists = _ml_artifact.if_artifact_exists( - self._session, self._name, self._schema, dataset.id, _ml_artifact.ArtifactType.DATASET - ) - if not is_artifact_exists: - _ml_artifact.add_artifact( - session=self._session, - database_name=self._name, - schema_name=self._schema, - artifact_id=dataset.id, - artifact_type=_ml_artifact.ArtifactType.DATASET, - artifact_name=dataset.name, - artifact_version=dataset.version, - artifact_spec=json.loads(dataset.to_json()), - ) - new_model["ARTIFACT_IDS"] = [dataset.id] - else: - new_model["ARTIFACT_IDS"] = [] + if artifacts is not None: + for atf in artifacts: + if not self._artifact_manager.exists(atf.name if atf.name is not None else "", atf.version): + raise ValueError(f"Artifact {atf.name}/{atf.version} not found in model registry.") + new_model["ARTIFACT_IDS"] = [art._id for art in artifacts] existing_model_nums = self._list_selected_models(model_name=model_name, model_version=model_version).count() if existing_model_nums: @@ -1266,6 +1263,42 @@ def get_metrics(self, model_name: str, model_version: str) -> Dict[str, object]: else: return dict() + @telemetry.send_api_usage_telemetry( + project=_TELEMETRY_PROJECT, + subproject=_TELEMETRY_SUBPROJECT, + ) + @snowpark._internal.utils.private_preview(version="1.0.10") + def log_artifact( + self, + artifact: artifact.Artifact, + name: str, + version: Optional[str] = None, + ) -> artifact.Artifact: + """Upload and register an artifact to the Model Registry. + + Args: + artifact: artifact object. + name: name of artifact. + version: version of artifact. + + Raises: + DataError: Artifact with same name and version already exists. + + Returns: + Return a reference to the artifact. + """ + + if self._artifact_manager.exists(name, version): + raise connector.DataError(f"Artifact {name}/{version} already exists.") + + artifact_id = self._get_new_unique_identifier() + return self._artifact_manager.add( + artifact=artifact, + artifact_id=artifact_id, + artifact_name=name, + artifact_version=version, + ) + # Combined Registry and Repository operations. @telemetry.send_api_usage_telemetry( project=_TELEMETRY_PROJECT, @@ -1284,7 +1317,7 @@ def log_model( pip_requirements: Optional[List[str]] = None, signatures: Optional[Dict[str, model_signature.ModelSignature]] = None, sample_input_data: Optional[Any] = None, - dataset: Optional[dataset.Dataset] = None, + artifacts: Optional[List[artifact.Artifact]] = None, code_paths: Optional[List[str]] = None, options: Optional[model_types.BaseModelSaveOption] = None, ) -> Optional["ModelReference"]: @@ -1304,18 +1337,21 @@ def log_model( pip_requirements: List of PIP package specs. Model will not be able to deploy to the warehouse if there is pip requirements. signatures: Signatures of the model, which is a mapping from target method name to signatures of input and - output, which could be inferred by calling `infer_signature` method with sample input data dataset. - sample_input_data: Sample of the input data for the model. - dataset: A dataset metadata object. + output, which could be inferred by calling `infer_signature` method with sample input data. + sample_input_data: Sample of the input data for the model. If artifacts contains a feature store + generated dataset, then sample_input_data is not needed. If both sample_input_data and dataset provided + , then sample_input_data will be used to infer model signature. + artifacts: A list of artifact ids, which are generated from log_artifact(). code_paths: Directory of code to import when loading and deploying the model. options: Additional options when saving the model. Raises: - DataError: Raised when the given model exists. - ValueError: Raised in following cases: - 1) both sample_input_data and dataset are provided; - 2) signatures and sample_input_data/dataset are both not provided and - model is not a snowflake estimator. + DataError: Raised when: + 1) the given model already exists; + 2) given artifacts does not exists in this registry. + ValueError: Raised when: # noqa: DAR402 + 1) Signatures, sample_input_data and artifact(dataset) are both not provided and model is not a + snowflake estimator. Exception: Raised when there is any error raised when saving the model. Returns: @@ -1329,15 +1365,17 @@ def log_model( self._model_identifier_is_nonempty_or_raise(model_name, model_version) - if sample_input_data is not None and dataset is not None: - raise ValueError("Only one of sample_input_data and dataset should be provided.") + if artifacts is not None: + for atf in artifacts: + if not self._artifact_manager.exists(atf.name if atf.name is not None else "", atf.version): + raise connector.DataError(f"Artifact {atf.name}/{atf.version} does not exists.") - if dataset is not None: - sample_input_data = dataset.df - if dataset.timestamp_col is not None: - sample_input_data = sample_input_data.drop(dataset.timestamp_col) - if dataset.label_cols is not None: - sample_input_data = sample_input_data.drop(dataset.label_cols) + if sample_input_data is None and artifacts is not None: + for atf in artifacts: + if atf.type == artifact.ArtifactType.DATASET: + ds = self.get_artifact(atf.name if atf.name is not None else "", atf.version) + sample_input_data = ds.features_df() + break existing_model_nums = self._list_selected_models(model_name=model_name, model_version=model_version).count() if existing_model_nums: @@ -1377,7 +1415,7 @@ def log_model( uri=uri.get_uri_from_snowflake_stage_path(model_stage_file_path), description=description, tags=tags, - dataset=dataset, + artifacts=artifacts, ) return ModelReference(registry=self, model_name=model_name, model_version=model_version) @@ -1621,24 +1659,33 @@ def get_deployment(self, model_name: str, model_version: str, *, deployment_name project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) - @snowpark._internal.utils.private_preview(version="1.0.1") - def get_dataset(self, model_name: str, model_version: str) -> Optional[dataset.Dataset]: - """Get dataset of the model with the given (model name + model version). + @snowpark._internal.utils.private_preview(version="1.0.11") + def get_artifact(self, name: str, version: Optional[str] = None) -> Optional[artifact.Artifact]: + """Get artifact with the given (name, version). Args: - model_name: Model Name string. - model_version: Model Version string. + name: Name of artifact. + version: Version of artifact. Returns: - Dataset of the model or none if not found. + A reference to artifact if found, otherwise none. """ - artifacts = ( - self.list_artifacts(model_name, model_version) - .filter(snowpark.Column("TYPE") == _ml_artifact.ArtifactType.DATASET.value) - .collect() - ) + artifacts = self._artifact_manager.get( + name, + version, + ).collect() + + if len(artifacts) == 0: + return None + + atf = artifacts[0] + if atf["TYPE"] == artifact.ArtifactType.DATASET.value: + ds = dataset.Dataset.from_json(atf["ARTIFACT_SPEC"], self._session) + ds._log(name=atf["NAME"], version=atf["VERSION"], id=atf["ID"]) + return ds - return dataset.Dataset.from_json(artifacts[0]["ARTIFACT_SPEC"], self._session) if len(artifacts) != 0 else None + assert f"Unrecognized artifact type: {atf['TYPE']}" + return None @telemetry.send_api_usage_telemetry( project=_TELEMETRY_PROJECT, @@ -2019,9 +2066,10 @@ def create_model_registry( statement_params, ) finally: - # Restore the db & schema to the original ones - if old_db is not None and old_db != session.get_current_database(): - session.use_database(old_db) - if old_schema is not None and old_schema != session.get_current_schema(): - session.use_schema(old_schema) + if not snowpark_utils.is_in_stored_procedure(): # type: ignore[no-untyped-call] + # Restore the db & schema to the original ones + if old_db is not None and old_db != session.get_current_database(): + session.use_database(old_db) + if old_schema is not None and old_schema != session.get_current_schema(): + session.use_schema(old_schema) return True diff --git a/snowflake/ml/registry/model_registry_test.py b/snowflake/ml/registry/model_registry_test.py index a3d3fd35..2487b95e 100644 --- a/snowflake/ml/registry/model_registry_test.py +++ b/snowflake/ml/registry/model_registry_test.py @@ -564,7 +564,9 @@ def setup_create_views_call(self) -> None: {_ARTIFACTS_TABLE_NAME}.* FROM {_REGISTRY_TABLE_NAME} LEFT JOIN {_ARTIFACTS_TABLE_NAME} - ON (ARRAY_CONTAINS({_ARTIFACTS_TABLE_NAME}.ID::VARIANT, {_REGISTRY_TABLE_NAME}.ARTIFACT_IDS)) + ON (ARRAY_CONTAINS( + {_ARTIFACTS_TABLE_NAME}.ID::VARIANT, + {_REGISTRY_TABLE_NAME}.ARTIFACT_IDS)) """ ), result=mock_data_frame.MockDataFrame( @@ -614,6 +616,23 @@ def setup_schema_upgrade_calls(self, statement_params: Dict[str, str]) -> None: query=(f"ALTER TABLE {reg_table_full_path} ADD COLUMN ARTIFACT_IDS ARRAY"), result=mock_data_frame.MockDataFrame([snowpark.Row(status="Statement executed successfully.")]), ) + art_table_full_path = f"{_DATABASE_NAME}.{_SCHEMA_NAME}.{_ARTIFACTS_TABLE_NAME}" + self.add_session_mock_sql( + query=(f"ALTER TABLE {art_table_full_path} DROP COLUMN ARTIFACT_SPEC"), + result=mock_data_frame.MockDataFrame([snowpark.Row(status="Statement executed successfully.")]), + ) + self.add_session_mock_sql( + query=(f"ALTER TABLE {art_table_full_path} ADD COLUMN ARTIFACT_SPEC VARCHAR"), + result=mock_data_frame.MockDataFrame([snowpark.Row(status="Statement executed successfully.")]), + ) + self.add_session_mock_sql( + query=( + f"""COMMENT ON COLUMN {art_table_full_path}.ARTIFACT_SPEC IS + 'This column is VARCHAR but supposed to store a valid JSON object'""" + ), + result=mock_data_frame.MockDataFrame([snowpark.Row(status="Statement executed successfully.")]), + ) + # end schema upgrade plans self._mock_desc_registry_table(statement_params) self._mock_desc_metadata_table(statement_params) @@ -1150,7 +1169,7 @@ def test_log_model(self) -> None: uri=uri.get_uri_from_snowflake_stage_path(model_path), description="description", tags=None, - dataset=None, + artifacts=None, ) self._mock_show_version_table_exists({}) diff --git a/snowflake/ml/registry/notebooks/Deployment to Snowpark Container Service Demo.ipynb b/snowflake/ml/registry/notebooks/Deployment to Snowpark Container Service Demo.ipynb index 84526ff8..bdd99926 100644 --- a/snowflake/ml/registry/notebooks/Deployment to Snowpark Container Service Demo.ipynb +++ b/snowflake/ml/registry/notebooks/Deployment to Snowpark Container Service Demo.ipynb @@ -1,540 +1,382 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "a45960e1", - "metadata": {}, - "source": [ - "# Deployment to Snowpark Container Service Demo" - ] - }, - { - "cell_type": "markdown", - "id": "aa7a329a", - "metadata": {}, - "source": [ - "## Prerequisite\n", - "\n", - "- Install and have a running Docker Client (required only for PrPr for client-side image build)" - ] - }, - { - "cell_type": "markdown", - "id": "3b50d774", - "metadata": {}, - "source": [ - "## Train a model with Snowpark ML API " - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "18a75d71", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Tuple\n", - "from snowflake.ml.modeling import linear_model\n", - "from sklearn import datasets\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "def prepare_logistic_model() -> Tuple[linear_model.LogisticRegression, pd.DataFrame]:\n", - " iris = datasets.load_iris()\n", - " df = pd.DataFrame(data=np.c_[iris[\"data\"], iris[\"target\"]], columns=iris[\"feature_names\"] + [\"target\"])\n", - " df.columns = [s.replace(\" (CM)\", \"\").replace(\" \", \"\") for s in df.columns.str.upper()]\n", - "\n", - " input_cols = [\"SEPALLENGTH\", \"SEPALWIDTH\", \"PETALLENGTH\", \"PETALWIDTH\"]\n", - " label_cols = \"TARGET\"\n", - " output_cols = \"PREDICTED_TARGET\"\n", - "\n", - " estimator = linear_model.LogisticRegression(\n", - " input_cols=input_cols, output_cols=output_cols, label_cols=label_cols, random_state=0, max_iter=1000\n", - " ).fit(df)\n", - "\n", - " return estimator, df.drop(columns=label_cols).head(10)" - ] - }, - { - "cell_type": "markdown", - "id": "db6734fa", - "metadata": {}, - "source": [ - "## Start Snowpark Session" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "58dd3604", - "metadata": {}, - "outputs": [], - "source": [ - "from snowflake.ml.utils.connection_params import SnowflakeLoginOptions\n", - "from snowflake.snowpark import Session\n", - "\n", - "session = Session.builder.configs(SnowflakeLoginOptions()).create()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "27dfbc42", - "metadata": {}, - "outputs": [], - "source": [ - "from snowflake.ml.registry import model_registry\n", - "from snowflake.ml._internal.utils import identifier\n", - "\n", - "db = identifier._get_unescaped_name(session.get_current_database())\n", - "schema = identifier._get_unescaped_name(session.get_current_schema())\n", - "\n", - "# will be a no-op if registry already exists\n", - "model_registry.create_model_registry(session=session, database_name=db, schema_name=schema) \n", - "registry = model_registry.ModelRegistry(session=session, database_name=db, schema_name=schema)" - ] - }, - { - "cell_type": "markdown", - "id": "38e0a975", - "metadata": {}, - "source": [ - "## Register SnowML Model" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "574e7a43", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:snowflake.snowpark:ModelRegistry.log_model() is in private preview since 0.2.0. Do not use it in production. \n", - "WARNING:snowflake.snowpark:ModelRegistry.list_models() is in private preview since 0.2.0. Do not use it in production. \n" - ] - }, - { - "data": { - "text/plain": [ - "'0aa236602be711ee89915ac3f3b698e1'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logistic_model, test_features = prepare_logistic_model()\n", - "model_name = \"snowpark_ml_logistic\"\n", - "model_version = \"v1\"\n", - "\n", - "model_ref = registry.log_model(\n", - " model_name=model_name,\n", - " model_version=model_version,\n", - " model=logistic_model,\n", - " sample_input_data=test_features,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "054a3862", - "metadata": {}, - "source": [ - "## Model Deployment to Snowpark Container Service" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "72ff114f", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:root:Building the Docker image and deploying to Snowpark Container Service. This process may take a few minutes.\n", - "WARNING:root:Image successfully built! To prevent the need for rebuilding the Docker image in future deployments, simply specify 'prebuilt_snowflake_image': 'temptest002038-servicesnow.registry-dev.snowflakecomputing.com/inference_container_db/inference_container_schema/snowml_repo/42374efe274011eea4ff5ac3f3b698e1:latest' in the options field of the deploy() function\n" - ] - } - ], - "source": [ - "from snowflake.ml.model import deploy_platforms\n", - "from snowflake import snowpark\n", - "\n", - "compute_pool = \"MY_COMPUTE_POOL\" # Pre-created compute pool\n", - "deployment_name = \"LOGISTIC_FUNC\" # Name of the resulting UDF\n", - "\n", - "model_ref.deploy(\n", - " deployment_name=deployment_name, \n", - " platform=deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES,\n", - " target_method=\"predict\",\n", - " options={\n", - " \"compute_pool\": compute_pool\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "1c754e72", - "metadata": {}, - "source": [ - "## Batch Prediction on Snowpark Container Service" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "a5c02328", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SEPALLENGTHSEPALWIDTHPETALLENGTHPETALWIDTHPREDICTED_TARGET
05.13.51.40.20.0
14.93.01.40.20.0
24.73.21.30.20.0
34.63.11.50.20.0
45.03.61.40.20.0
55.43.91.70.40.0
64.63.41.40.30.0
75.03.41.50.20.0
84.42.91.40.20.0
94.93.11.50.10.0
\n", - "
" - ], - "text/plain": [ - " SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH PREDICTED_TARGET\n", - "0 5.1 3.5 1.4 0.2 0.0\n", - "1 4.9 3.0 1.4 0.2 0.0\n", - "2 4.7 3.2 1.3 0.2 0.0\n", - "3 4.6 3.1 1.5 0.2 0.0\n", - "4 5.0 3.6 1.4 0.2 0.0\n", - "5 5.4 3.9 1.7 0.4 0.0\n", - "6 4.6 3.4 1.4 0.3 0.0\n", - "7 5.0 3.4 1.5 0.2 0.0\n", - "8 4.4 2.9 1.4 0.2 0.0\n", - "9 4.9 3.1 1.5 0.1 0.0" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model_ref.predict(deployment_name, test_features)" - ] - }, - { - "cell_type": "markdown", - "id": "9f8c6ce5", - "metadata": {}, - "source": [ - "## Train a HuggingFace Model (cross-encoder/nli-MiniLM2-L6-H768)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "809d5e98", - "metadata": {}, - "outputs": [], - "source": [ - "from transformers import pipeline\n", - "from snowflake.ml.model import custom_model\n", - "\n", - "def prepare_cross_encoder_model() -> Tuple[custom_model.CustomModel, pd.DataFrame]:\n", - " \"\"\"\n", - " Pretrained cross encoder model from huggingface.\n", - " \"\"\"\n", - " classifier = pipeline(\"zero-shot-classification\", model='cross-encoder/nli-MiniLM2-L6-H768') \n", - " candidate_labels = ['customer support', 'product experience', 'account issues']\n", - "\n", - " class HuggingFaceModel(custom_model.CustomModel):\n", - " def __init__(self, context: custom_model.ModelContext) -> None:\n", - " super().__init__(context)\n", - " \n", - " @custom_model.inference_api\n", - " def predict(self, input_df: pd.DataFrame) -> pd.DataFrame: \n", - " sequences_to_classify = input_df.values.flatten().tolist()\n", - " data = [classifier(sequence, candidate_labels) for sequence in sequences_to_classify]\n", - " max_score_labels = []\n", - " for record in data:\n", - " max_score_label = max(zip(record['labels'], record['scores']), key=lambda x: x[1])[0]\n", - " max_score_labels.append(max_score_label) \n", - " return pd.DataFrame({\"output\": max_score_labels})\n", - "\n", - " cross_encoder_model = HuggingFaceModel(custom_model.ModelContext())\n", - " test_data = pd.DataFrame([\"The interface gets frozen very often\"])\n", - "\n", - " return cross_encoder_model, test_data" - ] - }, - { - "cell_type": "markdown", - "id": "67d6a7d2", - "metadata": {}, - "source": [ - "## Register Cross Encoder Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9dd84f88", - "metadata": {}, - "outputs": [], - "source": [ - "from snowflake.ml.registry import model_registry\n", - "\n", - "model, test_features = prepare_cross_encoder_model()\n", - "model_name = \"cross_encoder_model\"\n", - "model_version = \"v1\"\n", - "\n", - "model_ref = registry.log_model(\n", - " model_name=model_name,\n", - " model_version=model_version,\n", - " model=model,\n", - " sample_input_data=test_features,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "c6db686e", - "metadata": {}, - "source": [ - "## Model Deployment to Snowpark Container Service (GPU)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "701152f7", - "metadata": {}, - "outputs": [], - "source": [ - "from snowflake.ml.model import deploy_platforms\n", - "\n", - "compute_pool = \"MY_COMPUTE_POOL\" # Pre-created\n", - "deployment_name = \"CROSS_ENCODER\" # Name of the resulting UDF\n", - "\n", - "model_ref.deploy(\n", - " deployment_name=deployment_name, \n", - " platform=deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES,\n", - " target_method=\"predict\",\n", - " options={\n", - " \"compute_pool\": compute_pool,\n", - " \"num_gpus\": 1\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "7b0fba61", - "metadata": {}, - "source": [ - "## Zero-Shot Classification" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "936840df", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " input_feature_0\n", - "0 The interface gets frozen very often\n" - ] - } - ], - "source": [ - "print(test_features)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "302daaf9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
output
0product experience
\n", - "
" - ], - "text/plain": [ - " output\n", - "0 product experience" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model_ref.predict(deployment_name, test_features)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:local_snowml] *", - "language": "python", - "name": "conda-env-local_snowml-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.17" - } + "cells": [ + { + "cell_type": "markdown", + "id": "a45960e1", + "metadata": {}, + "source": [ + "# Deployment to Snowpark Container Service Demo" + ] + }, + { + "cell_type": "markdown", + "id": "aa7a329a", + "metadata": {}, + "source": [ + "### Snowflake-ML-Python Installation" + ] + }, + { + "cell_type": "markdown", + "id": "cb3d7a96", + "metadata": {}, + "source": [ + "- Please refer to our [landing page](https://docs.snowflake.com/en/developer-guide/snowpark-ml/index) to install `snowflake-ml-python`." + ] + }, + { + "cell_type": "markdown", + "id": "3b50d774", + "metadata": {}, + "source": [ + "## Train a model with Snowpark ML API " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "18a75d71", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Tuple\n", + "from snowflake.ml.modeling import linear_model\n", + "from sklearn import datasets\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "def prepare_logistic_model() -> Tuple[linear_model.LogisticRegression, pd.DataFrame]:\n", + " iris = datasets.load_iris()\n", + " df = pd.DataFrame(data=np.c_[iris[\"data\"], iris[\"target\"]], columns=iris[\"feature_names\"] + [\"target\"])\n", + " df.columns = [s.replace(\" (CM)\", \"\").replace(\" \", \"\") for s in df.columns.str.upper()]\n", + "\n", + " input_cols = [\"SEPALLENGTH\", \"SEPALWIDTH\", \"PETALLENGTH\", \"PETALWIDTH\"]\n", + " label_cols = \"TARGET\"\n", + " output_cols = \"PREDICTED_TARGET\"\n", + "\n", + " estimator = linear_model.LogisticRegression(\n", + " input_cols=input_cols, output_cols=output_cols, label_cols=label_cols, random_state=0, max_iter=1000\n", + " ).fit(df)\n", + "\n", + " return estimator, df.drop(columns=label_cols).head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "db6734fa", + "metadata": {}, + "source": [ + "## Start Snowpark Session" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "58dd3604", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.utils.connection_params import SnowflakeLoginOptions\n", + "from snowflake.snowpark import Session\n", + "\n", + "session = Session.builder.configs(SnowflakeLoginOptions()).create()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27dfbc42", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.registry import model_registry\n", + "from snowflake.ml._internal.utils import identifier\n", + "\n", + "db = identifier._get_unescaped_name(session.get_current_database())\n", + "schema = identifier._get_unescaped_name(session.get_current_schema())\n", + "\n", + "# will be a no-op if registry already exists\n", + "model_registry.create_model_registry(session=session, database_name=db, schema_name=schema) \n", + "registry = model_registry.ModelRegistry(session=session, database_name=db, schema_name=schema)" + ] + }, + { + "cell_type": "markdown", + "id": "38e0a975", + "metadata": {}, + "source": [ + "## Register SnowML Model" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "574e7a43", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:ModelRegistry.log_model() is in private preview since 0.2.0. Do not use it in production. \n", + "WARNING:snowflake.snowpark:ModelRegistry.list_models() is in private preview since 0.2.0. Do not use it in production. \n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 + { + "data": { + "text/plain": [ + "'0aa236602be711ee89915ac3f3b698e1'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logistic_model, test_features = prepare_logistic_model()\n", + "model_name = \"snowpark_ml_logistic\"\n", + "model_version = \"v1\"\n", + "\n", + "model_ref = registry.log_model(\n", + " model_name=model_name,\n", + " model_version=model_version,\n", + " model=logistic_model,\n", + " sample_input_data=test_features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "054a3862", + "metadata": {}, + "source": [ + "## Model Deployment to Snowpark Container Service" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "72ff114f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Building the Docker image and deploying to Snowpark Container Service. This process may take a few minutes.\n", + "WARNING:root:Image successfully built! To prevent the need for rebuilding the Docker image in future deployments, simply specify 'prebuilt_snowflake_image': 'temptest002038-servicesnow.registry-dev.snowflakecomputing.com/inference_container_db/inference_container_schema/snowml_repo/42374efe274011eea4ff5ac3f3b698e1:latest' in the options field of the deploy() function\n" + ] + } + ], + "source": [ + "from snowflake.ml.model import deploy_platforms\n", + "from snowflake import snowpark\n", + "\n", + "compute_pool = \"MY_COMPUTE_POOL\" # Pre-created compute pool\n", + "deployment_name = \"LOGISTIC_FUNC\" # Name of the resulting UDF\n", + "\n", + "model_ref.deploy(\n", + " deployment_name=deployment_name, \n", + " platform=deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES,\n", + " target_method=\"predict\",\n", + " options={\n", + " \"compute_pool\": compute_pool,\n", + " #num_gpus: 1 # Specify the number of GPUs for GPU inferenc\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1c754e72", + "metadata": {}, + "source": [ + "## Batch Prediction on Snowpark Container Service" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a5c02328", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SEPALLENGTHSEPALWIDTHPETALLENGTHPETALWIDTHPREDICTED_TARGET
05.13.51.40.20.0
14.93.01.40.20.0
24.73.21.30.20.0
34.63.11.50.20.0
45.03.61.40.20.0
55.43.91.70.40.0
64.63.41.40.30.0
75.03.41.50.20.0
84.42.91.40.20.0
94.93.11.50.10.0
\n", + "
" + ], + "text/plain": [ + " SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH PREDICTED_TARGET\n", + "0 5.1 3.5 1.4 0.2 0.0\n", + "1 4.9 3.0 1.4 0.2 0.0\n", + "2 4.7 3.2 1.3 0.2 0.0\n", + "3 4.6 3.1 1.5 0.2 0.0\n", + "4 5.0 3.6 1.4 0.2 0.0\n", + "5 5.4 3.9 1.7 0.4 0.0\n", + "6 4.6 3.4 1.4 0.3 0.0\n", + "7 5.0 3.4 1.5 0.2 0.0\n", + "8 4.4 2.9 1.4 0.2 0.0\n", + "9 4.9 3.1 1.5 0.1 0.0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_ref.predict(deployment_name, test_features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12991f07", + "metadata": {}, + "outputs": [], + "source": [ + "model_ref.delete_deployment(deployment_name=deployment_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09f337d2", + "metadata": {}, + "outputs": [], + "source": [ + "model_ref.delete_model()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:local_snowml]", + "language": "python", + "name": "conda-env-local_snowml-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/snowflake/ml/registry/notebooks/Finetune_Registry.ipynb b/snowflake/ml/registry/notebooks/Finetune_Registry.ipynb new file mode 100644 index 00000000..6037a7d3 --- /dev/null +++ b/snowflake/ml/registry/notebooks/Finetune_Registry.ipynb @@ -0,0 +1,423 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fa0e355f", + "metadata": {}, + "source": [ + "1. Create a conda python3.8 conda env\n", + "`conda create --name snowml python=3.8`\n", + "\n", + "2. You need to install these packages locally\n", + " * peft \n", + " * transformers\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ed66db9", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!pip install /Users/halu/snowml/bazel-bin/snowflake/ml/snowflake_ml_python-1.0.10-py3-none-any.whl" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "292e9f48", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "from IPython.display import display, HTML\n", + "display(HTML(\"\"))\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7585077b", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.snowpark import Session\n", + "from snowflake.ml.utils.connection_params import SnowflakeLoginOptions" + ] + }, + { + "cell_type": "markdown", + "id": "7a0294ba", + "metadata": {}, + "source": [ + "Connection config available at ~/.snowsql/config" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f876232e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. \n" + ] + } + ], + "source": [ + "session = Session.builder.configs(SnowflakeLoginOptions('connections.demo')).create()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c6aee8c9", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('\"HALU_FT\"', '\"PUBLIC\"')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "session.get_current_database(), session.get_current_schema()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "72c16c14", + "metadata": {}, + "outputs": [], + "source": [ + "REGISTRY_DATABASE_NAME = \"HALU_MR\"\n", + "REGISTRY_SCHEMA_NAME = \"PUBLIC\"" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c420807b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:create_model_registry() is in private preview since 0.2.0. Do not use it in production. \n", + "WARNING:absl:The schema HALU_MR.PUBLIC already exists. Skipping creation.\n" + ] + } + ], + "source": [ + "from snowflake.ml.registry import model_registry\n", + "\n", + "model_registry.create_model_registry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "0adc9637", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.model.models import llm" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "18323af6", + "metadata": {}, + "outputs": [], + "source": [ + "options = llm.LLMOptions(token=\"....\")\n", + "model = llm.LLM(\n", + " model_id_or_path=\"/Users/halu/Downloads/halu_peft_ft\",\n", + " options=options\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "dac3fc56", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:ModelRegistry.log_model() is in private preview since 0.2.0. Do not use it in production. \n", + "WARNING:snowflake.snowpark:ModelRegistry.list_models() is in private preview since 0.2.0. Do not use it in production. \n" + ] + } + ], + "source": [ + "svc_model = registry.log_model(\n", + " model_name='halu_ft_model_1',\n", + " model_version='v1',\n", + " model=model,\n", + " options={\"embed_local_ml_library\": True},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "b17b1fbb", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.ml.model._deploy_client.snowservice.deploy:Similar environment detected. Using existing image sfengineering-servicesnow.registry.snowflakecomputing.com/halu_ft_db/public/haul_repo/4b7980a43a1ff656d23b9401e4471bcd4f021d39:latest to skip image build. To disable this feature, set 'force_image_build=True' in deployment options\n" + ] + }, + { + "data": { + "text/plain": [ + "{'name': 'HALU_MR.PUBLIC.halu_ft_deploy_1',\n", + " 'platform': ,\n", + " 'target_method': 'infer',\n", + " 'signature': ModelSignature(\n", + " inputs=[\n", + " FeatureSpec(dtype=DataType.STRING, name='input')\n", + " ],\n", + " outputs=[\n", + " FeatureSpec(dtype=DataType.STRING, name='generated_text')\n", + " ]\n", + " ),\n", + " 'options': {'compute_pool': 'BUILD_2023_POOL',\n", + " 'num_gpus': 1,\n", + " 'image_repo': 'sfengineering-servicesnow.registry.snowflakecomputing.com/halu_ft_db/public/haul_repo',\n", + " 'enable_remote_image_build': True,\n", + " 'model_in_image': True},\n", + " 'details': {'image_name': 'sfengineering-servicesnow.registry.snowflakecomputing.com/halu_ft_db/public/haul_repo/4b7980a43a1ff656d23b9401e4471bcd4f021d39:latest',\n", + " 'service_spec': 'spec:\\n container:\\n - env:\\n NUM_WORKERS: 1\\n SNOWML_USE_GPU: true\\n TARGET_METHOD: infer\\n image: sfengineering-servicesnow.registry.snowflakecomputing.com/halu_ft_db/public/haul_repo/4b7980a43a1ff656d23b9401e4471bcd4f021d39:latest\\n name: inference-server\\n readinessProbe:\\n path: /health\\n port: 5000\\n resources:\\n limits:\\n nvidia.com/gpu: 1\\n requests:\\n nvidia.com/gpu: 1\\n volumeMounts:\\n - mountPath: /local/user/vol1\\n name: vol1\\n endpoint:\\n - name: predict\\n port: 5000\\n volume:\\n - name: vol1\\n source: local\\n',\n", + " 'service_function_sql': \"\\n CREATE OR REPLACE FUNCTION HALU_MR.PUBLIC.halu_ft_deploy_1(input OBJECT)\\n RETURNS OBJECT\\n SERVICE=HALU_MR.PUBLIC.service_d289e6506e3111eeb21b769aea86b514\\n ENDPOINT=predict\\n MAX_BATCH_ROWS = 1\\n AS '/predict'\\n \"}}" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from snowflake.ml.model import deploy_platforms\n", + "\n", + "deployment_options = {\n", + " \"compute_pool\": 'BUILD_2023_POOL',\n", + " \"num_gpus\": 1,\n", + " \"image_repo\": 'sfengineering-servicesnow.registry.snowflakecomputing.com/halu_ft_db/public/haul_repo',\n", + " \"enable_remote_image_build\": True,\n", + " \"model_in_image\": True,\n", + "}\n", + " \n", + "deploy_info = svc_model.deploy(\n", + " deployment_name=\"halu_ft_deploy_1\",\n", + " platform=deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES,\n", + " permanent=True,\n", + " options=deployment_options\n", + ")\n", + "deploy_info" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "b25baf1c", + "metadata": {}, + "outputs": [], + "source": [ + "sample = \"\"\"\n", + "[INST] <>\n", + "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n", + "<>\n", + "### Instruction:\n", + "Extract JSON response with 'location' and 'toy_list' as keys.\n", + "'location': Location string of the caller.\n", + "'toy_list\": List of toy names from the caller.\n", + "### Input:\n", + " \"frosty: Hello, good friend! You're talking to Frosty! What's your name?\n", + "caller: My name's Oliver. And I'm calling from Perth.\n", + "frosty: Nice to meet you, Oliver from Perth! So, what's on your wish list this year?\n", + "caller: I want a mickey, please.\n", + "frosty: Look forward to some Mickey adventures!\"\n", + "[/INST]\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "2a84b44b", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "db0e8d17", + "metadata": {}, + "outputs": [], + "source": [ + "input_df = pd.DataFrame({'input': [sample, sample, sample]})" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "3a98eabd", + "metadata": {}, + "outputs": [], + "source": [ + "res = svc_model.predict(\n", + " deployment_name='halu_ft_deploy_1',\n", + " data=input_df\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "f32e6498", + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_colwidth', None)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "a467afb6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
generated_text
0{\"toy_list\": [\"Fisher-Price Little People Mickey and Friends\"], \"location\": \"Perth\"}
1{\"toy_list\": [\"Fisher-Price Little People Mickey and Friends\"], \"location\": \"Perth\"}
2{\"toy_list\": [\"Fisher-Price Little People Mickey and Friends\"], \"location\": \"Perth\"}
\n", + "
" + ], + "text/plain": [ + " generated_text\n", + "0 {\"toy_list\": [\"Fisher-Price Little People Mickey and Friends\"], \"location\": \"Perth\"}\n", + "1 {\"toy_list\": [\"Fisher-Price Little People Mickey and Friends\"], \"location\": \"Perth\"}\n", + "2 {\"toy_list\": [\"Fisher-Price Little People Mickey and Friends\"], \"location\": \"Perth\"}" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb b/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb index 21e2bca2..3a4f1e59 100644 --- a/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb +++ b/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "5de3eb26", "metadata": {}, @@ -10,7 +9,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "197efd00", "metadata": {}, @@ -19,25 +17,63 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "6ce97b36", "metadata": {}, "source": [ - "### Install `snowflake-ml-python` locally" + "### Snowflake-ML-Python Installation" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "1117c596", "metadata": {}, "source": [ - "Please refer to our [landing page](https://docs.snowflake.com/en/developer-guide/snowpark-ml/index) to install `snowflake-ml-python`." + "- Please refer to our [landing page](https://docs.snowflake.com/en/developer-guide/snowpark-ml/index) to install `snowflake-ml-python`." + ] + }, + { + "cell_type": "markdown", + "id": "7ed8032a", + "metadata": {}, + "source": [ + "### Local Installation" + ] + }, + { + "cell_type": "markdown", + "id": "741c249e", + "metadata": {}, + "source": [ + "- transformers>=4.31.0 (For GPT-2 and LLAMA 2 model inference example)\n", + "- tokenizers>=0.13.3 (For LLAMA 2 model inference example)\n", + "- tensorflow (For GPT-2 Example)\n", + "- xgboost==1.7.6 (For XGBoost GPU inference example)" + ] + }, + { + "cell_type": "markdown", + "id": "2bde8397", + "metadata": {}, + "source": [ + "### Additional Requirements" + ] + }, + { + "cell_type": "markdown", + "id": "2647a880", + "metadata": {}, + "source": [ + "- SPCS compute pool with at least 1 GPU (For all GPU inference on SPCS examples below)\n", + "\n", + "- Requested access to use LLama 2 model through HuggingFace (For LLAMA 2 model inference example)\n", + "\n", + "- A HuggingFace token with read access (For LLAMA 2 model inference example)\n", + "\n", + "- Download the News Category Dataset from https://www.kaggle.com/datasets/rmisra/news-category-dataset (For LLAMA 2 model inference example)" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "99e58d8c", "metadata": {}, @@ -83,7 +119,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "1ac32c6f", "metadata": {}, @@ -119,7 +154,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "dfa9ab88", "metadata": {}, @@ -128,7 +162,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "b0a0c8a8", "metadata": {}, @@ -165,7 +198,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "d76e14a1", "metadata": {}, @@ -174,7 +206,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "c592d46c", "metadata": {}, @@ -183,7 +214,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "378eb3ba", "metadata": {}, @@ -236,7 +266,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "dda57d0b", "metadata": {}, @@ -256,7 +285,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "317e7843", "metadata": {}, @@ -265,7 +293,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "3b482561", "metadata": {}, @@ -309,7 +336,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "45c75e28", "metadata": {}, @@ -318,7 +344,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "a8d496db", "metadata": {}, @@ -356,7 +381,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "6c1f3c07", "metadata": {}, @@ -392,7 +416,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "dc2e2f5e", "metadata": {}, @@ -401,17 +424,6 @@ ] }, { - "attachments": {}, - "cell_type": "markdown", - "id": "f2224cc7", - "metadata": {}, - "source": [ - "Requirements:\n", - "- `transformers` and `tensorflow` installed locally." - ] - }, - { - "attachments": {}, "cell_type": "markdown", "id": "9bc58b66", "metadata": {}, @@ -434,7 +446,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "03454cba", "metadata": {}, @@ -469,7 +480,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "333118b7", "metadata": {}, @@ -530,7 +540,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "e111b527", "metadata": {}, @@ -539,7 +548,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "c27ed16a", "metadata": {}, @@ -582,7 +590,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "e634f4c1", "metadata": {}, @@ -591,7 +598,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "fc0f289d", "metadata": {}, @@ -623,7 +629,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "b44a55b7", "metadata": {}, @@ -632,7 +637,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "05e45630", "metadata": {}, @@ -657,7 +661,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "771cad94", "metadata": {}, @@ -742,7 +745,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "d4d25ee7", "metadata": {}, @@ -779,7 +781,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "2e9446fc", "metadata": {}, @@ -803,7 +804,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "5948b7c8", "metadata": {}, @@ -824,7 +824,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "e560bd8d", "metadata": {}, @@ -844,7 +843,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "08614b16", "metadata": {}, @@ -866,7 +864,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "d1e99456", "metadata": {}, @@ -906,7 +903,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "6b4eabe1", "metadata": {}, @@ -915,7 +911,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "be5ecdb5", "metadata": {}, @@ -941,16 +936,6 @@ "### Deploy to SPCS and using GPU for inference" ] }, - { - "cell_type": "markdown", - "id": "08bce3c3", - "metadata": {}, - "source": [ - "Requirements:\n", - "- `xgboost==1.7.6` installed locally.\n", - "- a SPCS compute pool with at least 1 GPU." - ] - }, { "cell_type": "code", "execution_count": null, @@ -998,18 +983,6 @@ "## Using LLM with HuggingFace Pipeline" ] }, - { - "cell_type": "markdown", - "id": "cd99cd28", - "metadata": {}, - "source": [ - "Requirements:\n", - "- `transformers>=4.31.0` and `tokenizers>=0.13.3` installed locally.\n", - "- a HuggingFace token with read access.\n", - "- a SPCS compute pool with at least 1 GPU.\n", - "- News Category Dataset from https://www.kaggle.com/datasets/rmisra/news-category-dataset" - ] - }, { "cell_type": "markdown", "id": "07bb4d94", @@ -1101,6 +1074,7 @@ " token=\"...\", # Put your HuggingFace token here.\n", " return_full_text=False,\n", " max_new_tokens=100,\n", + " batch_size=1,\n", ")" ] }, @@ -1153,7 +1127,6 @@ " options={\n", " \"compute_pool\": \"...\",\n", " \"num_gpus\": 1,\n", - " \"enable_remote_image_build\": True,\n", " },\n", ")" ] @@ -1400,7 +1373,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -1414,7 +1387,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.8.12" }, "vscode": { "interpreter": { diff --git a/snowflake/ml/requirements.bzl b/snowflake/ml/requirements.bzl index c8c545d1..618e7401 100755 --- a/snowflake/ml/requirements.bzl +++ b/snowflake/ml/requirements.bzl @@ -1,6 +1,6 @@ # DO NOT EDIT! # Generate by running 'bazel run --config=pre_build //bazel/requirements:sync_requirements' -EXTRA_REQUIREMENTS = {"all": ["lightgbm==3.3.5", "mlflow>=2.1.0,<2.4", "sentencepiece>=0.1.95,<0.2", "shap==0.42.1", "tensorflow>=2.9,<3", "tokenizers>=0.10,<1", "torchdata>=0.4,<1", "transformers>=4.29.2,<5"], "lightgbm": ["lightgbm==3.3.5"], "mlflow": ["mlflow>=2.1.0,<2.4"], "shap": ["shap==0.42.1"], "tensorflow": ["tensorflow>=2.9,<3"], "torch": ["torchdata>=0.4,<1"], "transformers": ["sentencepiece>=0.1.95,<0.2", "tokenizers>=0.10,<1", "transformers>=4.29.2,<5"]} +EXTRA_REQUIREMENTS = {"all": ["lightgbm==3.3.5", "mlflow>=2.1.0,<2.4", "peft>=0.5.0,<1", "sentencepiece>=0.1.95,<0.2", "shap==0.42.1", "tensorflow>=2.9,<3,!=2.12.0", "tokenizers>=0.10,<1", "torchdata>=0.4,<1", "transformers>=4.29.2,<5"], "lightgbm": ["lightgbm==3.3.5"], "llm": ["peft>=0.5.0,<1"], "mlflow": ["mlflow>=2.1.0,<2.4"], "shap": ["shap==0.42.1"], "tensorflow": ["tensorflow>=2.9,<3,!=2.12.0"], "torch": ["torchdata>=0.4,<1"], "transformers": ["sentencepiece>=0.1.95,<0.2", "tokenizers>=0.10,<1", "transformers>=4.29.2,<5"]} REQUIREMENTS = ["absl-py>=0.15,<2", "anyio>=3.5.0,<4", "cachetools>=3.1.1,<5", "cloudpickle>=2.0.0", "fsspec[http]>=2022.11,<2024", "numpy>=1.23,<2", "packaging>=20.9,<24", "pandas>=1.0.0,<2", "pytimeparse>=1.1.8,<2", "pyyaml>=6.0,<7", "s3fs>=2022.11,<2024", "scikit-learn>=1.2.1,<1.4", "scipy>=1.9,<2", "snowflake-connector-python[pandas]>=3.0.4,<4", "snowflake-snowpark-python>=1.5.1,<2", "sqlparse>=0.4,<1", "typing-extensions>=4.1.0,<5", "xgboost>=1.7.3,<2"] diff --git a/snowflake/ml/version.bzl b/snowflake/ml/version.bzl index cdeb4317..9dfd36a1 100644 --- a/snowflake/ml/version.bzl +++ b/snowflake/ml/version.bzl @@ -1,2 +1,2 @@ # This is parsed by regex in conda reciper meta file. Make sure not to break it. -VERSION = "1.0.10" +VERSION = "1.0.11" diff --git a/tests/conftest.py b/tests/conftest.py index 2b7d7b96..959a94e7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,8 @@ +import inspect import os from unittest import mock +import cloudpickle as cp import pytest from snowflake.snowpark._internal.utils import TempObjectType @@ -17,4 +19,5 @@ def random_name_for_temp_object_mock(): with mock.patch( "snowflake.ml.modeling._internal.snowpark_handlers.random_name_for_temp_object", _random_name_for_temp_object ) as _fixture: + cp.register_pickle_by_value(inspect.getmodule(_random_name_for_temp_object)) yield _fixture diff --git a/tests/integ/snowflake/ml/_internal/BUILD.bazel b/tests/integ/snowflake/ml/_internal/BUILD.bazel index 9acf5aa9..dd7be706 100644 --- a/tests/integ/snowflake/ml/_internal/BUILD.bazel +++ b/tests/integ/snowflake/ml/_internal/BUILD.bazel @@ -71,3 +71,16 @@ py_test( "//snowflake/ml/utils:connection_params", ], ) + +py_test( + name = "search_single_node_test", + srcs = ["search_single_node_test.py"], + shard_count = 4, + deps = [ + "//snowflake/ml/modeling/_internal:estimator_utils", + "//snowflake/ml/modeling/model_selection/_internal:_grid_search_cv", + "//snowflake/ml/modeling/model_selection/_internal:_randomized_search_cv", + "//snowflake/ml/modeling/xgboost:xgb_classifier", + "//snowflake/ml/utils:connection_params", + ], +) diff --git a/tests/integ/snowflake/ml/_internal/grid_search_integ_test.py b/tests/integ/snowflake/ml/_internal/grid_search_integ_test.py index 65b85be4..a2bd5ad0 100644 --- a/tests/integ/snowflake/ml/_internal/grid_search_integ_test.py +++ b/tests/integ/snowflake/ml/_internal/grid_search_integ_test.py @@ -1,3 +1,5 @@ +from unittest import mock + import inflection import numpy as np from absl.testing.absltest import TestCase, main @@ -35,7 +37,9 @@ def _compare_cv_results(self, cv_result_1, cv_result_2) -> None: np.testing.assert_allclose(v, cv_result_2[k], rtol=1.0e-1, atol=1.0e-2) # Do not compare the fit time - def test_fit_and_compare_results(self) -> None: + @mock.patch("snowflake.ml.modeling.model_selection._internal._grid_search_cv.if_single_node") + def test_fit_and_compare_results(self, mock_if_single_node) -> None: + mock_if_single_node.return_value = True # falls back to HPO implementation input_df_pandas = load_diabetes(as_frame=True).frame input_df_pandas.columns = [inflection.parameterize(c, "_").upper() for c in input_df_pandas.columns] input_cols = [c for c in input_df_pandas.columns if not c.startswith("TARGET")] @@ -56,13 +60,20 @@ def test_fit_and_compare_results(self) -> None: actual_arr = reg.predict(input_df).to_pandas().sort_values(by="INDEX")[output_cols].to_numpy() sklearn_numpy_arr = sklearn_reg.predict(input_df_pandas[input_cols]) + # the result of SnowML grid search cv should behave the same as sklearn's assert reg._sklearn_object.best_params_ == sklearn_reg.best_params_ np.testing.assert_allclose(reg._sklearn_object.best_score_, sklearn_reg.best_score_) self._compare_cv_results(reg._sklearn_object.cv_results_, sklearn_reg.cv_results_) np.testing.assert_allclose(actual_arr.flatten(), sklearn_numpy_arr.flatten(), rtol=1.0e-1, atol=1.0e-2) - def test_fit_xgboost(self) -> None: + # Test on fitting on snowpark Dataframe, and predict on pandas dataframe + actual_arr_pd = reg.predict(input_df.to_pandas()).sort_values(by="INDEX")[output_cols].to_numpy() + np.testing.assert_allclose(actual_arr_pd.flatten(), sklearn_numpy_arr.flatten(), rtol=1.0e-1, atol=1.0e-2) + + @mock.patch("snowflake.ml.modeling.model_selection._internal._grid_search_cv.if_single_node") + def test_fit_xgboost(self, mock_if_single_node) -> None: + mock_if_single_node.return_value = True # falls back to HPO implementation input_df_pandas = load_iris(as_frame=True).frame input_df_pandas.columns = [inflection.parameterize(c, "_").upper() for c in input_df_pandas.columns] input_cols = [c for c in input_df_pandas.columns if not c.startswith("TARGET")] diff --git a/tests/integ/snowflake/ml/_internal/randomized_search_integ_test.py b/tests/integ/snowflake/ml/_internal/randomized_search_integ_test.py index 6eff1f9c..eac60515 100644 --- a/tests/integ/snowflake/ml/_internal/randomized_search_integ_test.py +++ b/tests/integ/snowflake/ml/_internal/randomized_search_integ_test.py @@ -1,3 +1,5 @@ +from unittest import mock + import inflection import numpy as np from absl.testing.absltest import TestCase, main @@ -34,7 +36,9 @@ def _compare_cv_results(self, cv_result_1, cv_result_2) -> None: np.testing.assert_allclose(v, cv_result_2[k], rtol=1.0e-1, atol=1.0e-2) # Do not compare the fit time - def test_fit_and_compare_results(self) -> None: + @mock.patch("snowflake.ml.modeling.model_selection._internal._randomized_search_cv.if_single_node") + def test_fit_and_compare_results(self, mock_if_single_node) -> None: + mock_if_single_node.return_value = True # falls back to HPO implementation input_df_pandas = load_iris(as_frame=True).frame input_df_pandas.columns = [inflection.parameterize(c, "_").upper() for c in input_df_pandas.columns] input_cols = [c for c in input_df_pandas.columns if not c.startswith("TARGET")] @@ -67,12 +71,19 @@ def test_fit_and_compare_results(self) -> None: actual_arr = reg.predict(input_df).to_pandas().sort_values(by="INDEX")[output_cols].to_numpy() sklearn_numpy_arr = sklearn_reg.predict(input_df_pandas[input_cols]) - np.testing.assert_allclose(reg._sklearn_object.best_score_, sklearn_reg.best_score_) - assert reg._sklearn_object.best_params_ == sklearn_reg.best_params_ - self._compare_cv_results(reg._sklearn_object.cv_results_, sklearn_reg.cv_results_) + sk_obj = reg.to_sklearn() + + # the result of SnowML grid search cv should behave the same as sklearn's + np.testing.assert_allclose(sk_obj.best_score_, sklearn_reg.best_score_) + assert sk_obj.best_params_ == sklearn_reg.best_params_ + self._compare_cv_results(sk_obj.cv_results_, sklearn_reg.cv_results_) np.testing.assert_allclose(actual_arr.flatten(), sklearn_numpy_arr.flatten(), rtol=1.0e-1, atol=1.0e-2) + # Test on fitting on snowpark Dataframe, and predict on pandas dataframe + actual_arr_pd = reg.predict(input_df.to_pandas()).sort_values(by="INDEX")[output_cols].to_numpy() + np.testing.assert_allclose(actual_arr_pd.flatten(), sklearn_numpy_arr.flatten(), rtol=1.0e-1, atol=1.0e-2) + if __name__ == "__main__": main() diff --git a/tests/integ/snowflake/ml/_internal/search_single_node_test.py b/tests/integ/snowflake/ml/_internal/search_single_node_test.py new file mode 100644 index 00000000..827b7ccd --- /dev/null +++ b/tests/integ/snowflake/ml/_internal/search_single_node_test.py @@ -0,0 +1,125 @@ +from unittest import mock + +import inflection +from absl.testing import absltest +from sklearn.datasets import load_iris + +from snowflake.ml.modeling.model_selection._internal import ( + GridSearchCV, + RandomizedSearchCV, +) +from snowflake.ml.modeling.xgboost import XGBClassifier +from snowflake.ml.utils.connection_params import SnowflakeLoginOptions +from snowflake.snowpark import Session + + +class SearchSingleNodeTest(absltest.TestCase): + def setUp(self) -> None: + self._session = Session.builder.configs(SnowflakeLoginOptions()).create() + + def tearDown(self) -> None: + self._session.close() + + @mock.patch("snowflake.ml.modeling.model_selection._internal._grid_search_cv.if_single_node") + def test_single_node_grid(self, mock_if_single_node) -> None: + mock_if_single_node.return_value = True + input_df_pandas = load_iris(as_frame=True).frame + input_df_pandas.columns = [inflection.parameterize(c, "_").upper() for c in input_df_pandas.columns] + input_cols = [c for c in input_df_pandas.columns if not c.startswith("TARGET")] + label_col = [c for c in input_df_pandas.columns if c.startswith("TARGET")] + input_df_pandas["INDEX"] = input_df_pandas.reset_index().index + input_df = self._session.create_dataframe(input_df_pandas) + + parameters = { + "learning_rate": [0.1], # reduce the parameters into one to accelerate the test process + } + + estimator = XGBClassifier() + reg = GridSearchCV(estimator=estimator, param_grid=parameters, cv=2, verbose=True) + reg.set_input_cols(input_cols) + output_cols = ["OUTPUT_" + c for c in label_col] + reg.set_output_cols(output_cols) + reg.set_label_cols(label_col) + reg.fit(input_df) + + self.assertEqual(reg.to_sklearn(), reg._sklearn_object) + + self.assertEqual(reg._sklearn_object.n_jobs, -1) + + @mock.patch("snowflake.ml.modeling.model_selection._internal._randomized_search_cv.if_single_node") + def test_single_node_random(self, mock_if_single_node) -> None: + mock_if_single_node.return_value = True + input_df_pandas = load_iris(as_frame=True).frame + input_df_pandas.columns = [inflection.parameterize(c, "_").upper() for c in input_df_pandas.columns] + input_cols = [c for c in input_df_pandas.columns if not c.startswith("TARGET")] + label_col = [c for c in input_df_pandas.columns if c.startswith("TARGET")] + input_df_pandas["INDEX"] = input_df_pandas.reset_index().index + input_df = self._session.create_dataframe(input_df_pandas) + + parameters = { + "learning_rate": [0.1], # reduce the parameters into one to accelerate the test process + } + + estimator = XGBClassifier() + reg = RandomizedSearchCV(estimator=estimator, param_distributions=parameters, cv=2, verbose=True) + reg.set_input_cols(input_cols) + output_cols = ["OUTPUT_" + c for c in label_col] + reg.set_output_cols(output_cols) + reg.set_label_cols(label_col) + reg.fit(input_df) + + self.assertEqual(reg.to_sklearn(), reg._sklearn_object) + + self.assertEqual(reg._sklearn_object.n_jobs, -1) + + @mock.patch("snowflake.ml.modeling.model_selection._internal._grid_search_cv.if_single_node") + def test_not_single_node_grid(self, mock_if_single_node) -> None: + mock_if_single_node.return_value = False + input_df_pandas = load_iris(as_frame=True).frame + input_df_pandas.columns = [inflection.parameterize(c, "_").upper() for c in input_df_pandas.columns] + input_cols = [c for c in input_df_pandas.columns if not c.startswith("TARGET")] + label_col = [c for c in input_df_pandas.columns if c.startswith("TARGET")] + input_df_pandas["INDEX"] = input_df_pandas.reset_index().index + input_df = self._session.create_dataframe(input_df_pandas) + + parameters = { + "learning_rate": [0.1], + } + + estimator = XGBClassifier() + reg = GridSearchCV(estimator=estimator, param_grid=parameters, cv=2, verbose=True) + reg.set_input_cols(input_cols) + output_cols = ["OUTPUT_" + c for c in label_col] + reg.set_output_cols(output_cols) + reg.set_label_cols(label_col) + reg.fit(input_df) + + self.assertEqual(reg._sklearn_object.estimator.n_jobs, 3) + + @mock.patch("snowflake.ml.modeling.model_selection._internal._randomized_search_cv.if_single_node") + def test_not_single_node_random(self, mock_if_single_node) -> None: + mock_if_single_node.return_value = False + input_df_pandas = load_iris(as_frame=True).frame + input_df_pandas.columns = [inflection.parameterize(c, "_").upper() for c in input_df_pandas.columns] + input_cols = [c for c in input_df_pandas.columns if not c.startswith("TARGET")] + label_col = [c for c in input_df_pandas.columns if c.startswith("TARGET")] + input_df_pandas["INDEX"] = input_df_pandas.reset_index().index + input_df = self._session.create_dataframe(input_df_pandas) + + parameters = { + "learning_rate": [0.1], # reduce the parameters into one to accelerate the test process + } + + estimator = XGBClassifier() + reg = RandomizedSearchCV(estimator=estimator, param_distributions=parameters, cv=2, verbose=True) + reg.set_input_cols(input_cols) + output_cols = ["OUTPUT_" + c for c in label_col] + reg.set_output_cols(output_cols) + reg.set_label_cols(label_col) + reg.fit(input_df) + + self.assertEqual(reg._sklearn_object.estimator.n_jobs, 3) + + +if __name__ == "__main__": + absltest.main() diff --git a/tests/integ/snowflake/ml/extra_tests/BUILD.bazel b/tests/integ/snowflake/ml/extra_tests/BUILD.bazel index aa746846..623b67eb 100644 --- a/tests/integ/snowflake/ml/extra_tests/BUILD.bazel +++ b/tests/integ/snowflake/ml/extra_tests/BUILD.bazel @@ -18,6 +18,7 @@ py_test( "//snowflake/ml/modeling/linear_model:logistic_regression", "//snowflake/ml/modeling/model_selection:grid_search_cv", "//snowflake/ml/modeling/svm:svr", + "//snowflake/ml/modeling/xgboost:xgb_regressor", "//snowflake/ml/utils:connection_params", ], ) @@ -122,3 +123,12 @@ py_test( "//snowflake/ml/utils:connection_params", ], ) + +py_test( + name = "decimal_type_test", + srcs = ["decimal_type_test.py"], + deps = [ + "//snowflake/ml/modeling/linear_model:linear_regression", + "//snowflake/ml/utils:connection_params", + ], +) diff --git a/tests/integ/snowflake/ml/extra_tests/decimal_type_test.py b/tests/integ/snowflake/ml/extra_tests/decimal_type_test.py new file mode 100644 index 00000000..7b4f8959 --- /dev/null +++ b/tests/integ/snowflake/ml/extra_tests/decimal_type_test.py @@ -0,0 +1,56 @@ +from typing import List, Tuple + +import inflection +import numpy as np +import pandas as pd +from absl.testing.absltest import TestCase, main +from sklearn.datasets import load_diabetes +from sklearn.linear_model import LinearRegression as SkLinearRegression + +from snowflake.ml.modeling.linear_model import LinearRegression +from snowflake.ml.utils.connection_params import SnowflakeLoginOptions +from snowflake.snowpark import DataFrame, Session, functions, types + + +class DecimalTypeTest(TestCase): + def setUp(self) -> None: + """Creates Snowpark and Snowflake environments for testing.""" + self._session = Session.builder.configs(SnowflakeLoginOptions()).create() + + def tearDown(self) -> None: + self._session.close() + + def _get_test_dataset(self) -> Tuple[pd.DataFrame, DataFrame, List[str], List[str]]: + input_df_pandas = load_diabetes(as_frame=True).frame + # Normalize column names + input_df_pandas.columns = [inflection.parameterize(c, "_").upper() for c in input_df_pandas.columns] + input_cols = [c for c in input_df_pandas.columns if not c.startswith("TARGET")] + label_col = [c for c in input_df_pandas.columns if c.startswith("TARGET")] + input_df = self._session.create_dataframe(input_df_pandas) + # casting every columns as decimal type + fields = input_df.schema.fields + selected_cols = [] + for field in fields: + src = field.column_identifier.quoted_name + dest = types.DecimalType(15, 10) + selected_cols.append(functions.cast(functions.col(src), dest).alias(src)) + input_df = input_df.select(selected_cols) + return (input_df_pandas, input_df, input_cols, label_col) + + def test_decimal_type(self) -> None: + input_df_pandas, input_df, input_cols, label_cols = self._get_test_dataset() + + sklearn_reg = SkLinearRegression() + reg = LinearRegression(input_cols=input_cols, label_cols=label_cols) + + sklearn_reg.fit(input_df_pandas[input_cols], input_df_pandas[label_cols]) + reg.fit(input_df) + + actual_results = reg.predict(input_df_pandas)[reg.get_output_cols()].to_numpy() + sklearn_results = sklearn_reg.predict(input_df_pandas[input_cols]) + + np.testing.assert_allclose(actual_results.flatten(), sklearn_results.flatten()) + + +if __name__ == "__main__": + main() diff --git a/tests/integ/snowflake/ml/extra_tests/grid_search_test.py b/tests/integ/snowflake/ml/extra_tests/grid_search_test.py index 2634e788..601b3c7a 100644 --- a/tests/integ/snowflake/ml/extra_tests/grid_search_test.py +++ b/tests/integ/snowflake/ml/extra_tests/grid_search_test.py @@ -5,9 +5,11 @@ from sklearn.model_selection import GridSearchCV as SkGridSearchCV from sklearn.svm import SVR as SkSVR from snowflake.ml.modeling.linear_model.logistic_regression import LogisticRegression +from xgboost import XGBRegressor as xgboost_regressor from snowflake.ml.modeling.model_selection import GridSearchCV from snowflake.ml.modeling.svm import SVR +from snowflake.ml.modeling.xgboost import XGBRegressor from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session @@ -28,20 +30,27 @@ def test_fit_and_compare_results(self) -> None: input_df_pandas["INDEX"] = input_df_pandas.reset_index().index input_df = self._session.create_dataframe(input_df_pandas) - sklearn_reg = SkGridSearchCV(estimator=SkSVR(), param_grid={"C": [1, 10], "kernel": ("linear", "rbf")}) - reg = GridSearchCV(estimator=SVR(), param_grid={"C": [1, 10], "kernel": ("linear", "rbf")}) - reg.set_input_cols(input_cols) - output_cols = ["OUTPUT_" + c for c in label_col] - reg.set_output_cols(output_cols) - reg.set_label_cols(label_col) + for Estimator, SKEstimator, params in [ + (SVR, SkSVR, {"C": [1, 10], "kernel": ("linear", "rbf")}), + (XGBRegressor, xgboost_regressor, {"n_estimators": [5, 10]}), + ]: + with self.subTest(): + sklearn_reg = SkGridSearchCV(estimator=SKEstimator(), param_grid=params) + reg = GridSearchCV(estimator=Estimator(), param_grid=params) + reg.set_input_cols(input_cols) + output_cols = ["OUTPUT_" + c for c in label_col] + reg.set_output_cols(output_cols) + reg.set_label_cols(label_col) - reg.fit(input_df) - sklearn_reg.fit(X=input_df_pandas[input_cols], y=input_df_pandas[label_col].squeeze()) + reg.fit(input_df) + sklearn_reg.fit(X=input_df_pandas[input_cols], y=input_df_pandas[label_col].squeeze()) - actual_arr = reg.predict(input_df).to_pandas().sort_values(by="INDEX")[output_cols].astype("float64").to_numpy() - sklearn_numpy_arr = sklearn_reg.predict(input_df_pandas[input_cols]) + actual_arr = ( + reg.predict(input_df).to_pandas().sort_values(by="INDEX")[output_cols].astype("float64").to_numpy() + ) + sklearn_numpy_arr = sklearn_reg.predict(input_df_pandas[input_cols]) - np.testing.assert_allclose(actual_arr.flatten(), sklearn_numpy_arr.flatten(), rtol=1.0e-1, atol=1.0e-2) + np.testing.assert_allclose(actual_arr.flatten(), sklearn_numpy_arr.flatten(), rtol=1.0e-1, atol=1.0e-2) def test_invalid_alias_pattern(self) -> None: """ @@ -71,9 +80,6 @@ def test_invalid_alias_pattern(self) -> None: reg.set_output_cols(output_cols) reg.set_label_cols(label_col) - q = reg.fit(input_df).predict_proba(input_df).queries["queries"][-1] - print(q) - reg.fit(input_df).predict_proba(input_df).collect() diff --git a/tests/integ/snowflake/ml/model/BUILD.bazel b/tests/integ/snowflake/ml/model/BUILD.bazel index b1e012e4..f7e8b307 100644 --- a/tests/integ/snowflake/ml/model/BUILD.bazel +++ b/tests/integ/snowflake/ml/model/BUILD.bazel @@ -152,3 +152,21 @@ py_test( "//tests/integ/snowflake/ml/test_utils:db_manager", ], ) + +#TODO(halu): Needs support of pip package for build & test +#py_test( +# name = "spcs_llm_model_integ_test", +# timeout = "eternal", # 3600s, GPU image takes very long to build.. +# srcs = ["spcs_llm_model_integ_test.py"], +# compatible_with_snowpark = False, +# deps = [ +# ":warehouse_model_integ_test_utils", +# "//snowflake/ml/_internal:env_utils", +# "//snowflake/ml/model:type_hints", +# "//snowflake/ml/model/models:llm_model", +# "//snowflake/ml/utils:connection_params", +# "//tests/integ/snowflake/ml/test_utils:db_manager", +# "//tests/integ/snowflake/ml/test_utils:spcs_integ_test_base", +# "//tests/integ/snowflake/ml/test_utils:test_env_utils", +# ], +#) diff --git a/tests/integ/snowflake/ml/model/model_badcase_integ_test.py b/tests/integ/snowflake/ml/model/model_badcase_integ_test.py index 5cbb402e..c1c9133b 100644 --- a/tests/integ/snowflake/ml/model/model_badcase_integ_test.py +++ b/tests/integ/snowflake/ml/model/model_badcase_integ_test.py @@ -99,7 +99,7 @@ def test_custom_demo_model(self) -> None: model_stage_file_path=posixpath.join(tmp_stage, "custom_demo_model.zip"), model=lm, conda_dependencies=[ - test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") + test_env_utils.get_latest_package_version_spec_in_server(self._session, "snowflake-snowpark-python") ], sample_input=pd_df, metadata={"author": "halu", "version": "1"}, diff --git a/tests/integ/snowflake/ml/model/spcs_llm_model_integ_test.py b/tests/integ/snowflake/ml/model/spcs_llm_model_integ_test.py new file mode 100644 index 00000000..e86cca8f --- /dev/null +++ b/tests/integ/snowflake/ml/model/spcs_llm_model_integ_test.py @@ -0,0 +1,109 @@ +# import os +# import tempfile +# import uuid + +# import pandas as pd +# from absl.testing import absltest + +# from snowflake.ml.model import ( +# _deployer, +# _model as model_api, +# deploy_platforms, +# type_hints as model_types, +# ) +# from snowflake.ml.model.models import llm +# from tests.integ.snowflake.ml.test_utils import ( +# db_manager, +# spcs_integ_test_base, +# test_env_utils, +# ) + + +# class TestSPCSLLMModelInteg(spcs_integ_test_base.SpcsIntegTestBase): +# @classmethod +# def setUpClass(cls) -> None: +# super().setUpClass() +# cls.cache_dir = tempfile.TemporaryDirectory() +# cls._original_hf_home = os.getenv("HF_HOME", None) +# os.environ["HF_HOME"] = cls.cache_dir.name + +# @classmethod +# def tearDownClass(cls) -> None: +# super().tearDownClass() +# if cls._original_hf_home: +# os.environ["HF_HOME"] = cls._original_hf_home +# else: +# del os.environ["HF_HOME"] +# cls.cache_dir.cleanup() + +# def setUp(self) -> None: +# # Set up a unique id for each artifact, in addition to the class-level prefix. This is particularly useful +# # when differentiating artifacts generated between different test cases, such as service function names. +# self.uid = uuid.uuid4().hex[:4] + +# def test_text_generation_pipeline( +# self, +# ) -> None: +# import peft + +# ft_model = peft.AutoPeftModelForCausalLM.from_pretrained( +# "peft-internal-testing/tiny-OPTForCausalLM-lora", +# device_map="auto", +# ) +# tmpdir = self.create_tempdir().full_path +# ft_model.save_pretrained(tmpdir) +# model = llm.LLM( +# model_id_or_path=tmpdir, +# ) + +# x_df = pd.DataFrame( +# [["Hello world"]], +# ) +# cls = TestSPCSLLMModelInteg +# stage_path = f"@{cls._TEST_STAGE}/{self.uid}/model.zip" +# deployment_stage_path = f"@{cls._TEST_STAGE}/{self.uid}" +# model_api.save_model( # type: ignore[call-overload] +# name="model", +# session=self._session, +# model_stage_file_path=stage_path, +# model=model, +# options={"embed_local_ml_library": True}, +# conda_dependencies=[ +# test_env_utils.get_latest_package_version_spec_in_server(self._session, "snowflake-snowpark-python"), +# ], +# ) +# svc_func_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( +# self._RUN_ID, +# f"func_{self.uid}", +# ) +# deployment_options: model_types.SnowparkContainerServiceDeployOptions = { +# "compute_pool": cls._TEST_GPU_COMPUTE_POOL, +# "num_gpus": 1, +# # TODO(halu): Create an separate testing registry. +# # Creating new registry for each single test is costly since no cache hit would ever occurs. +# "image_repo": "sfengineering-mlplatformtest.registry.snowflakecomputing.com/" +# "regtest_db/regtest_schema/halu_test", +# "enable_remote_image_build": True, +# "model_in_image": True, +# } + +# deploy_info = _deployer.deploy( +# name=svc_func_name, +# session=cls._session, +# model_stage_file_path=stage_path, +# deployment_stage_path=deployment_stage_path, +# model_id=svc_func_name, +# platform=deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES, +# options={ +# **deployment_options, # type: ignore[arg-type] +# }, # type: ignore[call-overload] +# ) +# assert deploy_info is not None +# res = _deployer.predict(session=cls._session, deployment=deploy_info, X=x_df) +# self.assertIn("generated_text", res) +# self.assertEqual(len(res["generated_text"]), 1) +# self.assertNotEmpty(res["generated_text"][0]) + + +# if __name__ == "__main__": +# absltest.main() diff --git a/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py b/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py index 842ddedc..9e20b8c5 100644 --- a/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py +++ b/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py @@ -34,12 +34,12 @@ def base_test_case( version_args: Dict[str, Any] = {} tmp_stage = db._session.get_session_stage() conda_dependencies = [ - test_env_utils.get_latest_package_versions_in_server(db._session, "snowflake-snowpark-python") + test_env_utils.get_latest_package_version_spec_in_server(db._session, "snowflake-snowpark-python") ] if additional_dependencies: conda_dependencies.extend(additional_dependencies) # We only test when the test is added before the current version available in the server. - snowml_req_str = test_env_utils.get_latest_package_versions_in_server(db._session, "snowflake-ml-python") + snowml_req_str = test_env_utils.get_latest_package_version_spec_in_server(db._session, "snowflake-ml-python") if permanent_deploy: permanent_deploy_args = {"permanent_udf_stage_location": f"@{full_qual_stage}/"} diff --git a/tests/integ/snowflake/ml/modeling/framework/utils.py b/tests/integ/snowflake/ml/modeling/framework/utils.py index 36b8efc1..cfe5ed98 100644 --- a/tests/integ/snowflake/ml/modeling/framework/utils.py +++ b/tests/integ/snowflake/ml/modeling/framework/utils.py @@ -141,7 +141,7 @@ class DataType(Enum): def gen_fuzz_data( rows: int, types: List[DataType], low: Union[int, List[int]] = MIN_INT, high: Union[int, List[int]] = MAX_INT -) -> Tuple[List[Any], List[str]]: +) -> Tuple[List[Any], List[str], List[str]]: """ Generate random data based on input column types and row count. First column in the result data will be an ID column for indexing. @@ -160,6 +160,7 @@ def gen_fuzz_data( """ data: List[npt.NDArray[Any]] = [np.arange(1, rows + 1, 1)] names = ["ID"] + snowflake_identifiers = ["ID"] for idx, t in enumerate(types): _low = low if isinstance(low, int) else low[idx] @@ -170,9 +171,11 @@ def gen_fuzz_data( data.append(np.random.uniform(_low, _high, rows)) else: raise ValueError(f"Unsupported data type {t}") - names.append(f"COL_{idx}") + names.append(f"col_{idx}") + snowflake_identifiers.append(f'"col_{idx}"') + data = np.core.records.fromarrays(data, names=names).tolist() # type: ignore[call-overload] - return np.core.records.fromarrays(data, names=names).tolist(), names # type: ignore[call-overload] + return data, names, snowflake_identifiers def get_df( @@ -181,12 +184,14 @@ def get_df( schema: List[str], fillna: Optional[Union[object, ArrayLike]] = None, ) -> Tuple[pd.DataFrame, DataFrame]: - """Create pandas dataframe and Snowpark dataframes from input data. + """Create pandas dataframe and Snowpark dataframes from input data. The schema passed should be + a pandas schema, which will be converted to a schema using snowflake identifiers when `session.create_dataframe` + is called. Args: session: Snowpark session object. data: List of input data to convert to dataframe. - schema: Schema for dataframe to be created. + schema: The pandas schema for dataframe to be created. fillna: Value to fill for NA values in the input data. Returns: @@ -196,6 +201,8 @@ def get_df( if fillna is not None: df_pandas.fillna(value=fillna, inplace=True) df = session.create_dataframe(df_pandas) + df_pandas.columns = df.columns + return df_pandas, df diff --git a/tests/integ/snowflake/ml/modeling/metrics/accuracy_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/accuracy_score_test.py index b30e9c92..95ac3d32 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/accuracy_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/accuracy_score_test.py @@ -1,6 +1,5 @@ from typing import Any, Dict -import pandas as pd from absl.testing import parameterized from absl.testing.absltest import main from sklearn import metrics as sklearn_metrics @@ -12,23 +11,23 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=5, ) -_Y_TRUE_COL = _SCHEMA[1] -_Y_PRED_COL = _SCHEMA[2] -_Y_TRUE_COLS = [_SCHEMA[1], _SCHEMA[2]] -_Y_PRED_COLS = [_SCHEMA[3], _SCHEMA[4]] -_SAMPLE_WEIGHT_COL = _SCHEMA[5] +_Y_TRUE_COL = _SF_SCHEMA[1] +_Y_PRED_COL = _SF_SCHEMA[2] +_Y_TRUE_COLS = [_SF_SCHEMA[1], _SF_SCHEMA[2]] +_Y_PRED_COLS = [_SF_SCHEMA[3], _SF_SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[5] class AccuracyScoreTest(parameterized.TestCase): @@ -57,8 +56,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_score = snowml_metrics.accuracy_score( @@ -91,8 +89,7 @@ def test_normalized(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for normalize in params["normalize"]: actual_score = snowml_metrics.accuracy_score( diff --git a/tests/integ/snowflake/ml/modeling/metrics/confusion_matrix_test.py b/tests/integ/snowflake/ml/modeling/metrics/confusion_matrix_test.py index c454a72d..c48b402a 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/confusion_matrix_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/confusion_matrix_test.py @@ -10,15 +10,15 @@ from snowflake.ml.utils import connection_params from tests.integ.snowflake.ml.modeling.framework import utils -_DATA, _SCHEMA = utils.gen_fuzz_data( +_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=100, types=[utils.DataType.INTEGER] * 2 + [utils.DataType.FLOAT], low=-1, high=5, ) -_Y_TRUE_COL = _SCHEMA[1] -_Y_PRED_COL = _SCHEMA[2] -_SAMPLE_WEIGHT_COL = _SCHEMA[3] +_Y_TRUE_COL = _SF_SCHEMA[1] +_Y_PRED_COL = _SF_SCHEMA[2] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[3] class ConfusionMatrixTest(parameterized.TestCase): @@ -35,8 +35,7 @@ def tearDown(self) -> None: {"params": {"labels": [None, [2, 0, 4]]}}, ) def test_labels(self, params: Dict[str, Any]) -> None: - input_df = self._session.create_dataframe(_DATA, schema=_SCHEMA) - pandas_df = input_df.to_pandas() + pandas_df, input_df = utils.get_df(self._session, _DATA, _PD_SCHEMA) for labels in params["labels"]: actual_cm = snowml_metrics.confusion_matrix( @@ -56,8 +55,7 @@ def test_labels(self, params: Dict[str, Any]) -> None: {"params": {"sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL]}}, ) def test_sample_weight(self, params: Dict[str, Any]) -> None: - input_df = self._session.create_dataframe(_DATA, schema=_SCHEMA) - pandas_df = input_df.to_pandas() + pandas_df, input_df = utils.get_df(self._session, _DATA, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_cm = snowml_metrics.confusion_matrix( @@ -78,8 +76,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"normalize": ["true", "pred", "all", None]}}, ) def test_normalize(self, params: Dict[str, Any]) -> None: - input_df = self._session.create_dataframe(_DATA, schema=_SCHEMA) - pandas_df = input_df.to_pandas() + pandas_df, input_df = utils.get_df(self._session, _DATA, _PD_SCHEMA) for normalize in params["normalize"]: actual_cm = snowml_metrics.confusion_matrix( @@ -101,7 +98,7 @@ def test_normalize(self, params: Dict[str, Any]) -> None: {"params": {"normalize": "invalid"}}, ) def test_invalid_params(self, params: Dict[str, Any]) -> None: - input_df = self._session.create_dataframe(_DATA, schema=_SCHEMA) + input_df = self._session.create_dataframe(_DATA, schema=_SF_SCHEMA) if "labels" in params: with self.assertRaises(ValueError): diff --git a/tests/integ/snowflake/ml/modeling/metrics/d2_absolute_error_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/d2_absolute_error_score_test.py index 1600c144..b61fef1c 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/d2_absolute_error_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/d2_absolute_error_score_test.py @@ -2,7 +2,6 @@ from unittest import mock import numpy as np -import pandas as pd from absl.testing import parameterized from absl.testing.absltest import main from sklearn import metrics as sklearn_metrics @@ -14,23 +13,23 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=5, ) -_Y_TRUE_COL = _SCHEMA[1] -_Y_PRED_COL = _SCHEMA[2] -_Y_TRUE_COLS = [_SCHEMA[1], _SCHEMA[2]] -_Y_PRED_COLS = [_SCHEMA[3], _SCHEMA[4]] -_SAMPLE_WEIGHT_COL = _SCHEMA[5] +_Y_TRUE_COL = _SF_SCHEMA[1] +_Y_PRED_COL = _SF_SCHEMA[2] +_Y_TRUE_COLS = [_SF_SCHEMA[1], _SF_SCHEMA[2]] +_Y_PRED_COLS = [_SF_SCHEMA[3], _SF_SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[5] _MULTILABEL_DATA = [ [1, 0, 1, 0.8, 0.3, 0.6], [0, 1, 0, 0.2, 0.7, 0.4], @@ -68,8 +67,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_loss = snowml_metrics.d2_absolute_error_score( @@ -90,8 +88,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"multioutput": ["raw_values", "uniform_average", [0.2, 1.0, 1.66]]}}, ) def test_multioutput(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTILABEL_DATA, columns=_MULTILABEL_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTILABEL_DATA, _MULTILABEL_SCHEMA) for multioutput in params["multioutput"]: actual_loss = snowml_metrics.d2_absolute_error_score( @@ -108,8 +105,7 @@ def test_multioutput(self, params: Dict[str, Any]) -> None: np.testing.assert_allclose(actual_loss, sklearn_loss) def test_multilabel(self) -> None: - pandas_df = pd.DataFrame(_MULTILABEL_DATA, columns=_MULTILABEL_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTILABEL_DATA, _MULTILABEL_SCHEMA) actual_loss = snowml_metrics.d2_absolute_error_score( df=input_df, @@ -124,8 +120,7 @@ def test_multilabel(self) -> None: @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) actual_loss = snowml_metrics.d2_absolute_error_score( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/d2_pinball_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/d2_pinball_score_test.py index 2d20cedc..e36839a7 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/d2_pinball_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/d2_pinball_score_test.py @@ -2,7 +2,6 @@ from unittest import mock import numpy as np -import pandas as pd from absl.testing import parameterized from absl.testing.absltest import main from sklearn import metrics as sklearn_metrics @@ -14,23 +13,23 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=5, ) -_Y_TRUE_COL = _SCHEMA[1] -_Y_PRED_COL = _SCHEMA[2] -_Y_TRUE_COLS = [_SCHEMA[1], _SCHEMA[2]] -_Y_PRED_COLS = [_SCHEMA[3], _SCHEMA[4]] -_SAMPLE_WEIGHT_COL = _SCHEMA[5] +_Y_TRUE_COL = _SF_SCHEMA[1] +_Y_PRED_COL = _SF_SCHEMA[2] +_Y_TRUE_COLS = [_SF_SCHEMA[1], _SF_SCHEMA[2]] +_Y_PRED_COLS = [_SF_SCHEMA[3], _SF_SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[5] _MULTILABEL_DATA = [ [1, 0, 1, 0.8, 0.3, 0.6], [0, 1, 0, 0.2, 0.7, 0.4], @@ -68,8 +67,8 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_loss = snowml_metrics.d2_pinball_score( @@ -102,8 +101,7 @@ def test_alpha(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for alpha in params["alpha"]: actual_loss = snowml_metrics.d2_pinball_score( @@ -123,8 +121,7 @@ def test_alpha(self, params: Dict[str, Any]) -> None: {"params": {"multioutput": ["raw_values", "uniform_average", [0.2, 1.0, 1.66]]}}, ) def test_multioutput(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTILABEL_DATA, columns=_MULTILABEL_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTILABEL_DATA, _MULTILABEL_SCHEMA) for multioutput in params["multioutput"]: actual_loss = snowml_metrics.d2_pinball_score( @@ -141,8 +138,7 @@ def test_multioutput(self, params: Dict[str, Any]) -> None: np.testing.assert_allclose(actual_loss, sklearn_loss) def test_multilabel(self) -> None: - pandas_df = pd.DataFrame(_MULTILABEL_DATA, columns=_MULTILABEL_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTILABEL_DATA, _MULTILABEL_SCHEMA) actual_loss = snowml_metrics.d2_pinball_score( df=input_df, @@ -157,8 +153,7 @@ def test_multilabel(self) -> None: @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) actual_loss = snowml_metrics.d2_pinball_score( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/explained_variance_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/explained_variance_score_test.py index cd307a16..9a79db02 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/explained_variance_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/explained_variance_score_test.py @@ -2,7 +2,6 @@ from unittest import mock import numpy as np -import pandas as pd from absl.testing import parameterized from absl.testing.absltest import main from sklearn import metrics as sklearn_metrics @@ -14,23 +13,23 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=5, ) -_Y_TRUE_COL = _SCHEMA[1] -_Y_PRED_COL = _SCHEMA[2] -_Y_TRUE_COLS = [_SCHEMA[1], _SCHEMA[2]] -_Y_PRED_COLS = [_SCHEMA[3], _SCHEMA[4]] -_SAMPLE_WEIGHT_COL = _SCHEMA[5] +_Y_TRUE_COL = _SF_SCHEMA[1] +_Y_PRED_COL = _SF_SCHEMA[2] +_Y_TRUE_COLS = [_SF_SCHEMA[1], _SF_SCHEMA[2]] +_Y_PRED_COLS = [_SF_SCHEMA[3], _SF_SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[5] _MULTILABEL_DATA = [ [1, 0, 1, 0.8, 0.3, 0.6], [0, 1, 0, 0.2, 0.7, 0.4], @@ -68,8 +67,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_loss = snowml_metrics.explained_variance_score( @@ -90,8 +88,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"multioutput": ["raw_values", "uniform_average", [0.2, 1.0, 1.66]]}}, ) def test_multioutput(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTILABEL_DATA, columns=_MULTILABEL_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTILABEL_DATA, _MULTILABEL_SCHEMA) for multioutput in params["multioutput"]: actual_loss = snowml_metrics.explained_variance_score( @@ -123,8 +120,7 @@ def test_force_finite(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for force_finite in params["force_finite"]: actual_loss = snowml_metrics.explained_variance_score( @@ -141,8 +137,7 @@ def test_force_finite(self, params: Dict[str, Any]) -> None: self.assertAlmostEqual(sklearn_loss, actual_loss) def test_multilabel(self) -> None: - pandas_df = pd.DataFrame(_MULTILABEL_DATA, columns=_MULTILABEL_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTILABEL_DATA, _MULTILABEL_SCHEMA) actual_loss = snowml_metrics.explained_variance_score( df=input_df, @@ -157,8 +152,7 @@ def test_multilabel(self) -> None: @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) actual_loss = snowml_metrics.explained_variance_score( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/f1_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/f1_score_test.py index 4e50cee9..9341bfbe 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/f1_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/f1_score_test.py @@ -1,7 +1,6 @@ from typing import Any, Dict import numpy as np -import pandas as pd from absl.testing import parameterized from absl.testing.absltest import main from sklearn import exceptions, metrics as sklearn_metrics @@ -13,23 +12,23 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=5, ) -_Y_TRUE_COL = _SCHEMA[1] -_Y_PRED_COL = _SCHEMA[2] -_Y_TRUE_COLS = [_SCHEMA[1], _SCHEMA[2]] -_Y_PRED_COLS = [_SCHEMA[3], _SCHEMA[4]] -_SAMPLE_WEIGHT_COL = _SCHEMA[5] +_Y_TRUE_COL = _SF_SCHEMA[1] +_Y_PRED_COL = _SF_SCHEMA[2] +_Y_TRUE_COLS = [_SF_SCHEMA[1], _SF_SCHEMA[2]] +_Y_PRED_COLS = [_SF_SCHEMA[3], _SF_SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[5] class F1ScoreTest(parameterized.TestCase): @@ -46,8 +45,7 @@ def tearDown(self) -> None: {"params": {"labels": [None, [2, 0, 4]]}}, ) def test_labels(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for labels in params["labels"]: actual_f = snowml_metrics.f1_score( @@ -69,8 +67,7 @@ def test_labels(self, params: Dict[str, Any]) -> None: {"params": {"pos_label": [0, 2, 4]}}, ) def test_pos_label(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for pos_label in params["pos_label"]: actual_f = snowml_metrics.f1_score( @@ -92,8 +89,7 @@ def test_pos_label(self, params: Dict[str, Any]) -> None: {"params": {"average": [None, "micro", "macro", "weighted"]}}, ) def test_average_multiclass(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for average in params["average"]: actual_f = snowml_metrics.f1_score( @@ -119,8 +115,7 @@ def test_average_multiclass(self, params: Dict[str, Any]) -> None: }, ) def test_average_binary(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) for idx, average in enumerate(params["average"]): y_true = params["y_true"][idx] @@ -154,8 +149,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_f = snowml_metrics.f1_score( @@ -181,8 +175,7 @@ def test_zero_division(self, params: Dict[str, Any]) -> None: data = [ [0, 0, 0, 0, 0, 0], ] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for zero_division in params["zero_division"]: if zero_division == "warn": diff --git a/tests/integ/snowflake/ml/modeling/metrics/fbeta_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/fbeta_score_test.py index 226f6eff..f40cc576 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/fbeta_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/fbeta_score_test.py @@ -1,7 +1,6 @@ from typing import Any, Dict import numpy as np -import pandas as pd from absl.testing import parameterized from absl.testing.absltest import main from sklearn import exceptions, metrics as sklearn_metrics @@ -13,23 +12,23 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=5, ) -_Y_TRUE_COL = _SCHEMA[1] -_Y_PRED_COL = _SCHEMA[2] -_Y_TRUE_COLS = [_SCHEMA[1], _SCHEMA[2]] -_Y_PRED_COLS = [_SCHEMA[3], _SCHEMA[4]] -_SAMPLE_WEIGHT_COL = _SCHEMA[5] +_Y_TRUE_COL = _SF_SCHEMA[1] +_Y_PRED_COL = _SF_SCHEMA[2] +_Y_TRUE_COLS = [_SF_SCHEMA[1], _SF_SCHEMA[2]] +_Y_PRED_COLS = [_SF_SCHEMA[3], _SF_SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[5] class FbetaScoreTest(parameterized.TestCase): @@ -58,8 +57,7 @@ def test_beta(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for beta in params["beta"]: actual_f = snowml_metrics.fbeta_score( @@ -81,8 +79,7 @@ def test_beta(self, params: Dict[str, Any]) -> None: {"params": {"labels": [None, [2, 0, 4]]}}, ) def test_labels(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for labels in params["labels"]: actual_f = snowml_metrics.fbeta_score( @@ -106,8 +103,7 @@ def test_labels(self, params: Dict[str, Any]) -> None: {"params": {"pos_label": [0, 2, 4]}}, ) def test_pos_label(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for pos_label in params["pos_label"]: actual_f = snowml_metrics.fbeta_score( @@ -131,8 +127,7 @@ def test_pos_label(self, params: Dict[str, Any]) -> None: {"params": {"average": [None, "micro", "macro", "weighted"]}}, ) def test_average_multiclass(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for average in params["average"]: actual_f = snowml_metrics.fbeta_score( @@ -160,8 +155,7 @@ def test_average_multiclass(self, params: Dict[str, Any]) -> None: }, ) def test_average_binary(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) for idx, average in enumerate(params["average"]): y_true = params["y_true"][idx] @@ -197,8 +191,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_f = snowml_metrics.fbeta_score( @@ -226,8 +219,7 @@ def test_zero_division(self, params: Dict[str, Any]) -> None: data = [ [0, 0, 0, 0, 0, 0], ] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for zero_division in params["zero_division"]: if zero_division == "warn": diff --git a/tests/integ/snowflake/ml/modeling/metrics/log_loss_test.py b/tests/integ/snowflake/ml/modeling/metrics/log_loss_test.py index 42672bea..f0242535 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/log_loss_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/log_loss_test.py @@ -1,6 +1,5 @@ from typing import Any, Dict -import pandas as pd from absl.testing import parameterized from absl.testing.absltest import main from sklearn import metrics as sklearn_metrics @@ -12,14 +11,14 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] + [utils.DataType.FLOAT] * 4 -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=[2, 1, 1, 1, 1], ) -_BINARY_Y_TRUE_COL = _SCHEMA[1] -_BINARY_Y_PRED_COL = _SCHEMA[2] +_BINARY_Y_TRUE_COL = _SF_SCHEMA[1] +_BINARY_Y_PRED_COL = _SF_SCHEMA[2] _MULTICLASS_DATA = [ [0, 2, 0.29, 0.49, 0.22, 0.18], [1, 0, 0.33, 0.16, 0.51, 0.69], @@ -28,9 +27,9 @@ [4, 1, 0.82, 0.12, 0.06, 0.91], [5, 2, 0.08, 0.46, 0.46, 0.76], ] -_MULTICLASS_Y_TRUE_COL = _SCHEMA[1] -_MULTICLASS_Y_PRED_COLS = [_SCHEMA[2], _SCHEMA[3], _SCHEMA[4]] -_SAMPLE_WEIGHT_COL = _SCHEMA[5] +_MULTICLASS_Y_TRUE_COL = _SF_SCHEMA[1] +_MULTICLASS_Y_PRED_COLS = [_SF_SCHEMA[2], _SF_SCHEMA[3], _SF_SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[5] _MULTILABEL_DATA = [ [1, 0, 1, 0.8, 0.3, 0.6], [0, 1, 0, 0.2, 0.7, 0.4], @@ -68,8 +67,7 @@ def test_eps(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for eps in params["eps"]: actual_loss = snowml_metrics.log_loss( @@ -101,8 +99,7 @@ def test_normalize(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for normalize in params["normalize"]: actual_loss = snowml_metrics.log_loss( @@ -134,8 +131,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_loss = snowml_metrics.log_loss( @@ -156,8 +152,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"labels": [None, [2, 0, 4]]}}, ) def test_labels(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for labels in params["labels"]: actual_loss = snowml_metrics.log_loss( @@ -174,8 +169,7 @@ def test_labels(self, params: Dict[str, Any]) -> None: self.assertAlmostEqual(sklearn_loss, actual_loss) def test_multilabel(self) -> None: - pandas_df = pd.DataFrame(_MULTILABEL_DATA, columns=_MULTILABEL_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTILABEL_DATA, _MULTILABEL_SCHEMA) actual_loss = snowml_metrics.log_loss( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/mean_absolute_error_test.py b/tests/integ/snowflake/ml/modeling/metrics/mean_absolute_error_test.py index f858fe2c..44e73cfa 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/mean_absolute_error_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/mean_absolute_error_test.py @@ -2,7 +2,6 @@ from unittest import mock import numpy as np -import pandas as pd from absl.testing import parameterized from absl.testing.absltest import main from sklearn import metrics as sklearn_metrics @@ -14,23 +13,23 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=5, ) -_Y_TRUE_COL = _SCHEMA[1] -_Y_PRED_COL = _SCHEMA[2] -_Y_TRUE_COLS = [_SCHEMA[1], _SCHEMA[2]] -_Y_PRED_COLS = [_SCHEMA[3], _SCHEMA[4]] -_SAMPLE_WEIGHT_COL = _SCHEMA[5] +_Y_TRUE_COL = _SF_SCHEMA[1] +_Y_PRED_COL = _SF_SCHEMA[2] +_Y_TRUE_COLS = [_SF_SCHEMA[1], _SF_SCHEMA[2]] +_Y_PRED_COLS = [_SF_SCHEMA[3], _SF_SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[5] _MULTILABEL_DATA = [ [1, 0, 1, 0.8, 0.3, 0.6], [0, 1, 0, 0.2, 0.7, 0.4], @@ -68,8 +67,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_loss = snowml_metrics.mean_absolute_error( @@ -90,8 +88,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"multioutput": ["raw_values", "uniform_average", [0.2, 1.0, 1.66]]}}, ) def test_multioutput(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTILABEL_DATA, columns=_MULTILABEL_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTILABEL_DATA, _MULTILABEL_SCHEMA) for multioutput in params["multioutput"]: actual_loss = snowml_metrics.mean_absolute_error( @@ -108,8 +105,7 @@ def test_multioutput(self, params: Dict[str, Any]) -> None: np.testing.assert_allclose(actual_loss, sklearn_loss) def test_multilabel(self) -> None: - pandas_df = pd.DataFrame(_MULTILABEL_DATA, columns=_MULTILABEL_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTILABEL_DATA, _MULTILABEL_SCHEMA) actual_loss = snowml_metrics.mean_absolute_error( df=input_df, @@ -124,8 +120,7 @@ def test_multilabel(self) -> None: @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) actual_loss = snowml_metrics.mean_absolute_error( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/mean_absolute_percentage_error_test.py b/tests/integ/snowflake/ml/modeling/metrics/mean_absolute_percentage_error_test.py index 6d0676e1..7b657228 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/mean_absolute_percentage_error_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/mean_absolute_percentage_error_test.py @@ -2,7 +2,6 @@ from unittest import mock import numpy as np -import pandas as pd from absl.testing import parameterized from absl.testing.absltest import main from sklearn import metrics as sklearn_metrics @@ -14,23 +13,23 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=5, ) -_Y_TRUE_COL = _SCHEMA[1] -_Y_PRED_COL = _SCHEMA[2] -_Y_TRUE_COLS = [_SCHEMA[1], _SCHEMA[2]] -_Y_PRED_COLS = [_SCHEMA[3], _SCHEMA[4]] -_SAMPLE_WEIGHT_COL = _SCHEMA[5] +_Y_TRUE_COL = _SF_SCHEMA[1] +_Y_PRED_COL = _SF_SCHEMA[2] +_Y_TRUE_COLS = [_SF_SCHEMA[1], _SF_SCHEMA[2]] +_Y_PRED_COLS = [_SF_SCHEMA[3], _SF_SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[5] _MULTILABEL_DATA = [ [1, 0, 1, 0.8, 0.3, 0.6], [0, 1, 0, 0.2, 0.7, 0.4], @@ -68,8 +67,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_loss = snowml_metrics.mean_absolute_percentage_error( @@ -90,8 +88,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"multioutput": ["raw_values", "uniform_average", [0.2, 1.0, 1.66]]}}, ) def test_multioutput(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTILABEL_DATA, columns=_MULTILABEL_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTILABEL_DATA, _MULTILABEL_SCHEMA) for multioutput in params["multioutput"]: actual_loss = snowml_metrics.mean_absolute_percentage_error( @@ -108,8 +105,7 @@ def test_multioutput(self, params: Dict[str, Any]) -> None: np.testing.assert_allclose(actual_loss, sklearn_loss, rtol=0.000001) def test_multilabel(self) -> None: - pandas_df = pd.DataFrame(_MULTILABEL_DATA, columns=_MULTILABEL_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTILABEL_DATA, _MULTILABEL_SCHEMA) actual_loss = snowml_metrics.mean_absolute_percentage_error( df=input_df, @@ -124,8 +120,7 @@ def test_multilabel(self) -> None: @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) actual_loss = snowml_metrics.mean_absolute_percentage_error( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/mean_squared_error_test.py b/tests/integ/snowflake/ml/modeling/metrics/mean_squared_error_test.py index 996338c2..4a93650a 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/mean_squared_error_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/mean_squared_error_test.py @@ -2,7 +2,6 @@ from unittest import mock import numpy as np -import pandas as pd from absl.testing import parameterized from absl.testing.absltest import main from sklearn import metrics as sklearn_metrics @@ -14,23 +13,23 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=5, ) -_Y_TRUE_COL = _SCHEMA[1] -_Y_PRED_COL = _SCHEMA[2] -_Y_TRUE_COLS = [_SCHEMA[1], _SCHEMA[2]] -_Y_PRED_COLS = [_SCHEMA[3], _SCHEMA[4]] -_SAMPLE_WEIGHT_COL = _SCHEMA[5] +_Y_TRUE_COL = _SF_SCHEMA[1] +_Y_PRED_COL = _SF_SCHEMA[2] +_Y_TRUE_COLS = [_SF_SCHEMA[1], _SF_SCHEMA[2]] +_Y_PRED_COLS = [_SF_SCHEMA[3], _SF_SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[5] _MULTILABEL_DATA = [ [1, 0, 1, 0.8, 0.3, 0.6], [0, 1, 0, 0.2, 0.7, 0.4], @@ -68,8 +67,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_loss = snowml_metrics.mean_squared_error( @@ -90,8 +88,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"multioutput": ["raw_values", "uniform_average", [0.2, 1.0, 1.66]]}}, ) def test_multioutput(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTILABEL_DATA, columns=_MULTILABEL_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTILABEL_DATA, _MULTILABEL_SCHEMA) for multioutput in params["multioutput"]: actual_loss = snowml_metrics.mean_squared_error( @@ -123,8 +120,7 @@ def test_squared(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for squared in params["squared"]: actual_loss = snowml_metrics.mean_squared_error( @@ -141,8 +137,7 @@ def test_squared(self, params: Dict[str, Any]) -> None: self.assertAlmostEqual(sklearn_loss, actual_loss) def test_multilabel(self) -> None: - pandas_df = pd.DataFrame(_MULTILABEL_DATA, columns=_MULTILABEL_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTILABEL_DATA, _MULTILABEL_SCHEMA) actual_loss = snowml_metrics.mean_squared_error( df=input_df, @@ -157,8 +152,7 @@ def test_multilabel(self) -> None: @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) actual_loss = snowml_metrics.mean_squared_error( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/metrics_utils_test.py b/tests/integ/snowflake/ml/modeling/metrics/metrics_utils_test.py index 49a3a607..6d638664 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/metrics_utils_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/metrics_utils_test.py @@ -1,5 +1,4 @@ import numpy as np -import pandas as pd from absl.testing import parameterized from absl.testing.absltest import main @@ -10,15 +9,15 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_Y_TRUE_COL = _SCHEMA[1] -_Y_PRED_COL = _SCHEMA[2] -_SAMPLE_WEIGHT_COL = _SCHEMA[5] +_Y_TRUE_COL = _SF_SCHEMA[1] +_Y_PRED_COL = _SF_SCHEMA[2] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[5] class MetricsUtilsTest(parameterized.TestCase): @@ -38,8 +37,7 @@ def tearDown(self) -> None: normalize=(False, True), ) def test_weighted_sum(self, df, sample_weight_col_name, sample_score_col_name, normalize) -> None: - pandas_df = pd.DataFrame(df, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, df, _PD_SCHEMA) snowpark_weight_col = input_df[sample_weight_col_name] if sample_weight_col_name else None actual_sum = metrics_utils.weighted_sum( diff --git a/tests/integ/snowflake/ml/modeling/metrics/precision_recall_curve_test.py b/tests/integ/snowflake/ml/modeling/metrics/precision_recall_curve_test.py index d5691e64..8c8b7d2f 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/precision_recall_curve_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/precision_recall_curve_test.py @@ -2,7 +2,6 @@ from unittest import mock import numpy as np -import pandas as pd from absl.testing import parameterized from absl.testing.absltest import main from sklearn import metrics as sklearn_metrics @@ -14,15 +13,15 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] + [utils.DataType.FLOAT] * 2 -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=[2, 1, 1], ) -_Y_TRUE_COL = _SCHEMA[1] -_PROBAS_PRED_COL = _SCHEMA[2] -_SAMPLE_WEIGHT_COL = _SCHEMA[3] +_Y_TRUE_COL = _SF_SCHEMA[1] +_PROBAS_PRED_COL = _SF_SCHEMA[2] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[3] class PrecisionRecallCurveTest(parameterized.TestCase): @@ -39,8 +38,7 @@ def tearDown(self) -> None: {"params": {"pos_label": [0, 2, 4]}}, ) def test_pos_label(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) for pos_label in params["pos_label"]: actual_precision, actual_recall, actual_thresholds = snowml_metrics.precision_recall_curve( @@ -62,8 +60,7 @@ def test_pos_label(self, params: Dict[str, Any]) -> None: {"params": {"sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL]}}, ) def test_sample_weight(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_precision, actual_recall, actual_thresholds = snowml_metrics.precision_recall_curve( @@ -84,8 +81,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: @mock.patch("snowflake.ml.modeling.metrics.ranking.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) actual_precision, actual_recall, actual_thresholds = snowml_metrics.precision_recall_curve( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/precision_recall_fscore_support_test.py b/tests/integ/snowflake/ml/modeling/metrics/precision_recall_fscore_support_test.py index f5928778..f4d1c06c 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/precision_recall_fscore_support_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/precision_recall_fscore_support_test.py @@ -1,7 +1,6 @@ from typing import Any, Dict import numpy as np -import pandas as pd from absl.testing import parameterized from absl.testing.absltest import main from sklearn import exceptions, metrics as sklearn_metrics @@ -13,23 +12,23 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=5, ) -_Y_TRUE_COL = _SCHEMA[1] -_Y_PRED_COL = _SCHEMA[2] -_Y_TRUE_COLS = [_SCHEMA[1], _SCHEMA[2]] -_Y_PRED_COLS = [_SCHEMA[3], _SCHEMA[4]] -_SAMPLE_WEIGHT_COL = _SCHEMA[5] +_Y_TRUE_COL = _SF_SCHEMA[1] +_Y_PRED_COL = _SF_SCHEMA[2] +_Y_TRUE_COLS = [_SF_SCHEMA[1], _SF_SCHEMA[2]] +_Y_PRED_COLS = [_SF_SCHEMA[3], _SF_SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[5] class PrecisionRecallFscoreSupportTest(parameterized.TestCase): @@ -58,8 +57,7 @@ def test_beta(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for beta in params["beta"]: actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( @@ -82,8 +80,7 @@ def test_beta(self, params: Dict[str, Any]) -> None: {"params": {"labels": [None, [2, 0, 4]]}}, ) def test_labels(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for labels in params["labels"]: actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( @@ -106,8 +103,7 @@ def test_labels(self, params: Dict[str, Any]) -> None: {"params": {"pos_label": [0, 2, 4]}}, ) def test_pos_label(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for pos_label in params["pos_label"]: actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( @@ -142,8 +138,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( @@ -167,8 +162,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"average": [None, "micro", "macro", "weighted"]}}, ) def test_average_multiclass(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for average in params["average"]: actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( @@ -195,8 +189,8 @@ def test_average_multiclass(self, params: Dict[str, Any]) -> None: sample_weight_col_name=(None, _SAMPLE_WEIGHT_COL), ) def test_average_binary_samples(self, y_true, y_pred, average, sample_weight_col_name) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( df=input_df, y_true_col_names=y_true, @@ -221,8 +215,7 @@ def test_zero_division(self, params: Dict[str, Any]) -> None: [0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0], ] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for zero_division in params["zero_division"]: if zero_division == "warn": @@ -265,8 +258,7 @@ def test_zero_division(self, params: Dict[str, Any]) -> None: def test_no_sample(self) -> None: data = [] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/precision_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/precision_score_test.py index fe22be7b..5ec6d5a2 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/precision_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/precision_score_test.py @@ -1,7 +1,6 @@ from typing import Any, Dict import numpy as np -import pandas as pd from absl.testing import parameterized from absl.testing.absltest import main from sklearn import exceptions, metrics as sklearn_metrics @@ -13,23 +12,23 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=5, ) -_Y_TRUE_COL = _SCHEMA[1] -_Y_PRED_COL = _SCHEMA[2] -_Y_TRUE_COLS = [_SCHEMA[1], _SCHEMA[2]] -_Y_PRED_COLS = [_SCHEMA[3], _SCHEMA[4]] -_SAMPLE_WEIGHT_COL = _SCHEMA[5] +_Y_TRUE_COL = _SF_SCHEMA[1] +_Y_PRED_COL = _SF_SCHEMA[2] +_Y_TRUE_COLS = [_SF_SCHEMA[1], _SF_SCHEMA[2]] +_Y_PRED_COLS = [_SF_SCHEMA[3], _SF_SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[5] class PrecisionScoreTest(parameterized.TestCase): @@ -46,8 +45,7 @@ def tearDown(self) -> None: {"params": {"labels": [None, [2, 0, 4]]}}, ) def test_labels(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for labels in params["labels"]: actual_p = snowml_metrics.precision_score( @@ -69,8 +67,7 @@ def test_labels(self, params: Dict[str, Any]) -> None: {"params": {"pos_label": [0, 2, 4]}}, ) def test_pos_label(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for pos_label in params["pos_label"]: actual_p = snowml_metrics.precision_score( @@ -104,8 +101,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_p = snowml_metrics.precision_score( @@ -128,8 +124,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"average": [None, "micro", "macro", "weighted"]}}, ) def test_average_multiclass(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for average in params["average"]: actual_p = snowml_metrics.precision_score( @@ -155,8 +150,7 @@ def test_average_multiclass(self, params: Dict[str, Any]) -> None: }, ) def test_average_binary(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) for idx, average in enumerate(params["average"]): y_true = params["y_true"][idx] @@ -182,8 +176,7 @@ def test_zero_division(self, params: Dict[str, Any]) -> None: [0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0], ] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for zero_division in params["zero_division"]: if zero_division == "warn": diff --git a/tests/integ/snowflake/ml/modeling/metrics/recall_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/recall_score_test.py index 59075ac3..ad1b219e 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/recall_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/recall_score_test.py @@ -1,7 +1,6 @@ from typing import Any, Dict import numpy as np -import pandas as pd from absl.testing import parameterized from absl.testing.absltest import main from sklearn import exceptions, metrics as sklearn_metrics @@ -13,23 +12,23 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=5, ) -_Y_TRUE_COL = _SCHEMA[1] -_Y_PRED_COL = _SCHEMA[2] -_Y_TRUE_COLS = [_SCHEMA[1], _SCHEMA[2]] -_Y_PRED_COLS = [_SCHEMA[3], _SCHEMA[4]] -_SAMPLE_WEIGHT_COL = _SCHEMA[5] +_Y_TRUE_COL = _SF_SCHEMA[1] +_Y_PRED_COL = _SF_SCHEMA[2] +_Y_TRUE_COLS = [_SF_SCHEMA[1], _SF_SCHEMA[2]] +_Y_PRED_COLS = [_SF_SCHEMA[3], _SF_SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[5] class RecallScoreTest(parameterized.TestCase): @@ -46,8 +45,7 @@ def tearDown(self) -> None: {"params": {"labels": [None, [2, 0, 4]]}}, ) def test_labels(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for labels in params["labels"]: actual_r = snowml_metrics.recall_score( @@ -69,8 +67,7 @@ def test_labels(self, params: Dict[str, Any]) -> None: {"params": {"pos_label": [0, 2, 4]}}, ) def test_pos_label(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for pos_label in params["pos_label"]: actual_r = snowml_metrics.recall_score( @@ -92,8 +89,7 @@ def test_pos_label(self, params: Dict[str, Any]) -> None: {"params": {"average": [None, "micro", "macro", "weighted"]}}, ) def test_average_multiclass(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for average in params["average"]: actual_r = snowml_metrics.recall_score( @@ -119,8 +115,7 @@ def test_average_multiclass(self, params: Dict[str, Any]) -> None: }, ) def test_average_binary(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) for idx, average in enumerate(params["average"]): y_true = params["y_true"][idx] @@ -154,8 +149,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_r = snowml_metrics.recall_score( @@ -182,8 +176,7 @@ def test_zero_division(self, params: Dict[str, Any]) -> None: [0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0], ] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for zero_division in params["zero_division"]: if zero_division == "warn": diff --git a/tests/integ/snowflake/ml/modeling/metrics/roc_auc_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/roc_auc_score_test.py index 508fa623..2340c9e7 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/roc_auc_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/roc_auc_score_test.py @@ -2,7 +2,6 @@ from unittest import mock import numpy as np -import pandas as pd from absl.testing import parameterized from absl.testing.absltest import main from sklearn import metrics as sklearn_metrics @@ -14,14 +13,14 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] + [utils.DataType.FLOAT] * 4 -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=[2, 1, 1, 1, 1], ) -_BINARY_Y_TRUE_COL = _SCHEMA[1] -_BINARY_Y_SCORE_COL = _SCHEMA[2] +_BINARY_Y_TRUE_COL = _SF_SCHEMA[1] +_BINARY_Y_SCORE_COL = _SF_SCHEMA[2] _MULTICLASS_DATA = [ [0, 2, 0.29, 0.49, 0.22, 0.18], [1, 0, 0.33, 0.16, 0.51, 0.69], @@ -30,9 +29,9 @@ [4, 1, 0.82, 0.12, 0.06, 0.91], [5, 2, 0.08, 0.46, 0.46, 0.76], ] -_MULTICLASS_Y_TRUE_COL = _SCHEMA[1] -_MULTICLASS_Y_SCORE_COLS = [_SCHEMA[2], _SCHEMA[3], _SCHEMA[4]] -_SAMPLE_WEIGHT_COL = _SCHEMA[5] +_MULTICLASS_Y_TRUE_COL = _SF_SCHEMA[1] +_MULTICLASS_Y_SCORE_COLS = [_SF_SCHEMA[2], _SF_SCHEMA[3], _SF_SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[5] _MULTILABEL_DATA = [ [1, 0, 1, 0.8, 0.3, 0.6], [0, 1, 0, 0.2, 0.7, 0.4], @@ -55,11 +54,10 @@ def tearDown(self) -> None: self._session.close() @parameterized.parameters( # type: ignore[misc] - {"params": {"average": [None, "micro", "macro", "samples", "weighted"]}}, + {"params": {"average": ["weighted"]}}, ) def test_average_binary(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) for average in params["average"]: actual_auc = snowml_metrics.roc_auc_score( @@ -84,8 +82,7 @@ def test_average_binary(self, params: Dict[str, Any]) -> None: }, ) def test_average_multiclass(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for idx, average in enumerate(params["average"]): multi_class = params["multi_class"][idx] @@ -120,8 +117,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_score = values["y_score"] - pandas_df = pd.DataFrame(data, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_auc = snowml_metrics.roc_auc_score( @@ -144,8 +140,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"max_fpr": [None, 0.1, 0.5, 1]}}, ) def test_max_fpr(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) for max_fpr in params["max_fpr"]: actual_auc = snowml_metrics.roc_auc_score( @@ -165,8 +160,7 @@ def test_max_fpr(self, params: Dict[str, Any]) -> None: {"params": {"multi_class": ["ovr", "ovo"]}}, ) def test_multi_class(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for multi_class in params["multi_class"]: actual_auc = snowml_metrics.roc_auc_score( @@ -186,8 +180,7 @@ def test_multi_class(self, params: Dict[str, Any]) -> None: {"params": {"labels": [None, [0, 1, 2]]}}, ) def test_labels(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for labels in params["labels"]: actual_auc = snowml_metrics.roc_auc_score( @@ -206,8 +199,7 @@ def test_labels(self, params: Dict[str, Any]) -> None: self.assertAlmostEqual(sklearn_auc, actual_auc) def test_multilabel(self) -> None: - pandas_df = pd.DataFrame(_MULTILABEL_DATA, columns=_MULTILABEL_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTILABEL_DATA, _MULTILABEL_SCHEMA) actual_auc = snowml_metrics.roc_auc_score( df=input_df, @@ -222,8 +214,7 @@ def test_multilabel(self) -> None: @mock.patch("snowflake.ml.modeling.metrics.ranking.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: - pandas_df = pd.DataFrame(_MULTILABEL_DATA, columns=_MULTILABEL_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTILABEL_DATA, _MULTILABEL_SCHEMA) actual_auc = snowml_metrics.roc_auc_score( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/roc_curve_test.py b/tests/integ/snowflake/ml/modeling/metrics/roc_curve_test.py index acdd0015..1b14eba4 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/roc_curve_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/roc_curve_test.py @@ -16,21 +16,21 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] + [utils.DataType.FLOAT] * 2 -_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=[2, 1, 1], ) -_MULTICLASS_DATA, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=[5, 1, 1], ) -_Y_TRUE_COL = _SCHEMA[1] -_Y_SCORE_COL = _SCHEMA[2] -_SAMPLE_WEIGHT_COL = _SCHEMA[3] +_Y_TRUE_COL = _SF_SCHEMA[1] +_Y_SCORE_COL = _SF_SCHEMA[2] +_SAMPLE_WEIGHT_COL = _SF_SCHEMA[3] class RocCurveTest(parameterized.TestCase): @@ -47,8 +47,7 @@ def tearDown(self) -> None: {"params": {"pos_label": [0, 2, 4]}}, ) def test_pos_label(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_MULTICLASS_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) for pos_label in params["pos_label"]: actual_fpr, actual_tpr, actual_thresholds = snowml_metrics.roc_curve( @@ -71,8 +70,7 @@ def test_pos_label(self, params: Dict[str, Any]) -> None: {"params": {"sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL]}}, ) def test_sample_weight(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_fpr, actual_tpr, actual_thresholds = snowml_metrics.roc_curve( @@ -96,8 +94,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"drop_intermediate": [True, False]}}, ) def test_drop_intermediate(self, params: Dict[str, Any]) -> None: - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) for drop_intermediate in params["drop_intermediate"]: actual_fpr, actual_tpr, actual_thresholds = snowml_metrics.roc_curve( @@ -122,7 +119,7 @@ def test_multi_query_df(self) -> None: self._session.sql(f"create temp stage {stage}").collect() # Load data into the stage. - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) + pandas_df = pd.DataFrame(_BINARY_DATA, columns=_PD_SCHEMA) with tempfile.TemporaryDirectory() as temp_dir: filename = "data.parquet" local_path = os.path.join(temp_dir, filename) @@ -137,6 +134,7 @@ def test_multi_query_df(self) -> None: input_df = df_lhs.join(df_rhs, ["ID"]) pd_df = input_df.to_pandas() + pd_df.columns = input_df.columns actual_fpr, actual_tpr, actual_thresholds = snowml_metrics.roc_curve( df=input_df, @@ -156,8 +154,7 @@ def test_multi_query_df(self) -> None: @mock.patch("snowflake.ml.modeling.metrics.ranking.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: # TODO: somehow confirm that the stage upload code path was taken. - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) - input_df = self._session.create_dataframe(pandas_df) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) actual_fpr, actual_tpr, actual_thresholds = snowml_metrics.roc_curve( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/k_bins_discretizer_test.py b/tests/integ/snowflake/ml/modeling/preprocessing/k_bins_discretizer_test.py index fd95d5da..34548ca4 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/k_bins_discretizer_test.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/k_bins_discretizer_test.py @@ -130,15 +130,15 @@ def test_fit_fuzz_data(self) -> None: N_BINS = [10, 7] ENCODE = "ordinal" - data, schema = utils.gen_fuzz_data( + data, pd_schema, sf_schema = utils.gen_fuzz_data( rows=1000, types=[utils.DataType.INTEGER, utils.DataType.FLOAT], ) - pandas_df, snowpark_df = utils.get_df(self._session, data, schema) + pandas_df, snowpark_df = utils.get_df(self._session, data, pd_schema) for strategy in self._strategies: sklearn_discretizer = SklearnKBinsDiscretizer(n_bins=N_BINS, encode=ENCODE, strategy=strategy) - sklearn_discretizer.fit(pandas_df[schema[1:]]) + sklearn_discretizer.fit(pandas_df[sf_schema[1:]]) target_n_bins = sklearn_discretizer.n_bins_.tolist() target_bin_edges = sklearn_discretizer.bin_edges_.tolist() @@ -147,7 +147,7 @@ def test_fit_fuzz_data(self) -> None: n_bins=N_BINS, encode=ENCODE, strategy=strategy, - input_cols=schema[1:], + input_cols=sf_schema[1:], ) discretizer.fit(df) actual_edges = discretizer.bin_edges_.tolist() @@ -197,7 +197,7 @@ def test_transform_ordinal_encoding_fuzz_data(self) -> None: ENCODE = "ordinal" OUTPUT_COLS = [f"OUT_{x}" for x in range(len(N_BINS))] - data, schema = utils.gen_fuzz_data( + data, pd_schema, sf_schema = utils.gen_fuzz_data( rows=10000, types=[ utils.DataType.INTEGER, @@ -207,30 +207,32 @@ def test_transform_ordinal_encoding_fuzz_data(self) -> None: low=-999999, high=999999, ) - pandas_df, snowpark_df = utils.get_df(self._session, data, schema) + pandas_df, snowpark_df = utils.get_df(self._session, data, pd_schema) for strategy in self._strategies: # 1. Create OSS SKLearn discretizer sklearn_discretizer = SklearnKBinsDiscretizer(n_bins=N_BINS, encode=ENCODE, strategy=strategy) - sklearn_discretizer.fit(pandas_df[schema[1:]]) - target_output = sklearn_discretizer.transform(pandas_df.sort_values(by=[schema[0]])[schema[1:]]) + sklearn_discretizer.fit(pandas_df[sf_schema[1:]]) + target_output = sklearn_discretizer.transform(pandas_df.sort_values(by=[sf_schema[0]])[sf_schema[1:]]) # 2. Create SnowML discretizer discretizer = KBinsDiscretizer( n_bins=N_BINS, encode=ENCODE, strategy=strategy, - input_cols=schema[1:], + input_cols=sf_schema[1:], output_cols=OUTPUT_COLS, ) discretizer.fit(snowpark_df) # 3. Transform with Snowpark DF and compare - actual_output = discretizer.transform(snowpark_df).sort(schema[0])[OUTPUT_COLS].to_pandas().to_numpy() + actual_output = discretizer.transform(snowpark_df).sort(sf_schema[0])[OUTPUT_COLS].to_pandas().to_numpy() np.testing.assert_allclose(target_output, actual_output) # 4. Transform with Pandas DF and compare - pd_actual_output = discretizer.transform(pandas_df.sort_values(by=[schema[0]])[schema[1:]])[OUTPUT_COLS] + pd_actual_output = discretizer.transform(pandas_df.sort_values(by=[sf_schema[0]])[sf_schema[1:]])[ + OUTPUT_COLS + ] np.testing.assert_allclose(target_output, pd_actual_output) def test_transform_onehot_encoding(self) -> None: diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/one_hot_encoder_test.py b/tests/integ/snowflake/ml/modeling/preprocessing/one_hot_encoder_test.py index 89741d85..efb2871a 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/one_hot_encoder_test.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/one_hot_encoder_test.py @@ -23,7 +23,7 @@ ) from snowflake.ml.utils import sparse as utils_sparse from snowflake.ml.utils.connection_params import SnowflakeLoginOptions -from snowflake.snowpark import DataFrame, Session +from snowflake.snowpark import DataFrame, Session, functions, types from tests.integ.snowflake.ml.modeling.framework import utils as framework_utils from tests.integ.snowflake.ml.modeling.framework.utils import ( BOOLEAN_COLS, @@ -120,6 +120,30 @@ def test_fit(self) -> None: """ Verify fitted categories. + Raises + ------ + AssertionError + If the fitted categories do not match those of the sklearn encoder. + """ + input_cols = NUMERIC_COLS + df_pandas, df = framework_utils.get_df(self._session, DATA, SCHEMA, np.nan) + + encoder = OneHotEncoder().set_input_cols(input_cols) + encoder.fit(df) + + actual_categories = encoder._categories_list + + # sklearn + encoder_sklearn = SklearnOneHotEncoder() + encoder_sklearn.fit(df_pandas[input_cols]) + + for actual_cats, sklearn_cats in zip(actual_categories, encoder_sklearn.categories_): + self.assertEqual(set(sklearn_cats.tolist()), set(actual_cats.tolist())) + + def test_fit_decimal(self) -> None: + """ + Verify fitted categories. + Raises ------ AssertionError @@ -128,6 +152,18 @@ def test_fit(self) -> None: input_cols = CATEGORICAL_COLS df_pandas, df = framework_utils.get_df(self._session, DATA, SCHEMA, np.nan) + # Map DoubleType to DecimalType + fields = df.schema.fields + selected_cols = [] + for field in fields: + src = field.column_identifier.quoted_name + if isinstance(field.datatype, types.DoubleType): + dest = types.DecimalType(15, 10) + selected_cols.append(functions.cast(functions.col(src), dest).alias(src)) + else: + selected_cols.append(functions.col(src)) + df = df.select(selected_cols) + encoder = OneHotEncoder().set_input_cols(input_cols) encoder.fit(df) diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/standard_scaler_test.py b/tests/integ/snowflake/ml/modeling/preprocessing/standard_scaler_test.py index 0fb68e66..8d784973 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/standard_scaler_test.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/standard_scaler_test.py @@ -16,7 +16,7 @@ StandardScaler, ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions -from snowflake.snowpark import Session +from snowflake.snowpark import Session, functions, types from tests.integ.snowflake.ml.modeling.framework import utils as framework_utils from tests.integ.snowflake.ml.modeling.framework.utils import ( DATA, @@ -69,6 +69,46 @@ def test_fit(self) -> None: np.testing.assert_allclose(actual_mean, scaler_sklearn.mean_) np.testing.assert_allclose(actual_var, scaler_sklearn.var_) + def test_fit_decimal(self) -> None: + """ + Verify fitted states with DecimalType + + Raises + ------ + AssertionError + If the fitted states do not match those of the sklearn scaler. + """ + input_cols = NUMERIC_COLS + df_pandas, df = framework_utils.get_df(self._session, DATA, SCHEMA, np.nan) + + # Map DoubleType to DecimalType + fields = df.schema.fields + selected_cols = [] + for field in fields: + src = field.column_identifier.quoted_name + if isinstance(field.datatype, types.DoubleType): + dest = types.DecimalType(38, 10) + selected_cols.append(functions.cast(functions.col(src), dest).alias(src)) + else: + selected_cols.append(functions.col(src)) + df = df.select(selected_cols) + + for _df in [df_pandas, df]: + scaler = StandardScaler().set_input_cols(input_cols) + scaler.fit(_df) + + actual_scale = scaler._convert_attribute_dict_to_ndarray(scaler.scale_) + actual_mean = scaler._convert_attribute_dict_to_ndarray(scaler.mean_) + actual_var = scaler._convert_attribute_dict_to_ndarray(scaler.var_) + + # sklearn + scaler_sklearn = SklearnStandardScaler() + scaler_sklearn.fit(df_pandas[input_cols]) + + np.testing.assert_allclose(actual_scale, scaler_sklearn.scale_) + np.testing.assert_allclose(actual_mean, scaler_sklearn.mean_) + np.testing.assert_allclose(actual_var, scaler_sklearn.var_) + def test_transform(self) -> None: """ Verify transformed results. diff --git a/tests/integ/snowflake/ml/registry/BUILD.bazel b/tests/integ/snowflake/ml/registry/BUILD.bazel index 85521799..50e2f815 100644 --- a/tests/integ/snowflake/ml/registry/BUILD.bazel +++ b/tests/integ/snowflake/ml/registry/BUILD.bazel @@ -4,7 +4,7 @@ py_test( name = "model_registry_basic_integ_test", srcs = ["model_registry_basic_integ_test.py"], deps = [ - "//snowflake/ml/registry:_ml_artifact", + "//snowflake/ml/registry:artifact_manager", "//snowflake/ml/registry:model_registry", "//snowflake/ml/utils:connection_params", "//tests/integ/snowflake/ml/test_utils:db_manager", @@ -25,6 +25,17 @@ py_test( ], ) +py_test( + name = "model_registry_compat_test", + timeout = "long", + srcs = ["model_registry_compat_test.py"], + deps = [ + "//snowflake/ml/registry:model_registry", + "//tests/integ/snowflake/ml/test_utils:common_test_base", + "//tests/integ/snowflake/ml/test_utils:db_manager", + ], +) + py_test( name = "model_registry_schema_evolution_integ_test", timeout = "long", @@ -63,5 +74,6 @@ py_test( name = "model_registry_snowservice_merge_gate_integ_test", timeout = "eternal", # 3600s srcs = ["model_registry_snowservice_merge_gate_integ_test.py"], + shard_count = 2, deps = [":model_registry_snowservice_integ_test_base"], ) diff --git a/tests/integ/snowflake/ml/registry/model_registry_basic_integ_test.py b/tests/integ/snowflake/ml/registry/model_registry_basic_integ_test.py index 7244ec29..7f9ce834 100644 --- a/tests/integ/snowflake/ml/registry/model_registry_basic_integ_test.py +++ b/tests/integ/snowflake/ml/registry/model_registry_basic_integ_test.py @@ -1,10 +1,10 @@ -import json import uuid from typing import Optional from absl.testing import absltest, parameterized -from snowflake.ml.registry import _ml_artifact, model_registry +from snowflake.ml.registry import model_registry +from snowflake.ml.registry.artifact import Artifact, ArtifactType from snowflake.ml.utils import connection_params from snowflake.snowpark import Session from tests.integ.snowflake.ml.test_utils import db_manager @@ -168,7 +168,7 @@ def test_create_and_drop_model_registry(self, database_name: str, schema_name: O self._validate_restore_db_and_schema() def test_add_and_delete_ml_artifacts(self) -> None: - """Test add_artifact() and delete_artifact() in `_ml_artifact.py` works as expected.""" + """Test add() and delete() in `_artifact_manager.py` works as expected.""" artifact_registry = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( _RUN_ID, "artifact_registry" @@ -179,78 +179,51 @@ def test_add_and_delete_ml_artifacts(self) -> None: model_registry.create_model_registry( session=self._session, database_name=artifact_registry, schema_name=artifact_registry_schema ) + registry = model_registry.ModelRegistry( + session=self._session, database_name=artifact_registry, schema_name=artifact_registry_schema + ) except Exception as e: self._db_manager.drop_database(artifact_registry) raise Exception(f"Test failed with exception:{e}") - artifact_id = "123" - artifact_type = _ml_artifact.ArtifactType.TESTTYPE - artifact_name = "test_artifact" + artifact_id = "test_art_123" artifact_version = "test_artifact_version" - artifact_spec = {"test_property": "test_value"} + artifact_name = "test_artifact" + artifact = Artifact(type=ArtifactType.DATASET, spec='{"test_property": "test_value"}') try: - self.assertTrue( - _ml_artifact.if_artifact_table_exists(self._session, artifact_registry, artifact_registry_schema) - ) - - # Validate `add_artifact()` can insert entry into the artifact table - self.assertFalse( - _ml_artifact.if_artifact_exists( - self._session, - artifact_registry, - artifact_registry_schema, - artifact_id=artifact_id, - artifact_type=artifact_type, - ) - ) - _ml_artifact.add_artifact( - self._session, - artifact_registry, - artifact_registry_schema, + art_ref = registry._artifact_manager.add( + artifact=artifact, artifact_id=artifact_id, - artifact_type=artifact_type, artifact_name=artifact_name, artifact_version=artifact_version, - artifact_spec=artifact_spec, ) + self.assertTrue( - _ml_artifact.if_artifact_exists( - self._session, - artifact_registry, - artifact_registry_schema, - artifact_id=artifact_id, - artifact_type=artifact_type, + registry._artifact_manager.exists( + art_ref.name, + art_ref.version, ) ) # Validate the artifact_spec can be parsed as expected - artifact_df = _ml_artifact._get_artifact( - self._session, - artifact_registry, - artifact_registry_schema, - artifact_id=artifact_id, - artifact_type=artifact_type, + retrieved_art_df = registry._artifact_manager.get( + art_ref.name, + art_ref.version, ) - actual_artifact_spec_str = artifact_df.collect()[0]["ARTIFACT_SPEC"] - actual_artifact_spec_dict = json.loads(actual_artifact_spec_str) - self.assertDictEqual(artifact_spec, actual_artifact_spec_dict) + + actual_artifact_spec = retrieved_art_df.collect()[0]["ARTIFACT_SPEC"] + self.assertEqual(artifact._spec, actual_artifact_spec) # Validate that `delete_artifact` can remove entries from the artifact table. - _ml_artifact.delete_artifact( - self._session, - artifact_registry, - artifact_registry_schema, - artifact_id=artifact_id, - artifact_type=artifact_type, + registry._artifact_manager.delete( + art_ref.name, + art_ref.version, ) self.assertFalse( - _ml_artifact.if_artifact_exists( - self._session, - artifact_registry, - artifact_registry_schema, - artifact_id=artifact_id, - artifact_type=artifact_type, + registry._artifact_manager.exists( + art_ref.name, + art_ref.version, ) ) finally: diff --git a/tests/integ/snowflake/ml/registry/model_registry_compat_test.py b/tests/integ/snowflake/ml/registry/model_registry_compat_test.py new file mode 100644 index 00000000..5a5a21e4 --- /dev/null +++ b/tests/integ/snowflake/ml/registry/model_registry_compat_test.py @@ -0,0 +1,62 @@ +import uuid +from typing import Callable, Tuple + +from absl.testing import absltest + +from snowflake.ml.registry import model_registry +from snowflake.snowpark import session +from tests.integ.snowflake.ml.test_utils import common_test_base, db_manager + + +class ModelRegistryCompatTest(common_test_base.CommonTestBase): + def setUp(self) -> None: + """Creates Snowpark and Snowflake environments for testing.""" + super().setUp() + self.run_id = uuid.uuid4().hex + self._db_manager = db_manager.DBManager(self.session) + self.current_db = self.session.get_current_database() + self.current_schema = self.session.get_current_schema() + + def _prepare_registry_fn_factory( + self, + ) -> Tuple[Callable[[session.Session, str], None], Tuple[str]]: + self.registry_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "registry_db") + + def prepare_registry(session: session.Session, registry_name: str) -> None: + from snowflake.connector.errors import ProgrammingError + from snowflake.ml.registry import model_registry + + try: + model_registry.create_model_registry(session=session, database_name=registry_name) + except ProgrammingError: + # Previous versions of library will call use even in the sproc env, which is not allowed. + # This is to suppress the error + pass + + return prepare_registry, (self.registry_name,) + + # Starting from 1.0.1 as we had a breaking change at that time. + # TODO: mypy is giving out error `Cannot infer type argument 1 of "compatibility_test" of "CommonTestBase" [misc]` + # Need to figure out the reason and remove ignore + @common_test_base.CommonTestBase.compatibility_test( + prepare_fn_factory=_prepare_registry_fn_factory, version_range=">=1.0.1,<=1.0.9" # type: ignore[misc] + ) + def test_open_registry_compat_v0(self) -> None: + try: + with self.assertRaisesRegex( + RuntimeError, r"Registry schema version \([0-9]+\) is ahead of deployed schema \(0\)." + ): + model_registry.ModelRegistry( + session=self.session, database_name=self.registry_name, create_if_not_exists=False + ) + model_registry.ModelRegistry( + session=self.session, database_name=self.registry_name, create_if_not_exists=True + ) + finally: + self._db_manager.drop_database(self.registry_name, if_exists=True) + self.session.use_database(self.current_db) + self.session.use_schema(self.current_schema) + + +if __name__ == "__main__": + absltest.main() diff --git a/tests/integ/snowflake/ml/registry/model_registry_integ_test.py b/tests/integ/snowflake/ml/registry/model_registry_integ_test.py index f280c2cb..83f6cd92 100644 --- a/tests/integ/snowflake/ml/registry/model_registry_integ_test.py +++ b/tests/integ/snowflake/ml/registry/model_registry_integ_test.py @@ -8,7 +8,8 @@ from snowflake import connector from snowflake.ml.dataset import dataset -from snowflake.ml.registry import _ml_artifact, model_registry +from snowflake.ml.registry import model_registry +from snowflake.ml.registry.artifact import ArtifactType from snowflake.ml.utils import connection_params from snowflake.snowpark import Session from tests.integ.snowflake.ml.test_utils import ( @@ -63,7 +64,7 @@ def test_basic_workflow(self) -> None: model=model, tags=model_tags, conda_dependencies=[ - test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") + test_env_utils.get_latest_package_version_spec_in_server(self._session, "snowflake-snowpark-python") ], sample_input_data=test_features, options={"embed_local_ml_library": True}, @@ -78,7 +79,7 @@ def test_basic_workflow(self) -> None: model=model, tags={"stage": "testing", "classifier_type": "svm.SVC"}, conda_dependencies=[ - test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") + test_env_utils.get_latest_package_version_spec_in_server(self._session, "snowflake-snowpark-python") ], sample_input_data=test_features, options={"embed_local_ml_library": True}, @@ -277,7 +278,7 @@ def test_snowml_model(self) -> None: model_version=model_version, model=model, conda_dependencies=[ - test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") + test_env_utils.get_latest_package_version_spec_in_server(self._session, "snowflake-snowpark-python") ], options={"embed_local_ml_library": True}, ) @@ -326,7 +327,7 @@ def test_snowml_pipeline(self) -> None: model_version=model_version, model=model, conda_dependencies=[ - test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") + test_env_utils.get_latest_package_version_spec_in_server(self._session, "snowflake-snowpark-python") ], options={"embed_local_ml_library": True}, ) @@ -389,63 +390,54 @@ def test_log_model_with_dataset(self) -> None: desc="a dummy dataset metadata", ) cur_user = self._session.sql("SELECT CURRENT_USER()").collect()[0]["CURRENT_USER()"] - self.assertEqual(dummy_dataset.id, dummy_dataset.id) self.assertEqual(dummy_dataset.owner, cur_user) - self.assertIsNotNone(dummy_dataset.name, dummy_snapshot_table_full_path) + self.assertIsNone(dummy_dataset.name) self.assertIsNotNone(dummy_dataset.generation_timestamp) minimal_dataset = dataset.Dataset( self._session, df=self._session.sql(spine_query), ) - self.assertEqual(minimal_dataset.id, minimal_dataset.id) self.assertEqual(minimal_dataset.owner, cur_user) - self.assertEqual(minimal_dataset.name, "") + self.assertIsNone(minimal_dataset.name) + self.assertIsNone(minimal_dataset.version) self.assertIsNotNone(minimal_dataset.generation_timestamp) - with self.assertRaisesRegex( - ValueError, - "Only one of sample_input_data and dataset should be provided.", - ): - registry.log_model( - model_name=model_name, - model_version=model_version, - model=model, - conda_dependencies=[ - test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") - ], - sample_input_data=test_features, - dataset=dummy_dataset, - options={"embed_local_ml_library": True}, - ) - test_combinations = [ (model_version, dummy_dataset), (f"{model_version}.2", dummy_dataset), (f"{model_version}.3", minimal_dataset), ] for version, ds in test_combinations: + atf_ref = registry.log_artifact( + artifact=ds, + name=f"ds_{version}", + version=f"{version}.ds", + ) + self.assertEqual(atf_ref.name, f"ds_{version}") + self.assertEqual(atf_ref.version, f"{version}.ds") + registry.log_model( model_name=model_name, model_version=version, model=model, conda_dependencies=[ - test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") + test_env_utils.get_latest_package_version_spec_in_server(self._session, "snowflake-snowpark-python") ], options={"embed_local_ml_library": True}, - dataset=ds, + artifacts=[atf_ref], ) - # test deserialized dataset from get_dataset - des_ds_0 = registry.get_dataset(model_name, version) + # test deserialized dataset from get_artifact + des_ds_0 = registry.get_artifact(atf_ref.name, atf_ref.version) self.assertIsNotNone(des_ds_0) self.assertEqual(des_ds_0, ds) # test deserialized dataset from list_artifacts rows_list = registry.list_artifacts(model_name, version).collect() self.assertEqual(len(rows_list), 1) - self.assertEqual(rows_list[0]["ID"], ds.id) - self.assertEqual(_ml_artifact.ArtifactType[rows_list[0]["TYPE"]], _ml_artifact.ArtifactType.DATASET) + self.assertEqual(rows_list[0]["ID"], des_ds_0._id) + self.assertEqual(ArtifactType[rows_list[0]["TYPE"]], ArtifactType.DATASET) des_ds_1 = dataset.Dataset.from_json(rows_list[0]["ARTIFACT_SPEC"], self._session) self.assertEqual(des_ds_1, ds) diff --git a/tests/integ/snowflake/ml/registry/model_registry_schema_evolution_integ_test.py b/tests/integ/snowflake/ml/registry/model_registry_schema_evolution_integ_test.py index 5af0f8de..c4f316c6 100644 --- a/tests/integ/snowflake/ml/registry/model_registry_schema_evolution_integ_test.py +++ b/tests/integ/snowflake/ml/registry/model_registry_schema_evolution_integ_test.py @@ -296,7 +296,7 @@ def test_api_schema_validation(self) -> None: model_version="v1", model=model, conda_dependencies=[ - test_env_utils.get_latest_package_versions_in_server(self.session, "snowflake-snowpark-python") + test_env_utils.get_latest_package_version_spec_in_server(self.session, "snowflake-snowpark-python") ], ) @@ -314,7 +314,7 @@ def test_api_schema_validation(self) -> None: model_version="v2", model=model, conda_dependencies=[ - test_env_utils.get_latest_package_versions_in_server(self.session, "snowflake-snowpark-python") + test_env_utils.get_latest_package_version_spec_in_server(self.session, "snowflake-snowpark-python") ], ) diff --git a/tests/integ/snowflake/ml/registry/model_registry_snowservice_integ_test_base.py b/tests/integ/snowflake/ml/registry/model_registry_snowservice_integ_test_base.py index 71425784..899e5d25 100644 --- a/tests/integ/snowflake/ml/registry/model_registry_snowservice_integ_test_base.py +++ b/tests/integ/snowflake/ml/registry/model_registry_snowservice_integ_test_base.py @@ -49,7 +49,6 @@ def _test_snowservice_deployment( embed_local_ml_library: Optional[bool] = True, omit_target_method_when_deploy: bool = False, ) -> None: - model, test_features, *_ = prepare_model_and_feature_fn() if omit_target_method_when_deploy: target_method = deployment_options.pop("target_method") @@ -65,7 +64,7 @@ def _test_snowservice_deployment( # Instead we rely on snowpark version on information.schema table. Note that this will not affect end user # as by the time they use it, the latest snowpark should be available in conda already. conda_dependencies = conda_dependencies or [] - conda_dependencies.append(test_env_utils.get_latest_package_versions_in_conda("snowflake-snowpark-python")) + conda_dependencies.append(test_env_utils.get_latest_package_version_spec_in_conda("snowflake-snowpark-python")) self.registry.log_model( model_name=model_name, diff --git a/tests/integ/snowflake/ml/test_utils/BUILD.bazel b/tests/integ/snowflake/ml/test_utils/BUILD.bazel index 91f7969b..2cde419f 100644 --- a/tests/integ/snowflake/ml/test_utils/BUILD.bazel +++ b/tests/integ/snowflake/ml/test_utils/BUILD.bazel @@ -38,6 +38,7 @@ py_library( ], deps = [ ":_snowml_requirements", + ":test_env_utils", "//snowflake/ml/_internal:file_utils", "//snowflake/ml/utils:connection_params", ], diff --git a/tests/integ/snowflake/ml/test_utils/common_test_base.py b/tests/integ/snowflake/ml/test_utils/common_test_base.py index 3228c590..fdbe874c 100644 --- a/tests/integ/snowflake/ml/test_utils/common_test_base.py +++ b/tests/integ/snowflake/ml/test_utils/common_test_base.py @@ -1,19 +1,40 @@ import functools import inspect +import itertools import os import tempfile -from typing import Any, Callable, Type, TypeVar +from typing import Any, Callable, List, Literal, Optional, Tuple, Type, TypeVar import cloudpickle from absl.testing import absltest, parameterized +from typing_extensions import Concatenate, ParamSpec from snowflake.ml._internal import file_utils from snowflake.ml.utils import connection_params from snowflake.snowpark import functions as F, session -from snowflake.snowpark._internal import utils as snowpark_utils -from tests.integ.snowflake.ml.test_utils import _snowml_requirements - -T = TypeVar("T") +from snowflake.snowpark._internal import udf_utils, utils as snowpark_utils +from tests.integ.snowflake.ml.test_utils import _snowml_requirements, test_env_utils + +_V = TypeVar("_V", bound="CommonTestBase") +_T_args = ParamSpec("_T_args") +_R_args = TypeVar("_R_args") + + +def get_function_body(func: Callable[..., Any]) -> str: + source_lines = inspect.getsourcelines(func)[0] + source_lines_generator = itertools.dropwhile(lambda x: x.startswith("@"), source_lines) + first_line: str = next(source_lines_generator) + indentation = len(first_line) - len(first_line.lstrip()) + first_line = first_line.strip() + if not first_line.startswith("def "): + return first_line.rsplit(":")[-1].strip() + elif not first_line.endswith(":"): + for line in source_lines_generator: + line = line.strip() + if line.endswith(":"): + break + # Find the indentation of the first line + return "".join([line[indentation:] for line in source_lines_generator]) class CommonTestBase(parameterized.TestCase): @@ -21,84 +42,170 @@ def setUp(self) -> None: """Creates Snowpark and Snowflake environments for testing.""" self.session = ( session.Session.builder.configs(connection_params.SnowflakeLoginOptions()).create() - if not snowpark_utils.is_in_stored_procedure() + if not snowpark_utils.is_in_stored_procedure() # type: ignore[no-untyped-call] # else session._get_active_session() ) def tearDown(self) -> None: - if not snowpark_utils.is_in_stored_procedure(): + if not snowpark_utils.is_in_stored_procedure(): # type: ignore[no-untyped-call] self.session.close() @classmethod def sproc_test( - kclass: Type["CommonTestBase"], local: bool = True - ) -> Callable[[Callable[["CommonTestBase", T], None]], Callable[["CommonTestBase", T], None]]: - def decorator(fn: Callable[["CommonTestBase", T], None]) -> Callable[["CommonTestBase", T], None]: + kclass: Type[_V], local: bool = True, test_callers_rights=True + ) -> Callable[[Callable[Concatenate[_V, _T_args], None]], Callable[Concatenate[_V, _T_args], None]]: + def decorator(fn: Callable[Concatenate[_V, _T_args], None]) -> Callable[Concatenate[_V, _T_args], None]: @functools.wraps(fn) - def test_wrapper(self: "CommonTestBase", *args: Any, **kwargs: Any) -> None: - if snowpark_utils.is_in_stored_procedure(): + def test_wrapper(self: _V, /, *args: _T_args.args, **kwargs: _T_args.kwargs) -> None: + if snowpark_utils.is_in_stored_procedure(): # type: ignore[no-untyped-call] fn(self, *args, **kwargs) return if local: - fn(self, *args, **kwargs) + with self.subTest("Local Test"): + fn(self, *args, **kwargs) + + def _in_sproc_test(execute_as: Literal["owner", "caller"] = "owner") -> None: + test_module = inspect.getmodule(fn) + assert test_module + cloudpickle.register_pickle_by_value(test_module) + assert test_module.__file__ + test_module_path = os.path.abspath(test_module.__file__) + ind = test_module_path.rfind(f"tests{os.sep}") + assert ind > 0 + rel_path = test_module_path[ind:] + rel_path = os.path.splitext(rel_path)[0] + test_module_name = rel_path.replace(os.sep, ".") + test_name = f"{test_module_name}.{fn.__qualname__}" + + with tempfile.TemporaryDirectory() as tmpdir: + snowml_path, snowml_start_path = file_utils.get_package_path("snowflake.ml") + + snowml_zip_module_filename = os.path.join(tmpdir, "snowflake-ml-python.zip") + with file_utils.zip_file_or_directory_to_stream(snowml_path, snowml_start_path) as input_stream: + with open(snowml_zip_module_filename, "wb") as f: + f.write(input_stream.getbuffer()) + + tests_path, tests_start_path = file_utils.get_package_path("tests") + + tests_zip_module_filename = os.path.join(tmpdir, "snowflake-ml-test.zip") + with file_utils.zip_file_or_directory_to_stream(tests_path, tests_start_path) as input_stream: + with open(tests_zip_module_filename, "wb") as f: + f.write(input_stream.getbuffer()) + + imports = [snowml_zip_module_filename, tests_zip_module_filename] + packages = [ + req for req in _snowml_requirements.REQUIREMENTS if "snowflake-connector-python" not in req + ] + + @F.sproc( # type: ignore[misc] + is_permanent=False, + packages=packages, # type: ignore[arg-type] + replace=True, + session=self.session, + anonymous=(execute_as == "caller"), + imports=imports, # type: ignore[arg-type] + execute_as=execute_as, + ) + def test_in_sproc(sess: session.Session, test_name: str) -> None: + import unittest + + loader = unittest.TestLoader() + + suite = loader.loadTestsFromName(test_name) + result = unittest.TextTestRunner(verbosity=2, failfast=False).run(suite) + if len(result.errors) > 0 or len(result.failures) > 0: + raise RuntimeError( + "Unit test failed unexpectedly with at least one error. " + f"Errors: {result.errors} Failures: {result.failures}" + ) + if result.testsRun == 0: + raise RuntimeError("Unit test does not run any test.") + + test_in_sproc(self.session, test_name) + + cloudpickle.unregister_pickle_by_value(test_module) + + with self.subTest("In-sproc Test (Owner's rights)"): + _in_sproc_test(execute_as="owner") + + if test_callers_rights: + with self.subTest("In-sproc Test (Caller's rights)"): + _in_sproc_test(execute_as="caller") - test_module = inspect.getmodule(fn) - assert test_module - cloudpickle.register_pickle_by_value(test_module) - assert test_module.__file__ - test_module_path = os.path.abspath(test_module.__file__) - ind = test_module_path.rfind(f"tests{os.sep}") - assert ind > 0 - rel_path = test_module_path[ind:] - rel_path = os.path.splitext(rel_path)[0] - test_module_name = rel_path.replace(os.sep, ".") - test_name = f"{test_module_name}.{fn.__qualname__}" - - with tempfile.TemporaryDirectory() as tmpdir: - snowml_path, snowml_start_path = file_utils.get_package_path("snowflake.ml") - - snowml_zip_module_filename = os.path.join(tmpdir, "snowflake-ml-python.zip") - with file_utils.zip_file_or_directory_to_stream(snowml_path, snowml_start_path) as input_stream: - with open(snowml_zip_module_filename, "wb") as f: - f.write(input_stream.getbuffer()) - - tests_path, tests_start_path = file_utils.get_package_path("tests") - - tests_zip_module_filename = os.path.join(tmpdir, "snowflake-ml-test.zip") - with file_utils.zip_file_or_directory_to_stream(tests_path, tests_start_path) as input_stream: - with open(tests_zip_module_filename, "wb") as f: - f.write(input_stream.getbuffer()) - - imports = [snowml_zip_module_filename, tests_zip_module_filename] - packages = [ - req for req in _snowml_requirements.REQUIREMENTS if "snowflake-connector-python" not in req - ] - - @F.sproc( - is_permanent=False, - packages=packages, - replace=True, - session=self.session, - anonymous=True, - imports=imports, - ) - def test_in_sproc(sess: session.Session, test_name: str) -> None: - import unittest - - loader = unittest.TestLoader() - - suite = loader.loadTestsFromName(test_name) - result = unittest.TextTestRunner(verbosity=2, failfast=False).run(suite) - if len(result.errors) > 0 or len(result.failures) > 0: - raise RuntimeError( - "Unit test failed unexpectedly with at least one error. " - f"Errors: {result.errors} Failures: {result.failures}" + return test_wrapper + + return decorator + + @classmethod + def compatibility_test( + kclass: Type[_V], + prepare_fn_factory: Callable[[_V], Tuple[Callable[[session.Session, _R_args], None], _R_args]], + version_range: Optional[str] = None, + additional_packages: Optional[List[str]] = None, + ) -> Callable[[Callable[Concatenate[_V, _T_args], None]], Callable[Concatenate[_V, _T_args], None]]: + def decorator(fn: Callable[Concatenate[_V, _T_args], None]) -> Callable[Concatenate[_V, _T_args], None]: + @functools.wraps(fn) + def test_wrapper(self: _V, /, *args: _T_args.args, **kwargs: _T_args.kwargs) -> None: + prepare_fn, prepare_fn_args = prepare_fn_factory(self) + if additional_packages: + packages = additional_packages + else: + packages = [] + + _, _, return_type, input_types = udf_utils.extract_return_input_types( + prepare_fn, return_type=None, input_types=None, object_type=snowpark_utils.TempObjectType.PROCEDURE + ) + + func_body = get_function_body(prepare_fn) + func_params = inspect.signature(prepare_fn).parameters + func_name = prepare_fn.__name__ + + seen_first_arg = False + first_arg_name = None + arg_list = [] + for arg_name in func_params.keys(): + if not seen_first_arg: + seen_first_arg = True + first_arg_name = arg_name + else: + arg_list.append(arg_name) + + assert first_arg_name is not None, "The prepare function must have at least one argument" + func_source = f""" +import snowflake.snowpark + +def {func_name}({first_arg_name}: snowflake.snowpark.Session, {", ".join(arg_list)}): +{func_body} +""" + + for pkg_ver in test_env_utils.get_package_versions_in_server( + self.session, f"snowflake-ml-python{version_range}" + ): + with self.subTest(f"Testing with snowflake-ml-python version {pkg_ver}"): + final_packages = packages[:] + [f"snowflake-ml-python=={pkg_ver}"] + + with tempfile.NamedTemporaryFile( + "w", encoding="utf-8", suffix=".py", delete=False + ) as temp_file: + temp_file.write(func_source) + temp_file.flush() + + # Instead of using decorator, we register from file to prevent pickling anything from + # current env. + prepare_fn_sproc = self.session.sproc.register_from_file( + file_path=temp_file.name, + func_name=func_name, + return_type=return_type, + input_types=input_types, + is_permanent=False, + packages=final_packages, + replace=True, ) - if result.testsRun == 0: - raise RuntimeError("Unit test does not run any test.") - test_in_sproc(self.session, test_name) + prepare_fn_sproc(*prepare_fn_args, session=self.session) + + fn(self, *args, **kwargs) return test_wrapper diff --git a/tests/integ/snowflake/ml/test_utils/model_factory.py b/tests/integ/snowflake/ml/test_utils/model_factory.py index 1d36c955..7516f20d 100644 --- a/tests/integ/snowflake/ml/test_utils/model_factory.py +++ b/tests/integ/snowflake/ml/test_utils/model_factory.py @@ -18,7 +18,7 @@ OneHotEncoder, ) from snowflake.ml.modeling.xgboost import XGBClassifier # type: ignore[attr-defined] -from snowflake.snowpark import DataFrame, Session +from snowflake.snowpark import DataFrame, Session, functions, types class DEVICE(Enum): @@ -88,9 +88,24 @@ def add_simple_category(df: pd.DataFrame) -> pd.DataFrame: df["SIMPLE"] = categories return df + # Add string to the dataset df_cat = add_simple_category(df) iris_df = session.create_dataframe(df_cat) + fields = iris_df.schema.fields + # Map DoubleType to DecimalType + selected_cols = [] + count = 0 + for field in fields: + src = field.column_identifier.quoted_name + if isinstance(field.datatype, types.DoubleType) and count == 0: + dest = types.DecimalType(15, 10) + selected_cols.append(functions.cast(functions.col(src), dest).alias(src)) + count += 1 + else: + selected_cols.append(functions.col(src)) + iris_df = iris_df.select(selected_cols) + numeric_features = ["SEPALLENGTH", "SEPALWIDTH", "PETALLENGTH", "PETALWIDTH"] categorical_features = ["SIMPLE"] numeric_features_output = [x + "_O" for x in numeric_features] diff --git a/tests/integ/snowflake/ml/test_utils/spcs_integ_test_base.py b/tests/integ/snowflake/ml/test_utils/spcs_integ_test_base.py index 00e696fd..7421d355 100644 --- a/tests/integ/snowflake/ml/test_utils/spcs_integ_test_base.py +++ b/tests/integ/snowflake/ml/test_utils/spcs_integ_test_base.py @@ -15,6 +15,8 @@ class SpcsIntegTestBase(absltest.TestCase): _RUN_ID = uuid.uuid4().hex[:2] _TEST_DB = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "db").upper() _TEST_SCHEMA = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "schema").upper() + _TEST_STAGE = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "stage").upper() + _TEST_IMAGE_REPO = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "repo").upper() @classmethod def setUpClass(cls) -> None: @@ -35,6 +37,7 @@ def setUpClass(cls) -> None: cls._db_manager = db_manager.DBManager(cls._session) cls._db_manager.create_database(cls._TEST_DB) cls._db_manager.create_schema(cls._TEST_SCHEMA) + cls._db_manager.create_stage(cls._TEST_STAGE, cls._TEST_SCHEMA, cls._TEST_DB, sse_encrypted=True) cls._db_manager.cleanup_databases(expire_hours=6) @classmethod diff --git a/tests/integ/snowflake/ml/test_utils/test_env_utils.py b/tests/integ/snowflake/ml/test_utils/test_env_utils.py index 7f4437a8..248e6987 100644 --- a/tests/integ/snowflake/ml/test_utils/test_env_utils.py +++ b/tests/integ/snowflake/ml/test_utils/test_env_utils.py @@ -1,8 +1,9 @@ import functools import textwrap +from typing import List import requests -from packaging import version +from packaging import requirements, version import snowflake.connector from snowflake.ml._internal import env @@ -11,15 +12,18 @@ @functools.lru_cache -def get_latest_package_versions_in_server( - session: session.Session, package_name: str, python_version: str = env.PYTHON_VERSION -) -> str: +def get_package_versions_in_server( + session: session.Session, + package_req_str: str, + python_version: str = env.PYTHON_VERSION, +) -> List[version.Version]: + package_req = requirements.Requirement(package_req_str) parsed_python_version = version.Version(python_version) sql = textwrap.dedent( f""" SELECT PACKAGE_NAME, VERSION FROM information_schema.packages - WHERE package_name = '{package_name}' + WHERE package_name = '{package_req.name}' AND language = 'python' AND runtime_version = '{parsed_python_version.major}.{parsed_python_version.minor}'; """ @@ -40,14 +44,29 @@ def get_latest_package_versions_in_server( req_ver = version.parse(row["VERSION"]) version_list.append(req_ver) except snowflake.connector.DataError: - return package_name - if len(version_list) == 0: - return package_name - return f"{package_name}=={max(version_list)}" + return [] + available_version_list = list(package_req.specifier.filter(version_list)) + return available_version_list + + +@functools.lru_cache +def get_latest_package_version_spec_in_server( + session: session.Session, + package_req_str: str, + python_version: str = env.PYTHON_VERSION, +) -> str: + package_req = requirements.Requirement(package_req_str) + available_version_list = get_package_versions_in_server(session, package_req_str, python_version) + if len(available_version_list) == 0: + return str(package_req) + return f"{package_req.name}=={max(available_version_list)}" @functools.lru_cache -def get_latest_package_versions_in_conda(package_name: str, python_version: str = env.PYTHON_VERSION) -> str: +def get_package_versions_in_conda( + package_req_str: str, python_version: str = env.PYTHON_VERSION +) -> List[version.Version]: + package_req = requirements.Requirement(package_req_str) repodata_url = "https://repo.anaconda.com/pkgs/snowflake/linux-64/repodata.json" parsed_python_version = version.Version(python_version) @@ -65,15 +84,25 @@ def get_latest_package_versions_in_conda(package_name: str, python_version: str packages_info = repodata["packages"] assert isinstance(packages_info, dict) for package_info in packages_info.values(): - if package_info["name"] == package_name and python_version_build_str in package_info["build"]: + if package_info["name"] == package_req.name and python_version_build_str in package_info["build"]: version_list.append(version.parse(package_info["version"])) - return f"{package_name}=={str(max(version_list))}" + available_version_list = list(package_req.specifier.filter(version_list)) + return available_version_list except Exception as e: max_retry -= 1 exc_list.append(e) raise RuntimeError( - f"Failed to get latest version of package {package_name} in Snowflake Anaconda Channel. " + f"Failed to get latest version of package {package_req} in Snowflake Anaconda Channel. " + "Exceptions are " + ", ".join(map(str, exc_list)) ) + + +@functools.lru_cache +def get_latest_package_version_spec_in_conda(package_req_str: str, python_version: str = env.PYTHON_VERSION) -> str: + package_req = requirements.Requirement(package_req_str) + available_version_list = get_package_versions_in_conda(package_req_str, python_version) + if len(available_version_list) == 0: + return str(package_req) + return f"{package_req.name}=={max(available_version_list)}"