From f3a83fbb9ff1aef3b55e607f9474aa71fbb991a9 Mon Sep 17 00:00:00 2001 From: Snowflake Provisioner <58576687+snowflake-provisioner@users.noreply.github.com> Date: Thu, 17 Aug 2023 19:33:42 -0700 Subject: [PATCH] Project import generated by Copybara. (#34) GitOrigin-RevId: b8f053f94d5a20e5b28e275eac6c9f0cd1b10eee Co-authored-by: Snowflake Authors --- .bazelrc | 15 +- .gitignore | 2 +- CHANGELOG.md | 20 +- CONTRIBUTING.md | 361 ++++++++++ WORKSPACE | 45 +- bazel/environments/BUILD.bazel | 5 + bazel/environments/conda-env-build.yml | 18 + .../environments/conda-env-snowflake.yml | 14 +- .../environments/conda-env.yml | 15 +- bazel/environments/fetch_conda_env_config.bzl | 40 ++ bazel/get_affected_targets.sh | 7 +- bazel/mypy/main.py | 69 +- bazel/mypy/mypy.bzl | 135 ++-- bazel/mypy/mypy.sh.tpl | 47 -- bazel/py_rules.bzl | 12 + bazel/requirements/BUILD.bazel | 19 +- bazel/requirements/changelog_version_check.sh | 15 + bazel/requirements/requirements.schema.json | 4 +- bazel/requirements/rules.bzl | 12 +- ci/RunTests.sh | 39 +- ci/build_and_run_tests.sh | 34 +- ci/conda_recipe/bld.bat | 12 + ci/conda_recipe/meta.yaml | 6 +- ci/get_excluded_tests.sh | 2 +- ci/skip_merge_gate_targets | 2 + ci/type_check.sh | 7 +- ci/type_ignored_targets | 35 +- codegen/codegen_rules.bzl | 14 +- codegen/estimator_autogen_tool.py | 2 +- codegen/sklearn_wrapper_generator.py | 2 +- codegen/sklearn_wrapper_template.py_template | 237 ++++--- ...nsformer_autogen_test_template.py_template | 37 +- requirements.yml | 143 ++-- .../ml/_internal/exceptions/error_codes.py | 5 + snowflake/ml/_internal/telemetry_test.py | 2 +- snowflake/ml/_internal/utils/identifier.py | 15 + .../ml/_internal/utils/pkg_version_utils.py | 5 +- .../_internal/utils/query_result_checker.py | 8 +- snowflake/ml/model/BUILD.bazel | 24 +- .../_deploy_client/image_builds/BUILD.bazel | 1 - .../image_builds/client_image_builder.py | 16 +- .../image_builds/client_image_builder_test.py | 6 +- .../image_builds/docker_context.py | 8 +- .../image_builds/docker_context_test.py | 9 +- .../image_builds/gunicorn_run.sh | 32 +- .../image_builds/inference_server/main.py | 10 +- .../inference_server/main_test.py | 25 +- .../test_fixtures/dockerfile_test_gpu_fixture | 29 - .../_deploy_client/snowservice/deploy.py | 37 +- .../snowservice/deploy_options.py | 12 +- .../_deploy_client/snowservice/deploy_test.py | 5 + .../templates/service_spec_template | 2 + .../utils/snowservice_client.py | 16 +- .../_deploy_client/warehouse/BUILD.bazel | 1 - .../model/_deploy_client/warehouse/deploy.py | 3 +- snowflake/ml/model/_deployer.py | 46 +- snowflake/ml/model/_handlers/xgboost.py | 8 + snowflake/ml/model/_model_test.py | 84 ++- snowflake/ml/model/_signatures/BUILD.bazel | 35 +- .../ml/model/_signatures/builtins_handler.py | 16 +- .../ml/model/_signatures/builtins_test.py | 9 +- snowflake/ml/model/_signatures/core.py | 65 +- snowflake/ml/model/_signatures/core_test.py | 29 +- .../ml/model/_signatures/numpy_handler.py | 14 +- snowflake/ml/model/_signatures/numpy_test.py | 9 +- .../ml/model/_signatures/pandas_handler.py | 57 +- snowflake/ml/model/_signatures/pandas_test.py | 55 +- .../ml/model/_signatures/pytorch_handler.py | 19 +- .../ml/model/_signatures/pytorch_test.py | 55 +- .../ml/model/_signatures/snowpark_handler.py | 33 +- .../ml/model/_signatures/snowpark_test.py | 23 +- .../model/_signatures/tensorflow_handler.py | 29 +- .../ml/model/_signatures/tensorflow_test.py | 45 +- snowflake/ml/model/_signatures/utils.py | 44 +- snowflake/ml/model/_signatures/utils_test.py | 7 +- snowflake/ml/model/deploy_platforms.py | 6 + snowflake/ml/model/model_signature.py | 220 ++++-- snowflake/ml/model/model_signature_test.py | 235 +++++-- snowflake/ml/model/type_hints.py | 15 +- .../ml/modeling/impute/simple_imputer.py | 2 +- snowflake/ml/modeling/metrics/monitor.py | 95 ++- snowflake/ml/modeling/pipeline/pipeline.py | 2 +- .../modeling/preprocessing/BUILD_NATIVE.bzl | 2 + .../preprocessing/k_bins_discretizer.py | 2 +- .../modeling/preprocessing/one_hot_encoder.py | 194 ++++-- .../modeling/preprocessing/ordinal_encoder.py | 145 ++-- snowflake/ml/registry/BUILD.bazel | 1 + snowflake/ml/registry/model_registry.py | 182 +++-- snowflake/ml/registry/model_registry_test.py | 38 ++ ...t to Snowpark Container Service Demo.ipynb | 620 +++++++++++++++++ .../notebooks/Model Packaging Example.ipynb | 56 -- .../Model Packaging SnowML Examples.ipynb | 120 +--- .../notebooks/Model Registry Demo.ipynb | 72 +- ...t to Snowpark Container Service Demo.ipynb | 644 ------------------ snowflake/ml/requirements.bzl | 2 +- snowflake/ml/test_utils/BUILD.bazel | 9 + snowflake/ml/test_utils/exception_utils.py | 22 + snowflake/ml/training_dataset/BUILD.bazel | 13 + .../ml/training_dataset/training_dataset.py | 44 ++ snowflake/ml/utils/connection_params.py | 106 ++- snowflake/ml/utils/connection_params_test.py | 193 ++++-- snowflake/ml/version.bzl | 2 +- .../integ/snowflake/ml/_internal/BUILD.bazel | 1 - tests/integ/snowflake/ml/model/BUILD.bazel | 19 +- .../deployment_to_snowservice_integ_test.py | 218 +++--- .../ml/model/model_badcase_integ_test.py | 24 +- .../warehouse_custom_model_integ_test.py | 172 ++--- .../warehouse_mlflow_model_integ_test.py | 28 +- .../model/warehouse_model_integ_test_utils.py | 38 +- .../warehouse_pytorch_model_integ_test.py | 139 ++-- ...ehouse_sklearn_xgboost_model_integ_test.py | 141 +++- .../warehouse_snowml_model_integ_test.py | 40 +- .../warehouse_tensorflow_model_integ_test.py | 103 ++- .../ml/modeling/metrics/test_monitor.py | 70 ++ .../preprocessing/test_label_encoder.py | 7 +- .../preprocessing/test_one_hot_encoder.py | 28 +- .../preprocessing/test_robust_scaler.py | 4 +- .../preprocessing/test_standard_scaler.py | 4 +- tests/integ/snowflake/ml/registry/BUILD.bazel | 22 +- .../model_registry_basic_integ_test.py | 39 +- .../ml/registry/model_registry_integ_test.py | 44 +- .../model_registry_integ_test_snowservice.py | 145 ++++ ...el_registry_integ_test_snowservice_base.py | 132 ++++ ...istry_integ_test_snowservice_merge_gate.py | 40 ++ ...el_registry_integ_test_with_snowservice.py | 224 ------ .../integ/snowflake/ml/test_utils/BUILD.bazel | 10 + .../snowflake/ml/test_utils/db_manager.py | 4 +- .../snowflake/ml/test_utils/model_factory.py | 131 +++- .../snowflake/ml/test_utils/test_env_utils.py | 59 ++ third_party/rules_conda/conda.bzl | 37 +- third_party/rules_conda/env.bzl | 21 +- third_party/rules_conda/utils.bzl | 26 +- ...ules_python_description_content_type.patch | 127 ---- 133 files changed, 4683 insertions(+), 2836 deletions(-) create mode 100644 CONTRIBUTING.md create mode 100644 bazel/environments/BUILD.bazel create mode 100644 bazel/environments/conda-env-build.yml rename conda-env-snowflake.yml => bazel/environments/conda-env-snowflake.yml (74%) rename conda-env.yml => bazel/environments/conda-env.yml (74%) create mode 100644 bazel/environments/fetch_conda_env_config.bzl delete mode 100644 bazel/mypy/mypy.sh.tpl create mode 100755 bazel/requirements/changelog_version_check.sh create mode 100644 ci/conda_recipe/bld.bat create mode 100644 ci/skip_merge_gate_targets delete mode 100644 snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_gpu_fixture create mode 100644 snowflake/ml/model/deploy_platforms.py create mode 100644 snowflake/ml/registry/notebooks/Deployment to Snowpark Container Service Demo.ipynb delete mode 100644 snowflake/ml/registry/notebooks/Snowpark ML - Deployment to Snowpark Container Service Demo.ipynb create mode 100644 snowflake/ml/test_utils/exception_utils.py create mode 100644 snowflake/ml/training_dataset/BUILD.bazel create mode 100644 snowflake/ml/training_dataset/training_dataset.py create mode 100644 tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice.py create mode 100644 tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_base.py create mode 100644 tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_merge_gate.py delete mode 100644 tests/integ/snowflake/ml/registry/model_registry_integ_test_with_snowservice.py create mode 100644 tests/integ/snowflake/ml/test_utils/test_env_utils.py delete mode 100644 third_party/rules_python_description_content_type.patch diff --git a/.bazelrc b/.bazelrc index d49c1672..5d1f58b5 100644 --- a/.bazelrc +++ b/.bazelrc @@ -1,12 +1,17 @@ # Make the target platform and the host platform the same -build --platforms //bazel/platforms:extended_conda_env --host_platform //bazel/platforms:extended_conda_env -test --platforms //bazel/platforms:extended_conda_env --host_platform //bazel/platforms:extended_conda_env --run_under='//bazel:test_wrapper' -run --platforms //bazel/platforms:extended_conda_env --host_platform //bazel/platforms:extended_conda_env -cquery --platforms //bazel/platforms:extended_conda_env --host_platform //bazel/platforms:extended_conda_env +build --platforms //bazel/platforms:extended_conda_env --host_platform //bazel/platforms:extended_conda_env --repo_env=BUILD_CONDA_ENV=build +test --platforms //bazel/platforms:extended_conda_env --host_platform //bazel/platforms:extended_conda_env --run_under='//bazel:test_wrapper' --repo_env=BUILD_CONDA_ENV=extended +run --platforms //bazel/platforms:extended_conda_env --host_platform //bazel/platforms:extended_conda_env --repo_env=BUILD_CONDA_ENV=extended +cquery --platforms //bazel/platforms:extended_conda_env --host_platform //bazel/platforms:extended_conda_env --repo_env=BUILD_CONDA_ENV=extended -build:typecheck --aspects //bazel/mypy:mypy.bzl%mypy_aspect --output_groups=mypy +run:pre_build --platforms //bazel/platforms:extended_conda_env --host_platform //bazel/platforms:extended_conda_env --repo_env=BUILD_CONDA_ENV=build +build:typecheck --aspects //bazel/mypy:mypy.bzl%mypy_aspect --output_groups=mypy --repo_env=BUILD_CONDA_ENV=extended # Since integration tests are located in different packages than code under test, # the default instrumentation filter would exclude the code under test. This # makes bazel consider all the source code in our repo for coverage. coverage --instrumentation_filter="-//tests[/:]" + +test:sf_only --platforms //bazel/platforms:snowflake_conda_env --host_platform //bazel/platforms:snowflake_conda_env --run_under='//bazel:test_wrapper' --repo_env=BUILD_CONDA_ENV=sf_only +run:sf_only --platforms //bazel/platforms:snowflake_conda_env --host_platform //bazel/platforms:snowflake_conda_env --repo_env=BUILD_CONDA_ENV=sf_only +cquery:sf_only --platforms //bazel/platforms:snowflake_conda_env --host_platform //bazel/platforms:snowflake_conda_env --repo_env=BUILD_CONDA_ENV=sf_only diff --git a/.gitignore b/.gitignore index 0be6c5b0..4fb30098 100644 --- a/.gitignore +++ b/.gitignore @@ -199,7 +199,7 @@ pids lib-cov # Coverage directory used by tools like istanbul -coverage +coverage*.json *.lcov # nyc test coverage diff --git a/CHANGELOG.md b/CHANGELOG.md index 68e0d5fb..9819eae9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,22 @@ # Release History +## 1.0.5 + +### New Features + +- Model Registry: Added support save/load/deploy xgboost Booster model. +- Model Registry: Added support to get the model name and the model version from model references. + +### Bug Fixes + +- Model Registry: Restore the db/schema back to the session after `create_model_registry()`. +- Model Registry: Fixed an issue that the UDF name created when deploying a model is not identical to what is provided and cannot be correctly dropped when deployment getting dropped. +- connection_params.SnowflakeLoginOptions(): Added support for `private_key_path`. + ## 1.0.4 ### New Features + - Model Registry: Added support save/load/deploy Tensorflow models (`tensorflow.Module`). - Model Registry: Added support save/load/deploy MLFlow PyFunc models (`mlflow.pyfunc.PyFuncModel`). - Model Development: Input dataframes can now be joined against data loaded from staged files. @@ -15,9 +29,11 @@ ## 1.0.3 (2023-07-14) ### Behavior Changes + - Model Registry: When predicting a model whose output is a list of NumPy ndarray, the output would not be flattened, instead, every ndarray will act as a feature(column) in the output. ### New Features + - Model Registry: Added support save/load/deploy PyTorch models (`torch.nn.Module` and `torch.jit.ScriptModule`). ### Bug Fixes @@ -33,11 +49,13 @@ ## 1.0.2 (2023-06-22) ### Behavior Changes + - Model Registry: Prohibit non-snowflake-native models from being logged. - Model Registry: `_use_local_snowml` parameter in options of `deploy()` has been removed. - Model Registry: A default `False` `embed_local_ml_library` parameter has been added to the options of `log_model()`. With this set to `False` (default), the version of the local snowflake-ml-python library will be recorded and used when deploying the model. With this set to `True`, local snowflake-ml-python library will be embedded into the logged model, and will be used when you load or deploy the model. ### New Features + - Model Registry: A new optional argument named `code_paths` has been added to the arguments of `log_model()` for users to specify additional code paths to be imported when loading and deploying the model. - Model Registry: A new optional argument named `options` has been added to the arguments of `log_model()` to specify any additional options when saving the model. - Model Development: Added metrics: @@ -52,8 +70,8 @@ - Model Development: `accuracy_score()` now works when given label column names are lists of a single value. - ## 1.0.1 (2023-06-16) + ### Behavior Changes - Model Development: Changed Metrics APIs to imitate sklearn metrics modules: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..b244bac9 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,361 @@ +## Build system + +We are using `bazel` as the build system. + +### Installation + +#### Bazel + +Install bazel, if not already done: + +``` +# This installs bazelisk in ~/go/bin/bazelisk +go install github.com/bazelbuild/bazelisk@latest +``` + +Add shortcut in your `~/.bashrc` (or equivalent): + +``` +if [ -f ~/go/bin/bazelisk ]; then + alias bazel=~/go/bin/bazelisk +fi +``` + +#### Buildifier + +This tool helps auto-formatting `BUILD.bazel` file. Installation is similar: + +``` +go install github.com/bazelbuild/buildtools/buildifier@latest +``` + +Add shortcut in your `~/.bashrc` (or equivalent): + +``` +if [ -f ~/go/bin/buildifier ]; then + alias buildifier=~/go/bin/buildifier +fi +``` + +Note: You may need to configure your editor to run this on save. + +### Build + +To build the package, run: + +```shell +> bazel build //snowflake/ml:wheel +``` + +`bazel` can be run from anywhere under the monorepo and it can accept absolute path or a relative path. For example, + +```shell +snowflake/ml> bazel build :wheel +``` + +You can build an entire sub-tree as: + +```shell +> bazel build //snowflake/... +``` + +### Notes when you add new target in a `BUILD.bazel` file + ++ Instead of using `py_binary`, `py_library` and `py_test` rule from bazel, use those from `bazel/py_rules.bzl`. + Example, instead of + ``` + py_library( + name="my_lib", + srcs=["my_lib.py"], + ) + ``` + use the following instead + ``` + load("//bazel:py_rules.bzl", "py_library") + + py_library( + name="my_lib", + srcs=["my_lib.py"], + ) + ``` ++ When using a `genrule` rule whose tool is a `py_binary`, use `py_genrule` from `bazel/py_rules.bzl` instead. + Example, instead of + ``` + py_binary( + name="my_tool", + srcs=["my_tool.py"], + ) + + genrule( + name="generate_something", + cmd="$(location :my_tool)", + tools=[":my_tool"] + ) + ``` + use the following instead + ``` + load("//bazel:py_rules.bzl", "py_binary", "py_genrule") + + py_binary( + name="my_tool", + srcs=["my_tool.py"], + ) + + py_genrule( + name="generate_something", + cmd="$(location :my_tool)", + tools=[":my_tool"] + ) + ``` + +### Type-check + +#### mypy + +We use [mypy](https://mypy.readthedocs.io/en/stable/) to type-check our Python source files. mypy is integrated into our bazel environment. + +The version of MyPy is specified in `conda-env.yml`, just like other conda +packages we depend on. + +#### Invoke MyPy locally + +``` +bazel build --config=typecheck +``` + +Or you could run +``` +./ci/type_check.sh -b +``` + +You only need to specify `-b ` if your `bazel` is not in `$PATH` or is an alias. + +### Test + +Similar to `bazel build`, `bazel test` can test any target. The target must be +a test target. It will run the target and report if `PASSED` or `FAILED`. It essentially `build`s the target and then `run` it. You can also build and run separately. + +TIP: If a test fails, there will be a log file, which is executable. You do not need to open via `less` or `editor`. You can directly paste the path in command line. + +Integration tests are configured to run against an existing Snowflake account. To run tests locally, make sure that you +have configured a SnowSQL `config` file in `/.snowsql/config` (see Snowflake +[documentation](https://docs.snowflake.com/en/user-guide/snowsql-config) for configuration options). + +For example, to run all autogenerated tests locally: +```bash +# Then run all autogenerated tests +bazel test //... --test_tag_filters=autogen +``` + +### Coverage + +A `lcov` coverage report can be generated by running + +``` +bazel coverage --combined_report=lcov +``` + +To get a human-readable report: + +``` +lcov --list $(bazel info output_path)/_coverage/_coverage_report.dat +``` + +To get an HTML report: + +``` +genhtml --output "$(bazel info output_path)/_coverage/_coverage_report.dat" +``` + +Both `lcov` and `genhtml` are part of the [`lcov`](https://github.com/linux-test-project/lcov) project. To install it on MacOS: + +``` +brew install lcov +``` + +The unit test coverage report is generated periodically by a GitHub +[workflow](https://github.com/snowflakedb/snowflake-ml-python/actions/workflows/continuous_build.yml?query=branch%3Amain). +You can download the report in the artifacts generated by the action runs. + +### Run + +Another useful command is, `bazel run`. This builds and then run the built target directly. Useful for binaries while debugging. + +### Other commands + +`bazel` is pretty powerful and has lots of other commands. Read more [here](https://bazel.build/run/build). + +### Python dependencies + +To introduce a third-party Python dependency, first check if it is available as a package in the +[Snowflake conda channel](https://repo.anaconda.com/pkgs/snowflake/). Then modify +[requirements.yml](https://github.com/snowflakedb/snowflake-ml-python/blob/main/requirements.yml), and run the following to re-generate all requirements files, including +[conda-env.yml](https://github.com/snowflakedb/snowflake-ml-python/blob/main/conda-env.yml): + +``` +bazel run --config=pre_build //bazel/requirements:sync_requirements +``` + +Then, your code can use the package as if it were "installed" in the Python environment. + +### Adding a new dependencies + +Please provide the following fields when adding a new record: + +#### Package Name Fields + +`name`: The name of the package. Set this if the package is available with the same name and is required in both `PyPI` +and `conda`. + +`name_pypi`: The name of the package in `PyPI`. Set this only to indicate that it is available in `PyPI` only. You can +also set this along with `name_conda` if the package has different names in `PyPI` and `conda`. + +`name_conda`: The name of the package in `conda`. Set this only to indicate that it is available in `conda` only. You +can also set this along with `name_pypi` if the package has different names in `PyPI` and `conda`. + +(At least one of these three fields should be set.) + +#### Development Version Fields + +`dev_version`: The version of the package to be pinned in the dev environment. Set this if the package is available +with the same version and is required in both `PyPI` and conda. + +`dev_version_pypi`: The version from `PyPI` to be pinned in the dev environment. Set this only to indicate that it is +available in `PyPI` only. You can also set this along with `dev_version_conda` if the package has different versions in +`PyPI` and `conda`. + +`dev_version_conda`: The version from `conda` to be pinned in the dev environment. Set this only to indicate that it is +available in `conda` only. You can also set this along with `dev_version_pypi` if the package has different versions in +`PyPI` and `conda`. + +(At least one of these three fields should be set.) + +#### Snowflake Anaconda Channel + +`from_channel`: Set this if the package is not available in the Snowflake Anaconda Channel +(https://repo.anaconda.com/pkgs/snowflake). + +#### Version Requirements Fields (for `snowflake-ml-python` release) + +`version_requirements`: The version requirements specifiers when this requirement is a dependency of the +`snowflake-ml-python` release. Set this if the package is available with the same name and required in both `PyPI` and +`conda`. + +`version_requirements_pypi`: The version requirements specifiers when this requirement is a dependency of the `snowflake-ml-python` release via `PyPI`. Set this only to indicate that it is required by the `PyPI` release only. You +can also set this along with `version_requirements_conda` if the package has different versions in `PyPI` and `conda`. + +`version_requirements_conda`: The version requirements specifiers when this requirement is a dependency of the `snowflake-ml-python` release via `conda`. Set this only to indicate that it is required by the `conda` release only. +You can also set this along with `version_requirements_pypi` if the package has different versions in `PyPI` and `conda`. + +(At least one of these three fields must be set to indicate that this package is a dependency of the release. If you +don't want to constrain the version, set the field to an empty string.) + +#### Extras Tags and Tags + +`requirements_extra_tags`: Set this to indicate that the package is an extras dependency of `snowflake-ml-python`. This requirement will be added to all extras tags specified here, and an `all` extras tag will be auto-generated to include +all extras requirements. All extras requirements will be labeled as `run_constrained` in conda's meta.yaml. + +`tags`: Set tags to filter some of the requirements in specific cases. The current valid tags include: + - `deployment_core`: Used by model deployment to indicate dependencies required to execute model deployment code + on the server-side. + - `build_essential`: Used to indicate the packages composing the build environment. + +Example: + +```yaml +- name: pandas + name_pypi: pandas-pypi-name + dev_version: 1.2.0 + dev_version_pypi: 1.2.0-pypi + version_requirements: ">=1.0.0" + version_requirements_pypi: ">=1.0.0" + from_channel: "conda-forge" + requirements_extra_tags: + - pandas + tags: + - deployment_core + - build_essential +``` + +## Unit Testing + +Write `pytest` or Python `unittest` style unit tests. + +### `unittest` + +Use `absl.testing.absltest` as a drop-in replacement of `unittest`. + +For example: + +``` +# instead of +# import unittest +from absl.testing import absltest + +# instead of +# from unittest import TestCase, main +from absl.testing.absltest import TestCase, main +``` + +`absltest` provides better `bazel` integration which produces a more detailed XML +test report. The test report is picked up by a Github workflow to provide a nice UI +for test results. + +### `pytest` + +Make each unit test file its own runnable `py_test` target and use the `main()` +function provided by `snowflake.ml.test_utils.pytest_driver`. + +For example: + +``` +from snowflake.ml.utils import pytest_driver + +def test_case(): + assert some_feature() + +if __name__ == "__main__": + pytest_driver.main() +``` + +`pytest_driver` contains `bazel` integration that allows `pytest` to produce a XML +test report. + +### Important Notes + +When you add a new test file, you should always ensure the existence of a `if __name__ == "__main__":` block, otherwise, the +test file will not be instructed by bazel. We have a test wrapper [here](./bazel/test_wrapper.sh) to ensure that the +test will fail if you forget that part. + +## `pre-commit` + +Pull requests against the main branch are subject to `pre-commit` checks. Those checks enforce the code style. + +You can make sure the checks can pass by installing the `pre-commit` hooks to your local repo +([instructions](https://pre-commit.com/#installation)). Those hooks will be invoked when you commit locally, +and they fix the style violations in-place. + +Tip: if you want to isolate those fixes, avoid the `-a` the option in `git commit`. This way the automated changes +will be unstaged changes. + +### Darglint + +The [darglint](https://github.com/terrencepreilly/darglint) pre-commit hook lints docstrings to make sure they +conform to the [Google style guide for docstrings](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings). +Function docstrings must contain "Args" section with input value descriptions, "Returns" section describing output, and +"Raises" section enumerating the exceptions that the function can raise. Darglint will ensure that all input args are present +in the docstring and is sensitive to whitespace (e.g. args should be indented the correct number of spaces). Refer +to the list of [darglint error codes](https://github.com/terrencepreilly/darglint#error-codes) for guidance. + +## Editors + +### VSCode + +Here are few good plugins to use: + +1. [Python](https://marketplace.visualstudio.com/items?itemName=ms-python.python) +1. [Pylance static checking](https://marketplace.visualstudio.com/items?itemName=ms-python.vscode-pylance) +1. [Bazel](https://marketplace.visualstudio.com/items?itemName=BazelBuild.vscode-bazel) + - You need to configure `buildifier` in settings for auto-formatting `BUILD.bazel` files +1. [Black Python Formatter](https://marketplace.visualstudio.com/items?itemName=ms-python.black-formatter) +1. [Flake8 Linter](https://marketplace.visualstudio.com/items?itemName=ms-python.flake8) diff --git a/WORKSPACE b/WORKSPACE index d7711872..ce7cf7b4 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -5,10 +5,10 @@ load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository") http_jar( name = "bazel_diff", + sha256 = "9c4546623a8b9444c06370165ea79a897fcb9881573b18fa5c9ee5c8ba0867e2", urls = [ "https://github.com/Tinder/bazel-diff/releases/download/4.3.0/bazel-diff_deploy.jar", ], - sha256 = "9c4546623a8b9444c06370165ea79a897fcb9881573b18fa5c9ee5c8ba0867e2", ) http_archive( @@ -27,9 +27,9 @@ bazel_skylib_workspace() # Latest @ 2023-06-20 # Replace with released version once newer version released. git_repository( - name="rules_python", - commit="0d59fcf561f6d2c4705924bc17c151fb4b998841", - remote="https://github.com/bazelbuild/rules_python.git" + name = "rules_python", + commit = "0d59fcf561f6d2c4705924bc17c151fb4b998841", + remote = "https://github.com/bazelbuild/rules_python.git", ) load("//third_party/rules_conda:defs.bzl", "conda_create", "load_conda", "register_toolchain") @@ -44,6 +44,7 @@ http_archive( load("@aspect_bazel_lib//lib:repositories.bzl", "aspect_bazel_lib_dependencies", "register_yq_toolchains") aspect_bazel_lib_dependencies() + register_yq_toolchains() # Below two conda environments (toolchains) are created and they require different @@ -58,40 +59,28 @@ register_yq_toolchains() # The default platform when --platforms flag is not set, is specified in # .bazelrc . -load_conda(conda_repo_name = "snowflake_conda", quiet = True) +load("@SnowML//bazel/environments:fetch_conda_env_config.bzl", "fetch_conda_env_config") +fetch_conda_env_config(name = "fetch_conda_env_config_repo") +load("@fetch_conda_env_config_repo//:config.bzl", "NAME", "ENVIRONMENT", "COMPATIBLE_TARGET") -conda_create( - name = "py3_env_snowflake_conda_only", - conda_repo_name = "snowflake_conda", - timeout = 3600, - clean = False, - environment = "@//:conda-env-snowflake.yml", - coverage_tool = "@//bazel/coverage_tool:coverage_tool.py", +load_conda( + conda_repo_name = "{}_conda".format(NAME), quiet = True, ) -register_toolchain( - name = "py3_env_snowflake_conda_only_repo", - env = "py3_env_snowflake_conda_only", - target_compatible_with=["@SnowML//bazel/platforms:snowflake_conda_channel"], - toolchain_name = "py3_toolchain_snowflake_conda_only", -) - -load_conda(conda_repo_name = "extended_conda", quiet = True) - conda_create( - name = "py3_env_extended_channels", - conda_repo_name = "extended_conda", + name = "{}_env".format(NAME), timeout = 3600, clean = False, - environment = "@//:conda-env.yml", + conda_repo_name = "{}_conda".format(NAME), coverage_tool = "@//bazel/coverage_tool:coverage_tool.py", + environment = ENVIRONMENT, quiet = True, ) register_toolchain( - name = "py3_env_extended_channels_repo", - env = "py3_env_extended_channels", - target_compatible_with=["@SnowML//bazel/platforms:extended_conda_channels"], - toolchain_name = "py3_toolchain_extended_channels", + name = "{}_env_repo".format(NAME), + env = "{}_env".format(NAME), + target_compatible_with = COMPATIBLE_TARGET, + toolchain_name = "py3_toolchain_{}_env".format(NAME), ) diff --git a/bazel/environments/BUILD.bazel b/bazel/environments/BUILD.bazel new file mode 100644 index 00000000..9ddc0d99 --- /dev/null +++ b/bazel/environments/BUILD.bazel @@ -0,0 +1,5 @@ +exports_files([ + "conda-env-snowflake.yml", + "conda-env-build.yml", + "conda-env.yml", +]) diff --git a/bazel/environments/conda-env-build.yml b/bazel/environments/conda-env-build.yml new file mode 100644 index 00000000..be19154f --- /dev/null +++ b/bazel/environments/conda-env-build.yml @@ -0,0 +1,18 @@ +# DO NOT EDIT! +# Generate by running 'bazel run --config=pre_build //bazel/requirements:sync_requirements' + +channels: +- https://repo.anaconda.com/pkgs/snowflake +- nodefaults +dependencies: +- absl-py==1.3.0 +- conda-libmamba-solver==23.3.0 +- inflection==0.5.1 +- jsonschema==3.2.0 +- lightgbm==3.3.5 +- mypy==0.981 +- numpy==1.24.3 +- packaging==23.0 +- pyyaml==6.0 +- scikit-learn==1.2.2 +- xgboost==1.7.3 diff --git a/conda-env-snowflake.yml b/bazel/environments/conda-env-snowflake.yml similarity index 74% rename from conda-env-snowflake.yml rename to bazel/environments/conda-env-snowflake.yml index 250e318a..13a21ec4 100644 --- a/conda-env-snowflake.yml +++ b/bazel/environments/conda-env-snowflake.yml @@ -1,16 +1,18 @@ # DO NOT EDIT! -# Generate by running 'bazel run //bazel/requirements:sync_requirements' +# Generate by running 'bazel run --config=pre_build //bazel/requirements:sync_requirements' channels: - https://repo.anaconda.com/pkgs/snowflake - nodefaults dependencies: - absl-py==1.3.0 +- aiohttp==3.8.3 - anyio==3.5.0 - boto3==1.24.28 - cloudpickle==2.0.0 - conda-libmamba-solver==23.3.0 - coverage==6.3.2 +- cryptography==39.0.1 - flask-cors==3.0.10 - flask==2.1.3 - fsspec==2022.11.0 @@ -23,13 +25,15 @@ dependencies: - moto==4.0.11 - mypy==0.981 - networkx==2.8.4 -- numpy==1.23.4 +- numpy==1.24.3 - packaging==23.0 - pandas==1.4.4 +- protobuf==3.20.3 - pytest==7.1.2 - python==3.8.13 -- pytorch==1.13.1 +- pytorch==2.0.1 - pyyaml==6.0 +- requests==2.29.0 - ruamel.yaml==0.17.21 - s3fs==2022.11.0 - scikit-learn==1.2.2 @@ -37,8 +41,8 @@ dependencies: - snowflake-connector-python==3.0.3 - snowflake-snowpark-python==1.5.1 - sqlparse==0.4.3 -- tensorflow==2.11.0 -- torchdata==0.5.1 +- tensorflow==2.10.0 - transformers==4.29.2 +- types-protobuf==4.23.0.1 - typing-extensions==4.5.0 - xgboost==1.7.3 diff --git a/conda-env.yml b/bazel/environments/conda-env.yml similarity index 74% rename from conda-env.yml rename to bazel/environments/conda-env.yml index ed189434..fa6d69d8 100644 --- a/conda-env.yml +++ b/bazel/environments/conda-env.yml @@ -1,11 +1,12 @@ # DO NOT EDIT! -# Generate by running 'bazel run //bazel/requirements:sync_requirements' +# Generate by running 'bazel run --config=pre_build //bazel/requirements:sync_requirements' channels: - https://repo.anaconda.com/pkgs/snowflake - nodefaults dependencies: - absl-py==1.3.0 +- aiohttp==3.8.3 - anyio==3.5.0 - boto3==1.24.28 - cloudpickle==2.0.0 @@ -13,6 +14,7 @@ dependencies: - conda-forge::types-PyYAML==6.0.12 - conda-libmamba-solver==23.3.0 - coverage==6.3.2 +- cryptography==39.0.1 - flask-cors==3.0.10 - flask==2.1.3 - fsspec==2022.11.0 @@ -25,13 +27,16 @@ dependencies: - moto==4.0.11 - mypy==0.981 - networkx==2.8.4 -- numpy==1.23.4 +- numpy==1.24.3 - packaging==23.0 - pandas==1.4.4 +- protobuf==3.20.3 - pytest==7.1.2 - python==3.8.13 -- pytorch==1.13.1 +- pytorch::torchdata==0.6.1 +- pytorch==2.0.1 - pyyaml==6.0 +- requests==2.29.0 - ruamel.yaml==0.17.21 - s3fs==2022.11.0 - scikit-learn==1.2.2 @@ -39,8 +44,8 @@ dependencies: - snowflake-connector-python==3.0.3 - snowflake-snowpark-python==1.5.1 - sqlparse==0.4.3 -- tensorflow==2.11.0 -- torchdata==0.5.1 +- tensorflow==2.10.0 - transformers==4.29.2 +- types-protobuf==4.23.0.1 - typing-extensions==4.5.0 - xgboost==1.7.3 diff --git a/bazel/environments/fetch_conda_env_config.bzl b/bazel/environments/fetch_conda_env_config.bzl new file mode 100644 index 00000000..e779bc12 --- /dev/null +++ b/bazel/environments/fetch_conda_env_config.bzl @@ -0,0 +1,40 @@ +def _fetch_conda_env_config_impl(rctx): + # read the particular environment variable we are interested in + config = rctx.os.environ.get("BUILD_CONDA_ENV", "extended").lower() + + # necessary to create empty BUILD file for this rule + # which will be located somewhere in the Bazel build files + rctx.file("BUILD") + + conda_env_map = { + "build":{ + "environment": "@//bazel/environments:conda-env-build.yml", + "compatible_target": ["@SnowML//bazel/platforms:extended_conda_channels"] + }, + "sf_only":{ + "environment": "@//bazel/environments:conda-env-snowflake.yml", + "compatible_target": ["@SnowML//bazel/platforms:snowflake_conda_channel"] + }, + "extended":{ + "environment": "@//bazel/environments:conda-env.yml", + "compatible_target": ["@SnowML//bazel/platforms:extended_conda_channels"] + }, + } + + if config not in conda_env_map.keys(): + fail("Unsupported conda env {} specified. Only {} is supported.".format(config, repr(conda_env_map.keys()))) + + # create a temporary file called config.bzl to be loaded into WORKSPACE + # passing in any desired information from this rule implementation + rctx.file("config.bzl", content = """ +NAME = {} +ENVIRONMENT = {} +COMPATIBLE_TARGET = {} +""".format(repr(config), repr(conda_env_map[config]["environment"]), repr(conda_env_map[config]["compatible_target"])) + ) + + +fetch_conda_env_config = repository_rule( + implementation=_fetch_conda_env_config_impl, + environ = ["BUILD_CONDA_ENV"] +) diff --git a/bazel/get_affected_targets.sh b/bazel/get_affected_targets.sh index c3e0f38d..dfd5bd13 100755 --- a/bazel/get_affected_targets.sh +++ b/bazel/get_affected_targets.sh @@ -71,21 +71,22 @@ final_hashes_json="${working_dir}/final_hashes.json" impacted_targets_path="${working_dir}/impacted_targets.txt" bazel_diff="${working_dir}/bazel_diff" -"${bazel}" run :bazel-diff --script_path="${bazel_diff}" +"${bazel}" run --config=pre_build :bazel-diff --script_path="${bazel_diff}" git -C "${workspace_path}" checkout "${pr_revision}" --quiet echo "Generating Hashes for Revision '${pr_revision}'" -"${bazel_diff}" generate-hashes -w "$workspace_path" -b "${bazel}" "${starting_hashes_json}" +"${bazel_diff}" generate-hashes -w "$workspace_path" -b "${bazel}" "${final_hashes_json}" MERGE_BASE_MAIN=$(git merge-base "${pr_revision}" main) git -C "${workspace_path}" checkout "${MERGE_BASE_MAIN}" --quiet echo "Generating Hashes for merge base ${MERGE_BASE_MAIN}" -$bazel_diff generate-hashes -w "${workspace_path}" -b "${bazel}" "${final_hashes_json}" +$bazel_diff generate-hashes -w "${workspace_path}" -b "${bazel}" "${starting_hashes_json}" +git -C "${workspace_path}" checkout "${pr_revision}" --quiet echo "Determining Impacted Targets and output to ${output_path}" $bazel_diff get-impacted-targets -sh "${starting_hashes_json}" -fh "${final_hashes_json}" -o "${impacted_targets_path}" diff --git a/bazel/mypy/main.py b/bazel/mypy/main.py index d00bc164..e246a877 100644 --- a/bazel/mypy/main.py +++ b/bazel/mypy/main.py @@ -1,16 +1,75 @@ -"""This is the mypy "tool" bazel "builds" for the mypy actions. - -It relies on mypy being available in the python environment that bazel uses. -""" +import argparse +import json +import os +import subprocess +import sys +import tempfile +MYPY_ENTRYPOINT_CODE = """ import sys try: from mypy.main import main except ImportError as e: raise ImportError( - f"Unable to import mypy. Make sure mypy is added to the bazel conda environment. Actual error: {e}" + f"Unable to import mypy. Make sure mypy is added to the bazel conda environment. Actual error: {{e}}" ) if __name__ == "__main__": main(stdout=sys.stdout, stderr=sys.stderr) + +""" + +MYPY_CACHE_DIR = ".mypy_cache" + + +def mypy_checker() -> None: + # To parse the arguments that bazel provides. + parser = argparse.ArgumentParser( + # Without this, the second path documented in main below fails. + fromfile_prefix_chars="@" + ) + parser.add_argument("--out") + parser.add_argument("--persistent_worker", action="store_true") + + args = parser.parse_args() + + os.makedirs(MYPY_CACHE_DIR, exist_ok=True) + + with tempfile.NamedTemporaryFile(suffix=".py") as mypy_entrypoint: + mypy_entrypoint.write(MYPY_ENTRYPOINT_CODE.encode()) + mypy_entrypoint.flush() + first_run = True + while args.persistent_worker or first_run: + data = sys.stdin.readline() + req = json.loads(data) + mypy_args = req["arguments"] + mypy_args = ["--cache-dir", MYPY_CACHE_DIR] + mypy_args + process = subprocess.Popen( + # We use this to make sure we are invoking mypy that is installed in the same environment of the current + # Python. + [sys.executable, mypy_entrypoint.name] + mypy_args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + process.wait() + text, err = process.communicate() + message = text.decode() + err.decode() + with open(args.out, "w") as output: + output.write(message) + sys.stderr.flush() + sys.stdout.write( + json.dumps( + { + "exitCode": process.returncode, + "output": message, + "requestId": req.get("requestId", 0), + } + ) + ) + sys.stdout.flush() + first_run = False + + +if __name__ == "__main__": + mypy_checker() diff --git a/bazel/mypy/mypy.bzl b/bazel/mypy/mypy.bzl index ea36c1d2..43ea6fea 100644 --- a/bazel/mypy/mypy.bzl +++ b/bazel/mypy/mypy.bzl @@ -1,13 +1,18 @@ "Public API" -load("@bazel_skylib//lib:shell.bzl", "shell") -load("@bazel_skylib//lib:sets.bzl", "sets") load("//bazel/mypy:rules.bzl", "MyPyStubsInfo") MyPyAspectInfo = provider( - "TODO: documentation", + """This is an aspect attaching to the original Python build graph to type-checking Python source files. + For every target it collects all transitive dependencies as well as direct sources and use symbol link to create + a folder ends with .mypy_runfiles. Mypy will be invoked to check the direct sources. + + This aspect uses persistent worker to make full use of mypy's cache which is defined in main.py in the same + directory. The mypy cache will be put into bazel's execroot/SnowML/,mypy_cache .""", fields = { "exe": "Used to pass the rule implementation built exe back to calling aspect.", + "args": "Used to pass the arguments sent to mypy executable.", + "runfiles": "Used to pass the inputs file for mypy executable.", "out": "Used to pass the dummy output file back to calling aspect.", }, ) @@ -19,10 +24,6 @@ DEBUG = False VALID_EXTENSIONS = ["py", "pyi"] DEFAULT_ATTRS = { - "_template": attr.label( - default = Label("//bazel/mypy:mypy.sh.tpl"), - allow_single_file = True, - ), "_mypy_cli": attr.label( default = Label("//bazel/mypy:mypy"), executable = True, @@ -34,17 +35,6 @@ DEFAULT_ATTRS = { ), } -def _sources_to_cache_map_triples(srcs): - triples_as_flat_list = [] - for f in srcs: - f_path = f.path - triples_as_flat_list.extend([ - shell.quote(f_path), - shell.quote("{}.meta.json".format(f_path)), - shell.quote("{}.data.json".format(f_path)), - ]) - return triples_as_flat_list - def _is_external_dep(dep): return dep.label.workspace_root.startswith("external/") @@ -78,26 +68,11 @@ def _extract_stub_deps(deps): stub_files.append(src_f) return stub_files -def _extract_imports(imports, label): - # NOTE: Bazel's implementation of this for py_binary, py_test is at - # src/main/java/com/google/devtools/build/lib/bazel/rules/python/BazelPythonSemantics.java - mypypath_parts = [] - for import_ in imports: - if import_.startswith("/"): - # buildifier: disable=print - print("ignoring invalid absolute path '{}'".format(import_)) - elif import_ in ["", "."]: - mypypath_parts.append(label.package) - else: - mypypath_parts.append("{}/{}".format(label.package, import_)) - return mypypath_parts - def _mypy_rule_impl(ctx): base_rule = ctx.rule mypy_config_file = ctx.file._mypy_config - mypypath_parts = [] direct_src_files = [] transitive_srcs_depsets = [] stub_files = [] @@ -109,82 +84,82 @@ def _mypy_rule_impl(ctx): transitive_srcs_depsets = _extract_transitive_deps(base_rule.attr.deps) stub_files = _extract_stub_deps(base_rule.attr.deps) - if hasattr(base_rule.attr, "imports"): - mypypath_parts = _extract_imports(base_rule.attr.imports, ctx.label) - final_srcs_depset = depset(transitive = transitive_srcs_depsets + [depset(direct = direct_src_files)]) src_files = [f for f in final_srcs_depset.to_list() if not _is_external_src(f)] if not src_files: return None - mypypath_parts += [src_f.dirname for src_f in stub_files] - mypypath = ":".join(mypypath_parts) - out = ctx.actions.declare_file("%s_dummy_out" % ctx.rule.attr.name) - exe = ctx.actions.declare_file( - "%s_mypy_exe" % ctx.rule.attr.name, - ) + runfiles_name = "%s.mypy_runfiles" % ctx.rule.attr.name # Compose a list of the files needed for use. Note that aspect rules can use # the project version of mypy however, other rules should fall back on their # relative runfiles. - runfiles = ctx.runfiles(files = src_files + stub_files + [mypy_config_file]) - src_root_paths = sets.to_list( - sets.make([f.root.path for f in src_files]), + src_run_files = [] + direct_src_run_files = [] + stub_run_files = [] + + for f in src_files + stub_files: + run_file_path = runfiles_name + "/" + f.short_path + run_file = ctx.actions.declare_file(run_file_path) + ctx.actions.symlink( + output = run_file, + target_file = f, + ) + if f in src_files: + src_run_files.append(run_file) + if f in direct_src_files: + direct_src_run_files.append(run_file) + if f in stub_files: + stub_run_files.append(run_file) + + src_root_path = src_run_files[0].path + src_root_path = src_root_path[0:(src_root_path.find(runfiles_name) + len(runfiles_name))] + + # arguments sent to mypy + args = [ + "--enable-incomplete-features", + ] + ["--package-root", src_root_path, "--config-file", mypy_config_file.path] + [f.path for f in direct_src_run_files] + + worker_arg_file = ctx.actions.declare_file(ctx.rule.attr.name + ".worker_args") + ctx.actions.write( + output = worker_arg_file, + content = "\n".join(args), ) - ctx.actions.expand_template( - template = ctx.file._template, - output = exe, - substitutions = { - "{MYPY_EXE}": ctx.executable._mypy_cli.path, - "{MYPY_ROOT}": ctx.executable._mypy_cli.root.path, - "{CACHE_MAP_TRIPLES}": " ".join(_sources_to_cache_map_triples(src_files)), - "{PACKAGE_ROOTS}": " ".join([ - "--package-root " + shell.quote(path or ".") - for path in src_root_paths - ]), - "{SRCS}": " ".join([ - shell.quote(f.path) - for f in src_files - ]), - "{VERBOSE_OPT}": "--verbose" if DEBUG else "", - "{VERBOSE_BASH}": "set -x" if DEBUG else "", - "{OUTPUT}": out.path if out else "", - "{MYPYPATH_PATH}": mypypath if mypypath else "", - "{MYPY_INI_PATH}": mypy_config_file.path, - }, - is_executable = True, + return MyPyAspectInfo( + exe = ctx.executable._mypy_cli, + args = worker_arg_file, + runfiles = src_run_files + stub_run_files + [mypy_config_file, worker_arg_file], + out = out, ) - return [ - DefaultInfo(executable = exe, runfiles = runfiles), - MyPyAspectInfo(exe = exe, out = out), - ] - def _mypy_aspect_impl(_, ctx): if (ctx.rule.kind not in ["py_binary", "py_library", "py_test", "mypy_test"] or ctx.label.workspace_root.startswith("external")): return [] - providers = _mypy_rule_impl( - ctx + aspect_info = _mypy_rule_impl( + ctx, ) - if not providers: + if not aspect_info: return [] - info = providers[0] - aspect_info = providers[1] - ctx.actions.run( outputs = [aspect_info.out], - inputs = info.default_runfiles.files, - tools = [ctx.executable._mypy_cli], + inputs = aspect_info.runfiles, + tools = [aspect_info.exe], executable = aspect_info.exe, mnemonic = "MyPy", progress_message = "Type-checking %s" % ctx.label, + execution_requirements = { + "supports-workers": "1", + "requires-worker-protocol": "json", + }, + # out is required for worker to write the output. + arguments = ["--out", aspect_info.out.path, "@" + aspect_info.args.path], use_default_shell_env = True, ) return [ diff --git a/bazel/mypy/mypy.sh.tpl b/bazel/mypy/mypy.sh.tpl deleted file mode 100644 index 855853f9..00000000 --- a/bazel/mypy/mypy.sh.tpl +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env bash - -{VERBOSE_BASH} -set -o errexit -set -o nounset -set -o pipefail - -main() { - local output - local report_file - local status - local root - local mypy - - report_file="{OUTPUT}" - root="{MYPY_ROOT}/" - mypy="{MYPY_EXE}" - - # TODO(Jonathon): Consider UX improvements using https://mypy.readthedocs.io/en/stable/command_line.html#configuring-error-messages - - export MYPYPATH="$(pwd):{MYPYPATH_PATH}" - - # Workspace rules run in a different location from aspect rules. Here we - # normalize if the external source isn't found. - if [ ! -f $mypy ]; then - mypy=${mypy#${root}} - fi - - set +o errexit - output=$($mypy {VERBOSE_OPT} --bazel {PACKAGE_ROOTS} --config-file {MYPY_INI_PATH} --enable-incomplete-features --cache-map {CACHE_MAP_TRIPLES} -- {SRCS} 2>&1) - status=$? - set -o errexit - - if [ ! -z "$report_file" ]; then - echo "${output}" > "${report_file}" - fi - - if [[ $status -ne 0 ]]; then - printf "\033[0;31m======== BEGIN MYPY ERROR ========\033[0m\n" - echo "${output}" # Show MyPy's error to end-user via Bazel's console logging - printf "\033[0;31m======== END MYPY ERROR ========\033[0m\n" - exit 1 - fi - -} - -main "$@" diff --git a/bazel/py_rules.bzl b/bazel/py_rules.bzl index ce0045fe..d01667e2 100644 --- a/bazel/py_rules.bzl +++ b/bazel/py_rules.bzl @@ -34,6 +34,14 @@ load( load("@rules_python//python:packaging.bzl", native_py_wheel = "py_wheel") load(":repo_paths.bzl", "check_for_experimental_dependencies", "check_for_tests_dependencies") +def py_genrule(**attrs): + orginal_cmd = attrs["cmd"] + attrs["cmd"] = select({ + "@bazel_tools//src/conditions:windows": "CONDA_DLL_SEARCH_MODIFICATION_ENABLE=1 " + orginal_cmd, + "//conditions:default": orginal_cmd, + }) + native.genrule(**attrs) + _COMPATIBLE_WITH_SNOWPARK_TAG = "wheel_compatible_with_snowpark" def _add_target_compatiblity_labels(compatible_with_snowpark, attrs): @@ -68,6 +76,10 @@ def py_binary(compatible_with_snowpark = True, **attrs): # * https://bazel.build/reference/be/python#py_test.legacy_create_init # * https://github.com/bazelbuild/rules_python/issues/55 attrs["legacy_create_init"] = 0 + attrs["env"] = select({ + "@bazel_tools//src/conditions:windows": {"CONDA_DLL_SEARCH_MODIFICATION_ENABLE": "1"}, + "//conditions:default": {}, + }) native_py_binary(**attrs) def py_library(compatible_with_snowpark = True, **attrs): diff --git a/bazel/requirements/BUILD.bazel b/bazel/requirements/BUILD.bazel index ef0758ef..42a61468 100644 --- a/bazel/requirements/BUILD.bazel +++ b/bazel/requirements/BUILD.bazel @@ -17,7 +17,7 @@ _CURRENT_PATH = "bazel/requirements" _SYNC_RULE_NAME = "sync_requirements" -_SYNC_BAZEL_CMD = "bazel run //{current_path}:{sync_rule}".format( +_SYNC_BAZEL_CMD = "bazel run --config=pre_build //{current_path}:{sync_rule}".format( current_path = _CURRENT_PATH, sync_rule = _SYNC_RULE_NAME, ) @@ -31,12 +31,17 @@ _GENERATED_REQUIREMENTS_FILES = { "conda_env_yml": { "cmd": "--mode dev_version --format conda_env", "generated": "conda-env.yml", - "target": "//:conda-env.yml", + "target": "//bazel/environments:conda-env.yml", + }, + "conda_env_build_yml": { + "cmd": "--mode dev_version --format conda_env --filter_by_tag build_essential", + "generated": "conda-env-build.yml", + "target": "//bazel/environments:conda-env-build.yml", }, "conda_env_snowflake_yml": { "cmd": "--mode dev_version --format conda_env --snowflake_channel_only", "generated": "conda-env-snowflake.yml", - "target": "//:conda-env-snowflake.yml", + "target": "//bazel/environments:conda-env-snowflake.yml", }, "conda_meta": { "cmd": "--mode version_requirements --format conda_meta --version " + VERSION, @@ -81,3 +86,11 @@ sync_target( root_path = _CURRENT_PATH, targets = _GENERATED_REQUIREMENTS_FILES.values(), ) + + +sh_test( + name = "changelog_version_check", + srcs = ["changelog_version_check.sh"], + args = [VERSION, "$(location //:CHANGELOG.md)"], + data = ["//:CHANGELOG.md"], +) diff --git a/bazel/requirements/changelog_version_check.sh b/bazel/requirements/changelog_version_check.sh new file mode 100755 index 00000000..7f862fb0 --- /dev/null +++ b/bazel/requirements/changelog_version_check.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env sh + +# Usage +# changelog_version_check.sh +# +# Action +# - Check if the section corresponding to the provided version exists in CHANGELOG. + +version=$1 +changelog_path=$2 + +version_escaped=$(echo "${version}" | sed 's/[^^]/[&]/g; s/\^/\\^/g' ) + +grep -E "##\s+${version_escaped}" "${changelog_path}" || \ +(echo "CHNAGELOG.md was not updated, please update by adding new section for the new version." && exit 1) diff --git a/bazel/requirements/requirements.schema.json b/bazel/requirements/requirements.schema.json index d055fe51..c82e803e 100644 --- a/bazel/requirements/requirements.schema.json +++ b/bazel/requirements/requirements.schema.json @@ -65,11 +65,13 @@ "type": "array", "items": [ { - "type": "string" + "type": "string", + "enum": ["deployment_core", "build_essential"] } ] } }, + "additionalProperties":false, "allOf": [ { "anyOf": [ diff --git a/bazel/requirements/rules.bzl b/bazel/requirements/rules.bzl index 1082a08a..0f5d6d38 100644 --- a/bazel/requirements/rules.bzl +++ b/bazel/requirements/rules.bzl @@ -1,6 +1,7 @@ load("@bazel_skylib//rules:diff_test.bzl", "diff_test") load("@bazel_skylib//rules:write_file.bzl", "write_file") load("@aspect_bazel_lib//lib:yq.bzl", "yq") +load("//bazel:py_rules.bzl", "py_genrule") _AUTOGEN_HEADERS = """# DO NOT EDIT! # Generate by running '{generation_cmd}' @@ -23,7 +24,7 @@ def generate_requirement_file( cmd, src_requirement_file, generation_cmd): - native.genrule( + py_genrule( name = "gen_{name}_body".format(name = name), srcs = [ src_requirement_file, @@ -41,12 +42,11 @@ def generate_requirement_file( outs = [generated_file], cmd = "(echo -e \"" + _AUTOGEN_HEADERS.format(generation_cmd = generation_cmd) + "\" ; cat $(location :{generated}.body) ) > $@".format( generated = generated_file, - ), - tools = [_GENERATE_TOOL], + ) ) diff_test( name = "check_{name}".format(name = name), - failure_message = "Please run: bazel run {generation_cmd}".format(generation_cmd = generation_cmd), + failure_message = "Please run: bazel run --config=pre_build {generation_cmd}".format(generation_cmd = generation_cmd), file1 = ":{generated}".format(generated = generated_file), file2 = target, ) @@ -59,7 +59,7 @@ def generate_requirement_file_yaml( cmd, src_requirement_file, generation_cmd): - native.genrule( + py_genrule( name = "gen_{name}_body".format(name = name), srcs = [ src_requirement_file, @@ -91,7 +91,7 @@ def generate_requirement_file_yaml( diff_test( name = "check_{name}".format(name = name), - failure_message = "Please run: bazel run {generation_cmd}".format(generation_cmd = generation_cmd), + failure_message = "Please run: bazel run --config=pre_build {generation_cmd}".format(generation_cmd = generation_cmd), file1 = ":{generated}".format(generated = generated_file), file2 = target, ) diff --git a/ci/RunTests.sh b/ci/RunTests.sh index c0199829..4341f6fa 100755 --- a/ci/RunTests.sh +++ b/ci/RunTests.sh @@ -5,34 +5,35 @@ # CLI TOOLS : bazel # # Usage -# RunTests.sh [-b ] [-m diff-only|standard] +# RunTests.sh [-b ] [-m merge_gate|continuous_run] # # Flags # -b: specify path to bazel. # -m: specify the mode from the following options -# diff-only: run affected tests only. (For merge gate) -# standard (default): run all tests except auto-generated tests. (For nightly run.) +# merge_gate: run affected tests only. +# continuous_run (default): run all tests except auto-generated tests and tests with +# 'skip_continuous_test' filter. (For nightly run.) # set -o pipefail set -u +set -e bazel="bazel" -mode="standard" +mode="continuous_run" PROG=$0 help() { local exit_code=$1 - echo "Usage: ${PROG} [-b ] [-m diff-only|standard]" + echo "Usage: ${PROG} [-b ] [-m merge_gate|continuous_run]" exit "${exit_code}" } while getopts "b:m:h" opt; do case "${opt}" in m) - mode="${OPTARG}" - if ! [[ "${mode}" = "diff-only" || "${mode}" = "standard" ]]; then - help 1 + if [[ "${OPTARG}" = "merge_gate" || "${OPTARG}" = "continuous_run" ]] ; then + mode="${OPTARG}" fi ;; b) @@ -55,15 +56,20 @@ done working_dir=$(mktemp -d "/tmp/tmp_XXXXX") trap 'rm -rf "${working_dir}"' EXIT +tag_filter="--test_tag_filters=" + case "${mode}" in -diff-only) +merge_gate) affected_targets_file="${working_dir}/affected_targets" ./bazel/get_affected_targets.sh -b "${bazel}" -f "${affected_targets_file}" - test_targets=$(${bazel} query "kind('py_test rule', rdeps(//..., set($(<"${affected_targets_file}"))))") + tag_filter="--test_tag_filters=-autogen,-skip_continuous_test" + + # Notice that we should include all kinds of test here. + test_targets=$(${bazel} query "kind('.*_test rule', rdeps(//... - //snowflake/ml/experimental/... - set($( "${test_targets_file}" +printf "%s" "${test_targets}" >"${test_targets_file}" +set +e "${bazel}" test --cache_test_results=no \ --test_output=errors \ - --test_tag_filters=-autogen \ + "${tag_filter}" \ --target_pattern_file "${test_targets_file}" bazel_exit_code=$? # Bazel exit code # 0: Success; # 4: Build Successful but no tests found # See https://bazel.build/run/scripts#exit-codes -if [[ ${mode} = "diff-only" && ${bazel_exit_code} -eq 4 ]] ; then - exit 0 +if [[ ${mode} = "merge_gate" && ${bazel_exit_code} -eq 4 ]]; then + exit 0 fi exit $bazel_exit_code diff --git a/ci/build_and_run_tests.sh b/ci/build_and_run_tests.sh index 39db4936..1db5b70d 100755 --- a/ci/build_and_run_tests.sh +++ b/ci/build_and_run_tests.sh @@ -1,7 +1,7 @@ #!/bin/bash # Usage -# build_and_run_tests.sh [-b ] [--env pip|conda] [--mode diff-only|standard|release] [--with-snowpark] +# build_and_run_tests.sh [-b ] [--env pip|conda] [--mode merge_gate|continuous_run|release] [--with-snowpark] # # Args # workspace: path to the workspace, SnowML code should be in snowml directory. @@ -10,8 +10,8 @@ # b: specify path to bazel # env: Set the environment, choose from pip and conda # mode: Set the tests set to be run. -# diff-only: run affected tests only. (For merge gate) -# standard (default): run all tests except auto-generated tests. (For nightly run.) +# merge_gate: run affected tests only. +# continuous_run (default): run all tests except auto-generated tests. (For nightly run.) # release: run all tests including auto-generated tests. (For releasing) # with-snowpark: Build and test with snowpark in snowpark-python directory in the workspace. # @@ -21,12 +21,13 @@ set -o pipefail set -u +set -e PROG=$0 help() { local exit_code=$1 - echo "Usage: ${PROG} [-b ] [--env pip|conda] [--mode diff-only|standard|release] [--with-snowpark]" + echo "Usage: ${PROG} [-b ] [--env pip|conda] [--mode merge_gate|continuous_run|release] [--with-snowpark]" exit "${exit_code}" } @@ -34,7 +35,7 @@ WORKSPACE=$1 && shift || help 1 BAZEL="bazel" ENV="pip" WITH_SNOWPARK=false -MODE="standard" +MODE="continuous_run" SNOWML_DIR="snowml" SNOWPARK_DIR="snowpark-python" @@ -57,7 +58,7 @@ while (($#)); do ;; --mode) shift - if [[ $1 = "diff-only" || $1 = "standard" || $1 = "release" ]]; then + if [[ $1 = "merge_gate" || $1 = "continuous_run" || $1 = "release" ]]; then MODE=$1 else help 1 @@ -116,9 +117,9 @@ fi # Compare test required dependencies with wheel pkg dependencies and exclude tests if necessary EXCLUDE_TESTS=$(mktemp "${TEMP_TEST_DIR}/exclude_tests_XXXXX") -if [[ ${MODE} = "standard" ]]; then +if [[ ${MODE} = "continuous_run" ]]; then ./ci/get_excluded_tests.sh -f "${EXCLUDE_TESTS}" -m unused -b "${BAZEL}" -elif [[ ${MODE} = "diff-only" ]]; then +elif [[ ${MODE} = "merge_gate" ]]; then ./ci/get_excluded_tests.sh -f "${EXCLUDE_TESTS}" -m all -b "${BAZEL}" fi # Copy tests into temp directory @@ -154,6 +155,9 @@ if [ "${ENV}" = "pip" ]; then else which conda + # Clean conda cache + conda clean --all --force-pkgs-dirs -y + # Clean conda build workspace rm -rf "${WORKSPACE}/conda-bld" @@ -199,13 +203,9 @@ if [ "${ENV}" = "pip" ]; then fi python3.8 -m pip list - # Set up pip specific pytest flags - PIP_PYTEST_FLAG=() - PIP_PYTEST_FLAG+=(-m "not pip_incompatible") # Filter out those pip incompatible tests. - # Run the tests set +e - TEST_SRCDIR="${TEMP_TEST_DIR}" python3.8 -m pytest "${COMMON_PYTEST_FLAG[@]}" "${PIP_PYTEST_FLAG[@]}" tests/ + TEST_SRCDIR="${TEMP_TEST_DIR}" python3.8 -m pytest "${COMMON_PYTEST_FLAG[@]}" -m "not pip_incompatible" tests/integ/ TEST_RETCODE=$? set -e else @@ -219,9 +219,9 @@ else conda create -y -p testenv -c "file://${WORKSPACE}/conda-bld" -c "https://repo.anaconda.com/pkgs/snowflake/" --override-channels "python=3.8" snowflake-ml-python pytest-xdist inflection "${OPTIONAL_REQUIREMENTS[@]}" conda list -p testenv - # Run the tests + # Run integration tests set +e - TEST_SRCDIR="${TEMP_TEST_DIR}" conda run -p testenv --no-capture-output python3.8 -m pytest "${COMMON_PYTEST_FLAG[@]}" tests/ + TEST_SRCDIR="${TEMP_TEST_DIR}" conda run -p testenv --no-capture-output python3.8 -m pytest "${COMMON_PYTEST_FLAG[@]}" tests/integ/ TEST_RETCODE=$? set -e @@ -236,7 +236,7 @@ echo "Done running ${PROG}" # 0: Success; # 5: no tests found # See https://docs.pytest.org/en/7.1.x/reference/exit-codes.html -if [[ ${MODE} = "diff-only" && ${TEST_RETCODE} -eq 5 ]] ; then - exit 0 +if [[ ${MODE} = "merge_gate" && ${TEST_RETCODE} -eq 5 ]]; then + exit 0 fi exit ${TEST_RETCODE} diff --git a/ci/conda_recipe/bld.bat b/ci/conda_recipe/bld.bat new file mode 100644 index 00000000..e4aa272b --- /dev/null +++ b/ci/conda_recipe/bld.bat @@ -0,0 +1,12 @@ +@echo off +setlocal EnableDelayedExpansion + +bazel "--output_user_root=C:\broot" "build" "--repository_cache=" "--nobuild_python_zip" "--enable_runfiles" --action_env="USERPROFILE=%USERPROFILE%" --host_action_env="USERPROFILE=%USERPROFILE%" "//snowflake/ml:wheel" +SET BAZEL_BIN_PATH= +FOR /f "delims=" %%a in ('bazel --output_user_root=C:\broot info bazel-bin') DO (SET "BAZEL_BIN_PATH=!BAZEL_BIN_PATH!%%a") +SET WHEEL_PATH_PATTERN="!BAZEL_BIN_PATH:/=\!\snowflake\ml\*.whl" +SET WHEEL_PATH= +FOR /f "delims=" %%a in ('dir /b/s !WHEEL_PATH_PATTERN!') DO (SET "WHEEL_PATH=!WHEEL_PATH!%%a") +pip "install" "--no-dependencies" "!WHEEL_PATH!" +bazel "clean" "--expunge" +bazel "shutdown" diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml index 0f45d49c..6b1b3b43 100644 --- a/ci/conda_recipe/meta.yaml +++ b/ci/conda_recipe/meta.yaml @@ -1,5 +1,5 @@ # DO NOT EDIT! -# Generate by running 'bazel run //bazel/requirements:sync_requirements' +# Generate by running 'bazel run --config=pre_build //bazel/requirements:sync_requirements' --- about: @@ -17,13 +17,14 @@ build: noarch: python package: name: snowflake-ml-python - version: 1.0.4 + version: 1.0.5 requirements: build: - python - bazel >=6.0.0 run: - absl-py>=0.15,<2 + - aiohttp!=4.0.0a0, !=4.0.0a1 - anyio>=3.5.0,<4 - cloudpickle - fsspec>=2022.11,<=2023.1 @@ -32,6 +33,7 @@ requirements: - pandas>=1.0.0,<2 - python - pyyaml>=6.0,<7 + - requests - scikit-learn>=1.2.1,<1.3 - scipy>=1.9,<2 - snowflake-connector-python>=3.0.3,<4 diff --git a/ci/get_excluded_tests.sh b/ci/get_excluded_tests.sh index b143566f..9d852c72 100755 --- a/ci/get_excluded_tests.sh +++ b/ci/get_excluded_tests.sh @@ -115,7 +115,7 @@ esac excluded_test_source_rule_file=${working_dir}/excluded_test_source_rule -printf "kind('source file', set(%s)" "$(<"${targets_to_exclude_file}"))" > "${excluded_test_source_rule_file}" +printf "kind('source file', set(%s) + set($( "${excluded_test_source_rule_file}" ${bazel} query --query_file="${excluded_test_source_rule_file}" \ --output location | diff --git a/ci/skip_merge_gate_targets b/ci/skip_merge_gate_targets new file mode 100644 index 00000000..de6afd17 --- /dev/null +++ b/ci/skip_merge_gate_targets @@ -0,0 +1,2 @@ +//tests/integ/snowflake/ml/model:deployment_to_snowservice_integ_test +//tests/integ/snowflake/ml/registry:model_registry_integ_test_snowservice diff --git a/ci/type_check.sh b/ci/type_check.sh index 0c676224..67a9b4bc 100755 --- a/ci/type_check.sh +++ b/ci/type_check.sh @@ -20,12 +20,13 @@ # Otherwise exits with bazel's exit code. # # NOTE: -# 1. Ignores all targets that depends on (1) targets with tag "skip_mypy_check" (2) targets in `type_ignored_targets`. +# 1. Ignores all targets that depends on targets in `type_ignored_targets`. # 2. Affected targets also include raw python files on top of bazel build targets whereas ignored_targets don't. Hence # we used `kind('py_.* rule')` filter. set -o pipefail set -u +set -e bazel="bazel" affected_targets="" @@ -71,12 +72,12 @@ fi printf \ "let type_ignored_targets = set(%s) in \ let affected_targets = kind('py_.* rule', set(%s)) in \ - let skipped_targets = attr('tags', '[\[ ]skip_mypy_check[,\]]', \$affected_targets) in \ - let rdeps_targets = rdeps(//..., \$type_ignored_targets) union rdeps(//..., \$skipped_targets) in \ + let rdeps_targets = rdeps(//..., \$type_ignored_targets) in \ \$affected_targets except \$rdeps_targets" \ "$("${working_dir}/type_checked_targets_query" "${bazel}" query --query_file="${working_dir}/type_checked_targets_query" >"${working_dir}/type_checked_targets" echo "Type checking the following targets:" "$(<"${working_dir}/type_checked_targets")" + set +e "${bazel}" build \ --keep_going \ diff --git a/ci/type_ignored_targets b/ci/type_ignored_targets index 3123cc2d..e66fc8ed 100644 --- a/ci/type_ignored_targets +++ b/ci/type_ignored_targets @@ -1,37 +1,12 @@ //snowflake/ml/experimental/... +//snowflake/ml/modeling/xgboost/... //tests/integ/snowflake/ml/_internal/... //tests/integ/snowflake/ml/extra_tests/... //tests/integ/snowflake/ml/modeling/impute/... +//tests/integ/snowflake/ml/modeling/linear_model/... //tests/integ/snowflake/ml/modeling/metrics/... //tests/integ/snowflake/ml/modeling/pipeline/... //tests/integ/snowflake/ml/modeling/preprocessing/... - -//snowflake/ml/modeling/linear_model/... -//snowflake/ml/modeling/ensemble/... -//snowflake/ml/modeling/svm/... -//snowflake/ml/modeling/neural_network/... -//snowflake/ml/modeling/tree/... -//snowflake/ml/modeling/calibration/... -//snowflake/ml/modeling/cluster/... -//snowflake/ml/modeling/compose/... -//snowflake/ml/modeling/covariance/... -//snowflake/ml/modeling/decomposition/... -//snowflake/ml/modeling/discriminant_analysis/... -//snowflake/ml/modeling/feature_selection/... -//snowflake/ml/modeling/gaussian_process/... -//snowflake/ml/modeling/impute:iterative_imputer -//snowflake/ml/modeling/impute:knn_imputer -//snowflake/ml/modeling/impute:missing_indicator -//snowflake/ml/modeling/isotonic/... -//snowflake/ml/modeling/kernel_approximation/... -//snowflake/ml/modeling/kernel_ridge/... -//snowflake/ml/modeling/lightgbm/... -//snowflake/ml/modeling/manifold/... -//snowflake/ml/modeling/mixture/... -//snowflake/ml/modeling/model_selection/... -//snowflake/ml/modeling/multiclass/... -//snowflake/ml/modeling/multioutput/... -//snowflake/ml/modeling/naive_bayes/... -//snowflake/ml/modeling/neighbors/... -//snowflake/ml/modeling/semi_supervised/... -//snowflake/ml/modeling/xgboost/... +//tests/integ/snowflake/ml/test_utils/... +//tests/integ/snowflake/ml/registry:model_registry_integ_test_snowservice_base +//tests/integ/snowflake/ml/registry:model_registry_integ_test_snowservice diff --git a/codegen/codegen_rules.bzl b/codegen/codegen_rules.bzl index 437a01e3..b0bce0b9 100644 --- a/codegen/codegen_rules.bzl +++ b/codegen/codegen_rules.bzl @@ -7,7 +7,7 @@ Helper functions to autogenerate genrules and build rules for the following """ load("@rules_python//python:packaging.bzl", native_py_package = "py_package") -load("//bazel:py_rules.bzl", "py_library", "py_test") +load("//bazel:py_rules.bzl", "py_library", "py_test", "py_genrule") AUTO_GEN_TOOL_BAZEL_PATH = "//codegen:estimator_autogen_tool" ESTIMATOR_TEMPLATE_BAZEL_PATH = "//codegen:sklearn_wrapper_template.py_template" @@ -29,7 +29,7 @@ def autogen_init_file_for_module(module): module (str) : Name of the module to auto-generate init file for. """ - native.genrule( + py_genrule( name = "generate_init_file", outs = ["__init__.py"], tools = [AUTO_GEN_TOOL_BAZEL_PATH], @@ -42,7 +42,6 @@ def autogen_init_file_for_module(module): name = "init", srcs = [":generate_init_file"], deps = ["//snowflake/ml/_internal:init_utils"], - tags = ["skip_mypy_check"], ) def get_genrule_cmd(gen_mode, template_path, module, output_path): @@ -70,7 +69,7 @@ def autogen_estimators(module, estimator_info_list): ) for e in estimator_info_list: - native.genrule( + py_genrule( name = "generate_{}".format(e.normalized_class_name), outs = ["{}.py".format(e.normalized_class_name)], tools = [AUTO_GEN_TOOL_BAZEL_PATH], @@ -86,6 +85,7 @@ def autogen_estimators(module, estimator_info_list): ":init", "//snowflake/ml/modeling/framework:framework", "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/_internal/exceptions:exceptions", "//snowflake/ml/_internal/utils:temp_file_utils", "//snowflake/ml/_internal/utils:query_result_checker", "//snowflake/ml/_internal/utils:pkg_version_utils", @@ -93,7 +93,6 @@ def autogen_estimators(module, estimator_info_list): "//snowflake/ml/model:model_signature", "//snowflake/ml/model/_signatures:utils", ], - tags = ["skip_mypy_check"], ) native_py_package( @@ -103,7 +102,6 @@ def autogen_estimators(module, estimator_info_list): ":{}".format(e.normalized_class_name) for e in estimator_info_list ], - tags = ["skip_mypy_check"], ) def autogen_tests_for_estimators(module, module_root_dir, estimator_info_list): @@ -122,7 +120,7 @@ def autogen_tests_for_estimators(module, module_root_dir, estimator_info_list): ) for e in estimator_info_list: - native.genrule( + py_genrule( name = "generate_test_{}".format(e.normalized_class_name), outs = ["test_{}.py".format(e.normalized_class_name)], tools = [AUTO_GEN_TOOL_BAZEL_PATH], @@ -141,5 +139,5 @@ def autogen_tests_for_estimators(module, module_root_dir, estimator_info_list): timeout = "long", legacy_create_init = 0, shard_count = 5, - tags = ["autogen", "skip_mypy_check"], + tags = ["autogen"], ) diff --git a/codegen/estimator_autogen_tool.py b/codegen/estimator_autogen_tool.py index 4d5b3d7d..dbb6e9f8 100644 --- a/codegen/estimator_autogen_tool.py +++ b/codegen/estimator_autogen_tool.py @@ -72,7 +72,7 @@ def main(argv: List[str]) -> None: expected_suffix = AutogenTool.module_root_dir(module_name=FLAGS.module) expected_suffix = os.path.normpath(os.path.join(actual_output_path, expected_suffix)) - bazel_out_dir = FLAGS.bazel_out_dir + bazel_out_dir = os.path.normpath(FLAGS.bazel_out_dir) if not bazel_out_dir.endswith(expected_suffix): raise AssertionError( f"genrule output dir $(RULEDIR) {bazel_out_dir} is expected to end with suffix {expected_suffix}" diff --git a/codegen/sklearn_wrapper_generator.py b/codegen/sklearn_wrapper_generator.py index a4fc0aa7..664dee3c 100644 --- a/codegen/sklearn_wrapper_generator.py +++ b/codegen/sklearn_wrapper_generator.py @@ -749,7 +749,7 @@ def _populate_integ_test_fields(self) -> None: self.test_estimator_imports_list.extend( [ f"from {self.root_module_name} import {self.original_class_name} as Sk{self.original_class_name}", - f"from {snow_ml_module_name} import {self.original_class_name}", + f"from {snow_ml_module_name} import {self.original_class_name} # type: ignore[attr-defined]", f"from sklearn.datasets import {self.test_dataset_func}", ] ) diff --git a/codegen/sklearn_wrapper_template.py_template b/codegen/sklearn_wrapper_template.py_template index 77512e34..4187a093 100644 --- a/codegen/sklearn_wrapper_template.py_template +++ b/codegen/sklearn_wrapper_template.py_template @@ -5,6 +5,7 @@ import inspect import os import posixpath from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set +from typing_extensions import TypeGuard from uuid import uuid4 import cloudpickle as cp @@ -16,6 +17,7 @@ from sklearn.utils.metaestimators import available_if from snowflake.ml.modeling.framework.base import BaseTransformer from snowflake.ml.modeling.framework._utils import to_native_format from snowflake.ml._internal import telemetry +from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator from snowflake.ml._internal.utils import pkg_version_utils, identifier from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get_temp_file_path @@ -29,6 +31,7 @@ from snowflake.ml.model.model_signature import ( FeatureSpec, ModelSignature, _infer_signature, + BaseFeatureSpec, ) from snowflake.ml.model._signatures import utils as model_signature_utils @@ -48,7 +51,7 @@ def _original_estimator_has_callable(attr : str) -> Callable[[Any], bool]: Returns: A function which checks for the existance of callable `attr` on the given object. """ - def check(self) -> bool: + def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]: """ Check for the existance of callable `attr` in self. Returns: @@ -58,6 +61,7 @@ def _original_estimator_has_callable(attr : str) -> Callable[[Any], bool]: return check + def _gather_dependencies(obj: Any) -> Set[str]: """ Gethers dependencies from the SnowML Estimator and Transformer objects. @@ -78,6 +82,7 @@ def _gather_dependencies(obj: Any) -> Set[str]: else: return set() + def _transform_snowml_obj_to_sklearn_obj(obj: Any) -> Any: """Converts SnowML Estimator and Transformer objects to equivalent SKLearn objects. @@ -113,7 +118,7 @@ def _validate_sklearn_args(args: Dict[str, Any], klass: type) -> Dict[str, Any]: Raises an expception if a user specified arg is not supported by current version of sklearn/xgboost. """ result = {{}} - signature = inspect.signature(klass.__init__) + signature = inspect.signature(klass.__init__) # type: ignore for k, v in args.items(): if k not in signature.parameters.keys(): # Arg is not supported. if ( @@ -122,7 +127,12 @@ def _validate_sklearn_args(args: Dict[str, Any], klass: type) -> Dict[str, Any]: v[0] != v[1] # Value is not same as default. and not (isinstance(v[0], float) and np.isnan(v[0]) and np.isnan(v[1]))) # both are not NANs ): - raise RuntimeError(f"Arg {{k}} is not supported by current version of SKLearn/XGBoost.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.DEPENDENCY_VERSION_ERROR, + original_exception=RuntimeError( + f"Arg {{k}} is not supported by current version of SKLearn/XGBoost." + ), + ) else: result[k] = v[0] return result @@ -132,7 +142,7 @@ class {transform.original_class_name}(BaseTransformer): r"""{transform.estimator_class_docstring} """ - def __init__( + def __init__( # type: ignore {transform.estimator_init_signature} ) -> None: super().__init__() @@ -148,7 +158,7 @@ class {transform.original_class_name}(BaseTransformer): self._sklearn_object = {transform.root_module_name}.{transform.original_class_name}( {transform.sklearn_init_arguments} ) - self._model_signature_dict = None + self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None {transform.estimator_init_member_args} def _get_rand_id(self) -> str: @@ -178,6 +188,15 @@ class {transform.original_class_name}(BaseTransformer): cols = [identifier.concat_names(ids=['OUTPUT_', c]) for c in self.label_cols] self.set_output_cols(output_cols=cols) + def _get_active_columns(self) -> List[str]: + """"Get the list of columns that are relevant to the transformer.""" + selected_cols = ( + self.input_cols + + self.label_cols + + ([self.sample_weight_col] if self.sample_weight_col is not None else []) + ) + return selected_cols + @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, @@ -205,20 +224,17 @@ class {transform.original_class_name}(BaseTransformer): def _fit_snowpark(self, dataset: DataFrame) -> None: session = dataset._session + assert session is not None # keep mypy happy # Validate that key package version in user workspace are supported in snowflake conda channel # If customer doesn't have package in conda channel, replace the ones have the closest versions self._deps = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT) - + # Specify input columns so column pruing will be enforced - selected_cols = ( - self.input_cols + - self.label_cols + - [self.sample_weight_col] if self.sample_weight_col is not None else [] - ) + selected_cols = self._get_active_columns() if len(selected_cols) > 0: dataset = dataset.select(selected_cols) - + # Extract query that generated the datafrome. We will need to pass it to the fit procedure. queries = dataset.queries["queries"] @@ -261,10 +277,10 @@ class {transform.original_class_name}(BaseTransformer): statement_params=statement_params ) - @sproc( + @sproc( # type: ignore is_permanent=False, name=fit_sproc_name, - packages=["snowflake-snowpark-python"] + self._get_dependencies(), + packages=["snowflake-snowpark-python"] + self._get_dependencies(), # type: ignore replace=True, session=session, statement_params=statement_params, @@ -375,6 +391,7 @@ class {transform.original_class_name}(BaseTransformer): cleanup_temp_files([local_transform_file_name, local_result_file_name]) def _fit_pandas(self, dataset: pd.DataFrame) -> None: + assert self._sklearn_object is not None and hasattr(self._sklearn_object, "fit") # keep mypy happy argspec = inspect.getfullargspec(self._sklearn_object.fit) args = {{'X': dataset[self.input_cols]}} if self.label_cols: @@ -402,9 +419,21 @@ class {transform.original_class_name}(BaseTransformer): """Util method to create UDF and run batch inference. """ if not self._is_fitted: - raise RuntimeError(f"Estimator not fitted before calling {{inference_method}} method.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.METHOD_NOT_ALLOWED, + original_exception=RuntimeError( + f"Estimator {{self.__class__.__name__}} not fitted before calling {{inference_method}} method." + ), + ) session = dataset._session + if session is None: + raise exceptions.SnowflakeMLException( + error_code=error_codes.NOT_FOUND, + original_exception=ValueError( + "Session must not specified for snowpark dataset." + ), + ) # Validate that key package version in user workspace are supported in snowflake conda channel pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT) @@ -432,15 +461,15 @@ class {transform.original_class_name}(BaseTransformer): custom_tags=dict([("autogen", True)]), ) - @pandas_udf( + @pandas_udf( # type: ignore is_permanent=False, name=batch_inference_udf_name, - packages= self._get_dependencies(), + packages=self._get_dependencies(), # type: ignore replace=True, session=session, statement_params=statement_params, ) - def vec_batch_infer(ds: PandasSeries[dict]) -> PandasSeries[dict]: + def vec_batch_infer(ds: PandasSeries[dict]) -> PandasSeries[dict]: # type: ignore import pandas as pd import numpy as np @@ -494,13 +523,15 @@ class {transform.original_class_name}(BaseTransformer): # So, to avoid that we are packing the results into a dataframe of shape (n_samples, 1) with # each element being a list. if len(expected_output_cols_list) != 1: - raise TypeError("expected_output_cols_list must be same length as transformed array or " - "should be of length 1") + raise TypeError( + "expected_output_cols_list must be same length as transformed array or " + "should be of length 1" + ) series = pd.Series(transformed_numpy_array.tolist()) transformed_pandas_df = pd.DataFrame(series, columns=expected_output_cols_list) else: transformed_pandas_df = pd.DataFrame(transformed_numpy_array, columns=expected_output_cols_list) - return transformed_pandas_df.to_dict("records") + return transformed_pandas_df.to_dict("records") # type: ignore batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{{safe_id}}".format( safe_id=self._get_rand_id() @@ -570,9 +601,9 @@ class {transform.original_class_name}(BaseTransformer): for i, f in enumerate(features_required_by_estimator): if ( i >= len(input_cols) - or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f) - or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset - and quoted_input_cols[i] not in features_in_dataset) + or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f) + or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset + and quoted_input_cols[i] not in features_in_dataset) ): missing_features.append(f) elif input_cols[i] in features_in_dataset: @@ -583,10 +614,13 @@ class {transform.original_class_name}(BaseTransformer): columns_to_select.append(quoted_input_cols[i]) if len(missing_features) > 0: - raise ValueError( - "The feature names should match with those that were passed during fit.\n" - f"Features seen during fit call but not present in the input: {{missing_features}}\n" - f"Features in the input dataframe : {{input_cols}}\n" + raise exceptions.SnowflakeMLException( + error_code=error_codes.NOT_FOUND, + original_exception=ValueError( + "The feature names should match with those that were passed during fit.\n" + f"Features seen during fit call but not present in the input: {{missing_features}}\n" + f"Features in the input dataframe : {{input_cols}}\n" + ), ) input_df = dataset[columns_to_select] input_df.columns = features_required_by_estimator @@ -626,9 +660,12 @@ class {transform.original_class_name}(BaseTransformer): shape = transformed_numpy_array.shape if shape[1] != len(output_cols): if len(output_cols) != 1: - raise TypeError( - "expected_output_cols_list must be same length as transformed array or " - "should be of length 1 or should be of length number of label columns" + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=TypeError( + "expected_output_cols_list must be same length as transformed array or " + "should be of length 1" + ), ) actual_output_cols = [] for i in range(shape[1]): @@ -642,7 +679,7 @@ class {transform.original_class_name}(BaseTransformer): dataset[output_cols] = transformed_numpy_array return dataset - @available_if(_original_estimator_has_callable("predict")) + @available_if(_original_estimator_has_callable("predict")) # type: ignore @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, @@ -659,6 +696,7 @@ class {transform.original_class_name}(BaseTransformer): Returns: Transformed dataset. """ + super()._check_dataset_type(dataset) if isinstance(dataset, DataFrame): expected_type_inferred = "{transform.udf_datatype}" # when it is classifier, infer the datatype from label columns @@ -678,15 +716,10 @@ class {transform.original_class_name}(BaseTransformer): dataset=dataset, inference_method="predict", expected_output_cols_list=self.output_cols,) - else: - raise TypeError( - f"Unexpected dataset type: {{type(dataset)}}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) return output_df - @available_if(_original_estimator_has_callable("transform")) + @available_if(_original_estimator_has_callable("transform")) # type: ignore @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, @@ -703,6 +736,7 @@ class {transform.original_class_name}(BaseTransformer): Returns: Transformed dataset. """ + super()._check_dataset_type(dataset) if isinstance(dataset, DataFrame): expected_dtype = "{transform.udf_datatype}" if {transform._is_heterogeneous_ensemble}: # is child of _BaseHeterogeneousEnsemble @@ -724,11 +758,6 @@ class {transform.original_class_name}(BaseTransformer): inference_method="transform", expected_output_cols_list=self.output_cols, ) - else: - raise TypeError( - f"Unexpected dataset type: {{type(dataset)}}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) return output_df @@ -739,6 +768,7 @@ class {transform.original_class_name}(BaseTransformer): if getattr(self._sklearn_object, "classes_", None) is None: return [output_cols_prefix] + assert self._sklearn_object is not None # keep mypy happy classes = self._sklearn_object.classes_ if isinstance(classes, numpy.ndarray): return [f'{{output_cols_prefix}}{{c}}' for c in classes.tolist()] @@ -757,7 +787,7 @@ class {transform.original_class_name}(BaseTransformer): return output_cols return [] - @available_if(_original_estimator_has_callable("predict_proba")) + @available_if(_original_estimator_has_callable("predict_proba")) # type: ignore @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, @@ -777,6 +807,7 @@ class {transform.original_class_name}(BaseTransformer): Returns: Output dataset with probability of the sample for each class in the model. """ + super()._check_dataset_type(dataset) if isinstance(dataset, DataFrame): output_df = self._batch_inference( dataset=dataset, @@ -790,15 +821,10 @@ class {transform.original_class_name}(BaseTransformer): inference_method="predict_proba", expected_output_cols_list=self._get_output_column_names(output_cols_prefix), ) - else: - raise TypeError( - f"Unexpected dataset type: {{type(dataset)}}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) - + return output_df - @available_if(_original_estimator_has_callable("predict_log_proba")) + @available_if(_original_estimator_has_callable("predict_log_proba")) # type: ignore @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, @@ -819,6 +845,7 @@ class {transform.original_class_name}(BaseTransformer): Returns: Output dataset with log probability of the sample for each class in the model. """ + super()._check_dataset_type(dataset) if isinstance(dataset, DataFrame): output_df = self._batch_inference( dataset=dataset, @@ -832,15 +859,10 @@ class {transform.original_class_name}(BaseTransformer): inference_method="predict_log_proba", expected_output_cols_list=self._get_output_column_names(output_cols_prefix), ) - else: - raise TypeError( - f"Unexpected dataset type: {{type(dataset)}}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) return output_df - @available_if(_original_estimator_has_callable("decision_function")) + @available_if(_original_estimator_has_callable("decision_function")) # type: ignore @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, @@ -861,6 +883,7 @@ class {transform.original_class_name}(BaseTransformer): Returns: Output dataset with results of the decision function for the samples in input dataset. """ + super()._check_dataset_type(dataset) if isinstance(dataset, DataFrame): output_df = self._batch_inference( dataset=dataset, @@ -874,15 +897,10 @@ class {transform.original_class_name}(BaseTransformer): inference_method="decision_function", expected_output_cols_list=self._get_output_column_names(output_cols_prefix), ) - else: - raise TypeError( - f"Unexpected dataset type: {{type(dataset)}}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) return output_df - @available_if(_original_estimator_has_callable("score")) + @available_if(_original_estimator_has_callable("score")) # type: ignore def score(self, dataset: Union[DataFrame, pd.DataFrame]) -> float: """{transform.score_docstring} @@ -890,24 +908,27 @@ class {transform.original_class_name}(BaseTransformer): Score. """ self._infer_input_output_cols(dataset) + super()._check_dataset_type(dataset) if isinstance(dataset, pd.DataFrame): - return self._score_sklearn(dataset) + output_score = self._score_sklearn(dataset) elif isinstance(dataset, DataFrame): - return self._score_snowpark(dataset) - else: - raise TypeError( - f"Unexpected dataset type: {{type(dataset)}}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + output_score = self._score_snowpark(dataset) + return output_score def _score_sklearn(self, dataset: pd.DataFrame) -> float: + assert self._sklearn_object is not None and hasattr(self._sklearn_object, "score") # make type checker happy argspec = inspect.getfullargspec(self._sklearn_object.score) if "X" in argspec.args: args = {{'X': dataset[self.input_cols]}} elif "X_test" in argspec.args: args = {{'X_test': dataset[self.input_cols]}} else: - raise RuntimeError("Neither 'X' or 'X_test' exist in argument") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=RuntimeError( + "Neither 'X' or 'X_test' exist in argument" + ), + ) if self.label_cols: label_arg_name = "Y" if "Y" in argspec.args else "y" @@ -921,11 +942,7 @@ class {transform.original_class_name}(BaseTransformer): def _score_snowpark(self, dataset: DataFrame) -> float: # Specify input columns so column pruing will be enforced - selected_cols = ( - self.input_cols + self.label_cols + [self.sample_weight_col] - if self.sample_weight_col is not None - else [] - ) + selected_cols = self._get_active_columns() if len(selected_cols) > 0: dataset = dataset.select(selected_cols) @@ -940,6 +957,7 @@ class {transform.original_class_name}(BaseTransformer): # Create temp stage to run score. score_stage_name = "SNOWML_SCORE_{{safe_id}}".format(safe_id=self._get_rand_id()) session = dataset._session + assert session is not None # keep mypy happy stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {{score_stage_name}};" SqlResultValidator( session=session, @@ -972,7 +990,7 @@ class {transform.original_class_name}(BaseTransformer): @sproc( is_permanent=False, name=score_sproc_name, - packages=["snowflake-snowpark-python"] + self._get_dependencies(), + packages=["snowflake-snowpark-python"] + self._get_dependencies(), # type: ignore replace=True, session=session, statement_params=statement_params, @@ -1024,7 +1042,7 @@ class {transform.original_class_name}(BaseTransformer): if sample_weight_col is not None and "sample_weight" in argspec.args: args['sample_weight'] = df[sample_weight_col].squeeze() - result = estimator.score(**args) + result: float = estimator.score(**args) return result # Call score sproc @@ -1037,7 +1055,7 @@ class {transform.original_class_name}(BaseTransformer): api_calls=[Session.call], custom_tags=dict([("autogen", True)]), ) - score = score_wrapper_sproc( + score: float = score_wrapper_sproc( session, queries, stage_score_file_name, @@ -1052,35 +1070,54 @@ class {transform.original_class_name}(BaseTransformer): return score def _get_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None: - self._model_signature_dict: Dict[str, ModelSignature] = dict() + self._model_signature_dict = dict() PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"] - inputs = _infer_signature(dataset[self.input_cols], "input") + inputs = list(_infer_signature(dataset[self.input_cols], "input")) + outputs: List[BaseFeatureSpec] = [] if hasattr(self, "predict"): + # keep mypy happy + assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type") # For classifier, the type of predict is the same as the type of label if self._sklearn_object._estimator_type == 'classifier': - outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output - outputs = model_signature_utils.rename_features(outputs, self.output_cols) # rename the output columns - self._model_signature_dict["predict"] = ModelSignature(inputs, - ([] if self._drop_input_cols else inputs) + outputs) + # label columns is the desired type for output + outputs = _infer_signature(dataset[self.label_cols], "output") + # rename the output columns + outputs = model_signature_utils.rename_features(outputs, self.output_cols) + self._model_signature_dict["predict"] = ModelSignature(inputs, + ([] if self._drop_input_cols else inputs) + + outputs) # For regressor, the type of predict is float64 elif self._sklearn_object._estimator_type == 'regressor': outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols] - self._model_signature_dict["predict"] = ModelSignature(inputs, - ([] if self._drop_input_cols else inputs) + outputs) + self._model_signature_dict["predict"] = ModelSignature(inputs, + ([] if self._drop_input_cols else inputs) + + outputs) for prob_func in PROB_FUNCTIONS: if hasattr(self, prob_func): output_cols_prefix: str = f"{{prob_func}}_" output_column_names = self._get_output_column_names(output_cols_prefix) outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names] - self._model_signature_dict[prob_func] = ModelSignature(inputs, - ([] if self._drop_input_cols else inputs) + outputs) + self._model_signature_dict[prob_func] = ModelSignature(inputs, + ([] if self._drop_input_cols else inputs) + + outputs) @property def model_signatures(self) -> Dict[str, ModelSignature]: + """Returns model signature of current class. + + Raises: + exceptions.SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred + + Returns: + Dict[str, ModelSignature]: each method and its input output signature + """ if self._model_signature_dict is None: - raise RuntimeError("Estimator not fitted before accessing property model_signatures! ") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"), + ) return self._model_signature_dict def {transform.supported_export_method}(self) -> Any: @@ -1089,10 +1126,26 @@ class {transform.original_class_name}(BaseTransformer): return self._sklearn_object def {transform.unsupported_export_methods[0]}(self) -> Any: - raise AttributeError("Estimator doesn't support {transform.unsupported_export_methods[0]}(). Please use {transform.supported_export_method}()") + raise exceptions.SnowflakeMLException( + error_code=error_codes.METHOD_NOT_ALLOWED, + original_exception=AttributeError( + modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format( + "{transform.unsupported_export_methods[0]}()", + "{transform.supported_export_method}()" + ) + ), + ) def {transform.unsupported_export_methods[1]}(self) -> Any: - raise AttributeError("Estimator doesn't support {transform.unsupported_export_methods[1]}(). Please use {transform.supported_export_method}()") + raise exceptions.SnowflakeMLException( + error_code=error_codes.METHOD_NOT_ALLOWED, + original_exception=AttributeError( + modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format( + "{transform.unsupported_export_methods[1]}()", + "{transform.supported_export_method}()" + ) + ), + ) def _get_dependencies(self) -> List[str]: return self._deps diff --git a/codegen/transformer_autogen_test_template.py_template b/codegen/transformer_autogen_test_template.py_template index 962eb0d2..d63d8489 100644 --- a/codegen/transformer_autogen_test_template.py_template +++ b/codegen/transformer_autogen_test_template.py_template @@ -9,7 +9,7 @@ import json import random import pytest -from typing import Optional, Any +from typing import Optional, Any, Tuple, List from absl.testing.absltest import TestCase, main {transform.test_estimator_imports} from snowflake.ml.utils.connection_params import SnowflakeLoginOptions @@ -18,14 +18,16 @@ from snowflake.snowpark import Session, DataFrame @pytest.mark.pip_incompatible class {transform.test_class_name}(TestCase): - def setUp(self): + def setUp(self) -> None: """Creates Snowpark and Snowflake environments for testing.""" self._session = Session.builder.configs(SnowflakeLoginOptions()).create() - def tearDown(self): + def tearDown(self) -> None: self._session.close() - def _get_test_dataset(self, sklearn_obj: Optional[Any] = None, add_sample_weight_col: bool = False): + def _get_test_dataset( + self, sklearn_obj: Optional[Any] = None, add_sample_weight_col: bool = False + ) -> Tuple[pd.DataFrame, List[str], List[str]]: """ Constructs input dataset to be used in the integration test. Args: @@ -34,13 +36,14 @@ class {transform.test_class_name}(TestCase): add_sample_weight_col: If true and addiptional column named "SAMPLE_WEIGHT" will be added to the dataset representing the weight of each sample. - Retrurns: + Returns: A tuple containing pandas dataframe, list of input columns names, and list of lable column names. """ input_df_pandas = {transform.test_dataset_func}(as_frame=True).frame # Some of the estimators inherit from MultiOutputMixin class but don't actually support multi task learning. # Those estimators can be identified by calling _is_multitask() method or checking "multioutput" tag. + assert sklearn_obj is not None if ( {transform._is_multioutput} and ( @@ -106,6 +109,9 @@ class {transform.test_class_name}(TestCase): reg.set_output_cols(output_cols) reg.set_label_cols(label_col) + # Assert that we will filter on the correct columns + self.assertCountEqual(reg._get_active_columns(), input_cols + label_col) + args = {{ 'X':input_df_pandas[input_cols], 'y':input_df_pandas[label_col].squeeze() @@ -113,6 +119,7 @@ class {transform.test_class_name}(TestCase): if use_weighted_dataset: reg.set_sample_weight_col("SAMPLE_WEIGHT") args['sample_weight'] = input_df_pandas["SAMPLE_WEIGHT"].squeeze() + self.assertCountEqual(reg._get_active_columns(), input_cols + label_col + ["SAMPLE_WEIGHT"]) if fit_with_sproc: reg.fit(input_df) @@ -222,7 +229,7 @@ class {transform.test_class_name}(TestCase): if inference_with_udf: actual_score = getattr(reg, "score")(dataset=input_df) if isinstance(actual_score, DataFrame): - actual_score.to_pandas().sort_values(by="INDEX")[output_cols].to_numpy(dtype=np.float).squeeze() + actual_score.to_pandas().sort_values(by="INDEX")[output_cols].to_numpy(dtype=np.float_).squeeze() else: actual_score = getattr(reg, "score")(dataset=input_df_pandas) if isinstance(actual_score, pd.DataFrame): @@ -230,23 +237,23 @@ class {transform.test_class_name}(TestCase): c for c in actual_score.columns if any([c.find(colName) >= 0 for colName in output_cols]) ] - actual_score = actual_score[actual_output_cols].to_numpy(dtype=np.float).squeeze() + actual_score = actual_score[actual_output_cols].to_numpy(dtype=np.float_).squeeze() sklearn_score = getattr(sklearn_reg, "score")(**args) np.testing.assert_allclose(actual_score, sklearn_score, rtol=1.e-1, atol=1.e-2) - def test_fit_with_sproc_infer_with_udf_non_weighted_datasets(self): + def test_fit_with_sproc_infer_with_udf_non_weighted_datasets(self) -> None: self._fit_and_compare_results(use_weighted_dataset=False, fit_with_sproc = True, inference_with_udf = True) - def test_fit_with_sproc_infer_with_pandas_non_weighted_datasets(self): + def test_fit_with_sproc_infer_with_pandas_non_weighted_datasets(self) -> None: self._fit_and_compare_results(use_weighted_dataset=False, fit_with_sproc = True, inference_with_udf = False) - def test_fit_with_pandas_infer_with_pandas_non_weighted_datasets(self): + def test_fit_with_pandas_infer_with_pandas_non_weighted_datasets(self) -> None: self._fit_and_compare_results(use_weighted_dataset=False, fit_with_sproc = False, inference_with_udf = False) - def test_fit_with_pandas_infer_with_udf_non_weighted_datasets(self): + def test_fit_with_pandas_infer_with_udf_non_weighted_datasets(self) -> None: self._fit_and_compare_results(use_weighted_dataset=False, fit_with_sproc = False, inference_with_udf = True) def _is_weighted_dataset_supported(self, klass: type) -> bool: @@ -257,19 +264,19 @@ class {transform.test_class_name}(TestCase): is_weighted_dataset_supported = True if "sample_weight" in argspec.args else False return is_weighted_dataset_supported - def test_fit_with_sproc_infer_with_udf_weighted_datasets(self): + def test_fit_with_sproc_infer_with_udf_weighted_datasets(self) -> None: if self._is_weighted_dataset_supported(Sk{transform.original_class_name}): self._fit_and_compare_results(use_weighted_dataset=True, fit_with_sproc = True, inference_with_udf = True) - def test_fit_with_sproc_infer_with_pandas_weighted_datasets(self): + def test_fit_with_sproc_infer_with_pandas_weighted_datasets(self) -> None: if self._is_weighted_dataset_supported(Sk{transform.original_class_name}): self._fit_and_compare_results(use_weighted_dataset=True, fit_with_sproc = True, inference_with_udf = False) - def test_fit_with_pandas_infer_with_pandas_weighted_datasets(self): + def test_fit_with_pandas_infer_with_pandas_weighted_datasets(self) -> None: if self._is_weighted_dataset_supported(Sk{transform.original_class_name}): self._fit_and_compare_results(use_weighted_dataset=True, fit_with_sproc = False, inference_with_udf = False) - def test_fit_with_pandas_infer_with_udf_weighted_datasets(self): + def test_fit_with_pandas_infer_with_udf_weighted_datasets(self) -> None: if self._is_weighted_dataset_supported(Sk{transform.original_class_name}): self._fit_and_compare_results(use_weighted_dataset=True, fit_with_sproc = False, inference_with_udf = True) diff --git a/requirements.yml b/requirements.yml index b7f47b29..3aa82dd4 100644 --- a/requirements.yml +++ b/requirements.yml @@ -1,46 +1,77 @@ # Add requirements information here and use `bazel run //bazel/requirements:sync_requirements` # to generate all other requirements files. -# Fields: -# name: The name of the package. Set if it is available with the same name and required both in PyPI and conda. -# name_pypi: The name of the package in PyPI. Set this only to indicate it is a requirements available in PyPI only, -# or set this with name_conda to indicates that it has different name in PyPI and conda. -# name_conda: The name of the package in conda. Set this only to indicate it is a requirements available in conda only, -# or set this with name_pypi to indicates that it has different name in PyPI and conda. -# At least 1 of these 3 fields should be set. -# -# dev_version: The version of the package to be pinned in the dev environment. -# Set if it is available with the same version and required both in PyPI and conda. -# dev_version_pypi: The version from PyPI to be pinned in the dev environment. Set this only to indicate -# it is a requirements available in PyPI only, or set this with dev_version_conda to indicates that -# it has different version in PyPI and conda. -# dev_version_conda: The version from conda to be pinned in the dev environment. Set this only to indicate -# it is a requirements available in conda only, or set this with dev_version_pypi to indicates that -# it has different version in PyPI and conda. -# from_channel: Set this if the package is not available in Snowflake Anaconda Channel -# (https://repo.anaconda.com/pkgs/snowflake). Each dependency must be accompanied with a JIRA ticket requesting it -# to be added to the Snowflake channel. -# At least 1 of these 3 fields should be set. -# -# version_requirements: The version requirements specifiers when this requirement is a dependency of SnowML release. -# Set if it is available with the same name and required both in PyPI and conda. -# version_requirements_pypi: The version requirements specifiers when this requirement is a dependency of -# SnowML release via PyPI. Set this only to indicate it is a requirements required by PyPI release only, -# or set this with version_requirements_conda to indicates that it has different version in PyPI and conda. -# version_requirements_conda: The version requirements specifiers when this requirement is a dependency of -# SnowML release via conda. Set this only to indicate it is a requirements required by conda release only, -# or set this with version_requirements_pypi to indicates that it has different version in PyPI and conda. -# At least 1 of these 3 fields but be set to indicate that this package is a dependency of release. -# If you don't want to constrain version, set the field to empty string. -# -# requirements_extra_tags: PyPI release only. Set this to indicate the package is a extras dependency of the SnowML. -# This requirements will be then added to all extras tags set here, and an all extras tag will be auto -# generated to include all extras requirements. All extras requirements will be labeled as run_constrained in conda -# meta.yaml. -# tags: Set tag to to filter some of the requirements in some cases. +# Please provide the following fields when adding a new record: + +# # Package Name Fields + +# `name`: The name of the package. Set this if the package is available with the same name and is required in both `PyPI` +# and `conda`. + +# `name_pypi`: The name of the package in `PyPI`. Set this only to indicate that it is available in `PyPI` only. You can +# also set this along with `name_conda` if the package has different names in `PyPI` and `conda`. + +# `name_conda`: The name of the package in `conda`. Set this only to indicate that it is available in `conda` only. You +# can also set this along with `name_pypi` if the package has different names in `PyPI` and `conda`. + +# (At least one of these three fields should be set.) + +# # Development Version Fields + +# `dev_version`: The version of the package to be pinned in the dev environment. Set this if the package is available +# with the same version and is required in both `PyPI` and conda. + +# `dev_version_pypi`: The version from `PyPI` to be pinned in the dev environment. Set this only to indicate that it is +# available in `PyPI` only. You can also set this along with `dev_version_conda` if the package has different versions in +# `PyPI` and `conda`. + +# `dev_version_conda`: The version from `conda` to be pinned in the dev environment. Set this only to indicate that it is +# available in `conda` only. You can also set this along with `dev_version_pypi` if the package has different versions in +# `PyPI` and `conda`. + +# (At least one of these three fields should be set.) + +# # Snowflake Anaconda Channel + +# `from_channel`: Set this if the package is not available in the Snowflake Anaconda Channel +# (https://repo.anaconda.com/pkgs/snowflake). + +# # Version Requirements Fields (for `snowflake-ml-python` release) + +# `version_requirements`: The version requirements specifiers when this requirement is a dependency of the +# `snowflake-ml-python` release. Set this if the package is available with the same name and required in both `PyPI` and +# `conda`. + +# `version_requirements_pypi`: The version requirements specifiers when this requirement is a dependency of the +#`snowflake-ml-python` release via `PyPI`. Set this only to indicate that it is required by the `PyPI` release only. You +# can also set this along with `version_requirements_conda` if the package has different versions in `PyPI` and `conda`. + +# `version_requirements_conda`: The version requirements specifiers when this requirement is a dependency of the +# `snowflake-ml-python` release via `conda`. Set this only to indicate that it is required by the `conda` release only. +# You can also set this along with `version_requirements_pypi` if the package has different versions in `PyPI` and `conda`. + +# (At least one of these three fields must be set to indicate that this package is a dependency of the release. If you +# don't want to constrain the version, set the field to an empty string.) + +# # Extras Tags and Tags + +# `requirements_extra_tags`: Set this to indicate that the package is an extras dependency of `snowflake-ml-python`. +# This requirement will be added to all extras tags specified here, and an `all` extras tag will be auto-generated to include +# all extras requirements. All extras requirements will be labeled as `run_constrained` in conda's meta.yaml. + +# `tags`: Set tags to filter some of the requirements in specific cases. The current valid tags include: +# - `deployment_core`: Used by model deployment to indicate dependencies required to execute model deployment code +# on the server-side. +# - `build_essential`: Used to indicate the packages composing the build environment. - name: absl-py dev_version: "1.3.0" version_requirements: ">=0.15,<2" + tags: + - build_essential +# For fsspec[http] in conda +- name_conda: aiohttp + dev_version_conda: "3.8.3" + version_requirements_conda: "!=4.0.0a0, !=4.0.0a1" - name: anyio dev_version: "3.5.0" version_requirements: ">=3.5.0,<4" @@ -50,11 +81,17 @@ dev_version: "1.24.28" - name_conda: conda-libmamba-solver dev_version_conda: "23.3.0" + tags: + - build_essential - name: cloudpickle dev_version: "2.0.0" version_requirements: "" tags: - deployment_core +- name: cryptography + dev_version: "39.0.1" + # Skipping version requirements as it should come as part of connector. + # Only used in connection_params.py, which is an util library anyways. - name: coverage dev_version: "6.3.2" - name: flask-cors @@ -69,8 +106,12 @@ dev_version: "0.23.0" - name: inflection dev_version: "0.5.1" + tags: + - build_essential - name: jsonschema dev_version: "3.2.0" + tags: + - build_essential - name: joblib dev_version: "1.1.1" - name: lightgbm @@ -78,6 +119,8 @@ version_requirements: "==3.3.5" requirements_extra_tags: - lightgbm + tags: + - build_essential - name: mlflow dev_version: "2.3.1" version_requirements: ">=2.1.0,<3" @@ -87,23 +130,29 @@ dev_version: "4.0.11" - name: mypy dev_version: "0.981" + tags: + - build_essential - name: networkx dev_version: "2.8.4" - name: numpy - dev_version: "1.23.4" + dev_version: "1.24.3" version_requirements: ">=1.23,<2" tags: - deployment_core + - build_essential - name: packaging dev_version: "23.0" version_requirements: ">=20.9,<24" tags: - deployment_core + - build_essential - name: pandas dev_version: "1.4.4" version_requirements: ">=1.0.0,<2" tags: - deployment_core +- name: protobuf + dev_version: "3.20.3" - name: pytest dev_version: "7.1.2" - name_conda: python @@ -111,12 +160,17 @@ version_requirements_conda: "" - name_pypi: torch name_conda: pytorch - dev_version: "1.13.1" + dev_version: "2.0.1" - name: pyyaml dev_version: "6.0" version_requirements: ">=6.0,<7" tags: - deployment_core + - build_essential +# For fsspec[http] in conda +- name_conda: requests + dev_version_conda: "2.29.0" + version_requirements_conda: "" - name: ruamel.yaml dev_version: "0.17.21" - name: s3fs @@ -124,6 +178,8 @@ - name: scikit-learn dev_version: "1.2.2" version_requirements: ">=1.2.1,<1.3" + tags: + - build_essential - name: scipy dev_version: "1.9.3" version_requirements: ">=1.9,<2" @@ -143,17 +199,20 @@ dev_version: "0.4.3" version_requirements: ">=0.4,<1" - name: tensorflow - dev_version: "2.11.0" + dev_version: "2.10.0" version_requirements: ">=2.9,<3" requirements_extra_tags: - tensorflow - name: torchdata - dev_version: "0.5.1" + dev_version: "0.6.1" + from_channel: pytorch version_requirements: ">=0.4,<1" requirements_extra_tags: - torch - name: transformers dev_version: "4.29.2" +- name: types-protobuf + dev_version: "4.23.0.1" - name: types-PyYAML dev_version: "6.0.12" from_channel: conda-forge @@ -165,3 +224,5 @@ - name: xgboost dev_version: "1.7.3" version_requirements: ">=1.7.3,<2" + tags: + - build_essential diff --git a/snowflake/ml/_internal/exceptions/error_codes.py b/snowflake/ml/_internal/exceptions/error_codes.py index dd52525f..93e2ca20 100644 --- a/snowflake/ml/_internal/exceptions/error_codes.py +++ b/snowflake/ml/_internal/exceptions/error_codes.py @@ -40,6 +40,9 @@ # The method is known but is not supported by the target resource. For example, calling `to_xgboost` is not allowed by # Snowpark ML models based on scikit-learn. METHOD_NOT_ALLOWED = "2102" +# Not implemented. +NOT_IMPLEMENTED = "2103" + # Calling an API with unsupported keywords/values. INVALID_ARGUMENT = "2110" # Object has invalid attributes caused by invalid/unsupported value, unsupported data type, size mismatch, etc. @@ -48,6 +51,8 @@ INVALID_DATA = "2112" # Invalid data type in the processed data. For example, an API handling numeric columns gets a string column. INVALID_DATA_TYPE = "2113" +# Calling an API with unsupported value type, or perform actions on objects with incorrect types. +INVALID_TYPE = "2114" # Indicates the creation of underlying resources (files, stages, tables, etc) failed, which can be caused by duplicated # name, invalid permission, etc. diff --git a/snowflake/ml/_internal/telemetry_test.py b/snowflake/ml/_internal/telemetry_test.py index 32cc79a0..43dc2624 100644 --- a/snowflake/ml/_internal/telemetry_test.py +++ b/snowflake/ml/_internal/telemetry_test.py @@ -333,7 +333,7 @@ def foo2(self) -> "DummyObject": utils_telemetry.TelemetryField.KEY_CUSTOM_TAGS.value: {"custom_tag": "tag"}, } self.assertIsNotNone(actual_statement_params) - assert actual_statement_params is not None, "actual_statement_params cannot be None" # mypy + assert actual_statement_params is not None # mypy self.assertLessEqual(expected_statement_params.items(), actual_statement_params.items()) self.assertIn("DummyObject.foo", actual_statement_params[utils_telemetry.TelemetryField.KEY_FUNC_NAME.value]) self.assertFalse(hasattr(test_obj.foo2(), "_statement_params")) diff --git a/snowflake/ml/_internal/utils/identifier.py b/snowflake/ml/_internal/utils/identifier.py index 042926dd..3b54d7a3 100644 --- a/snowflake/ml/_internal/utils/identifier.py +++ b/snowflake/ml/_internal/utils/identifier.py @@ -200,6 +200,21 @@ def get_unescaped_names(ids: Optional[Union[str, List[str]]]) -> Optional[Union[ raise ValueError("Unsupported type. Only string or list of string are supported for selecting columns.") +@overload +def get_escaped_names(ids: None) -> None: + ... + + +@overload +def get_escaped_names(ids: str) -> str: + ... + + +@overload +def get_escaped_names(ids: List[str]) -> List[str]: + ... + + def get_escaped_names(ids: Optional[Union[str, List[str]]]) -> Optional[Union[str, List[str]]]: """Given a user provided identifier(s), this method will compute the equivalent column name identifier(s) in case of column name contains special characters, and maintains case-sensitivity diff --git a/snowflake/ml/_internal/utils/pkg_version_utils.py b/snowflake/ml/_internal/utils/pkg_version_utils.py index 242bdd22..11546405 100644 --- a/snowflake/ml/_internal/utils/pkg_version_utils.py +++ b/snowflake/ml/_internal/utils/pkg_version_utils.py @@ -23,8 +23,8 @@ def is_relaxed() -> bool: def get_valid_pkg_versions_supported_in_snowflake_conda_channel( pkg_versions: List[str], session: Session, subproject: Optional[str] = None -) -> List[Optional[str]]: - pkg_version_conda_list: List[Optional[str]] = [] +) -> List[str]: + pkg_version_conda_list: List[str] = [] pkg_version_warning_list: List[List[str]] = [] for pkg_version in pkg_versions: conda_pkg_version = _validate_pkg_version_supported_in_snowflake_conda_channel( @@ -39,6 +39,7 @@ def get_valid_pkg_versions_supported_in_snowflake_conda_channel( f"python runtime {_RUNTIME_VERSION}." ) else: + tokens = pkg_version.split("==") pkg_name = tokens[0] pkg_version_conda_list.append(f"{pkg_name}=={conda_pkg_version}") diff --git a/snowflake/ml/_internal/utils/query_result_checker.py b/snowflake/ml/_internal/utils/query_result_checker.py index b572319a..15c4b89e 100644 --- a/snowflake/ml/_internal/utils/query_result_checker.py +++ b/snowflake/ml/_internal/utils/query_result_checker.py @@ -1,7 +1,7 @@ from __future__ import annotations # for return self methods from functools import partial -from typing import Any, Callable +from typing import Any, Callable, Optional from snowflake import connector, snowpark from snowflake.ml._internal.utils import formatting @@ -13,7 +13,7 @@ def _query_log(sql: str | None) -> str: def result_dimension_matcher( - expected_rows: int, expected_cols: int, result: list[snowpark.Row], sql: str | None = None + expected_rows: Optional[int], expected_cols: Optional[int], result: list[snowpark.Row], sql: str | None = None ) -> bool: """Check result dimensions of the collected result dataframe of a Snowflake SQL operation. @@ -32,7 +32,7 @@ def result_dimension_matcher( DataError: In case the validation failed. """ actual_rows = len(result) - if expected_rows and actual_rows != expected_rows: + if expected_rows is not None and actual_rows != expected_rows: raise connector.DataError( formatting.unwrap( f"""Query Result did not match expected number of rows. Expected {expected_rows} rows, found: @@ -40,7 +40,7 @@ def result_dimension_matcher( ) ) - if expected_cols: + if expected_cols is not None: if not result: raise connector.DataError( formatting.unwrap( diff --git a/snowflake/ml/model/BUILD.bazel b/snowflake/ml/model/BUILD.bazel index 03394896..1b025d63 100644 --- a/snowflake/ml/model/BUILD.bazel +++ b/snowflake/ml/model/BUILD.bazel @@ -1,15 +1,17 @@ -load("//bazel:py_rules.bzl", "py_library", "py_test") +load("//bazel:py_rules.bzl", "py_library", "py_test", "py_genrule") package(default_visibility = ["//visibility:public"]) -genrule( +GEN_CORE_REQ_CMD = "$(location //bazel/requirements:parse_and_generate_requirements) $(location //:requirements.yml) --schema $(location //bazel/requirements:requirements.schema.json) --mode version_requirements --format python --filter_by_tag deployment_core > $@" + +py_genrule( name = "gen_core_requirements", srcs = [ "//:requirements.yml", "//bazel/requirements:requirements.schema.json", ], outs = ["_core_requirements.py"], - cmd = "$(location //bazel/requirements:parse_and_generate_requirements) $(location //:requirements.yml) --schema $(location //bazel/requirements:requirements.schema.json) --mode version_requirements --format python --filter_by_tag deployment_core > $@", + cmd = GEN_CORE_REQ_CMD, tools = ["//bazel/requirements:parse_and_generate_requirements"], ) @@ -26,18 +28,24 @@ py_library( ], ) +py_library( + name = "deploy_platforms", + srcs = ["deploy_platforms.py"], +) + py_library( name = "model_signature", srcs = ["model_signature.py"], deps = [ ":type_hints", "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/_internal/exceptions", "//snowflake/ml/_internal/utils:formatting", "//snowflake/ml/_internal/utils:identifier", "//snowflake/ml/model/_deploy_client/warehouse:infer_template", "//snowflake/ml/model/_signatures:base_handler", - "//snowflake/ml/model/_signatures:core", "//snowflake/ml/model/_signatures:builtins_handler", + "//snowflake/ml/model/_signatures:core", "//snowflake/ml/model/_signatures:numpy_handler", "//snowflake/ml/model/_signatures:pandas_handler", "//snowflake/ml/model/_signatures:pytorch_handler", @@ -86,6 +94,7 @@ py_library( deps = [ ":model_signature", ":type_hints", + ":deploy_platforms", "//snowflake/ml/_internal/utils:identifier", "//snowflake/ml/model/_deploy_client/snowservice:deploy", "//snowflake/ml/model/_deploy_client/warehouse:deploy", @@ -149,6 +158,7 @@ py_test( srcs = ["model_signature_test.py"], deps = [ ":model_signature", + "//snowflake/ml/test_utils:exception_utils", ], ) @@ -174,10 +184,10 @@ py_test( ":custom_model", ":model_signature", ":type_hints", - "//snowflake/ml/modeling/linear_model:linear_regression", - "//snowflake/ml/test_utils:mock_session", - "//snowflake/ml/model/_signatures:utils", "//snowflake/ml/model/_signatures:pytorch_handler", "//snowflake/ml/model/_signatures:tensorflow_handler", + "//snowflake/ml/model/_signatures:utils", + "//snowflake/ml/modeling/linear_model:linear_regression", + "//snowflake/ml/test_utils:mock_session", ], ) diff --git a/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel b/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel index df3e9816..e5da3b2d 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel @@ -49,7 +49,6 @@ py_test( ], data = [ "test_fixtures/dockerfile_test_fixture", - "test_fixtures/dockerfile_test_gpu_fixture" ] ) diff --git a/snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py b/snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py index a223c4ab..8b491b0f 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +++ b/snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py @@ -14,6 +14,8 @@ docker_context, ) +logger = logging.getLogger(__name__) + class Platform(Enum): LINUX_AMD64 = "linux/amd64" @@ -30,22 +32,18 @@ class ClientImageBuilder(base_image_builder.ImageBuilder): """ - def __init__( - self, *, id: str, image_repo: str, model_dir: str, session: snowpark.Session, use_gpu: bool = False - ) -> None: + def __init__(self, *, id: str, image_repo: str, model_dir: str, session: snowpark.Session) -> None: """Initialization Args: id: A hexadecimal string used for naming the image tag. image_repo: Path to image repository. model_dir: Local model directory, downloaded form stage and extracted. - use_gpu: Boolean flag for generating the CPU or GPU base image. session: Snowpark session """ self.image_tag = "/".join([image_repo.rstrip("/"), id]) + ":latest" self.image_repo = image_repo self.model_dir = model_dir - self.use_gpu = use_gpu self.session = session def build_and_upload_image(self) -> str: @@ -82,7 +80,7 @@ def _cleanup_local_image() -> None: pass else: commands = ["docker", "--config", config_dir, "rmi", self.image_tag] - logging.info(f"Removing local image: {self.image_tag}") + logger.debug(f"Removing local image: {self.image_tag}") self._run_docker_commands(commands) self.validate_docker_client_env() @@ -151,7 +149,7 @@ def _build(self, docker_config_dir: str) -> None: """ with tempfile.TemporaryDirectory() as context_dir: - dc = docker_context.DockerContext(context_dir=context_dir, model_dir=self.model_dir, use_gpu=self.use_gpu) + dc = docker_context.DockerContext(context_dir=context_dir, model_dir=self.model_dir) dc.build() self._build_image_from_context(context_dir=context_dir, docker_config_dir=docker_config_dir) @@ -170,7 +168,7 @@ def _run_docker_commands(self, commands: List[str]) -> None: if proc.stdout: for line in iter(proc.stdout.readline, ""): output_lines.append(line) - logging.info(line) + logger.debug(line) if proc.wait(): raise RuntimeError(f"Docker commands failed: \n {''.join(output_lines)}") @@ -226,6 +224,6 @@ def _upload(self, docker_config_dir: str) -> None: commands = ["docker", "--config", docker_config_dir, "login", self.image_tag] self._run_docker_commands(commands) - logging.info(f"Pushing image to image repo {self.image_tag}") + logger.debug(f"Pushing image to image repo {self.image_tag}") commands = ["docker", "--config", docker_config_dir, "push", self.image_tag] self._run_docker_commands(commands) diff --git a/snowflake/ml/model/_deploy_client/image_builds/client_image_builder_test.py b/snowflake/ml/model/_deploy_client/image_builds/client_image_builder_test.py index 907e8ab6..5b1c6fc3 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/client_image_builder_test.py +++ b/snowflake/ml/model/_deploy_client/image_builds/client_image_builder_test.py @@ -16,14 +16,12 @@ def setUp(self) -> None: self.unique_id = "mock_id" self.image_repo = "mock_image_repo" self.model_dir = "local/dir/model.zip" - self.use_gpu = True self.client_image_builder = client_image_builder.ClientImageBuilder( id=self.unique_id, image_repo=self.image_repo, model_dir=self.model_dir, session=self.m_session, - use_gpu=self.use_gpu, ) @mock.patch( @@ -49,9 +47,7 @@ def test_build(self, m_tempdir: mock.MagicMock, m_docker_context_class: mock.Mag self.client_image_builder, "_build_image_from_context" ) as m_build_image_from_context: self.client_image_builder._build(m_docker_config_dir) - m_docker_context_class.assert_called_once_with( - context_dir=m_context_dir, model_dir=self.model_dir, use_gpu=True - ) + m_docker_context_class.assert_called_once_with(context_dir=m_context_dir, model_dir=self.model_dir) m_build.assert_called_once() m_build_image_from_context.assert_called_once_with( context_dir=m_context_dir, docker_config_dir=m_docker_config_dir diff --git a/snowflake/ml/model/_deploy_client/image_builds/docker_context.py b/snowflake/ml/model/_deploy_client/image_builds/docker_context.py index 21d7c785..931d4c69 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/docker_context.py +++ b/snowflake/ml/model/_deploy_client/image_builds/docker_context.py @@ -11,17 +11,15 @@ class DockerContext(ABC): Constructs the Docker context directory required for image building. """ - def __init__(self, context_dir: str, model_dir: str, *, use_gpu: bool = False) -> None: + def __init__(self, context_dir: str, model_dir: str) -> None: """Initialization Args: context_dir: Path to context directory. model_dir: Path to local model directory. - use_gpu: Boolean flag for generating the CPU or GPU base image. """ self.context_dir = context_dir self.model_dir = model_dir - self.use_gpu = use_gpu def build(self) -> None: """ @@ -57,9 +55,7 @@ def _generate_docker_file(self) -> None: dockerfile_content = string.Template(template.read()).safe_substitute( { # TODO(shchen): SNOW-835411, Support overwriting base image - "base_image": "mambaorg/micromamba:focal-cuda-11.7.1" - if self.use_gpu - else "mambaorg/micromamba:1.4.3", + "base_image": "mambaorg/micromamba:1.4.3", "model_env_folder": constants.MODEL_ENV_FOLDER, "inference_server_dir": constants.INFERENCE_SERVER_DIR, "entrypoint_script": constants.ENTRYPOINT_SCRIPT, diff --git a/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py b/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py index 68dfd823..7f9a6fc0 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py +++ b/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py @@ -35,8 +35,7 @@ def setUp(self) -> None: sample_input=_IRIS_X, ) - self.use_gpu = False - self.docker_context = docker_context.DockerContext(self.context_dir, model_dir=self.model_dir, use_gpu=False) + self.docker_context = docker_context.DockerContext(self.context_dir, model_dir=self.model_dir) def tearDown(self) -> None: shutil.rmtree(self.model_dir) @@ -71,12 +70,10 @@ def test_docker_file_content(self) -> None: self.assertEqual(actual, expected, "Generated dockerfile is not aligned with the docker template") def test_docker_file_content_with_gpu(self) -> None: - gpu_context = docker_context.DockerContext(self.context_dir, model_dir=self.model_dir, use_gpu=True) + gpu_context = docker_context.DockerContext(self.context_dir, model_dir=self.model_dir) gpu_context.build() dockerfile_path = os.path.join(self.context_dir, "Dockerfile") - dockerfile_fixture_path = os.path.join( - os.path.dirname(__file__), "test_fixtures", "dockerfile_test_gpu_fixture" - ) + dockerfile_fixture_path = os.path.join(os.path.dirname(__file__), "test_fixtures", "dockerfile_test_fixture") with open(dockerfile_path) as dockerfile, open(dockerfile_fixture_path) as expected_dockerfile: actual = dockerfile.read() expected = expected_dockerfile.read() diff --git a/snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh b/snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh index 5d1c745c..81984c67 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +++ b/snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh @@ -1,24 +1,36 @@ -#!/bin/sh +#!/bin/bash set -eu OS=$(uname) -if [ "${OS}" = "Linux" ]; then +if [[ ${OS} = "Linux" ]]; then NUM_CORES=$(nproc) -elif [ "${OS}" = "Darwin" ]; then +elif [[ ${OS} = "Darwin" ]]; then # macOS NUM_CORES=$(sysctl -n hw.ncpu) -elif [ "${OS}" = "Windows" ]; then +elif [[ ${OS} = "Windows" ]]; then NUM_CORES=$(wmic cpu get NumberOfCores | grep -Eo '[0-9]+') else echo "Unsupported operating system: ${OS}" exit 1 fi -# Based on the Gunicorn documentation, set the number of workers to number_of_cores * 2 + 1. This assumption is -# based on an ideal scenario where one core is handling two processes simultaneously, while one process is dedicated to -# IO operations and the other process is performing compute tasks. -NUM_WORKERS=$((NUM_CORES * 2 + 1)) +# Check if the "NUM_WORKERS" variable is set by the user +if [[ -n "${NUM_WORKERS-}" && "${NUM_WORKERS}" != "None" ]]; then + # If the user has set the "num_workers" variable, use it to overwrite the default value + FINAL_NUM_WORKERS=${NUM_WORKERS} +else + # Based on the Gunicorn documentation, set the number of workers to number_of_cores * 2 + 1. This assumption is + # based on an ideal scenario where one core is handling two processes simultaneously, while one process is dedicated to + # IO operations and the other process is performing compute tasks. + # However, in case when the model is large, we will run into OOM error as each process will need to load the model + # into memory. In such cases, we require the user to pass in "num_workers" to overwrite the default. + FINAL_NUM_WORKERS=$((NUM_CORES * 2 + 1)) +fi + echo "Number of CPU cores: $NUM_CORES" -echo "Setting number of workers to $NUM_WORKERS" -exec /opt/conda/bin/gunicorn --preload -w "$NUM_WORKERS" -k uvicorn.workers.UvicornWorker -b 0.0.0.0:5000 inference_server.main:app +echo "Setting number of workers to $FINAL_NUM_WORKERS" + +# Exclude preload option as it won't work with non-thread-safe model, and no easy way to detect whether model is +# thread-safe or not. Defer the optimization later. +exec /opt/conda/bin/gunicorn -w "$FINAL_NUM_WORKERS" -k uvicorn.workers.UvicornWorker -b 0.0.0.0:5000 inference_server.main:app diff --git a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py index cd7aa438..42135dfa 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +++ b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py @@ -11,7 +11,6 @@ logger = logging.getLogger(__name__) _LOADED_MODEL = None _LOADED_META = None -TARGET_METHOD = "predict" MODEL_CODE_DIR = "code" @@ -27,6 +26,7 @@ def _run_setup() -> None: MODEL_ZIP_STAGE_PATH = os.getenv("MODEL_ZIP_STAGE_PATH") assert MODEL_ZIP_STAGE_PATH, "Missing environment variable MODEL_ZIP_STAGE_PATH" + root_path = os.path.abspath(os.sep) model_zip_stage_path = os.path.join(root_path, MODEL_ZIP_STAGE_PATH) @@ -80,6 +80,10 @@ async def predict(request: requests.Request) -> responses.JSONResponse: """ assert _LOADED_MODEL, "model is not loaded" assert _LOADED_META, "model metadata is not loaded" + + TARGET_METHOD = os.getenv("TARGET_METHOD") + assert TARGET_METHOD, "Missing environment variable TARGET_METHOD" + from snowflake.ml.model.model_signature import FeatureSpec try: @@ -99,9 +103,7 @@ async def predict(request: requests.Request) -> responses.JSONResponse: return responses.JSONResponse({"error": error_message}, status_code=400) try: - # TODO(shchen): SNOW-835369, Support target method in inference server (Multi-task model). - # Mypy ignore will be fixed along with the above ticket. - predictions_df = _LOADED_MODEL.predict(x) # type: ignore[attr-defined] + predictions_df = getattr(_LOADED_MODEL, TARGET_METHOD)(x) predictions_df.columns = output_cols # Use _ID to keep the order of prediction result and associated features. _KEEP_ORDER_COL_NAME = "_ID" diff --git a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py index 63630742..87d33668 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py +++ b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py @@ -88,16 +88,21 @@ def test_predict_endpoint_happy_path(self) -> None: ] } - with mock.patch("main._LOADED_MODEL", loaded_model), mock.patch("main._LOADED_META", loaded_meta): + with mock.patch.dict(os.environ, {"TARGET_METHOD": "predict"}, clear=True), mock.patch( + "main._LOADED_MODEL", loaded_model + ), mock.patch("main._LOADED_META", loaded_meta): response = self.client.post("/predict", json=data) - self.assertEqual(response.status_code, 200) - expected_response = {"data": [[0, {"output_feature_0": 1, "_ID": 0}], [1, {"output_feature_0": 2, "_ID": 1}]]} - self.assertEqual(response.json(), expected_response) + self.assertEqual(response.status_code, 200) + expected_response = { + "data": [[0, {"output_feature_0": 1, "_ID": 0}], [1, {"output_feature_0": 2, "_ID": 1}]] + } + self.assertEqual(response.json(), expected_response) - # def test_predict_endpoint_with_invalid_input(self) -> None: loaded_model, loaded_meta = self.get_custom_sklearn_model() - with mock.patch("main._LOADED_MODEL", loaded_model), mock.patch("main._LOADED_META", loaded_meta): + with mock.patch.dict(os.environ, {"TARGET_METHOD": "predict"}, clear=True), mock.patch( + "main._LOADED_MODEL", loaded_model + ), mock.patch("main._LOADED_META", loaded_meta): response = self.client.post("/predict", json={}) self.assertEqual(response.status_code, 400) self.assertRegex(response.text, "Input data malformed: missing data field in the request input") @@ -150,7 +155,9 @@ def test_predict_with_misshaped_data(self) -> None: ] } - with mock.patch("main._LOADED_MODEL", loaded_model), mock.patch("main._LOADED_META", loaded_meta): + with mock.patch.dict(os.environ, {"TARGET_METHOD": "predict"}, clear=True), mock.patch( + "main._LOADED_MODEL", loaded_model + ), mock.patch("main._LOADED_META", loaded_meta): response = self.client.post("/predict", json=data) self.assertEqual(response.status_code, 400) self.assertRegex(response.text, r"Input data malformed: .*dtype mappings argument.*") @@ -172,7 +179,9 @@ def test_predict_with_incorrect_data_type(self) -> None: ] } - with mock.patch("main._LOADED_MODEL", loaded_model), mock.patch("main._LOADED_META", loaded_meta): + with mock.patch.dict(os.environ, {"TARGET_METHOD": "predict"}, clear=True), mock.patch( + "main._LOADED_MODEL", loaded_model + ), mock.patch("main._LOADED_META", loaded_meta): response = self.client.post("/predict", json=data) self.assertEqual(response.status_code, 400) self.assertRegex(response.text, "Input data malformed: could not convert string to float") diff --git a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_gpu_fixture b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_gpu_fixture deleted file mode 100644 index 6c76264a..00000000 --- a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_gpu_fixture +++ /dev/null @@ -1,29 +0,0 @@ -FROM mambaorg/micromamba:focal-cuda-11.7.1 as build - -COPY env/conda.yaml conda.yaml -COPY env/requirements.txt requirements.txt -ARG MAMBA_DOCKERFILE_ACTIVATE=1 -RUN --mount=type=cache,target=/opt/conda/pkgs micromamba install -y -n base -f conda.yaml && \ - python -m pip install "uvicorn[standard]" gunicorn starlette==0.30.0 && \ - python -m pip install -r requirements.txt - -FROM debian:buster-slim AS runtime - -ENV USER nonrootuser -ENV UID 1000 -ENV HOME /home/$USER -RUN adduser --disabled-password \ - --gecos "A non-root user for running inference server" \ - --uid $UID \ - --home $HOME \ - $USER - -COPY inference_server ./inference_server -COPY gunicorn_run.sh ./gunicorn_run.sh -RUN chmod +x /gunicorn_run.sh -COPY --from=build /opt/conda /opt/conda -EXPOSE 5000 - -USER nonrootuser - -CMD ["/gunicorn_run.sh"] diff --git a/snowflake/ml/model/_deploy_client/snowservice/deploy.py b/snowflake/ml/model/_deploy_client/snowservice/deploy.py index 83a16269..aaee5b81 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/deploy.py +++ b/snowflake/ml/model/_deploy_client/snowservice/deploy.py @@ -6,6 +6,7 @@ from abc import ABC from typing import Any, Dict, Optional, cast +import yaml from typing_extensions import Unpack from snowflake.ml._internal import file_utils @@ -15,6 +16,8 @@ from snowflake.ml.model._deploy_client.utils import constants, snowservice_client from snowflake.snowpark import FileOperation, Session +logger = logging.getLogger(__name__) + def _deploy( session: Session, @@ -23,6 +26,7 @@ def _deploy( service_func_name: str, model_zip_stage_path: str, deployment_stage_path: str, + target_method: str, **kwargs: Unpack[type_hints.SnowparkContainerServiceDeployOptions], ) -> _model_meta.ModelMetadata: """Entrypoint for model deployment to SnowService. This function will trigger a docker image build followed by @@ -34,6 +38,7 @@ def _deploy( service_func_name: The service function name in SnowService associated with the created service. model_zip_stage_path: Path to model zip file in stage. Note that this path has a "@" prefix. deployment_stage_path: Path to stage containing deployment artifacts. + target_method: The name of the target method to be deployed. **kwargs: various SnowService deployment options. Raises: @@ -91,6 +96,7 @@ def _deploy( model_zip_stage_path=model_zip_stage_path, # Pass down model_zip_stage_path for service spec file deployment_stage_path=deployment_stage_path, model_dir=temp_local_model_dir_path, + target_method=target_method, options=options, ) ss_deployment.deploy() @@ -144,6 +150,7 @@ def __init__( model_dir: str, model_zip_stage_path: str, deployment_stage_path: str, + target_method: str, options: deploy_options.SnowServiceDeployOptions, ) -> None: """Initialization @@ -156,6 +163,7 @@ def __init__( model_dir: Local model directory, downloaded form stage and extracted. model_zip_stage_path: Path to model zip file in stage. deployment_stage_path: Path to stage containing deployment artifacts. + target_method: The name of the target method to be deployed. options: A SnowServiceDeployOptions object containing deployment options. """ @@ -165,6 +173,7 @@ def __init__( self.model_zip_stage_path = model_zip_stage_path self.model_dir = model_dir self.options = options + self.target_method = target_method self._service_name = f"service_{model_id}" # Spec file and future deployment related artifacts will be stored under {stage}/models/{model_id} self._model_artifact_stage_location = posixpath.join(deployment_stage_path, "models", self.id) @@ -175,15 +184,15 @@ def deploy(self) -> None: """ if self.options.prebuilt_snowflake_image: image = self.options.prebuilt_snowflake_image - logging.warning(f"Skipped image build. Use prebuilt image: {self.options.prebuilt_snowflake_image}") + logger.warning(f"Skipped image build. Use prebuilt image: {self.options.prebuilt_snowflake_image}") else: - logging.warning( + logger.warning( "Building the Docker image and deploying to Snowpark Container Service. " "This process may take a few minutes." ) image = self._build_and_upload_image() - logging.warning( + logger.warning( f"Image successfully built! To prevent the need for rebuilding the Docker image in future deployments, " f"simply specify 'prebuilt_snowflake_image': '{image}' in the options field of the deploy() function" ) @@ -201,7 +210,6 @@ def _build_and_upload_image(self) -> str: image_repo=image_repo, model_dir=self.model_dir, session=self.session, - use_gpu=True if self.options.use_gpu else False, ) return image_builder.build_and_upload_image() @@ -217,7 +225,7 @@ def _prepare_and_upload_artifacts_to_stage(self, image: str) -> None: spec_file_path = os.path.join(tempdir, f"{constants.SERVICE_SPEC}.yaml") with open(spec_template_path, encoding="utf-8") as template, open( - spec_file_path, "w", encoding="utf-8" + spec_file_path, "w+", encoding="utf-8" ) as spec_file: content = string.Template(template.read()).substitute( { @@ -226,10 +234,23 @@ def _prepare_and_upload_artifacts_to_stage(self, image: str) -> None: "model_stage": self.model_zip_stage_path[1:].split("/")[0], # Reserve only the stage name "model_zip_stage_path": self.model_zip_stage_path[1:], # Remove the @ prefix "inference_server_container_name": constants.INFERENCE_SERVER_CONTAINER, + "target_method": self.target_method, + "num_workers": self.options.num_workers, } ) - spec_file.write(content) - logging.info(f"Create service spec: \n {content}") + content_dict = yaml.safe_load(content) + if self.options.num_gpus is not None and self.options.num_gpus > 0: + container = content_dict["spec"]["container"][0] + # TODO[shchen]: SNOW-871538, external dependency that only single GPU is supported on SnowService. + # GPU limit has to be specified in order to trigger the workload to be run on GPU in SnowService. + container["resources"] = { + "limits": {"nvidia.com/gpu": self.options.num_gpus}, + "requests": {"nvidia.com/gpu": self.options.num_gpus}, + } + + yaml.dump(content_dict, spec_file) + spec_file.seek(0) + logger.debug(f"Create service spec: \n {spec_file.read()}") self.session.file.put( local_file_name=spec_file_path, @@ -237,7 +258,7 @@ def _prepare_and_upload_artifacts_to_stage(self, image: str) -> None: auto_compress=False, overwrite=True, ) - logging.info( + logger.debug( f"Uploaded spec file {os.path.basename(spec_file_path)} " f"to {self._model_artifact_stage_location}" ) diff --git a/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py b/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py index 08cd018b..db35fb3f 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +++ b/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py @@ -14,7 +14,8 @@ def __init__( max_instances: Optional[int] = 1, endpoint: Optional[str] = constants.PREDICT, prebuilt_snowflake_image: Optional[str] = None, - use_gpu: Optional[bool] = False, + num_gpus: Optional[int] = 0, + num_workers: Optional[int] = None, ) -> None: """Initialization @@ -33,8 +34,10 @@ def __init__( Snowflake is used as is. This option is for users who consistently use the same image for multiple use cases, allowing faster deployment. The snowflake image used for deployment is logged to the console for future use. Default to None. - use_gpu: When set to True, a CUDA-enabled Docker image will be used to provide a runtime CUDA environment. - Default to False. + num_gpus: Number of GPUs to be used for the service. Default to 0. + num_workers: Number of workers used for model inference. Please ensure that the number of workers is set + lower than the total available memory divided by the size of model to prevent memory-related issues. + Default is number of CPU cores * 2 + 1. """ self.compute_pool = compute_pool @@ -43,7 +46,8 @@ def __init__( self.max_instances = max_instances self.endpoint = endpoint self.prebuilt_snowflake_image = prebuilt_snowflake_image - self.use_gpu = use_gpu + self.num_gpus = num_gpus + self.num_workers = num_workers @classmethod def from_dict(cls, options_dict: Dict[str, Any]) -> "SnowServiceDeployOptions": diff --git a/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py b/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py index c8bdeb9e..6fc058d6 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py +++ b/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py @@ -52,6 +52,7 @@ def test_deploy_with_model_id( service_func_name="mock_service_func", model_zip_stage_path=m_model_zip_stage_path, deployment_stage_path=m_deployment_stage_path, + target_method=constants.PREDICT, **self.options, ) @@ -65,6 +66,7 @@ def test_deploy_with_model_id( model_zip_stage_path=m_model_zip_stage_path, deployment_stage_path=m_deployment_stage_path, model_dir=mock.ANY, + target_method=constants.PREDICT, options=mock.ANY, ) m_deployment.deploy.assert_called_once() @@ -78,6 +80,7 @@ def test_deploy_with_empty_model_id(self, m_deployment_class: mock.MagicMock) -> model_id="", model_zip_stage_path="@mock_model_zip_stage_path/model.zip", deployment_stage_path="@mock_model_deployment_stage_path", + target_method=constants.PREDICT, **self.options, ) @@ -93,6 +96,7 @@ def test_deploy_with_missing_required_options(self, m_deployment_class: mock.Mag model_id="mock_model_id", model_zip_stage_path="@mock_model_zip_stage_path/model.zip", deployment_stage_path="@mock_model_deployment_stage_path", + target_method=constants.PREDICT, **options, ) m_deployment_class.assert_not_called() @@ -147,6 +151,7 @@ def setUp(self) -> None: model_dir=self.m_model_dir, model_zip_stage_path=self.m_model_zip_stage_path, deployment_stage_path=self.m_deployment_stage_path, + target_method=constants.PREDICT, options=deploy_options.SnowServiceDeployOptions.from_dict(self.m_options), ) diff --git a/snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template b/snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template index 39994ed0..0dc9520a 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +++ b/snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template @@ -4,6 +4,8 @@ spec: image: ${image} env: MODEL_ZIP_STAGE_PATH: ${model_zip_stage_path} + TARGET_METHOD: ${target_method} + NUM_WORKERS: ${num_workers} readinessProbe: port: 5000 path: /health diff --git a/snowflake/ml/model/_deploy_client/utils/snowservice_client.py b/snowflake/ml/model/_deploy_client/utils/snowservice_client.py index 4300bc13..2bf9de48 100644 --- a/snowflake/ml/model/_deploy_client/utils/snowservice_client.py +++ b/snowflake/ml/model/_deploy_client/utils/snowservice_client.py @@ -6,6 +6,8 @@ from snowflake.ml.model._deploy_client.utils import constants from snowflake.snowpark import Session +logger = logging.getLogger(__name__) + class SnowServiceClient: """ @@ -52,7 +54,7 @@ def create_or_replace_service( COMPUTE_POOL={compute_pool} SPEC={spec_stage_location} """ - logging.info(f"Create service with SQL: \n {sql}") + logger.debug(f"Create service with SQL: \n {sql}") self.session.sql(sql).collect() def _drop_service_if_exists(self, service_name: str) -> None: @@ -89,9 +91,9 @@ def create_or_replace_service_function( ENDPOINT={endpoint_name} AS '/{path_at_service_endpoint}' """ - logging.info(f"Create service function with SQL: \n {sql}") + logger.debug(f"Create service function with SQL: \n {sql}") self.session.sql(sql).collect() - logging.info(f"Successfully created service function: {service_func_name}") + logger.debug(f"Successfully created service function: {service_func_name}") def block_until_resource_is_ready( self, @@ -99,7 +101,7 @@ def block_until_resource_is_ready( resource_type: constants.ResourceType, *, max_retries: int = 60, - retry_interval_secs: int = 5, + retry_interval_secs: int = 10, ) -> None: """Blocks execution until the specified resource is ready. Note that this is a best-effort approach because when launching a service, it's possible for it to initially @@ -110,7 +112,7 @@ def block_until_resource_is_ready( resource_name: Name of the resource. resource_type: Type of the resource. max_retries: The maximum number of retries to check the resource readiness (default: 60). - retry_interval_secs: The number of seconds to wait between each retry (default: 5). + retry_interval_secs: The number of seconds to wait between each retry (default: 10). Raises: RuntimeError: If the resource received the following status [failed, not_found, internal_error, deleting] @@ -173,11 +175,11 @@ def get_resource_status( except Exception as e: raise RuntimeError(f"Error while querying the {resource_type} {resource_name} status: {str(e)}") resource_metadata = json.loads(row[0][status_func])[0] - logging.info(f"Resource status metadata: {resource_metadata}") + logger.debug(f"Resource status metadata: {resource_metadata}") if resource_metadata and resource_metadata["status"]: try: status = resource_metadata["status"] return constants.ResourceStatus(status) except ValueError: - logging.warning(f"Unknown status returned: {status}") + logger.warning(f"Unknown status returned: {status}") return None diff --git a/snowflake/ml/model/_deploy_client/warehouse/BUILD.bazel b/snowflake/ml/model/_deploy_client/warehouse/BUILD.bazel index e408b590..ee87b150 100644 --- a/snowflake/ml/model/_deploy_client/warehouse/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/warehouse/BUILD.bazel @@ -15,7 +15,6 @@ py_library( "//snowflake/ml/_internal:env", "//snowflake/ml/_internal:env_utils", "//snowflake/ml/_internal:file_utils", - "//snowflake/ml/_internal/utils:identifier", "//snowflake/ml/model:_model", "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:type_hints", diff --git a/snowflake/ml/model/_deploy_client/warehouse/deploy.py b/snowflake/ml/model/_deploy_client/warehouse/deploy.py index 8a43ae88..ac174d78 100644 --- a/snowflake/ml/model/_deploy_client/warehouse/deploy.py +++ b/snowflake/ml/model/_deploy_client/warehouse/deploy.py @@ -7,7 +7,6 @@ from typing_extensions import Unpack from snowflake.ml._internal import env_utils, file_utils -from snowflake.ml._internal.utils import identifier from snowflake.ml.model import _model, _model_meta, type_hints as model_types from snowflake.ml.model._deploy_client.warehouse import infer_template from snowflake.snowpark import session as snowpark_session, types as st @@ -92,7 +91,7 @@ class _UDFParams(TypedDict): params = _UDFParams( file_path=f.name, func_name="infer", - name=identifier.get_inferred_name(udf_name), + name=udf_name, return_type=st.PandasSeriesType(st.MapType(st.StringType(), st.VariantType())), input_types=[st.PandasDataFrameType([st.MapType()])], imports=list(imports), diff --git a/snowflake/ml/model/_deployer.py b/snowflake/ml/model/_deployer.py index 4e262375..0a5231a8 100644 --- a/snowflake/ml/model/_deployer.py +++ b/snowflake/ml/model/_deployer.py @@ -1,12 +1,15 @@ import traceback -from enum import Enum from typing import Optional, TypedDict, Union, cast, overload import pandas as pd from typing_extensions import Required from snowflake.ml._internal.utils import identifier -from snowflake.ml.model import model_signature, type_hints as model_types +from snowflake.ml.model import ( + deploy_platforms, + model_signature, + type_hints as model_types, +) from snowflake.ml.model._deploy_client.snowservice import deploy as snowservice_deploy from snowflake.ml.model._deploy_client.utils import constants as snowservice_constants from snowflake.ml.model._deploy_client.warehouse import ( @@ -17,23 +20,6 @@ from snowflake.snowpark import DataFrame as SnowparkDataFrame, Session, functions as F -class TargetPlatform(Enum): - WAREHOUSE = "warehouse" - SNOWPARK_CONTAINER_SERVICE = "snowpark_container_service" - - def __repr__(self) -> str: - """Construct a string format that works with the "ModelReference" in model_registry.py. Fundamentally, - ModelReference uses the TargetPlatform enum type when constructing the "deploy" function through exec(). - Since "exec" in Python takes input as a string, we need to dynamically construct a full path so that the - enum can be loaded successfully. - - Returns: - A enum string representation. - """ - - return f"{__name__.split('.')[-1]}.{self.__class__.__name__}.{self.name}" - - class Deployment(TypedDict): """Deployment information. @@ -45,7 +31,7 @@ class Deployment(TypedDict): """ name: Required[str] - platform: Required[TargetPlatform] + platform: Required[deploy_platforms.TargetPlatform] signature: model_signature.ModelSignature options: Required[model_types.DeployOptions] @@ -55,7 +41,7 @@ def deploy( session: Session, *, name: str, - platform: TargetPlatform, + platform: deploy_platforms.TargetPlatform, target_method: str, model_dir_path: str, options: Optional[model_types.DeployOptions], @@ -79,7 +65,7 @@ def deploy( session: Session, *, name: str, - platform: TargetPlatform, + platform: deploy_platforms.TargetPlatform, target_method: str, model_stage_file_path: str, options: Optional[model_types.DeployOptions], @@ -104,7 +90,7 @@ def deploy( *, model_id: str, name: str, - platform: TargetPlatform, + platform: deploy_platforms.TargetPlatform, target_method: str, model_stage_file_path: str, deployment_stage_path: str, @@ -130,7 +116,7 @@ def deploy( session: Session, *, name: str, - platform: TargetPlatform, + platform: deploy_platforms.TargetPlatform, target_method: str, model_dir_path: Optional[str] = None, model_stage_file_path: Optional[str] = None, @@ -172,7 +158,7 @@ def deploy( if not options: options = {} - if platform == TargetPlatform.WAREHOUSE: + if platform == deploy_platforms.TargetPlatform.WAREHOUSE: try: meta = warehouse_deploy._deploy_to_warehouse( session=session, @@ -185,7 +171,7 @@ def deploy( except Exception: raise RuntimeError("Error happened when deploying to the warehouse: " + traceback.format_exc()) - elif platform == TargetPlatform.SNOWPARK_CONTAINER_SERVICE: + elif platform == deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES: options = cast(model_types.SnowparkContainerServiceDeployOptions, options) assert model_id, "Require 'model_id' for Snowpark container service deployment" assert model_stage_file_path, "Require 'model_stage_file_path' for Snowpark container service deployment" @@ -199,6 +185,7 @@ def deploy( service_func_name=name, model_zip_stage_path=model_stage_file_path, deployment_stage_path=deployment_stage_path, + target_method=target_method, **options, ) except Exception: @@ -260,7 +247,6 @@ def predict( sig = deployment["signature"] keep_order = deployment["options"].get("keep_order", True) output_with_input_features = deployment["options"].get("output_with_input_features", False) - platform = deployment["platform"] # Validate and prepare input if not isinstance(X, SnowparkDataFrame): @@ -292,11 +278,7 @@ def predict( # TODO[shchen]: SNOW-870032, For SnowService, external function name cannot be double quoted, else it results in # external function no found. - udf_name = ( - deployment["name"] - if platform == TargetPlatform.SNOWPARK_CONTAINER_SERVICE - else identifier.get_inferred_name(deployment["name"]) - ) + udf_name = deployment["name"] output_obj = F.call_udf(udf_name, F.object_construct(*input_cols)) if output_with_input_features: diff --git a/snowflake/ml/model/_handlers/xgboost.py b/snowflake/ml/model/_handlers/xgboost.py index 27d6936a..d3143518 100644 --- a/snowflake/ml/model/_handlers/xgboost.py +++ b/snowflake/ml/model/_handlers/xgboost.py @@ -77,6 +77,9 @@ def get_prediction( if not isinstance(sample_input, (pd.DataFrame, np.ndarray)): sample_input = model_signature._convert_local_data_to_df(sample_input) + if isinstance(model, xgboost.Booster): + sample_input = xgboost.DMatrix(sample_input) + target_method = getattr(model, target_method_name, None) assert callable(target_method) predictions_df = target_method(sample_input) @@ -145,6 +148,8 @@ def _load_as_custom_model( Returns: The model object as a custom model. """ + import xgboost + from snowflake.ml.model import custom_model def _create_custom_model( @@ -158,6 +163,9 @@ def fn_factory( ) -> Callable[[custom_model.CustomModel, pd.DataFrame], pd.DataFrame]: @custom_model.inference_api def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: + if isinstance(raw_model, xgboost.Booster): + X = xgboost.DMatrix(X) + res = getattr(raw_model, target_method)(X) if isinstance(res, list) and len(res) > 0 and isinstance(res[0], np.ndarray): diff --git a/snowflake/ml/model/_model_test.py b/snowflake/ml/model/_model_test.py index 319aefe8..92561115 100644 --- a/snowflake/ml/model/_model_test.py +++ b/snowflake/ml/model/_model_test.py @@ -28,7 +28,9 @@ tensorflow_handler, utils as model_signature_utils, ) -from snowflake.ml.modeling.linear_model import LinearRegression +from snowflake.ml.modeling.linear_model import ( # type:ignore[attr-defined] + LinearRegression, +) from snowflake.ml.test_utils import mock_session from snowflake.snowpark import FileOperation, Session @@ -776,6 +778,62 @@ def test_skl(self) -> None: assert callable(predict_method) np.testing.assert_allclose(np.array([[-0.08254936]]), predict_method(iris_X_df[:1])) + def test_xgb_booster(self) -> None: + cal_data = datasets.load_breast_cancer() + cal_X = pd.DataFrame(cal_data.data, columns=cal_data.feature_names) + cal_y = pd.Series(cal_data.target) + cal_X_train, cal_X_test, cal_y_train, cal_y_test = model_selection.train_test_split(cal_X, cal_y) + params = dict(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, objective="binary:logistic") + regressor = xgboost.train(params, xgboost.DMatrix(data=cal_X_train, label=cal_y_train)) + y_pred = regressor.predict(xgboost.DMatrix(data=cal_X_test)) + with tempfile.TemporaryDirectory() as tmpdir: + s = {"predict": model_signature.infer_signature(cal_X_test, y_pred)} + with self.assertRaises(ValueError): + model_api.save_model( + name="model1", + model_dir_path=os.path.join(tmpdir, "model1"), + model=regressor, + signatures={**s, "another_predict": s["predict"]}, + metadata={"author": "halu", "version": "1"}, + ) + + model_api.save_model( + name="model1", + model_dir_path=os.path.join(tmpdir, "model1"), + model=regressor, + signatures=s, + metadata={"author": "halu", "version": "1"}, + ) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + + m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + assert isinstance(m, xgboost.Booster) + np.testing.assert_allclose(m.predict(xgboost.DMatrix(data=cal_X_test)), y_pred) + m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + predict_method = getattr(m_udf, "predict", None) + assert callable(predict_method) + np.testing.assert_allclose(predict_method(cal_X_test), np.expand_dims(y_pred, axis=1)) + + model_api.save_model( + name="model1_no_sig", + model_dir_path=os.path.join(tmpdir, "model1_no_sig"), + model=regressor, + sample_input=cal_X_test, + metadata={"author": "halu", "version": "1"}, + ) + + m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig")) + assert isinstance(m, xgboost.Booster) + np.testing.assert_allclose(m.predict(xgboost.DMatrix(data=cal_X_test)), y_pred) + self.assertEqual(s["predict"], meta.signatures["predict"]) + + m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig")) + predict_method = getattr(m_udf, "predict", None) + assert callable(predict_method) + np.testing.assert_allclose(predict_method(cal_X_test), np.expand_dims(y_pred, axis=1)) + def test_xgb(self) -> None: cal_data = datasets.load_breast_cancer() cal_X = pd.DataFrame(cal_data.data, columns=cal_data.feature_names) @@ -1062,11 +1120,11 @@ def test_pytorch(self) -> None: m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(m, torch.nn.Module) - torch.testing.assert_close(m.forward(data_x)[0], y_pred) # type:ignore[attr-defined] + torch.testing.assert_close(m.forward(data_x)[0], y_pred) m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) - torch.testing.assert_close( # type:ignore[attr-defined] + torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( predict_method(x_df), s["forward"].outputs )[0], @@ -1083,13 +1141,13 @@ def test_pytorch(self) -> None: m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) assert isinstance(m, torch.nn.Module) - torch.testing.assert_close(m.forward(data_x)[0], y_pred) # type:ignore[attr-defined] + torch.testing.assert_close(m.forward(data_x)[0], y_pred) self.assertEqual(s["forward"], meta.signatures["forward"]) m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig_1")) predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) - torch.testing.assert_close( # type:ignore[attr-defined] + torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(predict_method(x_df), s["forward"].outputs)[ 0 ], @@ -1132,11 +1190,11 @@ def test_torchscript(self) -> None: m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(m, torch.jit.ScriptModule) # type:ignore[attr-defined] - torch.testing.assert_close(m.forward(data_x)[0], y_pred) # type:ignore[attr-defined] + torch.testing.assert_close(m.forward(data_x)[0], y_pred) m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) - torch.testing.assert_close( # type:ignore[attr-defined] + torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( predict_method(x_df), s["forward"].outputs )[0], @@ -1153,13 +1211,13 @@ def test_torchscript(self) -> None: m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) assert isinstance(m, torch.jit.ScriptModule) # type:ignore[attr-defined] - torch.testing.assert_close(m.forward(data_x)[0], y_pred) # type:ignore[attr-defined] + torch.testing.assert_close(m.forward(data_x)[0], y_pred) self.assertEqual(s["forward"], meta.signatures["forward"]) m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig_1")) predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) - torch.testing.assert_close( # type:ignore[attr-defined] + torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(predict_method(x_df), s["forward"].outputs)[ 0 ], @@ -1189,12 +1247,12 @@ def test_torch_df_sample_input(self) -> None: m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) assert isinstance(m, torch.nn.Module) - torch.testing.assert_close(m.forward(data_x)[0], y_pred) # type:ignore[attr-defined] + torch.testing.assert_close(m.forward(data_x)[0], y_pred) m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig_1")) predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) - torch.testing.assert_close( # type:ignore[attr-defined] + torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(predict_method(x_df))[0], y_pred ) @@ -1211,12 +1269,12 @@ def test_torch_df_sample_input(self) -> None: m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig_2")) assert isinstance(m, torch.jit.ScriptModule) # type:ignore[attr-defined] - torch.testing.assert_close(m.forward(data_x)[0], y_pred) # type:ignore[attr-defined] + torch.testing.assert_close(m.forward(data_x)[0], y_pred) m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig_2")) predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) - torch.testing.assert_close( # type:ignore[attr-defined] + torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(predict_method(x_df))[0], y_pred ) diff --git a/snowflake/ml/model/_signatures/BUILD.bazel b/snowflake/ml/model/_signatures/BUILD.bazel index 6c775c66..b4e9b311 100644 --- a/snowflake/ml/model/_signatures/BUILD.bazel +++ b/snowflake/ml/model/_signatures/BUILD.bazel @@ -14,6 +14,9 @@ py_library( py_library( name = "core", srcs = ["core.py"], + deps = [ + "//snowflake/ml/_internal/exceptions", + ], ) py_test( @@ -21,6 +24,7 @@ py_test( srcs = ["core_test.py"], deps = [ ":core", + "//snowflake/ml/test_utils:exception_utils", ], ) @@ -28,9 +32,10 @@ py_library( name = "pandas_handler", srcs = ["pandas_handler.py"], deps = [ - ":core", ":base_handler", + ":core", ":utils", + "//snowflake/ml/_internal/exceptions", "//snowflake/ml/model:type_hints", ], ) @@ -41,6 +46,7 @@ py_test( deps = [ ":core", ":pandas_handler", + "//snowflake/ml/test_utils:exception_utils", ], ) @@ -48,8 +54,9 @@ py_library( name = "numpy_handler", srcs = ["numpy_handler.py"], deps = [ - ":core", ":base_handler", + ":core", + "//snowflake/ml/_internal/exceptions", "//snowflake/ml/model:type_hints", ], ) @@ -60,6 +67,7 @@ py_test( deps = [ ":core", ":numpy_handler", + "//snowflake/ml/test_utils:exception_utils", ], ) @@ -67,9 +75,10 @@ py_library( name = "pytorch_handler", srcs = ["pytorch_handler.py"], deps = [ - ":core", ":base_handler", + ":core", "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/_internal/exceptions", "//snowflake/ml/model:type_hints", ], ) @@ -81,6 +90,7 @@ py_test( ":core", ":pytorch_handler", ":utils", + "//snowflake/ml/test_utils:exception_utils", ], ) @@ -88,9 +98,10 @@ py_library( name = "tensorflow_handler", srcs = ["tensorflow_handler.py"], deps = [ - ":core", ":base_handler", + ":core", "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/_internal/exceptions", "//snowflake/ml/model:type_hints", ], ) @@ -102,6 +113,7 @@ py_test( ":core", ":tensorflow_handler", ":utils", + "//snowflake/ml/test_utils:exception_utils", ], ) @@ -109,9 +121,10 @@ py_library( name = "builtins_handler", srcs = ["builtins_handler.py"], deps = [ - ":core", ":base_handler", + ":core", ":pandas_handler", + "//snowflake/ml/_internal/exceptions", "//snowflake/ml/model:type_hints", ], ) @@ -120,8 +133,9 @@ py_test( name = "builtins_test", srcs = ["builtins_test.py"], deps = [ - ":core", ":builtins_handler", + ":core", + "//snowflake/ml/test_utils:exception_utils", ], ) @@ -130,6 +144,7 @@ py_library( srcs = ["utils.py"], deps = [ ":core", + "//snowflake/ml/_internal/exceptions", ], ) @@ -139,20 +154,21 @@ py_test( deps = [ ":core", ":utils", + "//snowflake/ml/test_utils:exception_utils", ], ) - py_library( name = "snowpark_handler", srcs = ["snowpark_handler.py"], deps = [ - ":core", ":base_handler", + ":core", ":pandas_handler", + "//snowflake/ml/_internal/exceptions", "//snowflake/ml/_internal/utils:identifier", - "//snowflake/ml/model/_deploy_client/warehouse:infer_template", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_deploy_client/warehouse:infer_template", ], ) @@ -162,6 +178,7 @@ py_test( deps = [ ":core", ":snowpark_handler", + "//snowflake/ml/test_utils:exception_utils", "//snowflake/ml/model:model_signature", "//snowflake/ml/utils:connection_params", ], diff --git a/snowflake/ml/model/_signatures/builtins_handler.py b/snowflake/ml/model/_signatures/builtins_handler.py index 9129059c..76a4bc2f 100644 --- a/snowflake/ml/model/_signatures/builtins_handler.py +++ b/snowflake/ml/model/_signatures/builtins_handler.py @@ -3,6 +3,10 @@ import pandas as pd from typing_extensions import TypeGuard +from snowflake.ml._internal.exceptions import ( + error_codes, + exceptions as snowml_exceptions, +) from snowflake.ml.model import type_hints as model_types from snowflake.ml.model._signatures import base_handler, core, pandas_handler @@ -27,10 +31,18 @@ def truncate(data: model_types._SupportedBuiltinsList) -> model_types._Supported @staticmethod def validate(data: model_types._SupportedBuiltinsList) -> None: if not all(isinstance(data_row, type(data[0])) for data_row in data): - raise ValueError(f"Data Validation Error: Inconsistent type of object found in data {data}.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error: Inconsistent type of object found in data {data}." + ), + ) df = pd.DataFrame(data) if df.isnull().values.any(): - raise ValueError(f"Data Validation Error: Ill-shaped list data {data} confronted.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError(f"Data Validation Error: Ill-shaped list data {data} confronted."), + ) @staticmethod def infer_signature( diff --git a/snowflake/ml/model/_signatures/builtins_test.py b/snowflake/ml/model/_signatures/builtins_test.py index 0e4e7a6f..ff5a1b42 100644 --- a/snowflake/ml/model/_signatures/builtins_test.py +++ b/snowflake/ml/model/_signatures/builtins_test.py @@ -2,16 +2,21 @@ from absl.testing import absltest from snowflake.ml.model._signatures import builtins_handler, core +from snowflake.ml.test_utils import exception_utils class ListOfBuiltinsHandlerTest(absltest.TestCase): def test_validate_list_builtins(self) -> None: lt6 = ["Hello", [2, 3]] - with self.assertRaisesRegex(ValueError, "Inconsistent type of object found in data"): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Inconsistent type of object found in data" + ): builtins_handler.ListOfBuiltinHandler.validate(lt6) # type:ignore[arg-type] lt7 = [[1], [2, 3]] - with self.assertRaisesRegex(ValueError, "Ill-shaped list data"): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Ill-shaped list data" + ): builtins_handler.ListOfBuiltinHandler.validate(lt7) lt8 = [pd.DataFrame([1]), pd.DataFrame([2, 3])] diff --git a/snowflake/ml/model/_signatures/core.py b/snowflake/ml/model/_signatures/core.py index 1f07b5c0..3d9f8234 100644 --- a/snowflake/ml/model/_signatures/core.py +++ b/snowflake/ml/model/_signatures/core.py @@ -19,6 +19,10 @@ import numpy.typing as npt import snowflake.snowpark.types as spt +from snowflake.ml._internal.exceptions import ( + error_codes, + exceptions as snowml_exceptions, +) if TYPE_CHECKING: import mlflow @@ -67,7 +71,7 @@ def from_numpy_type(cls, np_type: npt.DTypeLike) -> "DataType": np_type: The numpy dtype. Raises: - NotImplementedError: Raised when the given numpy type is not supported. + SnowflakeMLException: NotImplementedError: Raised when the given numpy type is not supported. Returns: Corresponding DataType. @@ -77,7 +81,10 @@ def from_numpy_type(cls, np_type: npt.DTypeLike) -> "DataType": if np.can_cast(np_type, potential_type, casting="no"): # This is used since the same dtype might represented in different ways. return np_to_snowml_type_mapping[potential_type] - raise NotImplementedError(f"Type {np_type} is not supported as a DataType.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.NOT_IMPLEMENTED, + original_exception=NotImplementedError(f"Type {np_type} is not supported as a DataType."), + ) @classmethod def from_torch_type(cls, torch_type: "torch.dtype") -> "DataType": @@ -111,7 +118,7 @@ def from_snowpark_type(cls, snowpark_type: spt.DataType) -> "DataType": snowpark_type: The snowpark type. Raises: - NotImplementedError: Raised when the given numpy type is not supported. + SnowflakeMLException: NotImplementedError: Raised when the given numpy type is not supported. Returns: Corresponding DataType. @@ -134,7 +141,10 @@ def from_snowpark_type(cls, snowpark_type: spt.DataType) -> "DataType": if isinstance(snowpark_type, spt.DecimalType): if snowpark_type.scale == 0: return DataType.INT64 - raise NotImplementedError(f"Type {snowpark_type} is not supported as a DataType.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.NOT_IMPLEMENTED, + original_exception=NotImplementedError(f"Type {snowpark_type} is not supported as a DataType."), + ) def is_same_snowpark_type(self, incoming_snowpark_type: spt.DataType) -> bool: """Check if provided snowpark type is the same as Data Type. @@ -143,7 +153,7 @@ def is_same_snowpark_type(self, incoming_snowpark_type: spt.DataType) -> bool: incoming_snowpark_type: The snowpark type. Raises: - NotImplementedError: Raised when the given numpy type is not supported. + SnowflakeMLException: NotImplementedError: Raised when the given numpy type is not supported. Returns: If the provided snowpark type is the same as the DataType. @@ -152,7 +162,12 @@ def is_same_snowpark_type(self, incoming_snowpark_type: spt.DataType) -> bool: if isinstance(incoming_snowpark_type, spt.DecimalType): if incoming_snowpark_type.scale == 0: return self == DataType.INT64 or self == DataType.UINT64 - raise NotImplementedError(f"Type {incoming_snowpark_type} is not supported as a DataType.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.NOT_IMPLEMENTED, + original_exception=NotImplementedError( + f"Type {incoming_snowpark_type} is not supported as a DataType." + ), + ) return isinstance(incoming_snowpark_type, self._snowpark_type) @@ -210,17 +225,23 @@ def __init__( (d1, d2, d3): 3d tensor. Raises: - TypeError: Raised when the dtype input type is incorrect. - TypeError: Raised when the shape input type is incorrect. + SnowflakeMLException: TypeError: Raised when the dtype input type is incorrect. + SnowflakeMLException: TypeError: Raised when the shape input type is incorrect. """ super().__init__(name=name) if not isinstance(dtype, DataType): - raise TypeError("dtype should be a model signature datatype.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_TYPE, + original_exception=TypeError("dtype should be a model signature datatype."), + ) self._dtype = dtype if shape and not isinstance(shape, tuple): - raise TypeError("Shape should be a tuple if presented.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_TYPE, + original_exception=TypeError("Shape should be a tuple if presented."), + ) self._shape = shape def as_snowpark_type(self) -> spt.DataType: @@ -300,7 +321,10 @@ def from_mlflow_spec( name = feature_name return FeatureSpec(name=name, dtype=DataType.from_numpy_type(input_spec.type), shape=shape) else: - raise NotImplementedError(f"MLFlow schema type {type(input_spec)} is not supported.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.NOT_IMPLEMENTED, + original_exception=NotImplementedError(f"MLFlow schema type {type(input_spec)} is not supported."), + ) class FeatureGroupSpec(BaseFeatureSpec): @@ -320,15 +344,26 @@ def __init__(self, name: str, specs: List[FeatureSpec]) -> None: def _validate(self) -> None: if len(self._specs) == 0: - raise ValueError("No children feature specs.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, original_exception=ValueError("No children feature specs.") + ) # each has to have name, and same type if not all(s._name is not None for s in self._specs): - raise ValueError("All children feature specs have to have name.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=ValueError("All children feature specs have to have name."), + ) if not (all(s._shape is None for s in self._specs) or all(s._shape is not None for s in self._specs)): - raise ValueError("All children feature specs have to have same shape.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=ValueError("All children feature specs have to have same shape."), + ) first_type = self._specs[0]._dtype if not all(s._dtype == first_type for s in self._specs): - raise ValueError("All children feature specs have to have same type.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=ValueError("All children feature specs have to have same type."), + ) def as_snowpark_type(self) -> spt.DataType: first_type = self._specs[0].as_snowpark_type() diff --git a/snowflake/ml/model/_signatures/core_test.py b/snowflake/ml/model/_signatures/core_test.py index a6f065b8..5173334c 100644 --- a/snowflake/ml/model/_signatures/core_test.py +++ b/snowflake/ml/model/_signatures/core_test.py @@ -3,6 +3,7 @@ import snowflake.snowpark.types as spt from snowflake.ml.model._signatures import core +from snowflake.ml.test_utils import exception_utils class DataTypeTest(absltest.TestCase): @@ -24,7 +25,11 @@ def test_snowpark_type(self) -> None: self.assertEqual(core.DataType.FLOAT, core.DataType.from_snowpark_type(spt.FloatType())) self.assertEqual(core.DataType.DOUBLE, core.DataType.from_snowpark_type(spt.DoubleType())) - with self.assertRaises(NotImplementedError): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=NotImplementedError, + expected_regex="Type .+ is not supported as a DataType.", + ): core.DataType.from_snowpark_type(spt.DecimalType(38, 6)) self.assertEqual(core.DataType.BOOL, core.DataType.from_snowpark_type(spt.BooleanType())) @@ -62,21 +67,35 @@ def test_feature_spec(self) -> None: class FeatureGroupSpecTest(absltest.TestCase): def test_feature_group_spec(self) -> None: - with self.assertRaisesRegex(ValueError, "No children feature specs."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="No children feature specs." + ): _ = core.FeatureGroupSpec(name="features", specs=[]) - with self.assertRaisesRegex(ValueError, "All children feature specs have to have name."): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="All children feature specs have to have name.", + ): ft1 = core.FeatureSpec(name="feature1", dtype=core.DataType.INT64) ft2 = core.FeatureSpec(name="feature2", dtype=core.DataType.INT64) ft2._name = None # type: ignore[assignment] _ = core.FeatureGroupSpec(name="features", specs=[ft1, ft2]) - with self.assertRaisesRegex(ValueError, "All children feature specs have to have same type."): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="All children feature specs have to have same type.", + ): ft1 = core.FeatureSpec(name="feature1", dtype=core.DataType.INT64) ft2 = core.FeatureSpec(name="feature2", dtype=core.DataType.FLOAT) _ = core.FeatureGroupSpec(name="features", specs=[ft1, ft2]) - with self.assertRaisesRegex(ValueError, "All children feature specs have to have same shape."): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="All children feature specs have to have same shape.", + ): ft1 = core.FeatureSpec(name="feature1", dtype=core.DataType.INT64) ft2 = core.FeatureSpec(name="feature2", dtype=core.DataType.INT64, shape=(2,)) fts = core.FeatureGroupSpec(name="features", specs=[ft1, ft2]) diff --git a/snowflake/ml/model/_signatures/numpy_handler.py b/snowflake/ml/model/_signatures/numpy_handler.py index 73347474..73a13bf7 100644 --- a/snowflake/ml/model/_signatures/numpy_handler.py +++ b/snowflake/ml/model/_signatures/numpy_handler.py @@ -4,6 +4,10 @@ import pandas as pd from typing_extensions import TypeGuard +from snowflake.ml._internal.exceptions import ( + error_codes, + exceptions as snowml_exceptions, +) from snowflake.ml.model import type_hints as model_types from snowflake.ml.model._signatures import base_handler, core @@ -25,11 +29,17 @@ def truncate(data: model_types._SupportedNumpyArray) -> model_types._SupportedNu def validate(data: model_types._SupportedNumpyArray) -> None: if data.shape == (0,): # Empty array - raise ValueError("Data Validation Error: Empty data is found.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError("Data Validation Error: Empty data is found."), + ) if data.shape == (): # scalar - raise ValueError("Data Validation Error: Scalar data is found.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError("Data Validation Error: Scalar data is found."), + ) @staticmethod def infer_signature( diff --git a/snowflake/ml/model/_signatures/numpy_test.py b/snowflake/ml/model/_signatures/numpy_test.py index db0d27ee..e0a1b904 100644 --- a/snowflake/ml/model/_signatures/numpy_test.py +++ b/snowflake/ml/model/_signatures/numpy_test.py @@ -3,16 +3,21 @@ from absl.testing import absltest from snowflake.ml.model._signatures import core, numpy_handler +from snowflake.ml.test_utils import exception_utils class NumpyArrayHandlerTest(absltest.TestCase): def test_validate_np_ndarray(self) -> None: arr = np.array([]) - with self.assertRaisesRegex(ValueError, "Empty data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Empty data is found." + ): numpy_handler.NumpyArrayHandler.validate(arr) arr = np.array(1) - with self.assertRaisesRegex(ValueError, "Scalar data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Scalar data is found." + ): numpy_handler.NumpyArrayHandler.validate(arr) def test_trunc_np_ndarray(self) -> None: diff --git a/snowflake/ml/model/_signatures/pandas_handler.py b/snowflake/ml/model/_signatures/pandas_handler.py index 43a80639..fedc86cb 100644 --- a/snowflake/ml/model/_signatures/pandas_handler.py +++ b/snowflake/ml/model/_signatures/pandas_handler.py @@ -4,6 +4,10 @@ import pandas as pd from typing_extensions import TypeGuard +from snowflake.ml._internal.exceptions import ( + error_codes, + exceptions as snowml_exceptions, +) from snowflake.ml.model import type_hints as model_types from snowflake.ml.model._signatures import base_handler, core, utils @@ -26,12 +30,22 @@ def validate(data: pd.DataFrame) -> None: df_cols = data.columns if df_cols.has_duplicates: # Rule out categorical index with duplicates - raise ValueError("Data Validation Error: Duplicate column index is found.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError("Data Validation Error: Duplicate column index is found."), + ) - assert all(hasattr(data[col], "dtype") for col in data.columns), f"Unknown column confronted in {data}" + if not all(hasattr(data[col], "dtype") for col in data.columns): + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError(f"Unknown column confronted in {data}"), + ) if len(df_cols) == 0: - raise ValueError("Data Validation Error: Empty data is found.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError("Data Validation Error: Empty data is found."), + ) if df_cols.dtype not in [ np.int64, @@ -39,15 +53,21 @@ def validate(data: pd.DataFrame) -> None: np.float64, np.object_, ]: # To keep compatibility with Pandas 2.x and 1.x - raise ValueError("Data Validation Error: Unsupported column index type is found.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError("Data Validation Error: Unsupported column index type is found."), + ) df_col_dtypes = [data[col].dtype for col in data.columns] for df_col, df_col_dtype in zip(df_cols, df_col_dtypes): if df_col_dtype == np.dtype("O"): # Check if all objects have the same type if not all(isinstance(data_row, type(data[df_col][0])) for data_row in data[df_col]): - raise ValueError( - f"Data Validation Error: Inconsistent type of object found in column data {data[df_col]}." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error: Inconsistent type of object found in column data {data[df_col]}." + ), ) if isinstance(data[df_col][0], list): @@ -60,21 +80,32 @@ def validate(data: pd.DataFrame) -> None: core.DataType.from_numpy_type(converted_data.dtype) == arr_dtype for converted_data in converted_data_list ): - raise ValueError( - "Data Validation Error: " - + f"Inconsistent type of element in object found in column data {data[df_col]}." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + "Data Validation Error: " + + f"Inconsistent type of element in object found in column data {data[df_col]}." + ), ) elif isinstance(data[df_col][0], np.ndarray): arr_dtype = core.DataType.from_numpy_type(data[df_col][0].dtype) if not all(core.DataType.from_numpy_type(data_row.dtype) == arr_dtype for data_row in data[df_col]): - raise ValueError( - "Data Validation Error: " - + f"Inconsistent type of element in object found in column data {data[df_col]}." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + "Data Validation Error: " + + f"Inconsistent type of element in object found in column data {data[df_col]}." + ), ) elif not isinstance(data[df_col][0], (str, bytes)): - raise ValueError(f"Data Validation Error: Unsupported type confronted in {data[df_col]}") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error: Unsupported type confronted in {data[df_col]}" + ), + ) @staticmethod def infer_signature(data: pd.DataFrame, role: Literal["input", "output"]) -> Sequence[core.BaseFeatureSpec]: diff --git a/snowflake/ml/model/_signatures/pandas_test.py b/snowflake/ml/model/_signatures/pandas_test.py index fc38ef43..53975f3e 100644 --- a/snowflake/ml/model/_signatures/pandas_test.py +++ b/snowflake/ml/model/_signatures/pandas_test.py @@ -3,60 +3,91 @@ from absl.testing import absltest from snowflake.ml.model._signatures import core, pandas_handler +from snowflake.ml.test_utils import exception_utils class PandasDataFrameHandlerTest(absltest.TestCase): def test_validate_pd_DataFrame(self) -> None: df = pd.DataFrame([]) - with self.assertRaisesRegex(ValueError, "Empty data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Empty data is found." + ): pandas_handler.PandasDataFrameHandler.validate(df) df = pd.DataFrame([[1, 2], [2, 4]], columns=["a", "a"]) - with self.assertRaisesRegex(ValueError, "Duplicate column index is found"): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Duplicate column index is found" + ): pandas_handler.PandasDataFrameHandler.validate(df) sub_df = pd.DataFrame([2.5, 6.8]) df = pd.DataFrame([[1, sub_df], [2, sub_df]], columns=["a", "b"]) - with self.assertRaisesRegex(ValueError, "Unsupported type confronted in"): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Unsupported type confronted in" + ): pandas_handler.PandasDataFrameHandler.validate(df) df = pd.DataFrame( [[1, 2.0, 1, 2.0, 1, 2.0], [2, 4.0, 2, 4.0, 2, 4.0]], columns=pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]), ) - with self.assertRaisesRegex(ValueError, "Duplicate column index is found"): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Duplicate column index is found" + ): pandas_handler.PandasDataFrameHandler.validate(df) df = pd.DataFrame([[1, 2], [2, 4]], columns=["a", "a"]) - with self.assertRaisesRegex(ValueError, "Duplicate column index is found"): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Duplicate column index is found" + ): pandas_handler.PandasDataFrameHandler.validate(df) df = pd.DataFrame([[1, "Hello"], [2, [2, 6]]], columns=["a", "b"]) - with self.assertRaisesRegex(ValueError, "Inconsistent type of object"): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Inconsistent type of object" + ): pandas_handler.PandasDataFrameHandler.validate(df) df = pd.DataFrame([[1, 2], [2, [2, 6]]], columns=["a", "b"]) - with self.assertRaisesRegex(ValueError, "Inconsistent type of object"): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Inconsistent type of object" + ): pandas_handler.PandasDataFrameHandler.validate(df) df = pd.DataFrame([[1, [2, [6]]], [2, [2, 6]]], columns=["a", "b"]) - with self.assertRaisesRegex(ValueError, "Ragged nested or Unsupported list-like data"): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Ragged nested or Unsupported list-like data" + ): pandas_handler.PandasDataFrameHandler.validate(df) df = pd.DataFrame([[1, [2, 6]], [2, [2, [6]]]], columns=["a", "b"]) - with self.assertRaisesRegex(ValueError, "Ragged nested or Unsupported list-like data"): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Ragged nested or Unsupported list-like data" + ): pandas_handler.PandasDataFrameHandler.validate(df) df = pd.DataFrame([[1, [2.5, 6.8]], [2, [2, 6]]], columns=["a", "b"]) - with self.assertRaisesRegex(ValueError, "Inconsistent type of element in object found in column data"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Inconsistent type of element in object found in column data", + ): pandas_handler.PandasDataFrameHandler.validate(df) df = pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2, 6])]], columns=["a", "b"]) - with self.assertRaisesRegex(ValueError, "Inconsistent type of element in object found in column data"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Inconsistent type of element in object found in column data", + ): pandas_handler.PandasDataFrameHandler.validate(df) df = pd.DataFrame([[1, np.array([2.5, 6.8])], [2, 6]], columns=["a", "b"]) - with self.assertRaisesRegex(ValueError, "Inconsistent type of object found in column data"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Inconsistent type of object found in column data", + ): pandas_handler.PandasDataFrameHandler.validate(df) def test_trunc_pd_DataFrame(self) -> None: diff --git a/snowflake/ml/model/_signatures/pytorch_handler.py b/snowflake/ml/model/_signatures/pytorch_handler.py index cebd4af1..f81c917b 100644 --- a/snowflake/ml/model/_signatures/pytorch_handler.py +++ b/snowflake/ml/model/_signatures/pytorch_handler.py @@ -5,6 +5,10 @@ from typing_extensions import TypeGuard from snowflake.ml._internal import type_utils +from snowflake.ml._internal.exceptions import ( + error_codes, + exceptions as snowml_exceptions, +) from snowflake.ml.model import type_hints as model_types from snowflake.ml.model._signatures import base_handler, core @@ -41,11 +45,17 @@ def validate(data: Sequence["torch.Tensor"]) -> None: for data_col in data: if data_col.shape == torch.Size([0]): # Empty array - raise ValueError("Data Validation Error: Empty data is found.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError("Data Validation Error: Empty data is found."), + ) if data_col.shape == torch.Size([1]): # scalar - raise ValueError("Data Validation Error: Scalar data is found.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError("Data Validation Error: Scalar data is found."), + ) @staticmethod def infer_signature( @@ -86,7 +96,10 @@ def convert_from_df( if features: for feature in features: if isinstance(feature, core.FeatureGroupSpec): - raise NotImplementedError("FeatureGroupSpec is not supported.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.NOT_IMPLEMENTED, + original_exception=NotImplementedError("FeatureGroupSpec is not supported."), + ) assert isinstance(feature, core.FeatureSpec), "Invalid feature kind." res.append(torch.from_numpy(np.stack(df[feature.name].to_numpy()).astype(feature._dtype._numpy_type))) return res diff --git a/snowflake/ml/model/_signatures/pytorch_test.py b/snowflake/ml/model/_signatures/pytorch_test.py index f144390b..c89cb30c 100644 --- a/snowflake/ml/model/_signatures/pytorch_test.py +++ b/snowflake/ml/model/_signatures/pytorch_test.py @@ -4,6 +4,7 @@ from absl.testing import absltest from snowflake.ml.model._signatures import core, pytorch_handler, utils +from snowflake.ml.test_utils import exception_utils class SeqOfPyTorchTensorHandlerTest(absltest.TestCase): @@ -19,36 +20,42 @@ def test_validate_list_of_pytorch_tensor(self) -> None: def test_validate_torch_tensor(self) -> None: t = [torch.Tensor([])] - with self.assertRaisesRegex(ValueError, "Empty data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Empty data is found." + ): pytorch_handler.SeqOfPyTorchTensorHandler.validate(t) t = [torch.Tensor(1)] - with self.assertRaisesRegex(ValueError, "Scalar data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Scalar data is found." + ): pytorch_handler.SeqOfPyTorchTensorHandler.validate(t) t = [torch.Tensor([1, 2]), torch.Tensor(1)] - with self.assertRaisesRegex(ValueError, "Scalar data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Scalar data is found." + ): pytorch_handler.SeqOfPyTorchTensorHandler.validate(t) def test_trunc_torch_tensor(self) -> None: t = [torch.Tensor([1] * (pytorch_handler.SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1))] for ts in pytorch_handler.SeqOfPyTorchTensorHandler.truncate(t): - torch.testing.assert_close( # type:ignore[attr-defined] + torch.testing.assert_close( torch.Tensor([1] * (pytorch_handler.SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT)), ts ) t = [torch.Tensor([1] * (pytorch_handler.SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1))] for ts in pytorch_handler.SeqOfPyTorchTensorHandler.truncate(t): - torch.testing.assert_close( # type:ignore[attr-defined] + torch.testing.assert_close( torch.Tensor([1] * (pytorch_handler.SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)), ts ) t = [torch.Tensor([1] * (pytorch_handler.SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1))] * 2 for ts in pytorch_handler.SeqOfPyTorchTensorHandler.truncate(t): - torch.testing.assert_close( # type:ignore[attr-defined] + torch.testing.assert_close( torch.Tensor([1] * (pytorch_handler.SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT)), ts ) @@ -58,9 +65,7 @@ def test_trunc_torch_tensor(self) -> None: ] for ts in pytorch_handler.SeqOfPyTorchTensorHandler.truncate(t): - torch.testing.assert_close( # type:ignore[attr-defined] - torch.Tensor([1]), ts - ) + torch.testing.assert_close(torch.Tensor([1]), ts) def test_infer_schema_torch_tensor(self) -> None: t1 = [torch.IntTensor([1, 2, 3, 4])] @@ -224,7 +229,7 @@ def test_convert_from_df_torch_tensor(self) -> None: pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t1) ) ): - torch.testing.assert_close(t, t1[idx]) # type:ignore[attr-defined] + torch.testing.assert_close(t, t1[idx]) t2 = [torch.DoubleTensor([1, 2, 3, 4])] for idx, t in enumerate( @@ -232,7 +237,7 @@ def test_convert_from_df_torch_tensor(self) -> None: pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t2) ) ): - torch.testing.assert_close(t, t2[idx]) # type:ignore[attr-defined] + torch.testing.assert_close(t, t2[idx]) t3 = [torch.LongTensor([[1, 1], [2, 2], [3, 3], [4, 4]])] for idx, t in enumerate( @@ -240,7 +245,7 @@ def test_convert_from_df_torch_tensor(self) -> None: pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t3) ) ): - torch.testing.assert_close(t, t3[idx]) # type:ignore[attr-defined] + torch.testing.assert_close(t, t3[idx]) t4 = [torch.LongTensor([[[1, 1], [2, 2]], [[3, 3], [4, 4]]])] for idx, t in enumerate( @@ -248,7 +253,7 @@ def test_convert_from_df_torch_tensor(self) -> None: pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t4) ) ): - torch.testing.assert_close(t, t4[idx]) # type:ignore[attr-defined] + torch.testing.assert_close(t, t4[idx]) t5 = [torch.LongTensor([1, 2]), torch.LongTensor([3, 4])] for idx, t in enumerate( @@ -256,7 +261,7 @@ def test_convert_from_df_torch_tensor(self) -> None: pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t5) ) ): - torch.testing.assert_close(t, t5[idx]) # type:ignore[attr-defined] + torch.testing.assert_close(t, t5[idx]) t6 = [torch.DoubleTensor([1.2, 2.4]), torch.LongTensor([3, 4])] for idx, t in enumerate( @@ -264,7 +269,7 @@ def test_convert_from_df_torch_tensor(self) -> None: pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t6) ) ): - torch.testing.assert_close(t, t6[idx]) # type:ignore[attr-defined] + torch.testing.assert_close(t, t6[idx]) t7 = [torch.LongTensor([[1, 1], [2, 2]]), torch.LongTensor([[3, 3], [4, 4]])] for idx, t in enumerate( @@ -272,7 +277,7 @@ def test_convert_from_df_torch_tensor(self) -> None: pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t7) ) ): - torch.testing.assert_close(t, t7[idx]) # type:ignore[attr-defined] + torch.testing.assert_close(t, t7[idx]) t8 = [torch.LongTensor([[1, 1], [2, 2]]), torch.DoubleTensor([[1.5, 6.8], [2.9, 9.2]])] for idx, t in enumerate( @@ -280,7 +285,7 @@ def test_convert_from_df_torch_tensor(self) -> None: pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t8) ) ): - torch.testing.assert_close(t, t8[idx]) # type:ignore[attr-defined] + torch.testing.assert_close(t, t8[idx]) t9 = [torch.IntTensor([1, 2, 3, 4])] fts = pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t9, role="input") @@ -290,7 +295,7 @@ def test_convert_from_df_torch_tensor(self) -> None: fts, ) ): - torch.testing.assert_close(t, t9[idx]) # type:ignore[attr-defined] + torch.testing.assert_close(t, t9[idx]) t10 = [torch.tensor([1.2, 3.4])] fts = pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t10, role="input") @@ -300,7 +305,7 @@ def test_convert_from_df_torch_tensor(self) -> None: fts, ) ): - torch.testing.assert_close(t, t10[idx]) # type:ignore[attr-defined] + torch.testing.assert_close(t, t10[idx]) t11 = [torch.tensor([[1, 1], [2, 2], [3, 3], [4, 4]])] fts = pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t11, role="input") @@ -310,7 +315,7 @@ def test_convert_from_df_torch_tensor(self) -> None: fts, ) ): - torch.testing.assert_close(t, t11[idx]) # type:ignore[attr-defined] + torch.testing.assert_close(t, t11[idx]) t12 = [torch.tensor([[[1, 1], [2, 2]], [[3, 3], [4, 4]]])] fts = pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t12, role="input") @@ -320,7 +325,7 @@ def test_convert_from_df_torch_tensor(self) -> None: fts, ) ): - torch.testing.assert_close(t, t12[idx]) # type:ignore[attr-defined] + torch.testing.assert_close(t, t12[idx]) t13 = [torch.tensor([1, 2]), torch.tensor([3, 4])] fts = pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t13, role="input") @@ -330,7 +335,7 @@ def test_convert_from_df_torch_tensor(self) -> None: fts, ) ): - torch.testing.assert_close(t, t13[idx]) # type:ignore[attr-defined] + torch.testing.assert_close(t, t13[idx]) t14 = [torch.tensor([1.2, 2.4]), torch.tensor([3, 4])] fts = pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t14, role="input") @@ -340,7 +345,7 @@ def test_convert_from_df_torch_tensor(self) -> None: fts, ) ): - torch.testing.assert_close(t, t14[idx]) # type:ignore[attr-defined] + torch.testing.assert_close(t, t14[idx]) t15 = [torch.tensor([[1, 1], [2, 2]]), torch.tensor([[3, 3], [4, 4]])] fts = pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t15, role="input") @@ -350,7 +355,7 @@ def test_convert_from_df_torch_tensor(self) -> None: fts, ) ): - torch.testing.assert_close(t, t15[idx]) # type:ignore[attr-defined] + torch.testing.assert_close(t, t15[idx]) t16 = [torch.tensor([[1, 1], [2, 2]]), torch.tensor([[1.5, 6.8], [2.9, 9.2]])] fts = pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t16, role="input") @@ -360,7 +365,7 @@ def test_convert_from_df_torch_tensor(self) -> None: fts, ) ): - torch.testing.assert_close(t, t16[idx]) # type:ignore[attr-defined] + torch.testing.assert_close(t, t16[idx]) if __name__ == "__main__": diff --git a/snowflake/ml/model/_signatures/snowpark_handler.py b/snowflake/ml/model/_signatures/snowpark_handler.py index 4c37a5ab..e7cd59cf 100644 --- a/snowflake/ml/model/_signatures/snowpark_handler.py +++ b/snowflake/ml/model/_signatures/snowpark_handler.py @@ -7,6 +7,10 @@ import snowflake.snowpark import snowflake.snowpark.types as spt +from snowflake.ml._internal.exceptions import ( + error_codes, + exceptions as snowml_exceptions, +) from snowflake.ml._internal.utils import identifier from snowflake.ml.model import type_hints as model_types from snowflake.ml.model._deploy_client.warehouse import infer_template @@ -36,8 +40,11 @@ def validate(data: snowflake.snowpark.DataFrame) -> None: else: actual_data_type = data_type if not any(type.is_same_snowpark_type(actual_data_type) for type in core.DataType): - raise ValueError( - f"Data Validation Error: Unsupported data type {field.datatype} in column {field.name}." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error: Unsupported data type {field.datatype} in column {field.name}." + ), ) @staticmethod @@ -49,7 +56,12 @@ def infer_signature( for field in schema.fields: name = identifier.get_unescaped_names(field.name) if isinstance(field.datatype, spt.ArrayType): - raise NotImplementedError("Cannot infer model signature from Snowpark DataFrame with Array Type.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.NOT_IMPLEMENTED, + original_exception=NotImplementedError( + "Cannot infer model signature from Snowpark DataFrame with Array Type." + ), + ) else: features.append(core.FeatureSpec(name=name, dtype=core.DataType.from_snowpark_type(field.datatype))) return features @@ -65,7 +77,10 @@ def convert_to_df( if features: for feature in features: if isinstance(feature, core.FeatureGroupSpec): - raise NotImplementedError("FeatureGroupSpec is not supported.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.NOT_IMPLEMENTED, + original_exception=NotImplementedError("FeatureGroupSpec is not supported."), + ) assert isinstance(feature, core.FeatureSpec), "Invalid feature kind." dtype_map[feature.name] = feature.as_dtype() df_local = data.to_pandas() @@ -98,13 +113,19 @@ def convert_from_df( df = pandas_handler.PandasDataFrameHandler.convert_to_df(df) df_cols = df.columns if df_cols.dtype != np.object_: - raise ValueError("Cannot convert a Pandas DataFrame whose column index is not a string") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.NOT_IMPLEMENTED, + original_exception=ValueError("Cannot convert a Pandas DataFrame whose column index is not a string"), + ) features = pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input") # Role will be no effect on the column index. That is to say, the feature name is the actual column name. schema_list = [] for feature in features: if isinstance(feature, core.FeatureGroupSpec): - raise NotImplementedError("FeatureGroupSpec is not supported.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.NOT_IMPLEMENTED, + original_exception=NotImplementedError("FeatureGroupSpec is not supported."), + ) assert isinstance(feature, core.FeatureSpec), "Invalid feature kind." schema_list.append( spt.StructField( diff --git a/snowflake/ml/model/_signatures/snowpark_test.py b/snowflake/ml/model/_signatures/snowpark_test.py index 9a843e92..8284f935 100644 --- a/snowflake/ml/model/_signatures/snowpark_test.py +++ b/snowflake/ml/model/_signatures/snowpark_test.py @@ -5,6 +5,7 @@ import snowflake.snowpark.types as spt from snowflake.ml.model import model_signature from snowflake.ml.model._signatures import core, snowpark_handler +from snowflake.ml.test_utils import exception_utils from snowflake.ml.utils import connection_params from snowflake.snowpark import Session @@ -21,7 +22,9 @@ def tearDownClass(cls) -> None: def test_validate_snowpark_df(self) -> None: schema = spt.StructType([spt.StructField('"a"', spt.VariantType()), spt.StructField('"b"', spt.StringType())]) df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) - with self.assertRaisesRegex(ValueError, "Unsupported data type"): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Unsupported data type" + ): snowpark_handler.SnowparkDataFrameHandler.validate(df) def test_infer_schema_snowpark_df(self) -> None: @@ -47,7 +50,11 @@ def test_infer_schema_snowpark_df(self) -> None: schema = spt.StructType([spt.StructField('"""a"""', spt.ArrayType(spt.LongType()))]) df = self._session.create_dataframe([[[1, 3]]], schema) - with self.assertRaises(NotImplementedError): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=NotImplementedError, + expected_regex="Cannot infer model signature from Snowpark DataFrame with Array Type.", + ): snowpark_handler.SnowparkDataFrameHandler.infer_signature(df, role="input"), def test_validate_data_with_features(self) -> None: @@ -69,16 +76,22 @@ def test_validate_data_with_features(self) -> None: schema = spt.StructType([spt.StructField('"a"', spt.LongType()), spt.StructField('"b"', spt.IntegerType())]) df = self._session.create_dataframe([[1, 3], [3, 9]], schema) - with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by column"): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Feature type [^\\s]* is not met by column" + ): model_signature._validate_snowpark_data(df, fts) schema = spt.StructType([spt.StructField('"a1"', spt.LongType()), spt.StructField('"b"', spt.StringType())]) df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) - with self.assertRaisesRegex(ValueError, "feature [^\\s]* does not exist in data."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="feature [^\\s]* does not exist in data." + ): model_signature._validate_snowpark_data(df, fts) df = self._session.create_dataframe([{'"a"': 1}, {'"b"': 2}]) - with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by column"): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Feature type [^\\s]* is not met by column" + ): model_signature._validate_snowpark_data(df, fts) fts = [ diff --git a/snowflake/ml/model/_signatures/tensorflow_handler.py b/snowflake/ml/model/_signatures/tensorflow_handler.py index 58518379..49a8c953 100644 --- a/snowflake/ml/model/_signatures/tensorflow_handler.py +++ b/snowflake/ml/model/_signatures/tensorflow_handler.py @@ -5,6 +5,10 @@ from typing_extensions import TypeGuard from snowflake.ml._internal import type_utils +from snowflake.ml._internal.exceptions import ( + error_codes, + exceptions as snowml_exceptions, +) from snowflake.ml.model import type_hints as model_types from snowflake.ml.model._signatures import base_handler, core @@ -42,7 +46,10 @@ def count(data: Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]) -> shapes = data_col.shape.as_list() if data_col.shape == tf.TensorShape(None) or (not shapes) or (shapes[0] is None): # Unknown shape array - raise ValueError("Data Validation Error: Unknown shape data is found.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError("Data Validation Error: Unknown shape data is found."), + ) # Make mypy happy assert isinstance(shapes[0], int) @@ -68,15 +75,24 @@ def validate(data: Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]) for data_col in data: if data_col.shape == tf.TensorShape(None) or any(dim is None for dim in data_col.shape.as_list()): # Unknown shape array - raise ValueError("Data Validation Error: Unknown shape data is found.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError("Data Validation Error: Unknown shape data is found."), + ) if data_col.shape == tf.TensorShape([0]): # Empty array - raise ValueError("Data Validation Error: Empty data is found.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError("Data Validation Error: Empty data is found."), + ) if data_col.shape == tf.TensorShape([1]) or data_col.shape == tf.TensorShape([]): # scalar - raise ValueError("Data Validation Error: Scalar data is found.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError("Data Validation Error: Scalar data is found."), + ) @staticmethod def infer_signature( @@ -116,7 +132,10 @@ def convert_from_df( if features: for feature in features: if isinstance(feature, core.FeatureGroupSpec): - raise NotImplementedError("FeatureGroupSpec is not supported.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.NOT_IMPLEMENTED, + original_exception=NotImplementedError("FeatureGroupSpec is not supported."), + ) assert isinstance(feature, core.FeatureSpec), "Invalid feature kind." res.append( tf.convert_to_tensor(np.stack(df[feature.name].to_numpy()).astype(feature._dtype._numpy_type)) diff --git a/snowflake/ml/model/_signatures/tensorflow_test.py b/snowflake/ml/model/_signatures/tensorflow_test.py index d626b65c..ca96f422 100644 --- a/snowflake/ml/model/_signatures/tensorflow_test.py +++ b/snowflake/ml/model/_signatures/tensorflow_test.py @@ -4,6 +4,7 @@ from absl.testing import absltest from snowflake.ml.model._signatures import core, tensorflow_handler, utils +from snowflake.ml.test_utils import exception_utils class SeqOfTensorflowTensorHandlerTest(absltest.TestCase): @@ -25,39 +26,57 @@ def test_validate_list_of_tf_tensor(self) -> None: def test_validate_tf_tensor(self) -> None: t = [tf.constant([])] - with self.assertRaisesRegex(ValueError, "Empty data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Empty data is found." + ): tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) t = [tf.Variable([1, 2], shape=tf.TensorShape(None))] - with self.assertRaisesRegex(ValueError, "Unknown shape data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Unknown shape data is found." + ): tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) t = [tf.Variable([[1, 2]], shape=tf.TensorShape([None, 2]))] - with self.assertRaisesRegex(ValueError, "Unknown shape data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Unknown shape data is found." + ): tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) t = [tf.Variable([[1, 2]], shape=tf.TensorShape([1, None]))] - with self.assertRaisesRegex(ValueError, "Unknown shape data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Unknown shape data is found." + ): tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) t = [tf.constant(1)] - with self.assertRaisesRegex(ValueError, "Scalar data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Scalar data is found." + ): tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) t = [tf.constant([1])] - with self.assertRaisesRegex(ValueError, "Scalar data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Scalar data is found." + ): tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) t = [tf.Variable(1)] - with self.assertRaisesRegex(ValueError, "Scalar data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Scalar data is found." + ): tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) t = [tf.Variable([1])] - with self.assertRaisesRegex(ValueError, "Scalar data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Scalar data is found." + ): tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) t = [tf.constant([1, 2]), tf.constant(1)] - with self.assertRaisesRegex(ValueError, "Scalar data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Scalar data is found." + ): tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) def test_count_tf_tensor(self) -> None: @@ -71,11 +90,15 @@ def test_count_tf_tensor(self) -> None: self.assertEqual(tensorflow_handler.SeqOfTensorflowTensorHandler.count(t), 2) t = [tf.Variable([1, 2], shape=tf.TensorShape(None))] - with self.assertRaisesRegex(ValueError, "Unknown shape data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Unknown shape data is found." + ): tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) t = [tf.Variable([[1, 2]], shape=tf.TensorShape([None, 2]))] - with self.assertRaisesRegex(ValueError, "Unknown shape data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Unknown shape data is found." + ): tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) t = [tf.Variable([[1, 2]], shape=tf.TensorShape([1, None]))] diff --git a/snowflake/ml/model/_signatures/utils.py b/snowflake/ml/model/_signatures/utils.py index 9be3a8a9..2788acad 100644 --- a/snowflake/ml/model/_signatures/utils.py +++ b/snowflake/ml/model/_signatures/utils.py @@ -5,6 +5,10 @@ import numpy.typing as npt import pandas as pd +from snowflake.ml._internal.exceptions import ( + error_codes, + exceptions as snowml_exceptions, +) from snowflake.ml.model._signatures import core @@ -15,8 +19,8 @@ def convert_list_to_ndarray(data: List[Any]) -> npt.NDArray[Any]: data: List or nested list. Raises: - ValueError: Raised when ragged nested list or list containing non-basic type confronted. - ValueError: Raised when ragged nested list or list containing non-basic type confronted. + SnowflakeMLException: ValueError: Raised when ragged nested list or list containing non-basic type confronted. + SnowflakeMLException: ValueError: Raised when ragged nested list or list containing non-basic type confronted. Returns: The converted numpy array. @@ -24,16 +28,22 @@ def convert_list_to_ndarray(data: List[Any]) -> npt.NDArray[Any]: warnings.filterwarnings("error", category=np.VisibleDeprecationWarning) try: arr = np.array(data) - except np.VisibleDeprecationWarning: + except (np.VisibleDeprecationWarning, ValueError): # In recent version of numpy, this warning should be raised when bad list provided. - raise ValueError( - f"Unable to construct signature: Ragged nested or Unsupported list-like data {data} confronted." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Unable to construct signature: Ragged nested or Unsupported list-like data {data} confronted." + ), ) warnings.filterwarnings("default", category=np.VisibleDeprecationWarning) if arr.dtype == object: # If not raised, then a array of object would be created. - raise ValueError( - f"Unable to construct signature: Ragged nested or Unsupported list-like data {data} confronted." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Unable to construct signature: Ragged nested or Unsupported list-like data {data} confronted." + ), ) return arr @@ -48,7 +58,7 @@ def rename_features( feature_names: A list of names to assign to features and feature groups. Defaults to None. Raises: - ValueError: Raised when provided feature_names does not match the data shape. + SnowflakeMLException: ValueError: Raised when provided feature_names does not match the data shape. Returns: A sequence of feature specifications and feature group specifications being renamed if names provided. @@ -58,8 +68,11 @@ def rename_features( for ft, ft_name in zip(features, feature_names): ft._name = ft_name else: - raise ValueError( - f"{len(feature_names)} feature names are provided, while there are {len(features)} features." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=ValueError( + f"{len(feature_names)} feature names are provided, while there are {len(features)} features." + ), ) return features @@ -72,7 +85,7 @@ def rename_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureSpec features: A sequence of feature specifications and feature group specifications to rename the dataframe. Raises: - ValueError: Raised when the data does not have the same number of features as signature. + SnowflakeMLException: ValueError: Raised when the data does not have the same number of features as signature. Returns: A pandas dataframe with columns renamed. @@ -80,9 +93,12 @@ def rename_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureSpec df_cols = data.columns if df_cols.dtype in [np.int64, np.uint64, np.float64]: if len(features) != len(data.columns): - raise ValueError( - "Data does not have the same number of features as signature. " - + f"Signature requires {len(features)} features, but have {len(data.columns)} in input data." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=ValueError( + "Data does not have the same number of features as signature. " + + f"Signature requires {len(features)} features, but have {len(data.columns)} in input data." + ), ) data.columns = pd.Index([feature.name for feature in features]) return data diff --git a/snowflake/ml/model/_signatures/utils_test.py b/snowflake/ml/model/_signatures/utils_test.py index 70e6c34a..cb6f6b6f 100644 --- a/snowflake/ml/model/_signatures/utils_test.py +++ b/snowflake/ml/model/_signatures/utils_test.py @@ -2,6 +2,7 @@ from absl.testing import absltest from snowflake.ml.model._signatures import core, utils +from snowflake.ml.test_utils import exception_utils class ModelSignatureMiscTest(absltest.TestCase): @@ -23,7 +24,11 @@ def testrename_features(self) -> None: fts = [core.FeatureSpec("a", core.DataType.INT64, shape=(2,))] utils.rename_features(fts) - with self.assertRaises(ValueError): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="\\d+ feature names are provided, while there are \\d+ features.", + ): fts = [core.FeatureSpec("a", core.DataType.INT64, shape=(2,))] utils.rename_features(fts, ["b", "c"]) diff --git a/snowflake/ml/model/deploy_platforms.py b/snowflake/ml/model/deploy_platforms.py new file mode 100644 index 00000000..65bd6440 --- /dev/null +++ b/snowflake/ml/model/deploy_platforms.py @@ -0,0 +1,6 @@ +from enum import Enum + + +class TargetPlatform(Enum): + WAREHOUSE = "warehouse" + SNOWPARK_CONTAINER_SERVICES = "SNOWPARK_CONTAINER_SERVICES" diff --git a/snowflake/ml/model/model_signature.py b/snowflake/ml/model/model_signature.py index 47fa74d1..c9fbc40e 100644 --- a/snowflake/ml/model/model_signature.py +++ b/snowflake/ml/model/model_signature.py @@ -6,6 +6,10 @@ import snowflake.snowpark import snowflake.snowpark.types as spt +from snowflake.ml._internal.exceptions import ( + error_codes, + exceptions as snowml_exceptions, +) from snowflake.ml._internal.utils import formatting, identifier from snowflake.ml.model import type_hints as model_types from snowflake.ml.model._signatures import ( @@ -56,8 +60,11 @@ def _truncate_data(data: model_types.SupportedDataType) -> model_types.Supported category=UserWarning, ) return handler.truncate(data) - raise NotImplementedError( - f"Unable to infer model signature: Un-supported type provided {type(data)} for data truncate." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.NOT_IMPLEMENTED, + original_exception=NotImplementedError( + f"Unable to infer model signature: Un-supported type provided {type(data)} for data truncate." + ), ) @@ -73,7 +80,7 @@ def _infer_signature( role: a flag indicating that if this is to infer an input or output feature. Raises: - NotImplementedError: Raised when an unsupported data type is provided. + SnowflakeMLException: NotImplementedError: Raised when an unsupported data type is provided. Returns: A sequence of feature specifications and feature group specifications. @@ -82,8 +89,11 @@ def _infer_signature( if handler.can_handle(data): handler.validate(data) return handler.infer_signature(data, role) - raise NotImplementedError( - f"Unable to infer model signature: Un-supported type provided {type(data)} for X type inference." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.NOT_IMPLEMENTED, + original_exception=NotImplementedError( + f"Unable to infer model signature: Un-supported type provided {type(data)} for X type inference." + ), ) @@ -95,48 +105,63 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS features: A sequence of feature specifications and feature group specifications, where the dataframe should fit. Raises: - NotImplementedError: FeatureGroupSpec is not supported. - ValueError: Raised when a feature cannot be found. - ValueError: Raised when feature is scalar but confront list element. - ValueError: Raised when feature type is not aligned in list element. - ValueError: Raised when feature shape is not aligned in list element. - ValueError: Raised when feature is scalar but confront array element. - ValueError: Raised when feature type is not aligned in numpy array element. - ValueError: Raised when feature shape is not aligned in numpy array element. - ValueError: Raised when feature type is not aligned in string element. - ValueError: Raised when feature type is not aligned in bytes element. + SnowflakeMLException: NotImplementedError: FeatureGroupSpec is not supported. + SnowflakeMLException: ValueError: Raised when a feature cannot be found. + SnowflakeMLException: ValueError: Raised when feature is scalar but confront list element. + SnowflakeMLException: ValueError: Raised when feature type is not aligned in list element. + SnowflakeMLException: ValueError: Raised when feature shape is not aligned in list element. + SnowflakeMLException: ValueError: Raised when feature is scalar but confront array element. + SnowflakeMLException: ValueError: Raised when feature type is not aligned in numpy array element. + SnowflakeMLException: ValueError: Raised when feature shape is not aligned in numpy array element. + SnowflakeMLException: ValueError: Raised when feature type is not aligned in string element. + SnowflakeMLException: ValueError: Raised when feature type is not aligned in bytes element. """ for feature in features: ft_name = feature.name try: data_col = data[ft_name] except KeyError: - raise ValueError(f"Data Validation Error: feature {ft_name} does not exist in data.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError(f"Data Validation Error: feature {ft_name} does not exist in data."), + ) df_col_dtype = data_col.dtype if isinstance(feature, core.FeatureGroupSpec): - raise NotImplementedError("FeatureGroupSpec is not supported.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.NOT_IMPLEMENTED, + original_exception=NotImplementedError("FeatureGroupSpec is not supported."), + ) - assert isinstance(feature, core.FeatureSpec), "Invalid feature kind." + assert isinstance(feature, core.FeatureSpec) # assert for mypy. ft_type = feature._dtype ft_shape = feature._shape if df_col_dtype != np.dtype("O"): if ft_type != core.DataType.from_numpy_type(df_col_dtype): - raise ValueError( - f"Data Validation Error in feature {ft_name}: " - + f"Feature type {ft_type} is not met by all elements in {data_col}." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature type {ft_type} is not met by all elements in {data_col}." + ), ) elif ft_shape is not None: - raise ValueError( - f"Data Validation Error in feature {ft_name}: " - + "Feature is a array type feature while scalar data is provided." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error in feature {ft_name}: " + + "Feature is a array type feature while scalar data is provided." + ), ) else: if isinstance(data_col[0], list): if not ft_shape: - raise ValueError( - f"Data Validation Error in feature {ft_name}: " - + "Feature is a scalar feature while list data is provided." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error in feature {ft_name}: " + + "Feature is a scalar feature while list data is provided." + ), ) converted_data_list = [utils.convert_list_to_ndarray(data_row) for data_row in data_col] @@ -145,59 +170,91 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS core.DataType.from_numpy_type(converted_data.dtype) == ft_type for converted_data in converted_data_list ): - raise ValueError( - f"Data Validation Error in feature {ft_name}: " - + f"Feature type {ft_type} is not met by all elements in {data_col}." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature type {ft_type} is not met by all elements in {data_col}." + ), ) if ft_shape and ft_shape != (-1,): if not all(np.shape(converted_data) == ft_shape for converted_data in converted_data_list): - raise ValueError( - f"Data Validation Error in feature {ft_name}: " - + f"Feature shape {ft_shape} is not met by all elements in {data_col}." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature shape {ft_shape} is not met by all elements in {data_col}." + ), ) + elif isinstance(data_col[0], np.ndarray): if not ft_shape: - raise ValueError( - f"Data Validation Error in feature {ft_name}: " - + "Feature is a scalar feature while array data is provided." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error in feature {ft_name}: " + + "Feature is a scalar feature while array data is provided." + ), ) if not all(core.DataType.from_numpy_type(data_row.dtype) == ft_type for data_row in data_col): - raise ValueError( - f"Data Validation Error in feature {ft_name}: " - + f"Feature type {ft_type} is not met by all elements in {data_col}." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature type {ft_type} is not met by all elements in {data_col}." + ), ) ft_shape = feature._shape if ft_shape and ft_shape != (-1,): if not all(np.shape(data_row) == ft_shape for data_row in data_col): ft_shape = (-1,) - raise ValueError( - f"Data Validation Error in feature {ft_name}: " - + f"Feature shape {ft_shape} is not met by all elements in {data_col}." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature shape {ft_shape} is not met by all elements in {data_col}." + ), ) + elif isinstance(data_col[0], str): if ft_shape is not None: - raise ValueError( - f"Data Validation Error in feature {ft_name}: " - + "Feature is a array type feature while scalar data is provided." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error in feature {ft_name}: " + + "Feature is a array type feature while scalar data is provided." + ), ) + if ft_type != core.DataType.STRING: - raise ValueError( - f"Data Validation Error in feature {ft_name}: " - + f"Feature type {ft_type} is not met by all elements in {data_col}." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature type {ft_type} is not met by all elements in {data_col}." + ), ) + elif isinstance(data_col[0], bytes): if ft_shape is not None: - raise ValueError( - f"Data Validation Error in feature {ft_name}: " - + "Feature is a array type feature while scalar data is provided." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error in feature {ft_name}: " + + "Feature is a array type feature while scalar data is provided." + ), ) + if ft_type != core.DataType.BYTES: - raise ValueError( - f"Data Validation Error in feature {ft_name}: " - + f"Feature type {ft_type} is not met by all elements in {data_col}." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature type {ft_type} is not met by all elements in {data_col}." + ), ) @@ -209,9 +266,9 @@ def _validate_snowpark_data(data: snowflake.snowpark.DataFrame, features: Sequen features: A sequence of feature specifications and feature group specifications, where the dataframe should fit. Raises: - NotImplementedError: FeatureGroupSpec is not supported. - ValueError: Raised when confronting invalid feature. - ValueError: Raised when a feature cannot be found. + SnowflakeMLException: NotImplementedError: FeatureGroupSpec is not supported. + SnowflakeMLException: ValueError: Raised when confronting invalid feature. + SnowflakeMLException: ValueError: Raised when a feature cannot be found. """ schema = data.schema for feature in features: @@ -228,33 +285,48 @@ def _validate_snowpark_data(data: snowflake.snowpark.DataFrame, features: Sequen category=RuntimeWarning, ) if isinstance(feature, core.FeatureGroupSpec): - raise NotImplementedError("FeatureGroupSpec is not supported.") - assert isinstance(feature, core.FeatureSpec), "Invalid feature kind." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.NOT_IMPLEMENTED, + original_exception=NotImplementedError("FeatureGroupSpec is not supported."), + ) + assert isinstance(feature, core.FeatureSpec) # mypy ft_type = feature._dtype field_data_type = field.datatype if isinstance(field_data_type, spt.ArrayType): if feature._shape is None: - raise ValueError( - f"Data Validation Error in feature {ft_name}: " - + f"Feature is a array feature, while {field.name} is not." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature is a array feature, while {field.name} is not." + ), ) warnings.warn( - f"Warn in feature {ft_name}: Feature is a array feature," + " type validation cannot happen.", + f"Warn in feature {ft_name}: Feature is a array feature, type validation cannot happen.", category=RuntimeWarning, ) else: if feature._shape: - raise ValueError( - f"Data Validation Error in feature {ft_name}: " - + f"Feature is a scalar feature, while {field.name} is not." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature is a scalar feature, while {field.name} is not." + ), ) if not ft_type.is_same_snowpark_type(field_data_type): - raise ValueError( - f"Data Validation Error in feature {ft_name}: " - + f"Feature type {ft_type} is not met by column {field.name}." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature type {ft_type} is not met by column {field.name}." + ), ) if not found: - raise ValueError(f"Data Validation Error: feature {ft_name} does not exist in data.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError(f"Data Validation Error: feature {ft_name} does not exist in data."), + ) def _convert_local_data_to_df(data: model_types.SupportedLocalDataType) -> pd.DataFrame: @@ -264,7 +336,7 @@ def _convert_local_data_to_df(data: model_types.SupportedLocalDataType) -> pd.Da data: The provided data. Raises: - ValueError: Raised when data cannot be handled by any data handler. + SnowflakeMLException: NotImplementedError: Raised when data cannot be handled by any data handler. Returns: The converted dataframe with renamed column index. @@ -276,7 +348,11 @@ def _convert_local_data_to_df(data: model_types.SupportedLocalDataType) -> pd.Da df = handler.convert_to_df(data, ensure_serializable=False) break if df is None: - raise ValueError(f"Data Validation Error: Un-supported type {type(data)} provided.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.NOT_IMPLEMENTED, + original_exception=NotImplementedError(f"Data Validation Error: Un-supported type {type(data)} provided."), + ) + return df diff --git a/snowflake/ml/model/model_signature_test.py b/snowflake/ml/model/model_signature_test.py index 7a9580b2..567b2e17 100644 --- a/snowflake/ml/model/model_signature_test.py +++ b/snowflake/ml/model/model_signature_test.py @@ -5,6 +5,7 @@ from absl.testing import absltest from snowflake.ml.model import model_signature +from snowflake.ml.test_utils import exception_utils class ModelSignatureMiscTest(absltest.TestCase): @@ -91,16 +92,26 @@ def test_infer_signature(self) -> None: df = pd.DataFrame([1, 2, 3, 4]) lt = [df, arr] - with self.assertRaises(NotImplementedError): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=NotImplementedError, expected_regex="Un-supported type provided" + ): model_signature._infer_signature(lt, role="input") - with self.assertRaises(ValueError): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Inconsistent type of object found in data", + ): model_signature._infer_signature([True, 1], role="input") - with self.assertRaises(NotImplementedError): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=NotImplementedError, expected_regex="Un-supported type provided" + ): model_signature._infer_signature(1, role="input") - with self.assertRaises(NotImplementedError): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=NotImplementedError, expected_regex="Un-supported type provided" + ): model_signature._infer_signature([], role="input") def test_validate_pandas_df(self) -> None: @@ -111,18 +122,30 @@ def test_validate_pandas_df(self) -> None: model_signature._validate_pandas_df(pd.DataFrame([[2, 5], [6, 8]], columns=["a", "b"]), fts) - with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature type [^\\s]* is not met by all elements", + ): model_signature._validate_pandas_df(pd.DataFrame([[2.5, 5], [6.8, 8]], columns=["a", "b"]), fts) - with self.assertRaisesRegex(ValueError, "feature [^\\s]* does not exist in data."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="feature [^\\s]* does not exist in data." + ): model_signature._validate_pandas_df(pd.DataFrame([5, 6], columns=["a"]), fts) model_signature._validate_pandas_df(pd.DataFrame([5, 6], columns=["a"]), fts[:1]) - with self.assertRaisesRegex(ValueError, "feature [^\\s]* does not exist in data."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="feature [^\\s]* does not exist in data." + ): model_signature._validate_pandas_df(pd.DataFrame([[2, 5], [6, 8]], columns=["c", "d"]), fts) - with self.assertRaisesRegex(ValueError, "Feature is a scalar feature while list data is provided."): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature is a scalar feature while list data is provided.", + ): model_signature._validate_pandas_df( pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5, 6.8]]], columns=["a", "b"]), fts ) @@ -134,47 +157,83 @@ def test_validate_pandas_df(self) -> None: model_signature._validate_pandas_df(pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5, 6.8]]], columns=["a", "b"]), fts) - with self.assertRaisesRegex(ValueError, "Feature is a array type feature while scalar data is provided."): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature is a array type feature while scalar data is provided.", + ): model_signature._validate_pandas_df(pd.DataFrame([[2, 2.5], [6, 6.8]], columns=["a", "b"]), fts) - with self.assertRaisesRegex(ValueError, "Feature shape [\\(\\)0-9,\\s-]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature shape [\\(\\)0-9,\\s-]* is not met by all elements", + ): model_signature._validate_pandas_df( pd.DataFrame([[1, [2.5, 6.8, 6.8]], [2, [2.5, 6.8, 6.8]]], columns=["a", "b"]), fts ) - with self.assertRaisesRegex(ValueError, "Feature shape [\\(\\)0-9,\\s-]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature shape [\\(\\)0-9,\\s-]* is not met by all elements", + ): model_signature._validate_pandas_df( pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5, 6.8, 6.8]]], columns=["a", "b"]), fts ) - with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature type [^\\s]* is not met by all elements", + ): model_signature._validate_pandas_df(pd.DataFrame([[1, [2, 5]], [2, [6, 8]]], columns=["a", "b"]), fts) model_signature._validate_pandas_df( pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2.5, 6.8])]], columns=["a", "b"]), fts ) - with self.assertRaisesRegex(ValueError, "Feature shape [\\(\\)0-9,\\s-]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature shape [\\(\\)0-9,\\s-]* is not met by all elements", + ): model_signature._validate_pandas_df( pd.DataFrame([[1, np.array([2.5, 6.8, 6.8])], [2, np.array([2.5, 6.8, 6.8])]], columns=["a", "b"]), fts ) - with self.assertRaisesRegex(ValueError, "Feature shape [\\(\\)0-9,\\s-]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature shape [\\(\\)0-9,\\s-]* is not met by all elements", + ): model_signature._validate_pandas_df( pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2.5, 6.8, 6.8])]], columns=["a", "b"]), fts ) - with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature type [^\\s]* is not met by all elements", + ): model_signature._validate_pandas_df( pd.DataFrame([[1, np.array([2, 5])], [2, np.array([6, 8])]], columns=["a", "b"]), fts ) - with self.assertRaisesRegex(ValueError, "Feature is a array type feature while scalar data is provided."): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature is a array type feature while scalar data is provided.", + ): model_signature._validate_pandas_df( pd.DataFrame([ele.encode() for ele in ["a", "b", "c", "d"]], columns=["b"]), fts[-1:] ) - with self.assertRaisesRegex(ValueError, "Feature is a array type feature while scalar data is provided."): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature is a array type feature while scalar data is provided.", + ): model_signature._validate_pandas_df(pd.DataFrame(["a", "b", "c", "d"], columns=["b"]), fts[-1:]) fts = [ @@ -192,7 +251,11 @@ def test_validate_pandas_df(self) -> None: pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5, 6.8, 6.8]]], columns=["a", "b"]), fts ) - with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature type [^\\s]* is not met by all elements", + ): model_signature._validate_pandas_df(pd.DataFrame([[1, [2, 5]], [2, [6, 8]]], columns=["a", "b"]), fts) model_signature._validate_pandas_df( @@ -207,7 +270,11 @@ def test_validate_pandas_df(self) -> None: pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2.5, 6.8, 6.8])]], columns=["a", "b"]), fts ) - with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature type [^\\s]* is not met by all elements", + ): model_signature._validate_pandas_df( pd.DataFrame([[1, np.array([2, 5])], [2, np.array([6, 8])]], columns=["a", "b"]), fts ) @@ -221,17 +288,29 @@ def test_validate_pandas_df(self) -> None: pd.DataFrame([[1, [[2.5], [6.8]]], [2, [[2.5], [6.8]]]], columns=["a", "b"]), fts ) - with self.assertRaisesRegex(ValueError, "Feature shape [\\(\\)0-9,\\s-]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature shape [\\(\\)0-9,\\s-]* is not met by all elements", + ): model_signature._validate_pandas_df( pd.DataFrame([[1, [[2.5], [6.8]]], [2, [[2.5], [6.8], [6.8]]]], columns=["a", "b"]), fts ) - with self.assertRaisesRegex(ValueError, "Feature shape [\\(\\)0-9,\\s-]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature shape [\\(\\)0-9,\\s-]* is not met by all elements", + ): model_signature._validate_pandas_df( pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5, 6.8]]], columns=["a", "b"]), fts ) - with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature type [^\\s]* is not met by all elements", + ): model_signature._validate_pandas_df( pd.DataFrame([[1, [[2], [5]]], [2, [[6], [8]]]], columns=["a", "b"]), fts ) @@ -240,18 +319,30 @@ def test_validate_pandas_df(self) -> None: pd.DataFrame([[1, np.array([[2.5], [6.8]])], [2, np.array([[2.5], [6.8]])]], columns=["a", "b"]), fts ) - with self.assertRaisesRegex(ValueError, "Feature shape [\\(\\)0-9,\\s-]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature shape [\\(\\)0-9,\\s-]* is not met by all elements", + ): model_signature._validate_pandas_df( pd.DataFrame([[1, np.array([[2.5], [6.8]])], [2, np.array([[2.5], [6.8], [6.8]])]], columns=["a", "b"]), fts, ) - with self.assertRaisesRegex(ValueError, "Feature shape [\\(\\)0-9,\\s-]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature shape [\\(\\)0-9,\\s-]* is not met by all elements", + ): model_signature._validate_pandas_df( pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2.5, 6.8])]], columns=["a", "b"]), fts ) - with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature type [^\\s]* is not met by all elements", + ): model_signature._validate_pandas_df( pd.DataFrame([[1, np.array([[2], [5]])], [2, np.array([[6], [8]])]], columns=["a", "b"]), fts ) @@ -259,15 +350,27 @@ def test_validate_pandas_df(self) -> None: fts = [model_signature.FeatureSpec("a", model_signature.DataType.STRING)] model_signature._validate_pandas_df(pd.DataFrame(["a", "b", "c", "d"], columns=["a"]), fts) - with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature type [^\\s]* is not met by all elements", + ): model_signature._validate_pandas_df( pd.DataFrame([ele.encode() for ele in ["a", "b", "c", "d"]], columns=["a"]), fts ) - with self.assertRaisesRegex(ValueError, "Feature is a scalar feature while list data is provided."): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature is a scalar feature while list data is provided.", + ): model_signature._validate_pandas_df(pd.DataFrame(data={"a": [[1, 2]]}), fts) - with self.assertRaisesRegex(ValueError, "Feature is a scalar feature while array data is provided."): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature is a scalar feature while array data is provided.", + ): model_signature._validate_pandas_df(pd.DataFrame(data={"a": [np.array([1, 2])]}), fts) fts = [model_signature.FeatureSpec("a", model_signature.DataType.BYTES)] @@ -275,13 +378,25 @@ def test_validate_pandas_df(self) -> None: pd.DataFrame([ele.encode() for ele in ["a", "b", "c", "d"]], columns=["a"]), fts ) - with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature type [^\\s]* is not met by all elements", + ): model_signature._validate_pandas_df(pd.DataFrame(["a", "b", "c", "d"], columns=["a"]), fts) - with self.assertRaisesRegex(ValueError, "Feature is a scalar feature while list data is provided."): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature is a scalar feature while list data is provided.", + ): model_signature._validate_pandas_df(pd.DataFrame(data={"a": [[1, 2]]}), fts) - with self.assertRaisesRegex(ValueError, "Feature is a scalar feature while array data is provided."): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature is a scalar feature while array data is provided.", + ): model_signature._validate_pandas_df(pd.DataFrame(data={"a": [np.array([1, 2])]}), fts) def test_validate_data_with_features(self) -> None: @@ -290,37 +405,73 @@ def test_validate_data_with_features(self) -> None: model_signature.FeatureSpec("input_feature_1", model_signature.DataType.INT64), ] - with self.assertRaisesRegex(ValueError, "Empty data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Empty data is found." + ): model_signature._convert_and_validate_local_data(np.array([]), fts) - with self.assertRaisesRegex(ValueError, "Scalar data is found."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Scalar data is found." + ): model_signature._convert_and_validate_local_data(np.array(5), fts) - with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature type [^\\s]* is not met by all elements", + ): model_signature._convert_and_validate_local_data(np.array([[2.5, 5], [6.8, 8]]), fts) - with self.assertRaisesRegex(ValueError, "Un-supported type provided."): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Un-supported type provided.", + ): model_signature._convert_and_validate_local_data([], fts) - with self.assertRaisesRegex(ValueError, "Inconsistent type of object found in data"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Inconsistent type of object found in data", + ): model_signature._convert_and_validate_local_data([1, [1, 1]], fts) - with self.assertRaisesRegex(ValueError, "Ill-shaped list data"): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Ill-shaped list data" + ): model_signature._convert_and_validate_local_data([[1], [1, 1]], fts) - with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature type [^\\s]* is not met by all elements", + ): model_signature._convert_and_validate_local_data([[2.1, 5.0], [6.8, 8.0]], fts) - with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Feature type [^\\s]* is not met by all elements", + ): model_signature._convert_and_validate_local_data(pd.DataFrame([[2.5, 5], [6.8, 8]]), fts) - with self.assertRaisesRegex(ValueError, "Data does not have the same number of features as signature"): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Data does not have the same number of features as signature", + ): model_signature._convert_and_validate_local_data(pd.DataFrame([5, 6]), fts) - with self.assertRaisesRegex(ValueError, "Data does not have the same number of features as signature."): + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Data does not have the same number of features as signature.", + ): model_signature._convert_and_validate_local_data(np.array([5, 6]), fts) - with self.assertRaisesRegex(ValueError, "feature [^\\s]* does not exist in data."): + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="feature [^\\s]* does not exist in data." + ): model_signature._convert_and_validate_local_data(pd.DataFrame([[2, 5], [6, 8]], columns=["a", "b"]), fts) df = model_signature._convert_and_validate_local_data(np.array([5, 6]), fts[:1]) diff --git a/snowflake/ml/model/type_hints.py b/snowflake/ml/model/type_hints.py index c40fc6e9..2ec25b49 100644 --- a/snowflake/ml/model/type_hints.py +++ b/snowflake/ml/model/type_hints.py @@ -31,9 +31,9 @@ "np.uint16", "np.uint32", "np.uint64", - "np.bool8", - "np.str0", - "np.bytes0", + "np.bool_", + "np.str_", + "np.bytes_", ] _SupportedNumpyArray = npt.NDArray[_SupportedNumpyDtype] _SupportedBuiltinsList = Sequence[_SupportedBuiltins] @@ -132,8 +132,10 @@ class SnowparkContainerServiceDeployOptions(DeployOptions): Snowflake is used as is. This option is for users who consistently use the same image for multiple use cases, allowing faster deployment. The snowflake image used for deployment is logged to the console for future use. Default to None. - use_gpu: When set to True, a CUDA-enabled Docker image will be used to provide a runtime CUDA environment. - Default to False. + num_gpus: Number of GPUs to be used for the service. Default to 0. + num_workers: Number of workers used for model inference. Please ensure that the number of workers is set lower than + the total available memory divided by the size of model to prevent memory-related issues. Default is number of + CPU cores * 2 + 1. """ compute_pool: str @@ -142,7 +144,8 @@ class SnowparkContainerServiceDeployOptions(DeployOptions): max_instances: NotRequired[int] endpoint: NotRequired[str] prebuilt_snowflake_image: NotRequired[str] - use_gpu: NotRequired[bool] + num_gpus: NotRequired[int] + num_workers: NotRequired[int] class BaseModelSaveOption(TypedDict): diff --git a/snowflake/ml/modeling/impute/simple_imputer.py b/snowflake/ml/modeling/impute/simple_imputer.py index 5ce07e9e..793e0378 100644 --- a/snowflake/ml/modeling/impute/simple_imputer.py +++ b/snowflake/ml/modeling/impute/simple_imputer.py @@ -264,7 +264,7 @@ def fit(self, dataset: snowpark.DataFrame) -> "SimpleImputer": self.statistics_[input_col] = self.fill_value else: state = STRATEGY_TO_STATE_DICT[self.strategy] - assert state is not None, "state cannot be None" + assert state is not None dataset_copy = copy.copy(dataset) if not pd.isna(self.missing_values): # Replace `self.missing_values` with null to avoid including it when computing states. diff --git a/snowflake/ml/modeling/metrics/monitor.py b/snowflake/ml/modeling/metrics/monitor.py index 21e4679d..a6b0bf83 100644 --- a/snowflake/ml/modeling/metrics/monitor.py +++ b/snowflake/ml/modeling/metrics/monitor.py @@ -7,9 +7,10 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry +from snowflake.snowpark import functions _PROJECT = "ModelDevelopment" -_SUBPROJECT = "Metrics" +_SUBPROJECT = "Monitor" class BucketConfig(TypedDict): @@ -96,6 +97,98 @@ def get_basic_stats(df: snowpark.DataFrame) -> Tuple[Dict[str, int], Dict[str, i return d1, d2 +@telemetry.send_api_usage_telemetry( + project=_PROJECT, + subproject=_SUBPROJECT, +) +def jensenshannon(df1: snowpark.DataFrame, colname1: str, df2: snowpark.DataFrame, colname2: str) -> float: + """ + Similar to scipy implementation: + https://github.com/scipy/scipy/blob/e4dec2c5993faa381bb4f76dce551d0d79734f8f/scipy/spatial/distance.py#L1174 + It's server solution, all computing being in Snowflake warehouse, so will be significantly faster than client. + + Args: + df1: 1st Snowpark Dataframe; + colname1: the col to be selected in df1 + df2: 2nd Snowpark Dataframe; + colname2: the col to be selected in df2 + Supported data Tyte: any data type that Snowflake supports, including VARIANT, OBJECT...etc. + + Returns: + a jensenshannon value + """ + df1 = df1.select(colname1) + df1 = ( + df1.group_by(colname1) + .agg(functions.count(colname1).alias("c1")) + .select(functions.col(colname1).alias("d1"), "c1") + ) + df2 = df2.select(colname2) + df2 = ( + df2.group_by(colname2) + .agg(functions.count(colname2).alias("c2")) + .select(functions.col(colname2).alias("d2"), "c2") + ) + + dfsum = df1.select("c1").agg(functions.sum("c1").alias("SUM1")) + sum1 = dfsum.collect()[0].as_dict()["SUM1"] + dfsum = df2.select("c2").agg(functions.sum("c2").alias("SUM2")) + sum2 = dfsum.collect()[0].as_dict()["SUM2"] + + df1 = df1.select("d1", functions.sql_expr("c1 / " + str(sum1)).alias("p")) + minp = df1.select(functions.min("P").alias("MINP")).collect()[0].as_dict()["MINP"] + df2 = df2.select("d2", functions.sql_expr("c2 / " + str(sum2)).alias("q")) + minq = df2.select(functions.min("Q").alias("MINQ")).collect()[0].as_dict()["MINQ"] + + DECAY_FACTOR = 0.5 + df = df1.join(df2, df1.d1 == df2.d2, "fullouter").select( + "d1", + "d2", + functions.sql_expr( + """ + CASE + WHEN p is NULL THEN {}*{} + ELSE p + END + """.format( + minp, DECAY_FACTOR + ) + ).alias("p"), + functions.sql_expr( + """ + CASE + WHEN q is NULL THEN {}*{} + ELSE q + END + """.format( + minq, DECAY_FACTOR + ) + ).alias("q"), + ) + + df = df.select("p", "q", functions.sql_expr("(p+q)/2.0").alias("m")) + df = df.select( + functions.sql_expr( + """ + CASE + WHEN p > 0 AND m > 0 THEN p * LOG(2, p/m) + ELSE 0 + END + """ + ).alias("left"), + functions.sql_expr( + """ + CASE + WHEN q > 0 AND m > 0 THEN q * LOG(2, q/m) + ELSE 0 + END + """ + ).alias("right"), + ) + resdf = df.select(functions.sql_expr("sqrt((sum(left) + sum(right)) / 2.0)").alias("JS")) + return float(resdf.collect()[0].as_dict()["JS"]) + + def _get_udf_query_str(name: str, col: str, df: snowpark.DataFrame, bucket_config: BucketConfig = None) -> str: if bucket_config: return "select count(1) as {}, width_bucket({}, {}, {}, {}) bucket from ({}) group by bucket".format( diff --git a/snowflake/ml/modeling/pipeline/pipeline.py b/snowflake/ml/modeling/pipeline/pipeline.py index 2a05ed07..17d142ca 100644 --- a/snowflake/ml/modeling/pipeline/pipeline.py +++ b/snowflake/ml/modeling/pipeline/pipeline.py @@ -454,7 +454,7 @@ def _invoke_estimator_func( transformed_dataset = self._transform_dataset(dataset=dataset) estimator = self._get_estimator() - assert estimator is not None, "estimator cannot be None" + assert estimator is not None res: snowpark.DataFrame = getattr(estimator[1], func_name)(transformed_dataset) return res diff --git a/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl b/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl index 24d16931..20dc3482 100644 --- a/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl +++ b/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl @@ -99,6 +99,7 @@ def get_build_rules_for_native_impl(): ":init", "//snowflake/ml/_internal:telemetry", "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/_internal/exceptions:exceptions", "//snowflake/ml/_internal/utils:identifier", "//snowflake/ml/model:model_signature", "//snowflake/ml/modeling/framework", @@ -114,6 +115,7 @@ def get_build_rules_for_native_impl(): ":init", "//snowflake/ml/_internal:telemetry", "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/_internal/exceptions:exceptions", "//snowflake/ml/_internal/utils:identifier", "//snowflake/ml/modeling/framework", ], diff --git a/snowflake/ml/modeling/preprocessing/k_bins_discretizer.py b/snowflake/ml/modeling/preprocessing/k_bins_discretizer.py index f48f194f..649a198c 100644 --- a/snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +++ b/snowflake/ml/modeling/preprocessing/k_bins_discretizer.py @@ -206,7 +206,7 @@ def _fit_snowpark(self, dataset: snowpark.DataFrame) -> None: self._handle_uniform(dataset) elif self.strategy == "kmeans": raise exceptions.SnowflakeMLException( - error_code=error_codes.INVALID_ATTRIBUTE, + error_code=error_codes.NOT_IMPLEMENTED, original_exception=NotImplementedError("kmeans not supported yet"), ) diff --git a/snowflake/ml/modeling/preprocessing/one_hot_encoder.py b/snowflake/ml/modeling/preprocessing/one_hot_encoder.py index 7fea93c1..acf20cc8 100644 --- a/snowflake/ml/modeling/preprocessing/one_hot_encoder.py +++ b/snowflake/ml/modeling/preprocessing/one_hot_encoder.py @@ -16,6 +16,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry, type_utils +from snowflake.ml._internal.exceptions import error_codes, exceptions from snowflake.ml._internal.utils import identifier from snowflake.ml.model import model_signature from snowflake.ml.modeling.framework import _utils, base @@ -208,6 +209,7 @@ def __init__( self.drop_idx_: Optional[npt.NDArray[np.int_]] = None self._drop_idx_after_grouping: Optional[npt.NDArray[np.int_]] = None self._n_features_outs: List[int] = [] + self._snowpark_cols: Dict[str, List[str]] = dict() # Fit state if output columns are set before fitting self._dense_output_cols_mappings: Dict[str, List[str]] = {} @@ -264,23 +266,16 @@ def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "OneHotEncode Returns: Fitted encoder. - - Raises: - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ self._reset() self._validate_keywords() super()._check_input_cols() + super()._check_dataset_type(dataset) if isinstance(dataset, pd.DataFrame): self._fit_sklearn(dataset) - elif isinstance(dataset, snowpark.DataFrame): - self._fit_snowpark(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + self._fit_snowpark(dataset) self._is_fitted = True if not self.sparse and self.output_cols: @@ -303,10 +298,6 @@ def _fit_sklearn(self, dataset: pd.DataFrame) -> None: self._drop_idx_after_grouping = sklearn_encoder.drop_idx_ self._n_features_outs = sklearn_encoder._n_features_outs - # Set `categories_` - if len(self.input_cols) != len(self._categories_list): - raise ValueError("The derived categories mismatch the supplied input columns.") - _state_pandas_counts: List[pd.DataFrame] = [] for idx, input_col in enumerate(self.input_cols): self.categories_[input_col] = self._categories_list[idx] @@ -331,6 +322,8 @@ def _fit_sklearn(self, dataset: pd.DataFrame) -> None: def _fit_snowpark(self, dataset: snowpark.DataFrame) -> None: # StructType[[StructField(COLUMN, TYPE, nullable=True), ...] self._dataset_schema = dataset.schema + self._snowpark_cols["input_cols"] = dataset.select(self.input_cols).columns + self._snowpark_cols["sorted_input_cols"] = dataset.select(sorted(self.input_cols)).columns fit_results = self._fit_category_state(dataset, return_counts=self._infrequent_enabled) if self._infrequent_enabled: self._fit_infrequent_category_mapping(fit_results["n_samples"], fit_results["category_counts"]) @@ -351,7 +344,7 @@ def _fit_category_state(self, dataset: snowpark.DataFrame, return_counts: bool) Dict with `n_samples` and (optionally) `category_counts` of the dataset. Raises: - ValueError: Empty data. + SnowflakeMLException: Empty data. """ # columns: COLUMN_NAME, CATEGORY, COUNT state_df = self._get_category_count_state_df(dataset) @@ -359,7 +352,10 @@ def _fit_category_state(self, dataset: snowpark.DataFrame, return_counts: bool) statement_params=telemetry.get_statement_params(base.PROJECT, base.SUBPROJECT, self.__class__.__name__) ) if self._state_pandas.empty: - raise ValueError("Empty data while a minimum of 1 sample is required.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError("Empty data while a minimum of 1 sample is required."), + ) # columns: COLUMN_NAME, STATE # state object: {category: count} @@ -391,7 +387,7 @@ def _get_category_count_state_df(self, dataset: snowpark.DataFrame) -> snowpark. State dataframe with columns [COLUMN_NAME, CATEGORY, COUNT]. Raises: - ValueError: If `self.categories` is provided, `self.handle_unknown="error"`, + SnowflakeMLException: If `self.categories` is provided, `self.handle_unknown="error"`, and unknown categories exist in dataset. """ # states of categories found in dataset @@ -418,15 +414,15 @@ def _get_category_count_state_df(self, dataset: snowpark.DataFrame) -> snowpark. temp_df = dataset.select(state_columns).distinct() found_state_df = found_state_df.union_by_name(temp_df) if found_state_df is not None else temp_df - assert found_state_df is not None, "found_state_df cannot be None" + assert found_state_df is not None if self.categories != "auto": state_data = [] - assert isinstance(self.categories, dict), "self.categories must be dict" + assert isinstance(self.categories, dict) for input_col, cats in self.categories.items(): for cat in cats.tolist(): state_data.append([input_col, cat]) # states of given categories - assert dataset._session is not None, "dataset._session cannot be None" + assert dataset._session is not None given_state_df = dataset._session.create_dataframe(data=state_data, schema=[_COLUMN_NAME, _CATEGORY]) given_state_df = ( given_state_df.join( @@ -456,8 +452,12 @@ def _get_category_count_state_df(self, dataset: snowpark.DataFrame) -> snowpark. ) ) if not unknown_pandas.empty: - msg = f"Found unknown categories during fit:\n{unknown_pandas.to_string()}" - raise ValueError(msg) + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Found unknown categories during fit:\n{unknown_pandas.to_string()}" + ), + ) return given_state_df @@ -488,11 +488,14 @@ def _assign_categories(self, state_object_pandas: pd.DataFrame) -> None: where STATE contains state objects: {category: count}. Raises: - ValueError: If `self.categories` is an unsupported value. + SnowflakeMLException: If `self.categories` is an unsupported value. """ if isinstance(self.categories, str): if self.categories != "auto": - raise ValueError(f"Unsupported value {self.categories} for parameter `categories`.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError(f"Unsupported value {self.categories} for parameter `categories`."), + ) categories_col = "CATEGORIES" @@ -511,8 +514,15 @@ def _assign_categories(self, state_object_pandas: pd.DataFrame) -> None: categories_col ] # Giving the original type back to categories. - for k, v in categories.items(): - snowml_type = model_signature.DataType.from_snowpark_type(self._dataset_schema[k].datatype) + for idx, k in enumerate(categories.keys()): + v = categories[k] + # Schema column names are case insensitive. Using snowpark's dataset to maintain schema's + # column name consistency. Because the key of categories_pandas is sorted, we need sorted + # input cols from snowpark as well. + _dataset_schema_key = self._snowpark_cols["sorted_input_cols"][idx] + snowml_type = model_signature.DataType.from_snowpark_type( + self._dataset_schema[_dataset_schema_key].datatype + ) # Don't convert the boolean type, numpy is unable to switch from string to boolean. # Boolean types would be treated as string if snowml_type not in [model_signature.DataType.BOOL]: @@ -602,7 +612,10 @@ def map_encoding(row: pd.Series) -> int: cat = row[_CATEGORY] if hasattr(self, "_dataset_schema") and not pd.isnull(cat): # Do not convert when it is null row_element = np.array([row[_CATEGORY]]) - snowml_type = model_signature.DataType.from_snowpark_type(self._dataset_schema[input_col].datatype) + _dataset_schema_key = self._snowpark_cols["input_cols"][col_idx] + snowml_type = model_signature.DataType.from_snowpark_type( + self._dataset_schema[_dataset_schema_key].datatype + ) # Don't convert the boolean type, it would be treated as string if snowml_type not in [model_signature.DataType.BOOL]: cat = row_element.astype(snowml_type._numpy_type)[0] @@ -617,10 +630,13 @@ def map_encoding(row: pd.Series) -> int: ) if has_infrequent_categories: if self._default_to_infrequent_mappings[col_idx] is None: - msg = ( - "`self._default_to_infrequent_mappings[{}]` is None while infrequent categories exist in '{}'." + raise exceptions.SnowflakeMLException( + error_code=error_codes.INTERNAL_PYTHON_ERROR, + original_exception=RuntimeError( + f"`self._default_to_infrequent_mappings[{col_idx}]` is None while infrequent categories " + f"exist in '{input_col}'." + ), ) - raise RuntimeError(msg.format(col_idx, input_col)) encoding: int = self._default_to_infrequent_mappings[col_idx][cat_idx] else: encoding = cat_idx @@ -659,13 +675,11 @@ def transform( - If input is DataFrame, returns DataFrame - If input is a pd.DataFrame and `self.sparse=True`, returns `csr_matrix` - If input is a pd.DataFrame and `self.sparse=False`, returns `pd.DataFrame` - - Raises: - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ self._enforce_fit() super()._check_input_cols() super()._check_output_cols() + super()._check_dataset_type(dataset) # output columns are unset before fitting if not self.sparse and not self._dense_output_cols_mappings: @@ -673,13 +687,8 @@ def transform( if isinstance(dataset, snowpark.DataFrame): output_df = self._transform_snowpark(dataset) - elif isinstance(dataset, pd.DataFrame): - output_df = self._transform_sklearn(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + output_df = self._transform_sklearn(dataset) return self._drop_input_columns(output_df) if self._drop_input_cols is True else output_df @@ -730,7 +739,7 @@ def map_encoded_value(row: pd.Series) -> Dict[str, Any]: state_pandas[_ENCODED_VALUE] = state_pandas.apply(lambda x: map_encoded_value(x), axis=1) # columns: COLUMN_NAME, CATEGORY, COUNT, FITTED_CATEGORY, ENCODING, N_FEATURES_OUT, ENCODED_VALUE - assert dataset._session is not None, "dataset._session cannot be None" + assert dataset._session is not None state_df = dataset._session.create_dataframe(state_pandas) suffix = "_" + uuid.uuid4().hex.upper() @@ -811,7 +820,7 @@ def map_encoded_value(row: pd.Series) -> List[int]: state_pandas = state_pandas.merge(split_pandas, on=[_COLUMN_NAME, _CATEGORY], how="left") # columns: COLUMN_NAME, CATEGORY, COUNT, FITTED_CATEGORY, ENCODING, N_FEATURES_OUT, ENCODED_VALUE, OUTPUT_CATs - assert dataset._session is not None, "dataset._session cannot be None" + assert dataset._session is not None state_df = dataset._session.create_dataframe(state_pandas) transformed_dataset = dataset @@ -963,22 +972,36 @@ def _create_sklearn_object(self) -> preprocessing.OneHotEncoder: def _validate_keywords(self) -> None: # categories if isinstance(self.categories, str) and self.categories != "auto": - msg = "`categories` must be 'auto' or a dictionary, got {}." - raise ValueError(msg.format(self.categories)) + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError(f"Unsupported `categories` value: {self.categories}."), + ) elif isinstance(self.categories, dict): if len(self.categories) != len(self.input_cols): - msg = "`categories` must have length equal to the number of input columns ({}), got {}." - raise ValueError(msg.format(len(self.input_cols), len(self.categories))) + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError( + f"The number of categories ({len(self.categories)}) mismatches the number of input columns " + f"({len(self.input_cols)})." + ), + ) elif set(self.categories.keys()) != set(self.input_cols): - msg = "`categories` must have keys equal to input columns." - raise ValueError(msg) + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError( + "The column names of categories mismatch the column names of input columns." + ), + ) # drop: array-like object is validated in `_compute_drop_idx` if isinstance(self.drop, str) and self.drop not in {"first", "if_binary"}: - msg = ( - "`drop` must be one of 'first', 'if_binary', an array-like of shape (n_features,), or None, " "got {}." + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError( + "`drop` must be one of 'first', 'if_binary', an array-like of shape (n_features,), or None, " + f"got {self.drop}." + ), ) - raise ValueError(msg.format(self.drop)) # handle_unknown # TODO(hayu): [SNOW-752263] Support OneHotEncoder handle_unknown="infrequent_if_exist". @@ -987,18 +1010,32 @@ def _validate_keywords(self) -> None: # msg = "`handle_unknown` must be one of 'error', 'ignore', 'infrequent_if_exist', got {}." # raise ValueError(msg.format(self.handle_unknown)) if self.handle_unknown not in {"error", "ignore"}: - msg = "`handle_unknown` must be one of 'error', 'ignore', got {}." - raise ValueError(msg.format(self.handle_unknown)) + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError( + f"`handle_unknown` must be one of 'error', 'ignore', got {self.handle_unknown}." + ), + ) # min_frequency if isinstance(self.min_frequency, numbers.Integral): if not int(self.min_frequency) >= 1: - msg = "`min_frequency` must be an integer at least 1, a float in (0.0, 1.0), or None, " "got integer {}" - raise ValueError(msg.format(self.min_frequency)) + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError( + "`min_frequency` must be an integer at least 1, a float in (0.0, 1.0), or None, " + f"got integer {self.min_frequency}." + ), + ) elif isinstance(self.min_frequency, numbers.Real): if not (0.0 < float(self.min_frequency) < 1.0): - msg = "`min_frequency` must be an integer at least 1, a float in (0.0, 1.0), or None, " "got float {}" - raise ValueError(msg.format(self.min_frequency)) + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError( + "`min_frequency` must be an integer at least 1, a float in (0.0, 1.0), or None, " + f"got float {self.min_frequency}." + ), + ) def _handle_unknown_in_transform( self, @@ -1016,7 +1053,7 @@ def _handle_unknown_in_transform( Transformed dataset with unknown values handled. Raises: - ValueError: If `self.handle_unknown="error"` and unknown values exist in the + SnowflakeMLException: If `self.handle_unknown="error"` and unknown values exist in the transformed dataset. """ if self.handle_unknown == "error": @@ -1056,8 +1093,12 @@ def _handle_unknown_in_transform( ) ) if not unknown_pandas.empty: - msg = f"Found unknown categories during transform:\n{unknown_pandas.to_string()}" - raise ValueError(msg) + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Found unknown categories during transform:\n{unknown_pandas.to_string()}" + ), + ) if self.handle_unknown == "ignore" and not self.sparse: transformed_dataset = transformed_dataset.na.fill(0, self._inferred_output_cols) @@ -1099,7 +1140,7 @@ def _map_drop_idx_to_infrequent(self, feature_idx: int, drop_idx: int) -> int: Converted drop index with infrequent encoding considered. Raises: - ValueError: If the category to drop is infrequent. + SnowflakeMLException: If the category to drop is infrequent. """ if not self._infrequent_enabled: return drop_idx @@ -1112,9 +1153,12 @@ def _map_drop_idx_to_infrequent(self, feature_idx: int, drop_idx: int) -> int: infrequent_indices = self._infrequent_indices[feature_idx] if infrequent_indices is not None and drop_idx in infrequent_indices: categories = self._categories_list[feature_idx] - raise ValueError( - f"Unable to drop category {categories[drop_idx]!r} from feature" - f" {feature_idx} because it is infrequent" + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Unable to drop category {categories[drop_idx]!r} from feature " + f"{feature_idx} because it is infrequent." + ), ) return default_to_infrequent[drop_idx] @@ -1147,7 +1191,7 @@ def _set_drop_idx(self) -> None: `drop_idx_=_drop_idx_after_grouping`. Raises: - ValueError: If `self.drop` is array-like: + SnowflakeMLException: If `self.drop` is array-like: - `self.drop` cannot be converted to a ndarray. - The length of `self.drop` is not equal to the number of input columns. - The categories to drop are not found. @@ -1174,14 +1218,21 @@ def _set_drop_idx(self) -> None: drop_array = np.asarray(self.drop, dtype=object) droplen = len(drop_array) except (ValueError, TypeError): - msg = ( - "`drop` must be one of 'first', 'if_binary', an array-like of shape (n_features,), or None, " - "got {}." + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError( + "`drop` must be one of 'first', 'if_binary', an array-like of " # type: ignore[str-bytes-safe] + f"shape (n_features,), or None, got {self.drop}." + ), ) - raise ValueError(msg.format(self.drop)) if droplen != len(self._categories_list): - msg = "`drop` must have length equal to the number of features ({}), got {}." - raise ValueError(msg.format(len(self._categories_list), droplen)) + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError( + f"`drop` must have length equal to the number of features ({len(self._categories_list)}), " + f"got {droplen}." + ), + ) missing_drops = [] drop_indices = [] for feature_idx, (drop_val, cat_list) in enumerate(zip(drop_array, self._categories_list)): @@ -1207,7 +1258,10 @@ def _set_drop_idx(self) -> None: "dropped, but were not found in the training " "data.\n{}".format("\n".join([f"Category: {c}, Feature: {v}" for c, v in missing_drops])) ) - raise ValueError(msg) + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError(msg), + ) drop_idx_after_grouping = np.array(drop_indices, dtype=object) # `_drop_idx_after_grouping` are the categories to drop *after* the infrequent diff --git a/snowflake/ml/modeling/preprocessing/ordinal_encoder.py b/snowflake/ml/modeling/preprocessing/ordinal_encoder.py index 307c70ee..fef7aca3 100644 --- a/snowflake/ml/modeling/preprocessing/ordinal_encoder.py +++ b/snowflake/ml/modeling/preprocessing/ordinal_encoder.py @@ -12,6 +12,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry, type_utils +from snowflake.ml._internal.exceptions import error_codes, exceptions from snowflake.ml._internal.utils import identifier from snowflake.ml.modeling.framework import _utils, base from snowflake.snowpark import functions as F, types as T @@ -158,23 +159,16 @@ def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "OrdinalEncod Returns: Fitted encoder. - - Raises: - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ self._reset() self._validate_keywords() super()._check_input_cols() + super()._check_dataset_type(dataset) if isinstance(dataset, pd.DataFrame): self._fit_sklearn(dataset) - elif isinstance(dataset, snowpark.DataFrame): - self._fit_snowpark(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + self._fit_snowpark(dataset) self._validate_unknown_value() self._check_missing_categories() @@ -188,10 +182,6 @@ def _fit_sklearn(self, dataset: pd.DataFrame) -> None: self._categories_list = sklearn_encoder.categories_ - # Set `categories_` and `_state_pandas` - if len(self.input_cols) != len(self._categories_list): - raise ValueError("The derived categories mismatch the supplied input columns.") - _state_pandas_ordinals: List[pd.DataFrame] = [] for idx, input_col in enumerate(sorted(self.input_cols)): self.categories_[input_col] = self._categories_list[idx] @@ -257,7 +247,7 @@ def _get_category_index_state_df(self, dataset: snowpark.DataFrame) -> snowpark. State dataframe with columns [COLUMN_NAME, CATEGORY, INDEX]. Raises: - ValueError: If `self.categories` is provided, `self.handle_unknown="error"`, + SnowflakeMLException: If `self.categories` is provided, `self.handle_unknown="error"`, and unknown categories exist in dataset. """ # states of categories found in dataset @@ -295,15 +285,15 @@ def _get_category_index_state_df(self, dataset: snowpark.DataFrame) -> snowpark. found_state_df.union(all_encoded_value_df) if found_state_df is not None else all_encoded_value_df ) - assert found_state_df is not None, "found_state_df cannot be None" + assert found_state_df is not None if self.categories != "auto": state_data = [] - assert isinstance(self.categories, dict), "self.categories must be dict" + assert isinstance(self.categories, dict) for input_col, cats in self.categories.items(): for idx, cat in enumerate(cats.tolist()): state_data.append([input_col, cat, idx]) # states of given categories - assert dataset._session is not None, "dataset._session cannot be None" + assert dataset._session is not None given_state_df = dataset._session.create_dataframe( data=state_data, schema=[_COLUMN_NAME, _CATEGORY, _INDEX] ) @@ -322,23 +312,18 @@ def _get_category_index_state_df(self, dataset: snowpark.DataFrame) -> snowpark. ) if not unknown_pandas.empty: msg = f"Found unknown categories during fit:\n{unknown_pandas.to_string()}" - raise ValueError(msg) + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError(msg), + ) return given_state_df return found_state_df def _assign_categories(self) -> None: - """ - Assign the categories to the object. - - Raises: - ValueError: If `self.categories` is an unsupported value. - """ + """Assign the categories to the object.""" if isinstance(self.categories, str): - if self.categories != "auto": - raise ValueError(f"Unsupported value {self.categories} for parameter `categories`.") - partial_state_arr = self._state_pandas[[_COLUMN_NAME, _CATEGORY]].to_numpy() column_names_arr = partial_state_arr[:, 0] categories_arr = partial_state_arr[:, 1] @@ -368,16 +353,19 @@ def _validate_unknown_value(self) -> None: `self.unknown_value` is not used to encode any known category. Raises: - ValueError: If unknown categories exist in the fitted dataset. + SnowflakeMLException: If unknown categories exist in the fitted dataset. """ if self.handle_unknown == "use_encoded_value": for feature_cats in self._categories_list: if isinstance(self.unknown_value, numbers.Integral) and 0 <= self.unknown_value < len(feature_cats): - raise ValueError( - "The used value for unknown_value " - f"{self.unknown_value} is one of the " - "values already used for encoding the " - "seen categories." + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError( + "The used value for unknown_value " + f"{self.unknown_value} is one of the " + "values already used for encoding the " + "seen categories." + ), ) def _check_missing_categories(self) -> None: @@ -400,7 +388,7 @@ def _validate_encoded_missing_value(self) -> None: is not used to encode any known category. Raises: - ValueError: If missing categories exist and `self.encoded_missing_value` is already + SnowflakeMLException: If missing categories exist and `self.encoded_missing_value` is already used to encode a known category. """ if self._missing_indices: @@ -415,10 +403,12 @@ def _validate_encoded_missing_value(self) -> None: ] if invalid_features: - raise ValueError( - f"encoded_missing_value ({self.encoded_missing_value}) " - "is already used to encode a known category in features: " - f"{invalid_features}" + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError( + f"encoded_missing_value ({self.encoded_missing_value}) is already used to encode a known " + f"category in features: {invalid_features}." + ), ) @telemetry.send_api_usage_telemetry( @@ -438,23 +428,16 @@ def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[s Returns: Output dataset. - - Raises: - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ self._enforce_fit() super()._check_input_cols() super()._check_output_cols() + super()._check_dataset_type(dataset) if isinstance(dataset, snowpark.DataFrame): output_df = self._transform_snowpark(dataset) - elif isinstance(dataset, pd.DataFrame): - output_df = self._transform_sklearn(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + output_df = self._transform_sklearn(dataset) return self._drop_input_columns(output_df) if self._drop_input_cols is True else output_df @@ -469,7 +452,7 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame Output dataset. """ passthrough_columns = [c for c in dataset.columns if c not in self.output_cols] - assert dataset._session is not None, "dataset._session cannot be None" + assert dataset._session is not None state_df = ( dataset._session.table(self._vocab_table_name) if _utils.table_exists( @@ -548,32 +531,53 @@ def _create_sklearn_object(self) -> preprocessing.OrdinalEncoder: def _validate_keywords(self) -> None: if isinstance(self.categories, str) and self.categories != "auto": - raise ValueError(f"Unsupported value {self.categories} for parameter `categories`.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError(f"Unsupported `categories` value: {self.categories}."), + ) elif isinstance(self.categories, dict): if len(self.categories) != len(self.input_cols): - raise ValueError("The number of categories mismatches the number of input columns.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError( + f"The number of categories ({len(self.categories)}) mismatches the number of input columns " + f"({len(self.input_cols)})." + ), + ) elif set(self.categories.keys()) != set(self.input_cols): - raise ValueError("The column names of categories mismatch the column names of input columns.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError( + "The column names of categories mismatch the column names of input columns." + ), + ) if self.handle_unknown not in {"error", "use_encoded_value"}: - msg = "handle_unknown should be one of 'error', 'use_encoded_value' " f"got {self.handle_unknown}." - raise ValueError(msg) + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError( + f"`handle_unknown` must be one of 'error', 'use_encoded_value', got {self.handle_unknown}." + ), + ) if self.handle_unknown == "use_encoded_value": if not ( sklearn_utils.is_scalar_nan(self.unknown_value) or isinstance(self.unknown_value, numbers.Integral) ): - raise TypeError( - "unknown_value should be an integer or " - "np.nan when " - "handle_unknown is 'use_encoded_value', " - f"got {self.unknown_value}." + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=TypeError( + "`unknown_value` must be an integer or np.nan when `handle_unknown` is 'use_encoded_value', " + f"got {self.unknown_value}." + ), ) elif self.unknown_value is not None: - raise TypeError( - "unknown_value should only be set when " - "handle_unknown is 'use_encoded_value', " - f"got {self.unknown_value}." + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=TypeError( + "`unknown_value` must only be set when `handle_unknown` is 'use_encoded_value', " + f"got {self.unknown_value}." + ), ) def _handle_unknown_in_transform(self, transformed_dataset: snowpark.DataFrame) -> snowpark.DataFrame: @@ -587,7 +591,7 @@ def _handle_unknown_in_transform(self, transformed_dataset: snowpark.DataFrame) Transformed dataset with unknown values handled. Raises: - ValueError: If `self.handle_unknown="error"` and unknown values exist in the + SnowflakeMLException: If `self.handle_unknown="error"` and unknown values exist in the transformed dataset. """ if self.handle_unknown == "error": @@ -610,14 +614,23 @@ def _handle_unknown_in_transform(self, transformed_dataset: snowpark.DataFrame) unknown_df = unknown_df.union_by_name(temp_df) if unknown_df is not None else temp_df if unknown_df is None: - raise ValueError("snowml internal error caused by handle_unknown='error': empty input columns") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INTERNAL_PYTHON_ERROR, + original_exception=ValueError( + "Internal error caused by handle_unknown='error': empty input columns." + ), + ) unknown_pandas = unknown_df.to_pandas( statement_params=telemetry.get_statement_params(base.PROJECT, base.SUBPROJECT, self.__class__.__name__) ) if not unknown_pandas.empty: - msg = f"Found unknown categories during transform:\n{unknown_pandas.to_string()}" - raise ValueError(msg) + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + f"Found unknown categories during transform:\n{unknown_pandas.to_string()}" + ), + ) if self.handle_unknown == "use_encoded_value": # left outer join has already filled unknown values with null diff --git a/snowflake/ml/registry/BUILD.bazel b/snowflake/ml/registry/BUILD.bazel index 8b4a2630..240dfa9a 100644 --- a/snowflake/ml/registry/BUILD.bazel +++ b/snowflake/ml/registry/BUILD.bazel @@ -16,6 +16,7 @@ py_library( "//snowflake/ml/_internal:telemetry", "//snowflake/ml/model:_model", "//snowflake/ml/model:_deployer", + "//snowflake/ml/model:deploy_platforms", "//snowflake/ml/modeling/framework:framework" ], ) diff --git a/snowflake/ml/registry/model_registry.py b/snowflake/ml/registry/model_registry.py index 2e1bb7f5..3e84e8b2 100644 --- a/snowflake/ml/registry/model_registry.py +++ b/snowflake/ml/registry/model_registry.py @@ -6,7 +6,7 @@ import tempfile import types import zipfile -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Union, cast from uuid import uuid1 from absl import logging @@ -22,6 +22,7 @@ from snowflake.ml.model import ( _deployer, _model as model_api, + deploy_platforms, model_signature, type_hints as model_types, ) @@ -82,6 +83,9 @@ def create_model_registry( True if the creation of the model registry internal data structures was successful, False otherwise. """ + # Get the db & schema of the current session + old_db = session.get_current_database() + old_schema = session.get_current_schema() # These might be exposed as parameters in the future. database_name = identifier.get_inferred_name(database_name) @@ -94,27 +98,33 @@ def create_model_registry( subproject=_TELEMETRY_SUBPROJECT, function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), ""), ) - - _create_registry_database(session, database_name, statement_params) - _create_registry_schema(session, database_name, schema_name, statement_params) - _create_registry_tables( - session, - database_name, - schema_name, - registry_table_name, - metadata_table_name, - deployment_table_name, - statement_params, - ) - _create_registry_views( - session, - database_name, - schema_name, - registry_table_name, - metadata_table_name, - deployment_table_name, - statement_params, - ) + try: + _create_registry_database(session, database_name, statement_params) + _create_registry_schema(session, database_name, schema_name, statement_params) + _create_registry_tables( + session, + database_name, + schema_name, + registry_table_name, + metadata_table_name, + deployment_table_name, + statement_params, + ) + _create_registry_views( + session, + database_name, + schema_name, + registry_table_name, + metadata_table_name, + deployment_table_name, + statement_params, + ) + finally: + # Restore the db & schema to the original ones + if old_db is not None: + session.use_database(old_db) + if old_schema is not None: + session.use_schema(old_schema) return True @@ -457,6 +467,8 @@ def _check_access(self) -> None: ), ).has_dimensions(expected_rows=1).validate() + self._validate_registry_table_schema(add_if_not_exists=set()) + query_result_checker.SqlResultValidator( self._session, query=formatting.unwrap( @@ -475,7 +487,40 @@ def _check_access(self) -> None: ), ).has_dimensions(expected_rows=1).validate() - # TODO(zzhu): Also check validity of views. Consider checking schema as well. + # TODO(zzhu): Also check validity of views. + + # TODO checks type as well. note type in _schema doesn't match with it appears in 'DESC TABLE'. + # We need another layer of mapping. This function can also be extended to other tables as well. + def _validate_registry_table_schema(self, add_if_not_exists: Set[str]) -> None: + """Validate the table schema to check for any missing columns. + + Args: + add_if_not_exists: column names that will be created if not found in existing tables. + + Raises: + TypeError: required column not exists in schema table and not defined in add_if_not_exists. + """ + + for k in add_if_not_exists: + assert k in _schema._REGISTRY_TABLE_SCHEMA + + actual_table_rows = self._session.sql(f"DESC TABLE {self._fully_qualified_registry_table_name()}").collect() + actual_schema_dict = {} + for row in actual_table_rows: + actual_schema_dict[row["name"]] = row["type"] + + for col_name, col_type in _schema._REGISTRY_TABLE_SCHEMA.items(): + if col_name not in actual_schema_dict: + if col_name not in add_if_not_exists: + raise TypeError( + f"Registry table:{self._fully_qualified_registry_table_name()}" + f" doesn't have required column:'{col_name}'." + ) + else: + self._session.sql( + f"""ALTER TABLE {self._fully_qualified_registry_table_name()} + ADD COLUMN {col_name} {col_type}""" + ).collect() def _get_statement_params(self, frame: Optional[types.FrameType]) -> Dict[str, Any]: return telemetry.get_function_usage_statement_params( @@ -1598,7 +1643,7 @@ def deploy( deployment_name: str, target_method: str, permanent: bool = False, - platform: _deployer.TargetPlatform = _deployer.TargetPlatform.WAREHOUSE, + platform: deploy_platforms.TargetPlatform = deploy_platforms.TargetPlatform.WAREHOUSE, options: Optional[ Union[model_types.WarehouseDeployOptions, model_types.SnowparkContainerServiceDeployOptions] ] = None, @@ -1613,7 +1658,7 @@ def deploy( permanent: Whether the deployment is permanent or not. Permanent deployment will generate a permanent UDF. (Only applicable for Warehouse deployment) platform: Target platform to deploy the model to. Currently supported platforms are - ['warehouse', 'snowpark_container_service'] + ['warehouse', 'SNOWPARK_CONTAINER_SERVICES'] options: Optional options for model deployment. Defaults to None. Raises: @@ -1624,11 +1669,11 @@ def deploy( deployment_stage_path = "" - if platform == _deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE: + if platform == deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES: permanent = True options = cast(model_types.SnowparkContainerServiceDeployOptions, options) deployment_stage_path = f"{self._prepare_deployment_stage()}/{deployment_name}/" - elif platform == _deployer.TargetPlatform.WAREHOUSE: + elif platform == deploy_platforms.TargetPlatform.WAREHOUSE: options = cast(model_types.WarehouseDeployOptions, options) if permanent: # Every deployment-generated UDF should reside in its own unique directory. As long as each deployment @@ -1649,7 +1694,7 @@ def deploy( # artifacts. However, UDF generation fails when importing from a mix of encrypted and unencrypted stages. # The following workaround copies model between stages (PrPr as of July 7th, 2023) to transfer the SSE # encrypted model zip from model stage to the temporary unencrypted stage. - if not permanent and platform == _deployer.TargetPlatform.WAREHOUSE: + if not permanent and platform == deploy_platforms.TargetPlatform.WAREHOUSE: schema = self._fully_qualified_schema_name() unencrypted_stage = f"@{schema}.TEMP_UNENCRYPTED_{self._get_new_unique_identifier()}" self._session.sql(f"CREATE TEMPORARY STAGE {unencrypted_stage[1:]}").collect() @@ -1657,9 +1702,8 @@ def deploy( self._session.sql(f"COPY FILES INTO {unencrypted_stage} from {remote_model_path}").collect() except Exception: raise RuntimeError( - "Please ensure parameters are enabled in your Snowflake account by running " - "'ALTER ACCOUNT SET ENABLE_COPY_FILES=TRUE, " - "ENABLE_COPY_FILES_API_IN_STORAGE=TRUE'" + "Temporary deployment to the warehouse is currently not supported. Please use " + "permanent deployment by setting the 'permanent' parameter to True" ) remote_model_path = f"{unencrypted_stage}/{os.path.basename(remote_model_path)}" @@ -1824,6 +1868,14 @@ def delete_deployment(self, model_name: str, model_version: str, *, deployment_n operation=_DROP_METADATA_OPERATION, ) + # Optional Step 5: Delete Snowpark container service. + if deployment["TARGET_PLATFORM"] == deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES.value: + service_name = f"service_{deployment['MODEL_ID']}" + query_result_checker.SqlResultValidator( + self._session, + f"DROP SERVICE IF EXISTS {service_name}", + ).validate() + @telemetry.send_api_usage_telemetry( project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, @@ -1878,13 +1930,6 @@ def delete_model( ) -_TEMPLATE_MODEL_REF_METHOD_DEFN = """ -@telemetry.send_api_usage_telemetry(project='{project}', subproject='{subproject}') -def {name}{signature}: - return self._registry.{name}({arguments}) -""" - - class ModelReference: """Wrapper class for ModelReference objects that proxy model metadata operations.""" @@ -1984,43 +2029,35 @@ def __init__( # Ensure that we are not silently overwriting existing functions. assert not hasattr(self.__class__, name) - # logging.info("TEST: Adding function: " + name) - old_sig = inspect.signature(obj) - removed_none_type = map( - lambda x: x.replace(annotation=str(x.annotation)), - filter(lambda p: p.name not in ["model_name", "model_version"], old_sig.parameters.values()), - ) - new_sig = old_sig.replace( - parameters=list(removed_none_type), return_annotation=str(old_sig.return_annotation) - ) - arguments = ", ".join( - ["model_name=self._model_name"] - + ["model_version=self._model_version"] - + [ - "{p.name}={p.name}".format(p=p) - for p in filter( - lambda p: p.name not in ["id", "model_name", "model_version", "self"], - old_sig.parameters.values(), - ) - ] - ) + def build_method(m: Callable[..., Any]) -> Callable[..., Any]: + return lambda self, *args, **kwargs: m( + self._registry, self._model_name, self._model_version, *args, **kwargs + ) + + method = build_method(m=obj) + setattr(self.__class__, name, method) + docstring = self._remove_arg_from_docstring("model_name", obj.__doc__) if docstring and "model_version" in docstring: docstring = self._remove_arg_from_docstring("model_version", docstring) - exec( - _TEMPLATE_MODEL_REF_METHOD_DEFN.format( - name=name, - signature=new_sig, - arguments=arguments, - project=_TELEMETRY_PROJECT, - subproject=_TELEMETRY_SUBPROJECT, - ) - ) - setattr(self.__class__, name, locals()[name]) setattr(self.__class__.__dict__[name], "__doc__", docstring) # NoQA setattr(self.__class__, "init_complete", True) # NoQA + @telemetry.send_api_usage_telemetry( + project=_TELEMETRY_PROJECT, + subproject=_TELEMETRY_SUBPROJECT, + ) + def get_name(self) -> str: + return self._model_name + + @telemetry.send_api_usage_telemetry( + project=_TELEMETRY_PROJECT, + subproject=_TELEMETRY_SUBPROJECT, + ) + def get_version(self) -> str: + return self._model_version + @telemetry.send_api_usage_telemetry( project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, @@ -2052,13 +2089,16 @@ def predict(self, deployment_name: str, data: Any) -> "pd.DataFrame": deployment = self._registry.get_deployment( self._model_name, self._model_version, deployment_name=deployment_name ).collect()[0] - platform = _deployer.TargetPlatform(deployment["TARGET_PLATFORM"]) + platform = deploy_platforms.TargetPlatform(deployment["TARGET_PLATFORM"]) signature = model_signature.ModelSignature.from_dict(json.loads(deployment["SIGNATURE"])) options_dict = cast(Dict[str, Any], json.loads(deployment["OPTIONS"])) platform_options = { - _deployer.TargetPlatform.WAREHOUSE: model_types.WarehouseDeployOptions, - _deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE: model_types.SnowparkContainerServiceDeployOptions, + deploy_platforms.TargetPlatform.WAREHOUSE: model_types.WarehouseDeployOptions, + deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES: ( + model_types.SnowparkContainerServiceDeployOptions + ), } + if platform not in platform_options: raise ValueError(f"Unsupported target Platform: {platform}") options = platform_options[platform](options_dict) diff --git a/snowflake/ml/registry/model_registry_test.py b/snowflake/ml/registry/model_registry_test.py index 4e6c3e6d..50758cee 100644 --- a/snowflake/ml/registry/model_registry_test.py +++ b/snowflake/ml/registry/model_registry_test.py @@ -57,10 +57,19 @@ def setUp(self) -> None: self.model_version = "abc" self.datetime = datetime.datetime(2022, 11, 4, 17, 1, 30, 153000) + self._setup_mock_session() + def tearDown(self) -> None: """Complete test case. Ensure all expected operations have been observed.""" self._session.finalize() + def _setup_mock_session(self) -> None: + """Equip the mock session with mock variable/methods just for model registry.""" + self._session.get_current_database = absltest.mock.MagicMock(return_value=_DATABASE_NAME) + self._session.get_current_schema = absltest.mock.MagicMock(return_value=_SCHEMA_NAME) + self._session.use_database = absltest.mock.MagicMock() + self._session.use_schema = absltest.mock.MagicMock() + def add_session_mock_sql(self, query: str, result: Any) -> None: self._session.add_mock_sql(query=query, result=result) @@ -130,6 +139,23 @@ def get_show_databases_success(self, name: str) -> List[snowpark.Row]: ) ] + def get_desc_registry_table_success(self) -> List[snowpark.Row]: + """Helper method that returns a DataFrame that looks like the response of from a successful desc table.""" + return [ + snowpark.Row(name="CREATION_CONTEXT", type="VARCHAR"), + snowpark.Row(name="CREATION_ENVIRONMENT_SPEC", type="OBJECT"), + snowpark.Row(name="CREATION_ROLE", type="VARCHAR"), + snowpark.Row(name="CREATION_TIME", type="TIMESTAMP_TZ"), + snowpark.Row(name="ID", type="VARCHAR PRIMARY KEY RELY"), + snowpark.Row(name="INPUT_SPEC", type="OBJECT"), + snowpark.Row(name="NAME", type="VARCHAR"), + snowpark.Row(name="OUTPUT_SPEC", type="OBJECT"), + snowpark.Row(name="RUNTIME_ENVIRONMENT_SPEC", type="OBJECT"), + snowpark.Row(name="TYPE", type="VARCHAR"), + snowpark.Row(name="URI", type="VARCHAR"), + snowpark.Row(name="VERSION", type="VARCHAR"), + ] + def setup_open_call(self) -> None: self.add_session_mock_sql( query=f"SHOW DATABASES LIKE '{_DATABASE_NAME}'", @@ -149,6 +175,12 @@ def setup_open_call(self) -> None: self.get_show_tables_success(name=_REGISTRY_TABLE_NAME) ).add_collect_result(self.get_show_tables_success(name=_REGISTRY_TABLE_NAME)), ) + self.add_session_mock_sql( + query=f"DESC TABLE {_FULLY_QUALIFIED_REGISTRY_TABLE_NAME}", + result=mock_data_frame.MockDataFrame(self.get_desc_registry_table_success()).add_collect_result( + self.get_desc_registry_table_success() + ), + ) self.add_session_mock_sql( query=f"SHOW TABLES LIKE '{_METADATA_TABLE_NAME}' IN {_DATABASE_NAME}.{_SCHEMA_NAME}", result=mock_data_frame.MockDataFrame( @@ -476,6 +508,12 @@ def test_open_existing(self) -> None: query=f"SHOW TABLES LIKE '{_REGISTRY_TABLE_NAME}' IN {_DATABASE_NAME}.{_SCHEMA_NAME}", result=mock_data_frame.MockDataFrame(self.get_show_tables_success(name=_REGISTRY_TABLE_NAME)), ) + self.add_session_mock_sql( + query=f"DESC TABLE {_FULLY_QUALIFIED_REGISTRY_TABLE_NAME}", + result=mock_data_frame.MockDataFrame(self.get_desc_registry_table_success()).add_collect_result( + self.get_desc_registry_table_success() + ), + ) self.add_session_mock_sql( query=f"SHOW TABLES LIKE '{_METADATA_TABLE_NAME}' IN {_DATABASE_NAME}.{_SCHEMA_NAME}", result=mock_data_frame.MockDataFrame(self.get_show_tables_success(name=_METADATA_TABLE_NAME)), diff --git a/snowflake/ml/registry/notebooks/Deployment to Snowpark Container Service Demo.ipynb b/snowflake/ml/registry/notebooks/Deployment to Snowpark Container Service Demo.ipynb new file mode 100644 index 00000000..36417955 --- /dev/null +++ b/snowflake/ml/registry/notebooks/Deployment to Snowpark Container Service Demo.ipynb @@ -0,0 +1,620 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a45960e1", + "metadata": {}, + "source": [ + "# Deployment to Snowpark Container Service Demo" + ] + }, + { + "cell_type": "markdown", + "id": "aa7a329a", + "metadata": {}, + "source": [ + "## Prerequisite\n", + "\n", + "- Install and have a running Docker Client (required only for PrPr for client-side image build)" + ] + }, + { + "cell_type": "markdown", + "id": "3b50d774", + "metadata": {}, + "source": [ + "## Train a model with Snowpark ML API " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "18a75d71", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Tuple\n", + "from snowflake.ml.modeling import linear_model\n", + "from sklearn import datasets\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "def prepare_logistic_model() -> Tuple[linear_model.LogisticRegression, pd.DataFrame]:\n", + " iris = datasets.load_iris()\n", + " df = pd.DataFrame(data=np.c_[iris[\"data\"], iris[\"target\"]], columns=iris[\"feature_names\"] + [\"target\"])\n", + " df.columns = [s.replace(\" (CM)\", \"\").replace(\" \", \"\") for s in df.columns.str.upper()]\n", + "\n", + " input_cols = [\"SEPALLENGTH\", \"SEPALWIDTH\", \"PETALLENGTH\", \"PETALWIDTH\"]\n", + " label_cols = \"TARGET\"\n", + " output_cols = \"PREDICTED_TARGET\"\n", + "\n", + " estimator = linear_model.LogisticRegression(\n", + " input_cols=input_cols, output_cols=output_cols, label_cols=label_cols, random_state=0, max_iter=1000\n", + " ).fit(df)\n", + "\n", + " return estimator, df.drop(columns=label_cols).head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "db6734fa", + "metadata": {}, + "source": [ + "## Start Snowpark Session" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "58dd3604", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.utils.connection_params import SnowflakeLoginOptions\n", + "from snowflake.snowpark import Session\n", + "\n", + "session = Session.builder.configs(SnowflakeLoginOptions()).create()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "27dfbc42", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:absl:The database INFERENCE_CONTAINER_DB already exists. Skipping creation.\n", + "WARNING:absl:The schema INFERENCE_CONTAINER_DB.INFERENCE_CONTAINER_SCHEMAalready exists. Skipping creation.\n" + ] + } + ], + "source": [ + "from snowflake.ml.registry import model_registry\n", + "from snowflake.ml._internal.utils import identifier\n", + "\n", + "db = identifier._get_unescaped_name(session.get_current_database())\n", + "schema = identifier._get_unescaped_name(session.get_current_schema())\n", + "\n", + "# will be a no-op if registry already exists\n", + "model_registry.create_model_registry(session=session, database_name=db, schema_name=schema) \n", + "registry = model_registry.ModelRegistry(session=session, database_name=db, schema_name=schema)" + ] + }, + { + "cell_type": "markdown", + "id": "38e0a975", + "metadata": {}, + "source": [ + "## Register SnowML Model" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "574e7a43", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:ModelRegistry.log_model() is in private preview since 0.2.0. Do not use it in production. \n", + "WARNING:snowflake.snowpark:ModelRegistry.list_models() is in private preview since 0.2.0. Do not use it in production. \n" + ] + }, + { + "data": { + "text/plain": [ + "'0aa236602be711ee89915ac3f3b698e1'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logistic_model, test_features = prepare_logistic_model()\n", + "model_name = \"snowpark_ml_logistic\"\n", + "model_version = \"v1\"\n", + "\n", + "registry.log_model(\n", + " model_name=model_name,\n", + " model_version=model_version,\n", + " model=logistic_model,\n", + " sample_input_data=test_features,\n", + " options={\"embed_local_ml_library\": True}, # This option is enabled to pull latest dev code changes.\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "054a3862", + "metadata": {}, + "source": [ + "## Model Deployment to Snowpark Container Service" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "72ff114f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Building the Docker image and deploying to Snowpark Container Service. This process may take a few minutes.\n", + "WARNING:root:Image successfully built! To prevent the need for rebuilding the Docker image in future deployments, simply specify 'prebuilt_snowflake_image': 'temptest002038-servicesnow.registry-dev.snowflakecomputing.com/inference_container_db/inference_container_schema/snowml_repo/42374efe274011eea4ff5ac3f3b698e1:latest' in the options field of the deploy() function\n" + ] + } + ], + "source": [ + "from snowflake.ml.model import deploy_platforms\n", + "from snowflake import snowpark\n", + "\n", + "model_ref = model_registry.ModelReference(\n", + " registry=registry, model_name=model_name, model_version=model_version\n", + ")\n", + "\n", + "compute_pool = \"MY_COMPUTE_POOL\" # Pre-created compute pool\n", + "deployment_name = \"LOGISTIC_FUNC\" # Name of the resulting UDF\n", + "\n", + "model_ref.deploy(\n", + " deployment_name=deployment_name, \n", + " platform=deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES,\n", + " target_method=\"predict\",\n", + " options={\n", + " \"compute_pool\": compute_pool\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1c754e72", + "metadata": {}, + "source": [ + "## Batch Prediction on Snowpark Container Service" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a5c02328", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SEPALLENGTHSEPALWIDTHPETALLENGTHPETALWIDTHPREDICTED_TARGET
05.13.51.40.20.0
14.93.01.40.20.0
24.73.21.30.20.0
34.63.11.50.20.0
45.03.61.40.20.0
55.43.91.70.40.0
64.63.41.40.30.0
75.03.41.50.20.0
84.42.91.40.20.0
94.93.11.50.10.0
\n", + "
" + ], + "text/plain": [ + " SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH PREDICTED_TARGET\n", + "0 5.1 3.5 1.4 0.2 0.0\n", + "1 4.9 3.0 1.4 0.2 0.0\n", + "2 4.7 3.2 1.3 0.2 0.0\n", + "3 4.6 3.1 1.5 0.2 0.0\n", + "4 5.0 3.6 1.4 0.2 0.0\n", + "5 5.4 3.9 1.7 0.4 0.0\n", + "6 4.6 3.4 1.4 0.3 0.0\n", + "7 5.0 3.4 1.5 0.2 0.0\n", + "8 4.4 2.9 1.4 0.2 0.0\n", + "9 4.9 3.1 1.5 0.1 0.0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_ref.predict(deployment_name, test_features)" + ] + }, + { + "cell_type": "markdown", + "id": "9f8c6ce5", + "metadata": {}, + "source": [ + "## Train a HuggingFace Model (cross-encoder/nli-MiniLM2-L6-H768)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "809d5e98", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import pipeline\n", + "from snowflake.ml.model import custom_model\n", + "\n", + "def prepare_cross_encoder_model() -> Tuple[custom_model.CustomModel, pd.DataFrame]:\n", + " \"\"\"\n", + " Pretrained cross encoder model from huggingface.\n", + " \"\"\"\n", + " classifier = pipeline(\"zero-shot-classification\", model='cross-encoder/nli-MiniLM2-L6-H768') \n", + " candidate_labels = ['customer support', 'product experience', 'account issues']\n", + "\n", + " class HuggingFaceModel(custom_model.CustomModel):\n", + " def __init__(self, context: custom_model.ModelContext) -> None:\n", + " super().__init__(context)\n", + " \n", + " @custom_model.inference_api\n", + " def predict(self, input_df: pd.DataFrame) -> pd.DataFrame: \n", + " sequences_to_classify = input_df.values.flatten().tolist()\n", + " data = [classifier(sequence, candidate_labels) for sequence in sequences_to_classify]\n", + " max_score_labels = []\n", + " for record in data:\n", + " max_score_label = max(zip(record['labels'], record['scores']), key=lambda x: x[1])[0]\n", + " max_score_labels.append(max_score_label) \n", + " return pd.DataFrame({\"output\": max_score_labels})\n", + "\n", + " cross_encoder_model = HuggingFaceModel(custom_model.ModelContext())\n", + " test_data = pd.DataFrame([\"The interface gets frozen very often\"])\n", + "\n", + " return cross_encoder_model, test_data" + ] + }, + { + "cell_type": "markdown", + "id": "67d6a7d2", + "metadata": {}, + "source": [ + "## Register Cross Encoder Model" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "9dd84f88", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'bafae568275d11ee95175ac3f3b698e1'" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from snowflake.ml.registry import model_registry\n", + "\n", + "model, test_features = prepare_cross_encoder_model()\n", + "model_name = \"cross_encoder_model\"\n", + "model_version = \"v1\"\n", + "\n", + "registry.log_model(\n", + " model_name=model_name,\n", + " model_version=model_version,\n", + " model=model,\n", + " conda_dependencies=[\"pytorch::pytorch==2.0.1\", \"conda-forge::transformers==4.18.0\"],\n", + " sample_input_data=test_features,\n", + " options={\"embed_local_ml_library\": True}, # This option is enabled to pull latest dev code changes.\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c6db686e", + "metadata": {}, + "source": [ + "## Model Deployment to Snowpark Container Service (GPU)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "701152f7", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Building the Docker image and deploying to Snowpark Container Service. This process may take a few minutes.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Image successfully built! To prevent the need for rebuilding the Docker image in future deployments, simply specify 'prebuilt_snowflake_image': 'temptest002038-servicesnow.registry-dev.snowflakecomputing.com/inference_container_db/inference_container_schema/snowml_repo/bafae568275d11ee95175ac3f3b698e1:latest' in the options field of the deploy() function\n" + ] + } + ], + "source": [ + "from snowflake.ml.model import deploy_platforms\n", + "\n", + "model_ref = model_registry.ModelReference(\n", + " registry=registry, model_name=model_name, model_version=model_version\n", + ")\n", + "\n", + "compute_pool = \"MY_COMPUTE_POOL\" # Pre-created\n", + "deployment_name = \"CROSS_ENCODER\" # Name of the resulting UDF\n", + "\n", + "model_ref.deploy(\n", + " deployment_name=deployment_name, \n", + " platform=deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES,\n", + " target_method=\"predict\",\n", + " options={\n", + " \"compute_pool\": compute_pool,\n", + " \"use_gpu\": True\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7b0fba61", + "metadata": {}, + "source": [ + "## Zero-Shot Classification" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "936840df", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " input_feature_0\n", + "0 The interface gets frozen very often\n" + ] + } + ], + "source": [ + "print(test_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "302daaf9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
output
0product experience
\n", + "
" + ], + "text/plain": [ + " output\n", + "0 product experience" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_ref.predict(deployment_name, test_features)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:local_snowml] *", + "language": "python", + "name": "conda-env-local_snowml-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb b/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb index a6cf8eac..c7bb52e8 100644 --- a/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb +++ b/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb @@ -312,62 +312,6 @@ "print(\"Registered new model:\", model_id)" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "735f0ac3", - "metadata": {}, - "source": [ - "### Load Model" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "f778a9ad", - "metadata": {}, - "source": [ - "We can also restore the model we saved to the registry and load it back into the local context to make predictions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2796f2e0", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=SVC_MODEL_NAME, model_version=SVC_MODEL_VERSION)\n", - "restored_clf = model.load_model()\n", - "\n", - "restored_prediction = restored_clf.predict(test_features)\n", - "\n", - "print(\"Original prediction:\", prediction[:10])\n", - "print(\"Restored prediction:\", restored_prediction[:10])\n", - "\n", - "print(\"Result comparison:\", np.array_equal(prediction, restored_prediction))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3717853f", - "metadata": {}, - "outputs": [], - "source": [ - "restored_prediction_proba = restored_clf.predict_proba(test_features)\n", - "\n", - "print(\"Original prediction:\", prediction_proba[:10])\n", - "print(\"Restored prediction:\", restored_prediction_proba[:10])\n", - "\n", - "print(\"Result comparison:\", np.array_equal(prediction_proba, restored_prediction_proba))" - ] - }, { "attachments": {}, "cell_type": "markdown", diff --git a/snowflake/ml/registry/notebooks/Model Packaging SnowML Examples.ipynb b/snowflake/ml/registry/notebooks/Model Packaging SnowML Examples.ipynb index 5b971804..a79d6aec 100644 --- a/snowflake/ml/registry/notebooks/Model Packaging SnowML Examples.ipynb +++ b/snowflake/ml/registry/notebooks/Model Packaging SnowML Examples.ipynb @@ -275,37 +275,6 @@ "print(\"Registered new model:\", model_id)" ] }, - { - "cell_type": "markdown", - "id": "fccfb1af", - "metadata": {}, - "source": [ - "### Test on the result using load_model " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf9a3596", - "metadata": { - "code_folding": [] - }, - "outputs": [], - "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", - "restored_clf = model.load_model()\n", - "\n", - "restored_prediction = restored_clf.predict(test_features)\n", - "\n", - "print(\"Original prediction:\", prediction[:10])\n", - "print(\"Restored prediction:\", restored_prediction[:10])\n", - "\n", - "print(\"Result comparison:\", np.array_equal(prediction, restored_prediction[prediction.columns]))" - ] - }, { "cell_type": "markdown", "id": "fe9e2081", @@ -474,35 +443,6 @@ "print(\"Registered new model:\", model_id)" ] }, - { - "cell_type": "markdown", - "id": "1c8e87fc", - "metadata": {}, - "source": [ - "#### Comparsion between load_model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cf1db785", - "metadata": {}, - "outputs": [], - "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", - "restored_clf = model.load_model()\n", - "\n", - "restored_prediction = restored_clf.predict(test_features)\n", - "\n", - "print(\"Original prediction:\", prediction[:10])\n", - "print(\"Restored prediction:\", restored_prediction[:10])\n", - "\n", - "print(\"Result comparison:\", np.array_equal(prediction[\"PREDICTED_TARGET\"], restored_prediction[prediction.columns]))" - ] - }, { "cell_type": "markdown", "id": "cefbad30", @@ -689,35 +629,6 @@ "print(\"Registered new model:\", model_id)" ] }, - { - "cell_type": "markdown", - "id": "39bdfe5a", - "metadata": {}, - "source": [ - "#### Comparison between load_model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5fcc75b", - "metadata": {}, - "outputs": [], - "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", - "restored_clf = model.load_model()\n", - "\n", - "restored_prediction = restored_clf.predict(test_features)\n", - "\n", - "print(\"Original prediction:\", prediction[:10])\n", - "print(\"Restored prediction:\", restored_prediction[:10])\n", - "\n", - "print(\"Result comparison:\", np.array_equal(prediction, restored_prediction[prediction.columns]))" - ] - }, { "cell_type": "markdown", "id": "83ff0a1b", @@ -1005,35 +916,6 @@ "print(\"Registered new model:\", model_id)" ] }, - { - "cell_type": "markdown", - "id": "9cd6b554", - "metadata": {}, - "source": [ - "#### Comparison between load_model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ade4d099", - "metadata": {}, - "outputs": [], - "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", - "restored_clf = model.load_model()\n", - "\n", - "restored_prediction = restored_clf.predict(iris_df_test.to_pandas())\n", - "\n", - "print(\"Original prediction:\", prediction[:10])\n", - "print(\"Restored prediction:\", restored_prediction[:10])\n", - "\n", - "print(\"Result comparison:\", np.array_equal(prediction, restored_prediction[prediction.columns]))" - ] - }, { "cell_type": "markdown", "id": "2ca2e15e", @@ -1102,7 +984,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.8.12" }, "toc": { "base_numbering": 1, diff --git a/snowflake/ml/registry/notebooks/Model Registry Demo.ipynb b/snowflake/ml/registry/notebooks/Model Registry Demo.ipynb index 87db29d2..aedf48b4 100644 --- a/snowflake/ml/registry/notebooks/Model Registry Demo.ipynb +++ b/snowflake/ml/registry/notebooks/Model Registry Demo.ipynb @@ -506,76 +506,6 @@ ").show(3) " ] }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "735f0ac3", - "metadata": {}, - "source": [ - "## Load Model" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "0a43d2b6", - "metadata": {}, - "source": [ - "We can also restore the model we saved to the registry and load it back into the local context to make predictions." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "5e065521", - "metadata": {}, - "source": [ - "### Relational API" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc0512e1", - "metadata": {}, - "outputs": [], - "source": [ - "registry = model_registry.ModelRegistry(session=session, database_name=registry_name)\n", - "\n", - "restored_clf = registry.load_model(model_name=model_name, model_version=model_version)\n", - "\n", - "restored_prediction = restored_clf.predict(test_features)\n", - "\n", - "print(\"Original prediction:\", prediction[:10])\n", - "print(\"Restored prediction:\", restored_prediction[:10])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "4fbc0793", - "metadata": {}, - "source": [ - "### Object API" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2796f2e0", - "metadata": {}, - "outputs": [], - "source": [ - "registry = model_registry.ModelRegistry(session=session, database_name=registry_name)\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", - "restored_clf = model.load_model()\n", - "\n", - "restored_prediction = restored_clf.predict(test_features)\n", - "\n", - "print(\"Original prediction:\", prediction[:10])\n", - "print(\"Restored prediction:\", restored_prediction[:10])" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -817,7 +747,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.8.12" }, "vscode": { "interpreter": { diff --git a/snowflake/ml/registry/notebooks/Snowpark ML - Deployment to Snowpark Container Service Demo.ipynb b/snowflake/ml/registry/notebooks/Snowpark ML - Deployment to Snowpark Container Service Demo.ipynb deleted file mode 100644 index d83ecc27..00000000 --- a/snowflake/ml/registry/notebooks/Snowpark ML - Deployment to Snowpark Container Service Demo.ipynb +++ /dev/null @@ -1,644 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a45960e1", - "metadata": {}, - "source": [ - "# Snowpark ML - Deployment to Snowpark Container Service Demo" - ] - }, - { - "cell_type": "markdown", - "id": "aa7a329a", - "metadata": {}, - "source": [ - "## Prerequisite\n", - "\n", - "- Install and have a running Docker Client (required only for PrPr for client-side image build)" - ] - }, - { - "cell_type": "markdown", - "id": "3b50d774", - "metadata": {}, - "source": [ - "## Train a model with Snowpark ML API " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "18a75d71", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Tuple\n", - "from snowflake.ml.modeling.linear_model import LogisticRegression\n", - "from sklearn import datasets\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "def prepare_logistic_model() -> Tuple[LogisticRegression, pd.DataFrame]:\n", - " iris = datasets.load_iris()\n", - " df = pd.DataFrame(data=np.c_[iris[\"data\"], iris[\"target\"]], columns=iris[\"feature_names\"] + [\"target\"])\n", - " df.columns = [s.replace(\" (CM)\", \"\").replace(\" \", \"\") for s in df.columns.str.upper()]\n", - "\n", - " input_cols = [\"SEPALLENGTH\", \"SEPALWIDTH\", \"PETALLENGTH\", \"PETALWIDTH\"]\n", - " label_cols = \"TARGET\"\n", - " output_cols = \"PREDICTED_TARGET\"\n", - "\n", - " estimator = LogisticRegression(\n", - " input_cols=input_cols, output_cols=output_cols, label_cols=label_cols, random_state=0, max_iter=1000\n", - " ).fit(df)\n", - "\n", - " return estimator, df.drop(columns=label_cols).head(10)" - ] - }, - { - "cell_type": "markdown", - "id": "db25f95b", - "metadata": {}, - "source": [ - "## Train a HuggingFace Model (cross-encoder/nli-MiniLM2-L6-H768)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9e319bd2", - "metadata": {}, - "outputs": [], - "source": [ - "from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification\n", - "from snowflake.ml.model import custom_model\n", - "import torch\n", - "\n", - "def prepare_cross_encoder_model() -> Tuple[custom_model.CustomModel, pd.DataFrame]:\n", - " \"\"\"\n", - " Pretrained cross encoder model from huggingface.\n", - " \"\"\"\n", - " classifier = pipeline(\"zero-shot-classification\", model='cross-encoder/nli-MiniLM2-L6-H768') \n", - " candidate_labels = ['customer support', 'product experience', 'account issues']\n", - "\n", - " class HuggingFaceModel(custom_model.CustomModel):\n", - " def __init__(self, context: custom_model.ModelContext) -> None:\n", - " super().__init__(context)\n", - " \n", - " @custom_model.inference_api\n", - " def predict(self, input_df: pd.DataFrame) -> pd.DataFrame: \n", - " sequences_to_classify = input_df.values.flatten().tolist()\n", - " data = [classifier(sequence, candidate_labels) for sequence in sequences_to_classify]\n", - " max_score_labels = []\n", - " for record in data:\n", - " max_score_label = max(zip(record['labels'], record['scores']), key=lambda x: x[1])[0]\n", - " max_score_labels.append(max_score_label) \n", - " return pd.DataFrame({\"output\": max_score_labels})\n", - "\n", - " cross_encoder_model = HuggingFaceModel(custom_model.ModelContext())\n", - " test_data = pd.DataFrame([\"The interface gets frozen very often\"])\n", - "\n", - " return cross_encoder_model, test_data" - ] - }, - { - "cell_type": "markdown", - "id": "db6734fa", - "metadata": {}, - "source": [ - "## Start Snowpark Session\n", - "\n", - "To avoid exposing credentials in Github, we use a small utility `SnowflakeLoginOptions`. It allows you to score your default credentials in `~/.snowsql/config` in the following format:\n", - "```\n", - "[connections]\n", - "accountname = # Account identifier to connect to Snowflake.\n", - "username = # User name in the account.\n", - "password = # User password.\n", - "dbname = # Default database.\n", - "schemaname = # Default schema.\n", - "warehousename = # Default warehouse.\n", - "rolename = # Default role.\n", - "```\n", - "Please follow [this](https://docs.snowflake.com/en/user-guide/snowsql-start.html#configuring-default-connection-settings) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "58dd3604", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. \n" - ] - } - ], - "source": [ - "from snowflake.ml.utils.connection_params import SnowflakeLoginOptions\n", - "from snowflake.snowpark import Session, Column, functions\n", - "\n", - "session = Session.builder.configs(SnowflakeLoginOptions()).create()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "27dfbc42", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:snowflake.snowpark:create_model_registry() is in private preview since 0.2.0. Do not use it in production. \n", - "WARNING:absl:The database INFERENCE_CONTAINER_DB already exists. Skipping creation.\n", - "WARNING:absl:The schema INFERENCE_CONTAINER_DB.INFERENCE_CONTAINER_SCHEMAalready exists. Skipping creation.\n" - ] - } - ], - "source": [ - "from snowflake.ml.registry import model_registry\n", - "\n", - "conn = session._conn._conn\n", - "# will be a no-op if registry already exists\n", - "model_registry.create_model_registry(session=session, database_name=conn._database, schema_name=conn._schema) \n", - "registry = model_registry.ModelRegistry(session=session, database_name=conn._database, schema_name=conn._schema)" - ] - }, - { - "cell_type": "markdown", - "id": "38e0a975", - "metadata": {}, - "source": [ - "## Register SnowML Model" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "574e7a43", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:snowflake.snowpark:create_model_registry() is in private preview since 0.2.0. Do not use it in production. \n", - "WARNING:absl:The database INFERENCE_CONTAINER_DB already exists. Skipping creation.\n", - "WARNING:absl:The schema INFERENCE_CONTAINER_DB.INFERENCE_CONTAINER_SCHEMAalready exists. Skipping creation.\n", - "WARNING:snowflake.snowpark:ModelRegistry.log_model() is in private preview since 0.2.0. Do not use it in production. \n", - "WARNING:snowflake.snowpark:ModelRegistry.list_models() is in private preview since 0.2.0. Do not use it in production. \n" - ] - }, - { - "data": { - "text/plain": [ - "'42374efe274011eea4ff5ac3f3b698e1'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logistic_model, test_features = prepare_logistic_model()\n", - "model_name = \"snowpark_ml_logistic\"\n", - "model_version = \"v2\"\n", - "\n", - "registry.log_model(\n", - " model_name=model_name,\n", - " model_version=model_version,\n", - " model=logistic_model,\n", - " sample_input_data=test_features,\n", - " options={\"embed_local_ml_library\": True}, # This option is enabled to pull latest dev code changes.\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "054a3862", - "metadata": {}, - "source": [ - "## Model Deployment to Snowpark Container Service" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "72ff114f", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:root:Building the Docker image and deploying to Snowpark Container Service. This process may take a few minutes.\n", - "WARNING:root:Image successfully built! To prevent the need for rebuilding the Docker image in future deployments, simply specify 'prebuilt_snowflake_image': 'temptest002038-servicesnow.registry-dev.snowflakecomputing.com/inference_container_db/inference_container_schema/snowml_repo/42374efe274011eea4ff5ac3f3b698e1:latest' in the options field of the deploy() function\n" - ] - } - ], - "source": [ - "from snowflake.ml.model import _deployer\n", - "from snowflake import snowpark\n", - "\n", - "model_ref = model_registry.ModelReference(\n", - " registry=registry, model_name=model_name, model_version=model_version\n", - ")\n", - "\n", - "compute_pool = \"SHULIN_GPU_POOL\" # Pre-created\n", - "deployment_name = \"LOGISTIC_FUNC\" # Name of the resulting UDF\n", - "\n", - "model_ref.deploy(\n", - " deployment_name=deployment_name, \n", - " platform=_deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE,\n", - " target_method=\"predict\",\n", - " options={\n", - " \"compute_pool\": compute_pool\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "1c754e72", - "metadata": {}, - "source": [ - "## Batch Prediction on Snowpark Container Service" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "a5c02328", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SEPALLENGTHSEPALWIDTHPETALLENGTHPETALWIDTHPREDICTED_TARGET
05.13.51.40.20.0
14.93.01.40.20.0
24.73.21.30.20.0
34.63.11.50.20.0
45.03.61.40.20.0
55.43.91.70.40.0
64.63.41.40.30.0
75.03.41.50.20.0
84.42.91.40.20.0
94.93.11.50.10.0
\n", - "
" - ], - "text/plain": [ - " SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH PREDICTED_TARGET\n", - "0 5.1 3.5 1.4 0.2 0.0\n", - "1 4.9 3.0 1.4 0.2 0.0\n", - "2 4.7 3.2 1.3 0.2 0.0\n", - "3 4.6 3.1 1.5 0.2 0.0\n", - "4 5.0 3.6 1.4 0.2 0.0\n", - "5 5.4 3.9 1.7 0.4 0.0\n", - "6 4.6 3.4 1.4 0.3 0.0\n", - "7 5.0 3.4 1.5 0.2 0.0\n", - "8 4.4 2.9 1.4 0.2 0.0\n", - "9 4.9 3.1 1.5 0.1 0.0" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model_ref.predict(deployment_name, test_features)" - ] - }, - { - "cell_type": "markdown", - "id": "67d6a7d2", - "metadata": {}, - "source": [ - "## Register Cross Encoder Model" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "9dd84f88", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'bafae568275d11ee95175ac3f3b698e1'" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from snowflake.ml.registry import model_registry\n", - "\n", - "model, test_features = prepare_cross_encoder_model()\n", - "model_name = \"cross_encoder_model\"\n", - "model_version = \"v2\"\n", - "\n", - "registry.log_model(\n", - " model_name=model_name,\n", - " model_version=model_version,\n", - " model=model,\n", - " conda_dependencies=[\"pytorch::pytorch==2.0.1\", \"conda-forge::transformers==4.18.0\"],\n", - " sample_input_data=test_features,\n", - " options={\"embed_local_ml_library\": True}, # This option is enabled to pull latest dev code changes.\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "c6db686e", - "metadata": {}, - "source": [ - "## Model Deployment to Snowpark Container Service (GPU)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "701152f7", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:root:Building the Docker image and deploying to Snowpark Container Service. This process may take a few minutes.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:root:Image successfully built! To prevent the need for rebuilding the Docker image in future deployments, simply specify 'prebuilt_snowflake_image': 'temptest002038-servicesnow.registry-dev.snowflakecomputing.com/inference_container_db/inference_container_schema/snowml_repo/bafae568275d11ee95175ac3f3b698e1:latest' in the options field of the deploy() function\n" - ] - } - ], - "source": [ - "from snowflake.ml.model import _deployer\n", - "from snowflake import snowpark\n", - "\n", - "model_ref = model_registry.ModelReference(\n", - " registry=registry, model_name=model_name, model_version=model_version\n", - ")\n", - "\n", - "compute_pool = \"SHULIN_GPU_POOL\" # Pre-created\n", - "deployment_name = \"CROSS_ENCODER\" # Name of the resulting UDF\n", - "\n", - "model_ref.deploy(\n", - " deployment_name=deployment_name, \n", - " platform=_deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE,\n", - " target_method=\"predict\",\n", - " options={\n", - " \"compute_pool\": compute_pool,\n", - " \"use_gpu\": True\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "7b0fba61", - "metadata": {}, - "source": [ - "## Zero-Shot Classification" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "936840df", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " input_feature_0\n", - "0 The interface gets frozen very often\n" - ] - } - ], - "source": [ - "print(test_features)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "302daaf9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
output
0product experience
\n", - "
" - ], - "text/plain": [ - " output\n", - "0 product experience" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model_ref.predict(deployment_name, test_features)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:local_snowml] *", - "language": "python", - "name": "conda-env-local_snowml-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.17" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/snowflake/ml/requirements.bzl b/snowflake/ml/requirements.bzl index 17a32df5..6069e6a9 100755 --- a/snowflake/ml/requirements.bzl +++ b/snowflake/ml/requirements.bzl @@ -1,5 +1,5 @@ # DO NOT EDIT! -# Generate by running 'bazel run //bazel/requirements:sync_requirements' +# Generate by running 'bazel run --config=pre_build //bazel/requirements:sync_requirements' EXTRA_REQUIREMENTS={'lightgbm': ['lightgbm==3.3.5'], 'mlflow': ['mlflow>=2.1.0,<3'], 'tensorflow': ['tensorflow>=2.9,<3'], 'torch': ['torchdata>=0.4,<1'], 'all': ['lightgbm==3.3.5', 'mlflow>=2.1.0,<3', 'tensorflow>=2.9,<3', 'torchdata>=0.4,<1']} diff --git a/snowflake/ml/test_utils/BUILD.bazel b/snowflake/ml/test_utils/BUILD.bazel index 9999763b..32c78a6d 100644 --- a/snowflake/ml/test_utils/BUILD.bazel +++ b/snowflake/ml/test_utils/BUILD.bazel @@ -2,6 +2,15 @@ load("//bazel:py_rules.bzl", "py_library", "py_test") package(default_visibility = ["//visibility:public"]) +py_library( + name = "exception_utils", + testonly = True, + srcs = ["exception_utils.py"], + deps = [ + "//snowflake/ml/_internal/exceptions:exceptions", + ], +) + py_library( name = "mock_snowml_base", testonly = True, diff --git a/snowflake/ml/test_utils/exception_utils.py b/snowflake/ml/test_utils/exception_utils.py new file mode 100644 index 00000000..66a8f74c --- /dev/null +++ b/snowflake/ml/test_utils/exception_utils.py @@ -0,0 +1,22 @@ +import contextlib +from typing import Generator, Optional, Type + +from absl.testing import absltest + +from snowflake.ml._internal.exceptions import exceptions + + +@contextlib.contextmanager +def assert_snowml_exceptions( + test_case: absltest.TestCase, + *, + expected_error_code: Optional[str] = None, + expected_original_error_type: Optional[Type[Exception]] = None, + expected_regex: str = "", +) -> Generator[None, None, None]: + with test_case.assertRaisesRegex(exceptions.SnowflakeMLException, expected_regex) as exc: + yield + if expected_error_code: + test_case.assertEqual(exc.exception.error_code, expected_error_code) + if expected_original_error_type: + test_case.assertIsInstance(exc.exception.original_exception, expected_original_error_type) diff --git a/snowflake/ml/training_dataset/BUILD.bazel b/snowflake/ml/training_dataset/BUILD.bazel new file mode 100644 index 00000000..31435e7e --- /dev/null +++ b/snowflake/ml/training_dataset/BUILD.bazel @@ -0,0 +1,13 @@ +load("//bazel:py_rules.bzl", "py_library") + +package(default_visibility = ["//visibility:public"]) + +py_library( + name = "training_dataset", + srcs = [ + "training_dataset.py", + ], + deps = [ + "//snowflake/ml/_internal/utils:query_result_checker", + ], +) diff --git a/snowflake/ml/training_dataset/training_dataset.py b/snowflake/ml/training_dataset/training_dataset.py new file mode 100644 index 00000000..09c584b4 --- /dev/null +++ b/snowflake/ml/training_dataset/training_dataset.py @@ -0,0 +1,44 @@ +from dataclasses import dataclass +from typing import Dict, List, Optional + +from snowflake.snowpark import DataFrame + + +@dataclass(frozen=True) +class FeatureStoreMetadata: + """ + Feature store metadata. + + Properties: + spine_query: The input query on source table which will be joined with features. + connection_params: a config contains feature store metadata. + features: A list of feature serialized object in the feature store. + + """ + + spine_query: str + connection_params: Dict[str, str] + features: List[str] + + +@dataclass(frozen=True) +class TrainingDataset: + """ + Training dataset object contains the metadata and async job object if training task is still running. + + Properties: + df: A dataframe object representing the training dataset generation. + materialized_table: The destination table name which training data will writes into. + timestamp_col: Name of timestamp column in spine_df that will be used to join time-series features. + If spine_timestamp_col is not none, the input features also must have timestamp_col. + label_cols: Name of colum(s) in materialized_table that contains training labels. + feature_store_metadata: A feature store metadata object. + desc: A description about this training dataset. + """ + + df: DataFrame + materialized_table: Optional[str] + timestamp_col: Optional[str] + label_cols: Optional[List[str]] + feature_store_metadata: Optional[FeatureStoreMetadata] + desc: str diff --git a/snowflake/ml/utils/connection_params.py b/snowflake/ml/utils/connection_params.py index ac0344c1..44e1f323 100644 --- a/snowflake/ml/utils/connection_params.py +++ b/snowflake/ml/utils/connection_params.py @@ -1,8 +1,10 @@ import configparser import os -from typing import Dict, Optional +from typing import Dict, Optional, Union from absl import logging +from cryptography.hazmat import backends +from cryptography.hazmat.primitives import serialization from snowflake import snowpark @@ -32,23 +34,73 @@ def _read_token(token_file: str = "") -> str: return token +_ENCRYPTED_PKCS8_PK_HEADER = b"-----BEGIN ENCRYPTED PRIVATE KEY-----" +_UNENCRYPTED_PKCS8_PK_HEADER = b"-----BEGIN PRIVATE KEY-----" + + +def _load_pem_to_der(private_key_path: str) -> bytes: + """Given a private key file path (in PEM format), decode key data into DER format.""" + with open(private_key_path, "rb") as f: + private_key_pem = f.read() + private_key_passphrase: Optional[str] = os.getenv("SNOWFLAKE_PRIVATE_KEY_PASSPHRASE", None) + + # Only PKCS#8 format key will be accepted. However, openssl + # transparently handle PKCS#8 and PKCS#1 format (by some fallback + # logic) and their is no function to distinguish between them. By + # reading openssl source code, apparently they also relies on header + # to determine if give bytes is PKCS#8 format or not + if not private_key_pem.startswith(_ENCRYPTED_PKCS8_PK_HEADER) and not private_key_pem.startswith( + _UNENCRYPTED_PKCS8_PK_HEADER + ): + raise Exception("Private key provided is not in PKCS#8 format. Please use correct format.") + + if private_key_pem.startswith(_ENCRYPTED_PKCS8_PK_HEADER) and private_key_passphrase is None: + raise Exception( + "Private key is encrypted but passphrase could not be found. " + "Please set SNOWFLAKE_PRIVATE_KEY_PASSPHRASE env variable." + ) + + if private_key_pem.startswith(_UNENCRYPTED_PKCS8_PK_HEADER): + private_key_passphrase = None + + private_key = serialization.load_pem_private_key( + private_key_pem, + str.encode(private_key_passphrase) if private_key_passphrase is not None else private_key_passphrase, + backends.default_backend(), + ) + + return private_key.private_bytes( + encoding=serialization.Encoding.DER, + format=serialization.PrivateFormat.PKCS8, + encryption_algorithm=serialization.NoEncryption(), + ) + + def _connection_properties_from_env() -> Dict[str, str]: """Returns a dict with all possible login related env variables.""" sf_conn_prop = { # Mandatory fields "account": os.environ["SNOWFLAKE_ACCOUNT"], - "user": os.getenv("SNOWFLAKE_USER", ""), "database": os.environ["SNOWFLAKE_DATABASE"], - # With empty default value - "authenticator": os.getenv("SNOWFLAKE_AUTHENTICATOR", ""), - "password": os.getenv("SNOWFLAKE_PASSWORD", ""), + # With a default value "token_file": os.getenv("SNOWFLAKE_TOKEN_FILE", "/snowflake/session/token"), - "host": os.getenv("SNOWFLAKE_HOST", ""), - "port": os.getenv("SNOWFLAKE_PORT", ""), - "schema": os.getenv("SNOWFLAKE_SCHEMA", "basic"), - "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE", ""), "ssl": os.getenv("SNOWFLAKE_SSL", "on"), + "protocol": os.getenv("SNOWFLAKE_PROTOCOL", "https"), } + # With empty default value + for key, env_var in { + "user": "SNOWFLAKE_USER", + "authenticator": "SNOWFLAKE_AUTHENTICATOR", + "password": "SNOWFLAKE_PASSWORD", + "host": "SNOWFLAKE_HOST", + "port": "SNOWFLAKE_PORT", + "schema": "SNOWFLAKE_SCHEMA", + "warehouse": "SNOWFLAKE_WAREHOUSE", + "private_key_path": "SNOWFLAKE_PRIVATE_KEY_PATH", + }.items(): + value = os.getenv(env_var, "") + if value: + sf_conn_prop[key] = value return sf_conn_prop @@ -81,7 +133,7 @@ def _load_from_snowsql_config_file(connection_name: str, login_file: str = "") - @snowpark._internal.utils.private_preview(version="0.2.0") -def SnowflakeLoginOptions(connection_name: str = "", login_file: Optional[str] = None) -> Dict[str, str]: +def SnowflakeLoginOptions(connection_name: str = "", login_file: Optional[str] = None) -> Dict[str, Union[str, bytes]]: """Returns a dict that can be used directly into snowflake python connector or Snowpark session config. NOTE: Token/Auth information is sideloaded in all cases above, if provided in following order: @@ -112,40 +164,30 @@ def SnowflakeLoginOptions(connection_name: str = "", login_file: Optional[str] = Raises: Exception: if none of config file and environment variable are present. """ - conn_prop = {} + conn_prop: Dict[str, Union[str, bytes]] = {} login_file = login_file or os.path.expanduser(_DEFAULT_CONNECTION_FILE) # If login file exists, use this exclusively. if os.path.exists(login_file): - conn_prop = _load_from_snowsql_config_file(connection_name, login_file) + conn_prop = {**(_load_from_snowsql_config_file(connection_name, login_file))} else: # If environment exists for SNOWFLAKE_ACCOUNT, assume everything # comes from environment. Mixing it not allowed. account = os.getenv("SNOWFLAKE_ACCOUNT", "") if account: - conn_prop = _connection_properties_from_env() + conn_prop = {**_connection_properties_from_env()} else: raise Exception("Snowflake credential is neither set in env nor a login file was provided.") # Token, if specified, is always side-loaded in all cases. - conn_prop["token"] = _read_token(conn_prop["token_file"] if "token_file" in conn_prop else "") - data = { - "account": conn_prop["account"], - } - for field in ["database", "schema", "warehouse", "host", "port", "role", "session_parameters"]: - if field in conn_prop and conn_prop[field]: - data[field] = conn_prop[field] - - if "authenticator" in conn_prop and conn_prop["authenticator"] == "externalbrowser": - data["authenticator"] = conn_prop["authenticator"] - data["user"] = conn_prop["user"] - elif conn_prop["token"]: - data["token"] = conn_prop["token"] - data["authenticator"] = "oauth" - else: - data["user"] = conn_prop["user"] - data["password"] = conn_prop["password"] + token = _read_token(str(conn_prop["token_file"]) if "token_file" in conn_prop else "") + if token: + conn_prop["token"] = token + if "authenticator" not in conn_prop or conn_prop["authenticator"]: + conn_prop["authenticator"] = "oauth" + elif "private_key_path" in conn_prop and "private_key" not in conn_prop: + conn_prop["private_key"] = _load_pem_to_der(str(conn_prop["private_key_path"])) if "ssl" in conn_prop and conn_prop["ssl"].lower() == "off": - data["protocol"] = "http" + conn_prop["protocol"] = "http" - return data + return conn_prop diff --git a/snowflake/ml/utils/connection_params_test.py b/snowflake/ml/utils/connection_params_test.py index 139674d9..c6aa6911 100644 --- a/snowflake/ml/utils/connection_params_test.py +++ b/snowflake/ml/utils/connection_params_test.py @@ -1,9 +1,12 @@ import configparser import os import tempfile +from typing import Optional import connection_params from absl.testing import absltest +from cryptography.hazmat.primitives import serialization +from cryptography.hazmat.primitives.asymmetric import rsa class SnowflakeLoginOptionsTest(absltest.TestCase): # # type: ignore @@ -95,6 +98,10 @@ def setUp(self) -> None: "schema": "public", "user": "admin2", "warehouse": "env_warehouse", + # Default + "protocol": "https", + "ssl": "on", + "token_file": "/snowflake/session/token", } # Default token file @@ -102,6 +109,19 @@ def setUp(self) -> None: self._token_file.write(b"login_file_token") self._token_file.flush() + @staticmethod + def genPrivateRsaKey(key_password: Optional[bytes] = None) -> bytes: + "Generate a new RSA private key and return." + private_key = rsa.generate_private_key(public_exponent=65537, key_size=2048) + encryption_algorithm: serialization.KeySerializationEncryption = serialization.NoEncryption() + if key_password: + encryption_algorithm = serialization.BestAvailableEncryption(key_password) + return private_key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.PKCS8, + encryption_algorithm=encryption_algorithm, + ) + def testReadInvalidSnowSQLConfigFile(self) -> None: """Tests if given snowsql config file is invalid, it raises exception.""" with self.assertRaises(configparser.ParsingError): @@ -117,57 +137,30 @@ def testReadFromEnv(self) -> None: connection_params._DEFAULT_CONNECTION_FILE = "/does/not/exist" with absltest.mock.patch.dict(os.environ, self._default_env_variable_dict): params = connection_params.SnowflakeLoginOptions() + # TODO - SUMIT self.assertEqual(params, self._connection_dict_from_env) - def testTokenOverrideUserPasswordAsWellAsTokenFile(self) -> None: - """Tests if token overrides user/password & token_file from environment.""" + def testOptionalEmptyEnvVarRemoved(self) -> None: + """Tests that empty optional env variables are skipped.""" connection_params._DEFAULT_CONNECTION_FILE = "/does/not/exist" env_vars = self._default_env_variable_dict - env_vars["SNOWFLAKE_TOKEN"] = "env_token" env_vars["SNOWFLAKE_TOKEN_FILE"] = self._token_file.name - with absltest.mock.patch.dict(os.environ, env_vars): + env_vars["SNOWFLAKE_USER"] = "" # Optional field empty env var => will not come in result + del env_vars["SNOWFLAKE_PASSWORD"] # Removing env var for password => will not come in result + with absltest.mock.patch.dict(os.environ, env_vars, clear=True): params = connection_params.SnowflakeLoginOptions() expected = self._connection_dict_from_env del expected["user"] - del expected["password"] - expected["token"] = "env_token" - expected["authenticator"] = "oauth" - self.assertEqual(params, expected) - - def testTokenFileOverrideEnvUserPassword(self) -> None: - """Tests if token file overrides user/password from environment.""" - connection_params._DEFAULT_CONNECTION_FILE = "/does/not/exist" - env_vars = self._default_env_variable_dict - env_vars["SNOWFLAKE_TOKEN_FILE"] = self._token_file.name - with absltest.mock.patch.dict(os.environ, self._default_env_variable_dict): - params = connection_params.SnowflakeLoginOptions() - expected = self._connection_dict_from_env - del expected["user"] - del expected["password"] + del expected["password"] # No env var => will not come in result + expected["token_file"] = self._token_file.name expected["token"] = "login_file_token" expected["authenticator"] = "oauth" self.assertEqual(params, expected) - @absltest.mock.patch.dict( # type: ignore - os.environ, - {"SNOWFLAKE_ACCOUNT": "", "SNOWFLAKE_TOKEN": "env_token"}, - clear=True, - ) - def testTokenFileOverridesLoginFile(self) -> None: - """Tests if token overrides user/password from file.""" - connection_params._DEFAULT_CONNECTION_FILE = self._login_file_toml.name - params = connection_params.SnowflakeLoginOptions("foo") - expected = self._connection_dict_from_toml_foo - del expected["user"] - del expected["password"] - expected["token"] = "env_token" - expected["authenticator"] = "oauth" - self.assertEqual(params, expected) - def testAllOptionalFieldsMissing(self) -> None: """Tests if ommitting all optional fields parses correctly.""" - self._minimal_login_file = tempfile.NamedTemporaryFile(suffix=".config") - self._minimal_login_file.write( + minimal_login_file = tempfile.NamedTemporaryFile(suffix=".config") + minimal_login_file.write( bytes( """ [connections] @@ -178,8 +171,8 @@ def testAllOptionalFieldsMissing(self) -> None: "utf-8", ) ) - self._minimal_login_file.flush() - connection_params._DEFAULT_CONNECTION_FILE = self._minimal_login_file.name + minimal_login_file.flush() + connection_params._DEFAULT_CONNECTION_FILE = minimal_login_file.name params = connection_params.SnowflakeLoginOptions() expected = { "account": "snowflake", @@ -190,8 +183,8 @@ def testAllOptionalFieldsMissing(self) -> None: def testExternalBrowser(self) -> None: """Tests that using external browser authentication is correctly passed on.""" - self._minimal_login_file = tempfile.NamedTemporaryFile(suffix=".json") - self._minimal_login_file.write( + minimal_login_file = tempfile.NamedTemporaryFile(suffix=".json") + minimal_login_file.write( bytes( """ [connections] @@ -202,8 +195,8 @@ def testExternalBrowser(self) -> None: "utf-8", ) ) - self._minimal_login_file.flush() - connection_params._DEFAULT_CONNECTION_FILE = self._minimal_login_file.name + minimal_login_file.flush() + connection_params._DEFAULT_CONNECTION_FILE = minimal_login_file.name params = connection_params.SnowflakeLoginOptions() expected = { "account": "snowflake", @@ -212,6 +205,120 @@ def testExternalBrowser(self) -> None: } self.assertEqual(params, expected) + def testUnencryptedPrivateKeyPath(self) -> None: + """Tests unencrypted private key path populates private key.""" + unencrypted_pem_private_key = self.genPrivateRsaKey() + private_key_path = tempfile.NamedTemporaryFile(suffix=".pk8") + private_key_path.write(unencrypted_pem_private_key) + private_key_path.flush() + minimal_login_file = tempfile.NamedTemporaryFile(suffix=".config") + minimal_login_file.write( + bytes( + """ + [connections] + accountname = "snowflake" + user = "admin" + private_key_path = "{private_key_path}" + """.format( + private_key_path=private_key_path.name + ), + "utf-8", + ) + ) + minimal_login_file.flush() + + connection_params._DEFAULT_CONNECTION_FILE = minimal_login_file.name + params = connection_params.SnowflakeLoginOptions() + + # Check private_key is set and not empty - aka deserialization worked + self.assertNotEqual(params["private_key"], "") + # No need to validate the value. So resetting it. + del params["private_key"] + + expected = { + "account": "snowflake", + "user": "admin", + "private_key_path": private_key_path.name, # We do not remove it, connect() does not use it + } + self.assertEqual(params, expected) + + def testUnencryptedPrivateKeyPathWithEmptyEnvPassword(self) -> None: + """Tests unencrypted private key path populates private key where empty env var for passphrase.""" + unencrypted_pem_private_key = self.genPrivateRsaKey() + private_key_path = tempfile.NamedTemporaryFile(suffix=".pk8") + private_key_path.write(unencrypted_pem_private_key) + private_key_path.flush() + minimal_login_file = tempfile.NamedTemporaryFile(suffix=".config") + minimal_login_file.write( + bytes( + """ + [connections] + accountname = "snowflake" + user = "admin" + private_key_path = "{private_key_path}" + """.format( + private_key_path=private_key_path.name + ), + "utf-8", + ) + ) + minimal_login_file.flush() + + connection_params._DEFAULT_CONNECTION_FILE = minimal_login_file.name + with absltest.mock.patch.dict(os.environ, {"SNOWFLAKE_PRIVATE_KEY_PASSPHRASE": ""}): + params = connection_params.SnowflakeLoginOptions() + + # Check private_key is set and not empty - aka deserialization worked + self.assertNotEqual(params["private_key"], "") + # No need to validate the value. So resetting it. + del params["private_key"] + + expected = { + "account": "snowflake", + "user": "admin", + "private_key_path": private_key_path.name, # We do not remove it, connect() does not use it + } + self.assertEqual(params, expected) + + def testEncryptedPrivateKeyPath(self) -> None: + """Tests unencrypted private key path populates private key.""" + key_password = "foo" + unencrypted_pem_private_key = self.genPrivateRsaKey(bytes(key_password, "utf-8")) + private_key_path = tempfile.NamedTemporaryFile(suffix=".pk8") + private_key_path.write(unencrypted_pem_private_key) + private_key_path.flush() + minimal_login_file = tempfile.NamedTemporaryFile(suffix=".config") + minimal_login_file.write( + bytes( + """ + [connections] + accountname = "snowflake" + user = "admin" + private_key_path = "{private_key_path}" + """.format( + private_key_path=private_key_path.name + ), + "utf-8", + ) + ) + minimal_login_file.flush() + + connection_params._DEFAULT_CONNECTION_FILE = minimal_login_file.name + with absltest.mock.patch.dict(os.environ, {"SNOWFLAKE_PRIVATE_KEY_PASSPHRASE": key_password}): + params = connection_params.SnowflakeLoginOptions() + + # Check private_key is set and not empty - aka deserialization worked + self.assertNotEqual(params["private_key"], "") + # No need to validate the value. So resetting it. + del params["private_key"] + + expected = { + "account": "snowflake", + "user": "admin", + "private_key_path": private_key_path.name, # We do not remove it, connect() does not use it + } + self.assertEqual(params, expected) + if __name__ == "__main__": absltest.main() diff --git a/snowflake/ml/version.bzl b/snowflake/ml/version.bzl index 8c131c53..bcb01834 100644 --- a/snowflake/ml/version.bzl +++ b/snowflake/ml/version.bzl @@ -1,2 +1,2 @@ # This is parsed by regex in conda reciper meta file. Make sure not to break it. -VERSION = "1.0.4" +VERSION = "1.0.5" diff --git a/tests/integ/snowflake/ml/_internal/BUILD.bazel b/tests/integ/snowflake/ml/_internal/BUILD.bazel index 45eee1ef..16bac168 100644 --- a/tests/integ/snowflake/ml/_internal/BUILD.bazel +++ b/tests/integ/snowflake/ml/_internal/BUILD.bazel @@ -9,5 +9,4 @@ py_test( "//snowflake/ml/_internal:env", "//snowflake/ml/utils:connection_params" ], - tags = ["skip_merge_gates"], ) diff --git a/tests/integ/snowflake/ml/model/BUILD.bazel b/tests/integ/snowflake/ml/model/BUILD.bazel index 967fb054..4d69f9b6 100644 --- a/tests/integ/snowflake/ml/model/BUILD.bazel +++ b/tests/integ/snowflake/ml/model/BUILD.bazel @@ -6,9 +6,11 @@ py_library( srcs = ["warehouse_model_integ_test_utils.py"], deps = [ "//snowflake/ml/model:_deployer", + "//snowflake/ml/model:deploy_platforms", "//snowflake/ml/model:_model", "//snowflake/ml/model:type_hints", "//tests/integ/snowflake/ml/test_utils:db_manager", + "//tests/integ/snowflake/ml/test_utils:test_env_utils", ], ) @@ -16,13 +18,15 @@ py_test( name = "warehouse_custom_model_integ_test", timeout = "long", srcs = ["warehouse_custom_model_integ_test.py"], - shard_count = 5, + shard_count = 6, deps = [ ":warehouse_model_integ_test_utils", + "//snowflake/ml/model:deploy_platforms", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:type_hints", "//snowflake/ml/utils:connection_params", "//tests/integ/snowflake/ml/test_utils:db_manager", + "//tests/integ/snowflake/ml/test_utils:test_env_utils", ], ) @@ -30,7 +34,7 @@ py_test( name = "warehouse_pytorch_model_integ_test", timeout = "long", srcs = ["warehouse_pytorch_model_integ_test.py"], - shard_count = 4, + shard_count = 6, deps = [ ":warehouse_model_integ_test_utils", "//snowflake/ml/model:type_hints", @@ -38,6 +42,7 @@ py_test( "//snowflake/ml/model/_signatures:snowpark_handler", "//snowflake/ml/utils:connection_params", "//tests/integ/snowflake/ml/test_utils:db_manager", + "//tests/integ/snowflake/ml/test_utils:model_factory", ], ) @@ -45,7 +50,7 @@ py_test( name = "warehouse_tensorflow_model_integ_test", timeout = "long", srcs = ["warehouse_tensorflow_model_integ_test.py"], - shard_count = 4, + shard_count = 6, deps = [ ":warehouse_model_integ_test_utils", "//snowflake/ml/model:type_hints", @@ -53,6 +58,7 @@ py_test( "//snowflake/ml/model/_signatures:tensorflow_handler", "//snowflake/ml/utils:connection_params", "//tests/integ/snowflake/ml/test_utils:db_manager", + "//tests/integ/snowflake/ml/test_utils:model_factory", ], ) @@ -60,7 +66,7 @@ py_test( name = "warehouse_sklearn_xgboost_model_integ_test", timeout = "long", srcs = ["warehouse_sklearn_xgboost_model_integ_test.py"], - shard_count = 3, + shard_count = 6, deps = [ ":warehouse_model_integ_test_utils", "//snowflake/ml/model:type_hints", @@ -73,7 +79,7 @@ py_test( name = "warehouse_snowml_model_integ_test", timeout = "long", srcs = ["warehouse_snowml_model_integ_test.py"], - shard_count = 2, + shard_count = 4, deps = [ ":warehouse_model_integ_test_utils", "//snowflake/ml/model:type_hints", @@ -104,7 +110,7 @@ py_test( name = "warehouse_mlflow_model_integ_test", timeout = "long", srcs = ["warehouse_mlflow_model_integ_test.py"], - shard_count = 2, + shard_count = 4, deps = [ ":warehouse_model_integ_test_utils", "//snowflake/ml/model:type_hints", @@ -126,5 +132,6 @@ py_test( "//snowflake/ml/model/_deploy_client/utils:constants", "//snowflake/ml/utils:connection_params", "//tests/integ/snowflake/ml/test_utils:db_manager", + "//tests/integ/snowflake/ml/test_utils:test_env_utils", ], ) diff --git a/tests/integ/snowflake/ml/model/deployment_to_snowservice_integ_test.py b/tests/integ/snowflake/ml/model/deployment_to_snowservice_integ_test.py index 1b00bef1..effe2bdb 100644 --- a/tests/integ/snowflake/ml/model/deployment_to_snowservice_integ_test.py +++ b/tests/integ/snowflake/ml/model/deployment_to_snowservice_integ_test.py @@ -1,113 +1,121 @@ # # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # -import uuid -from unittest import SkipTest -import pandas as pd -import pytest -import sklearn.base -import sklearn.datasets as datasets +# TODO[shchen], SNOW-889081, re-enable once server-side image build is supported. +# # +# # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# # +# import uuid +# from unittest import SkipTest +# +# import pandas as pd +# import pytest +# import sklearn.base +# import sklearn.datasets as datasets from absl.testing import absltest -from sklearn import neighbors - -from snowflake.ml.model import ( - _model as model_api, - custom_model, - type_hints as model_types, -) -from snowflake.ml.model._deploy_client.snowservice import deploy as snowservice_api -from snowflake.ml.model._deploy_client.utils import constants -from snowflake.ml.utils import connection_params -from snowflake.snowpark import Session -from tests.integ.snowflake.ml.test_utils import db_manager - -_IRIS = datasets.load_iris(as_frame=True) -_IRIS_X = _IRIS.data -_IRIS_Y = _IRIS.target - - -def _get_sklearn_model() -> "sklearn.base.BaseEstimator": - knn_model = neighbors.KNeighborsClassifier() - knn_model.fit(_IRIS_X, _IRIS_Y) - return knn_model - - -@pytest.mark.pip_incompatible -class DeploymentToSnowServiceIntegTest(absltest.TestCase): - _RUN_ID = uuid.uuid4().hex[:2] - # Upper is necessary for `db, schema and repo names for an image repo must be unquoted identifiers.` - TEST_DB = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "db").upper() - TEST_SCHEMA = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "schema").upper() - TEST_STAGE = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "stage").upper() - TEST_IMAGE_REPO = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "repo").upper() - TEST_ROLE = "SYSADMIN" - TEST_COMPUTE_POOL = "MODEL_DEPLOYMENT_INTEG_TEST_POOL" # PRE-CREATED - CONNECTION_NAME = "snowservice" # PRE-CREATED AND STORED IN KEY VAULT - - @classmethod - def setUpClass(cls) -> None: - try: - login_options = connection_params.SnowflakeLoginOptions(connection_name=cls.CONNECTION_NAME) - except KeyError: - raise SkipTest("SnowService connection parameters not present: skipping SnowServicesIntegTest.") - - cls._session = Session.builder.configs( - { - **login_options, - **{"database": cls.TEST_DB, "schema": cls.TEST_SCHEMA}, - } - ).create() - cls._db_manager = db_manager.DBManager(cls._session) - cls._db_manager.set_role(cls.TEST_ROLE) - cls._db_manager.create_stage(cls.TEST_STAGE, cls.TEST_SCHEMA, cls.TEST_DB, sse_encrypted=True) - cls._db_manager.create_image_repo(cls.TEST_IMAGE_REPO) - - @classmethod - def tearDownClass(cls) -> None: - cls._db_manager.drop_image_repo(cls.TEST_IMAGE_REPO) - # Dropping the db/schema will implicitly terminate the service function and snowservice as well. - cls._db_manager.drop_database(cls.TEST_DB) - cls._session.close() - - def setUp(self) -> None: - # Set up a unique id for each artifact, in addition to the class-level prefix. This is particularly useful when - # differentiating artifacts generated between different test cases, such as service function names. - self.uid = uuid.uuid4().hex[:4] - - def _save_model_to_stage(self, model: custom_model.CustomModel, sample_input: pd.DataFrame) -> str: - stage_path = f"@{self.TEST_STAGE}/{self.uid}/model.zip" - model_api.save_model( # type: ignore[call-overload] - name="model", - session=self._session, - model_stage_file_path=stage_path, - model=model, - sample_input=sample_input, - options={"embed_local_ml_library": True}, - ) - return stage_path - - def test_deployment_workflow(self) -> None: - model_stage_file_path = self._save_model_to_stage(model=_get_sklearn_model(), sample_input=_IRIS_X) - service_func_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( - self._RUN_ID, f"func_{self.uid}" - ) - deployment_options: model_types.SnowparkContainerServiceDeployOptions = { - "compute_pool": self.TEST_COMPUTE_POOL, - # image_repo is optional for user, pass in full image repo for test purposes only - "image_repo": self._db_manager.get_snowservice_image_repo( - subdomain=constants.DEV_IMAGE_REGISTRY_SUBDOMAIN, repo=self.TEST_IMAGE_REPO - ), - } - snowservice_api._deploy( - self._session, - model_id=uuid.uuid4().hex, - service_func_name=service_func_name, - model_zip_stage_path=model_stage_file_path, - deployment_stage_path=model_stage_file_path, # use the same stage for testing - **deployment_options, - ) - +# from sklearn import neighbors +# +# from snowflake.ml.model import ( +# _model as model_api, +# custom_model, +# type_hints as model_types, +# ) +# from snowflake.ml.model._deploy_client.snowservice import deploy as snowservice_api +# from snowflake.ml.model._deploy_client.utils import constants +# from snowflake.ml.utils import connection_params +# from snowflake.snowpark import Session +# from tests.integ.snowflake.ml.test_utils import db_manager +# +# _IRIS = datasets.load_iris(as_frame=True) +# _IRIS_X = _IRIS.data +# _IRIS_Y = _IRIS.target +# +# +# def _get_sklearn_model() -> "sklearn.base.BaseEstimator": +# knn_model = neighbors.KNeighborsClassifier() +# knn_model.fit(_IRIS_X, _IRIS_Y) +# return knn_model +# +# +# @pytest.mark.pip_incompatible +# class DeploymentToSnowServiceIntegTest(absltest.TestCase): +# _RUN_ID = uuid.uuid4().hex[:2] +# # Upper is necessary for `db, schema and repo names for an image repo must be unquoted identifiers.` +# TEST_DB = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "db").upper() +# TEST_SCHEMA = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "schema").upper() +# TEST_STAGE = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "stage").upper() +# TEST_IMAGE_REPO = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "repo").upper() +# TEST_ROLE = "SYSADMIN" +# TEST_COMPUTE_POOL = "MODEL_DEPLOYMENT_INTEG_TEST_POOL_STANDARD_2" # PRE-CREATED +# CONNECTION_NAME = "snowservice" # PRE-CREATED AND STORED IN KEY VAULT +# +# @classmethod +# def setUpClass(cls) -> None: +# try: +# login_options = connection_params.SnowflakeLoginOptions(connection_name=cls.CONNECTION_NAME) +# except KeyError: +# raise SkipTest("SnowService connection parameters not present: skipping SnowServicesIntegTest.") +# +# cls._session = Session.builder.configs( +# { +# **login_options, +# **{"database": cls.TEST_DB, "schema": cls.TEST_SCHEMA}, +# } +# ).create() +# cls._db_manager = db_manager.DBManager(cls._session) +# cls._db_manager.set_role(cls.TEST_ROLE) +# cls._db_manager.create_stage(cls.TEST_STAGE, cls.TEST_SCHEMA, cls.TEST_DB, sse_encrypted=True) +# cls._db_manager.create_image_repo(cls.TEST_IMAGE_REPO) +# cls._db_manager.cleanup_databases(expire_hours=6) +# +# @classmethod +# def tearDownClass(cls) -> None: +# cls._db_manager.drop_image_repo(cls.TEST_IMAGE_REPO) +# # Dropping the db/schema will implicitly terminate the service function and snowservice as well. +# cls._db_manager.drop_database(cls.TEST_DB) +# cls._session.close() +# +# def setUp(self) -> None: +# # Set up a unique id for each artifact, in addition to the class-level prefix. This is particularly useful +# when differentiating artifacts generated between different test cases, such as service function names. +# self.uid = uuid.uuid4().hex[:4] +# +# def _save_model_to_stage(self, model: custom_model.CustomModel, sample_input: pd.DataFrame) -> str: +# stage_path = f"@{self.TEST_STAGE}/{self.uid}/model.zip" +# model_api.save_model( # type: ignore[call-overload] +# name="model", +# session=self._session, +# model_stage_file_path=stage_path, +# model=model, +# sample_input=sample_input, +# options={"embed_local_ml_library": True}, +# ) +# return stage_path +# +# def test_deployment_workflow(self) -> None: +# model_stage_file_path = self._save_model_to_stage(model=_get_sklearn_model(), sample_input=_IRIS_X) +# service_func_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( +# self._RUN_ID, f"func_{self.uid}" +# ) +# deployment_options: model_types.SnowparkContainerServiceDeployOptions = { +# "compute_pool": self.TEST_COMPUTE_POOL, +# # image_repo is optional for user, pass in full image repo for test purposes only +# "image_repo": self._db_manager.get_snowservice_image_repo( +# subdomain=constants.DEV_IMAGE_REGISTRY_SUBDOMAIN, repo=self.TEST_IMAGE_REPO +# ), +# } +# snowservice_api._deploy( +# self._session, +# model_id=uuid.uuid4().hex, +# service_func_name=service_func_name, +# model_zip_stage_path=model_stage_file_path, +# deployment_stage_path=model_stage_file_path, # use the same stage for testing +# target_method="predict", +# **deployment_options, +# ) +# +# if __name__ == "__main__": absltest.main() diff --git a/tests/integ/snowflake/ml/model/model_badcase_integ_test.py b/tests/integ/snowflake/ml/model/model_badcase_integ_test.py index 4077dfbd..a2fa1a5c 100644 --- a/tests/integ/snowflake/ml/model/model_badcase_integ_test.py +++ b/tests/integ/snowflake/ml/model/model_badcase_integ_test.py @@ -14,11 +14,12 @@ _deployer, _model as model_api, custom_model, + deploy_platforms, type_hints as model_types, ) from snowflake.ml.utils import connection_params from snowflake.snowpark import Session -from tests.integ.snowflake.ml.test_utils import db_manager +from tests.integ.snowflake.ml.test_utils import db_manager, test_env_utils class DemoModel(custom_model.CustomModel): @@ -85,7 +86,7 @@ def test_bad_model_deploy(self) -> None: session=self._session, name=function_name, model_dir_path=os.path.join(tmpdir, "custom_bad_model"), - platform=_deployer.TargetPlatform.WAREHOUSE, + platform=deploy_platforms.TargetPlatform.WAREHOUSE, target_method="predict", options=model_types.WarehouseDeployOptions({"relax_version": False}), ) @@ -99,6 +100,9 @@ def test_custom_demo_model(self) -> None: name="custom_demo_model", model_dir_path=os.path.join(tmpdir, "custom_demo_model"), model=lm, + conda_dependencies=[ + test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") + ], sample_input=pd_df, metadata={"author": "halu", "version": "1"}, options=model_types.CustomModelSaveOption({"embed_local_ml_library": True}), @@ -111,11 +115,11 @@ def test_custom_demo_model(self) -> None: session=self._session, name=function_name, model_dir_path=os.path.join(tmpdir, "custom_demo_model"), - platform=_deployer.TargetPlatform.WAREHOUSE, + platform=deploy_platforms.TargetPlatform.WAREHOUSE, target_method="predict", options=model_types.WarehouseDeployOptions( { - "relax_version": True, + "relax_version": test_env_utils.is_in_pip_env(), "permanent_udf_stage_location": f"{self.full_qual_stage}/", # Test stage location validation } @@ -126,11 +130,11 @@ def test_custom_demo_model(self) -> None: session=self._session, name=function_name, model_dir_path=os.path.join(tmpdir, "custom_demo_model", ""), # Test sanitizing user path input. - platform=_deployer.TargetPlatform.WAREHOUSE, + platform=deploy_platforms.TargetPlatform.WAREHOUSE, target_method="predict", options=model_types.WarehouseDeployOptions( { - "relax_version": True, + "relax_version": test_env_utils.is_in_pip_env(), "permanent_udf_stage_location": f"@{self.full_qual_stage}/", } ), @@ -148,11 +152,11 @@ def test_custom_demo_model(self) -> None: session=self._session, name=function_name, model_dir_path=os.path.join(tmpdir, "custom_demo_model"), - platform=_deployer.TargetPlatform.WAREHOUSE, + platform=deploy_platforms.TargetPlatform.WAREHOUSE, target_method="predict", options=model_types.WarehouseDeployOptions( { - "relax_version": True, + "relax_version": test_env_utils.is_in_pip_env(), "permanent_udf_stage_location": f"@{self.full_qual_stage}/", } ), @@ -164,11 +168,11 @@ def test_custom_demo_model(self) -> None: session=self._session, name=function_name, model_dir_path=os.path.join(tmpdir, "custom_demo_model"), - platform=_deployer.TargetPlatform.WAREHOUSE, + platform=deploy_platforms.TargetPlatform.WAREHOUSE, target_method="predict", options=model_types.WarehouseDeployOptions( { - "relax_version": True, + "relax_version": test_env_utils.is_in_pip_env(), "permanent_udf_stage_location": f"@{self.full_qual_stage}/", "replace_udf": True, } diff --git a/tests/integ/snowflake/ml/model/warehouse_custom_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_custom_model_integ_test.py index cb7d3c13..0cccfd65 100644 --- a/tests/integ/snowflake/ml/model/warehouse_custom_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_custom_model_integ_test.py @@ -111,7 +111,7 @@ def base_test_case( deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: warehouse_model_integ_test_utils.base_test_case( self._db_manager, @@ -124,20 +124,20 @@ def base_test_case( deploy_params=deploy_params, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_async_model_composition( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: async def _test(self: "TestWarehouseCustomModelInteg") -> None: arr = np.random.randint(100, size=(10000, 3)) @@ -166,22 +166,22 @@ async def _test(self: "TestWarehouseCustomModelInteg") -> None: }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) asyncio.get_event_loop().run_until_complete(_test(self)) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_custom_demo_model_sp( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: lm = DemoModel(custom_model.ModelContext()) arr = [[1, 2, 3], [4, 2, 5]] @@ -202,20 +202,20 @@ def test_custom_demo_model_sp( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_custom_demo_model_sp_quote( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: lm = DemoModelSPQuote(custom_model.ModelContext()) arr = [[1, 2, 3], [4, 2, 5]] @@ -237,20 +237,20 @@ def test_custom_demo_model_sp_quote( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_custom_demo_model_sp_mix_1( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: lm = DemoModel(custom_model.ModelContext()) arr = [[1, 2, 3], [4, 2, 5]] @@ -272,20 +272,20 @@ def test_custom_demo_model_sp_mix_1( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_custom_demo_model_sp_mix_2( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: lm = DemoModel(custom_model.ModelContext()) arr = [[1, 2, 3], [4, 2, 5]] @@ -307,20 +307,20 @@ def test_custom_demo_model_sp_mix_2( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_custom_demo_model_array( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: lm = DemoModelArray(custom_model.ModelContext()) arr = np.array([[1, 2, 3], [4, 2, 5]]) @@ -341,20 +341,20 @@ def test_custom_demo_model_array( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_custom_demo_model_str( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: lm = DemoModel(custom_model.ModelContext()) pd_df = pd.DataFrame([["Yogiri", "Civia", "Echo"], ["Artia", "Doris", "Rosalyn"]], columns=["c1", "c2", "c3"]) @@ -374,20 +374,20 @@ def test_custom_demo_model_str( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_custom_demo_model_array_sp( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: lm = DemoModelArray(custom_model.ModelContext()) arr = np.array([[1, 2, 3], [4, 2, 5]]) @@ -409,20 +409,20 @@ def test_custom_demo_model_array_sp( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_custom_demo_model_str_sp( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: lm = DemoModel(custom_model.ModelContext()) pd_df = pd.DataFrame([["Yogiri", "Civia", "Echo"], ["Artia", "Doris", "Rosalyn"]], columns=["c1", "c2", "c3"]) @@ -443,20 +443,20 @@ def test_custom_demo_model_str_sp( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_custom_demo_model_array_str( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: lm = DemoModelArray(custom_model.ModelContext()) pd_df = pd.DataFrame([["Yogiri", "Civia", "Echo"], ["Artia", "Doris", "Rosalyn"]], columns=["c1", "c2", "c3"]) @@ -476,20 +476,20 @@ def test_custom_demo_model_array_str( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_custom_demo_model_with_input_no_keep_order( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: lm = DemoModel(custom_model.ModelContext()) arr = np.random.randint(100, size=(10000, 3)) @@ -509,20 +509,20 @@ def test_custom_demo_model_with_input_no_keep_order( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_custom_demo_model_with_input( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: lm = DemoModel(custom_model.ModelContext()) arr = np.random.randint(100, size=(10000, 3)) @@ -547,20 +547,20 @@ def check_res(res: pd.DataFrame) -> Any: deploy_params={"predict": ({"output_with_input_features": True}, check_res)}, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_custom_model_with_artifacts( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, "bias"), "w", encoding="utf-8") as f: @@ -586,20 +586,20 @@ def test_custom_model_with_artifacts( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_custom_model_bool_sp( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, "bias"), "w", encoding="utf-8") as f: @@ -626,7 +626,7 @@ def test_custom_model_bool_sp( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/model/warehouse_mlflow_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_mlflow_model_integ_test.py index 81920f1a..f81e2d47 100644 --- a/tests/integ/snowflake/ml/model/warehouse_mlflow_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_mlflow_model_integ_test.py @@ -60,7 +60,7 @@ def base_test_case( deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: warehouse_model_integ_test_utils.base_test_case( self._db_manager, @@ -73,20 +73,20 @@ def base_test_case( deploy_params=deploy_params, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.4"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.4"}, ) def test_mlflow_model_deploy_sklearn_df( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: db = datasets.load_diabetes(as_frame=True) X_train, X_test, y_train, y_test = model_selection.train_test_split(db.data, db.target) @@ -132,20 +132,20 @@ def test_mlflow_model_deploy_sklearn_df( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.4"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.4"}, ) def test_mlflow_model_deploy_sklearn( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: db = datasets.load_diabetes() X_train, X_test, y_train, y_test = model_selection.train_test_split(db.data, db.target) @@ -193,7 +193,7 @@ def test_mlflow_model_deploy_sklearn( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py b/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py index 2662093f..42794415 100644 --- a/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py +++ b/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py @@ -5,13 +5,20 @@ import os import posixpath import tempfile +import unittest from typing import Any, Callable, Dict, Optional, Tuple, Union import pandas as pd +from packaging import version -from snowflake.ml.model import _deployer, _model as model_api, type_hints as model_types +from snowflake.ml.model import ( + _deployer, + _model as model_api, + deploy_platforms, + type_hints as model_types, +) from snowflake.snowpark import DataFrame as SnowparkDataFrame -from tests.integ.snowflake.ml.test_utils import db_manager +from tests.integ.snowflake.ml.test_utils import db_manager, test_env_utils def base_test_case( @@ -25,13 +32,25 @@ def base_test_case( deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: with tempfile.TemporaryDirectory() as tmpdir: version_args: Dict[str, Any] = {} tmp_stage = db._session.get_session_stage() - if test_released_library: - actual_name = f"{name}_v_released" + conda_dependencies = [ + test_env_utils.get_latest_package_versions_in_server(db._session, "snowflake-snowpark-python") + ] + # We only test when the test is added before the current version available in the server. + snowml_req_str = test_env_utils.get_latest_package_versions_in_server(db._session, "snowflake-ml-python") + + if test_released_version: + if version.parse(test_released_version) <= version.parse(snowml_req_str.split("==")[-1]): + actual_name = f"{name}_v_released" + conda_dependencies.append(snowml_req_str) + else: + raise unittest.SkipTest( + f"Skip test on released version {test_released_version} which has not been available yet." + ) else: actual_name = f"{name}_v_current" version_args["options"] = {"embed_local_ml_library": True} @@ -49,15 +68,13 @@ def base_test_case( name=actual_name, model=model, sample_input=sample_input, + conda_dependencies=conda_dependencies, metadata={"author": "halu", "version": "1"}, **location_args, **version_args, ) for target_method, (additional_deploy_options, check_func) in deploy_params.items(): - deploy_version_args = {} - if test_released_library: - deploy_version_args = {"disable_local_conda_resolver": True} if permanent_deploy: permanent_deploy_args = {"permanent_udf_stage_location": f"@{full_qual_stage}/"} else: @@ -70,13 +87,12 @@ def base_test_case( deploy_info = _deployer.deploy( name=function_name, **location_args, - platform=_deployer.TargetPlatform.WAREHOUSE, + platform=deploy_platforms.TargetPlatform.WAREHOUSE, target_method=target_method, options={ - "relax_version": True, + "relax_version": test_env_utils.is_in_pip_env(), **permanent_deploy_args, # type: ignore[arg-type] **additional_deploy_options, - **deploy_version_args, }, # type: ignore[call-overload] ) diff --git a/tests/integ/snowflake/ml/model/warehouse_pytorch_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_pytorch_model_integ_test.py index b9cebf24..8af2a7b3 100644 --- a/tests/integ/snowflake/ml/model/warehouse_pytorch_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_pytorch_model_integ_test.py @@ -3,9 +3,8 @@ # import uuid -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Optional, Tuple, Union -import numpy as np import pandas as pd import torch from absl.testing import absltest, parameterized @@ -15,41 +14,7 @@ from snowflake.ml.utils import connection_params from snowflake.snowpark import DataFrame as SnowparkDataFrame, Session from tests.integ.snowflake.ml.model import warehouse_model_integ_test_utils -from tests.integ.snowflake.ml.test_utils import db_manager - - -class TorchModel(torch.nn.Module): - def __init__(self, n_input: int, n_hidden: int, n_out: int, dtype: torch.dtype = torch.float32) -> None: - super().__init__() - self.model = torch.nn.Sequential( - torch.nn.Linear(n_input, n_hidden, dtype=dtype), - torch.nn.ReLU(), - torch.nn.Linear(n_hidden, n_out, dtype=dtype), - torch.nn.Sigmoid(), - ) - - def forward(self, tensors: List[torch.Tensor]) -> List[torch.Tensor]: - return [self.model(tensors[0])] - - -def _prepare_torch_model( - dtype: torch.dtype = torch.float32, -) -> Tuple[torch.nn.Module, List[torch.Tensor], List[torch.Tensor]]: - n_input, n_hidden, n_out, batch_size, learning_rate = 10, 15, 1, 100, 0.01 - x = np.random.rand(batch_size, n_input) - data_x = [torch.from_numpy(x).to(dtype=dtype)] - data_y = [(torch.rand(size=(batch_size, 1)) < 0.5).to(dtype=dtype)] - - model = TorchModel(n_input, n_hidden, n_out, dtype=dtype) - loss_function = torch.nn.MSELoss() - optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) - for _epoch in range(100): - pred_y = model(data_x) - loss = loss_function(pred_y[0], data_y[0]) - optimizer.zero_grad() - loss.backward() - optimizer.step() - return model, data_x, data_y +from tests.integ.snowflake.ml.test_utils import db_manager, model_factory class TestWarehousePytorchModelINteg(parameterized.TestCase): @@ -93,7 +58,7 @@ def base_test_case( deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: warehouse_model_integ_test_utils.base_test_case( self._db_manager, @@ -106,22 +71,22 @@ def base_test_case( deploy_params=deploy_params, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_pytorch_tensor_as_sample( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: - model, data_x, data_y = _prepare_torch_model() + model, data_x, data_y = model_factory.ModelFactory.prepare_torch_model() x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) y_pred = model.forward(data_x)[0].detach() @@ -133,29 +98,29 @@ def test_pytorch_tensor_as_sample( deploy_params={ "forward": ( {}, - lambda res: torch.testing.assert_close( # type:ignore[attr-defined] + lambda res: torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(res)[0], y_pred, check_dtype=False ), ), }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_pytorch_df_as_sample( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: - model, data_x, data_y = _prepare_torch_model(torch.float64) + model, data_x, data_y = model_factory.ModelFactory.prepare_torch_model(torch.float64) x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) y_pred = model.forward(data_x)[0].detach() @@ -167,29 +132,29 @@ def test_pytorch_df_as_sample( deploy_params={ "forward": ( {}, - lambda res: torch.testing.assert_close( # type:ignore[attr-defined] + lambda res: torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(res)[0], y_pred ), ), }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_pytorch_sp( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: - model, data_x, data_y = _prepare_torch_model(torch.float64) + model, data_x, data_y = model_factory.ModelFactory.prepare_torch_model(torch.float64) x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) x_df.columns = ["col_0"] y_pred = model.forward(data_x)[0].detach() @@ -203,7 +168,7 @@ def test_pytorch_sp( deploy_params={ "forward": ( {}, - lambda res: torch.testing.assert_close( # type:ignore[attr-defined] + lambda res: torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( snowpark_handler.SnowparkDataFrameHandler.convert_to_df(res) )[0], @@ -213,22 +178,22 @@ def test_pytorch_sp( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_torchscript_tensor_as_sample( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: - model, data_x, data_y = _prepare_torch_model() + model, data_x, data_y = model_factory.ModelFactory.prepare_jittable_torch_model() x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) model_script = torch.jit.script(model) # type:ignore[attr-defined] y_pred = model_script.forward(data_x)[0].detach() @@ -241,29 +206,29 @@ def test_torchscript_tensor_as_sample( deploy_params={ "forward": ( {}, - lambda res: torch.testing.assert_close( # type:ignore[attr-defined] + lambda res: torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(res)[0], y_pred, check_dtype=False ), ), }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_torchscript_df_as_sample( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: - model, data_x, data_y = _prepare_torch_model(torch.float64) + model, data_x, data_y = model_factory.ModelFactory.prepare_jittable_torch_model(torch.float64) x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) model_script = torch.jit.script(model) # type:ignore[attr-defined] y_pred = model_script.forward(data_x)[0].detach() @@ -276,29 +241,29 @@ def test_torchscript_df_as_sample( deploy_params={ "forward": ( {}, - lambda res: torch.testing.assert_close( # type:ignore[attr-defined] + lambda res: torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(res)[0], y_pred ), ), }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_torchscript_sp( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: - model, data_x, data_y = _prepare_torch_model(torch.float64) + model, data_x, data_y = model_factory.ModelFactory.prepare_jittable_torch_model(torch.float64) x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) x_df.columns = ["col_0"] model_script = torch.jit.script(model) # type:ignore[attr-defined] @@ -313,7 +278,7 @@ def test_torchscript_sp( deploy_params={ "forward": ( {}, - lambda res: torch.testing.assert_close( # type:ignore[attr-defined] + lambda res: torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( snowpark_handler.SnowparkDataFrameHandler.convert_to_df(res) )[0], @@ -323,7 +288,7 @@ def test_torchscript_sp( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/model/warehouse_sklearn_xgboost_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_sklearn_xgboost_model_integ_test.py index 067f300a..a98f59eb 100644 --- a/tests/integ/snowflake/ml/model/warehouse_sklearn_xgboost_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_sklearn_xgboost_model_integ_test.py @@ -59,7 +59,7 @@ def base_test_case( deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: warehouse_model_integ_test_utils.base_test_case( self._db_manager, @@ -72,20 +72,20 @@ def base_test_case( deploy_params=deploy_params, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_skl_model_deploy( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: iris_X, iris_y = datasets.load_iris(return_X_y=True) # LogisticRegression is for classfication task, such as iris @@ -104,20 +104,20 @@ def test_skl_model_deploy( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_skl_model_proba_deploy( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: iris_X, iris_y = datasets.load_iris(return_X_y=True) model = ensemble.RandomForestClassifier(random_state=42) @@ -139,20 +139,20 @@ def test_skl_model_proba_deploy( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_skl_multiple_output_model_proba_deploy( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: iris_X, iris_y = datasets.load_iris(return_X_y=True) target2 = np.random.randint(0, 6, size=iris_y.shape) @@ -179,20 +179,20 @@ def test_skl_multiple_output_model_proba_deploy( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_xgb( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: cal_data = datasets.load_breast_cancer(as_frame=True) cal_X = cal_data.data @@ -215,20 +215,20 @@ def test_xgb( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, ) def test_xgb_sp( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: cal_data = datasets.load_breast_cancer(as_frame=True) cal_data_sp_df = self._session.create_dataframe(cal_data.frame) @@ -253,7 +253,84 @@ def test_xgb_sp( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, + ) + + @parameterized.parameters( # type: ignore[misc] + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.5"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.5"}, + ) + def test_xgb_booster( + self, + model_in_stage: Optional[bool] = False, + permanent_deploy: Optional[bool] = False, + test_released_version: Optional[str] = None, + ) -> None: + cal_data = datasets.load_breast_cancer(as_frame=True) + cal_X = cal_data.data + cal_y = cal_data.target + cal_X_train, cal_X_test, cal_y_train, cal_y_test = model_selection.train_test_split(cal_X, cal_y) + params = dict(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, objective="binary:logistic") + regressor = xgboost.train(params, xgboost.DMatrix(data=cal_X_train, label=cal_y_train)) + y_pred = regressor.predict(xgboost.DMatrix(data=cal_X_test)) + self.base_test_case( + name="xgb_booster", + model=regressor, + sample_input=cal_X_test, + test_input=cal_X_test, + deploy_params={ + "predict": ( + {}, + lambda res: np.testing.assert_allclose(res.values, np.expand_dims(y_pred, axis=1)), + ), + }, + model_in_stage=model_in_stage, + permanent_deploy=permanent_deploy, + test_released_version=test_released_version, + ) + + @parameterized.parameters( # type: ignore[misc] + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.5"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.5"}, + ) + def test_xgb_booster_sp( + self, + model_in_stage: Optional[bool] = False, + permanent_deploy: Optional[bool] = False, + test_released_version: Optional[str] = None, + ) -> None: + cal_data = datasets.load_breast_cancer(as_frame=True) + cal_data_sp_df = self._session.create_dataframe(cal_data.frame) + cal_data_sp_df_train, cal_data_sp_df_test = tuple(cal_data_sp_df.random_split([0.25, 0.75], seed=2568)) + cal_data_pd_df_train = cal_data_sp_df_train.to_pandas() + params = dict(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, objective="binary:logistic") + regressor = xgboost.train( + params, + xgboost.DMatrix(data=cal_data_pd_df_train.drop(columns=["target"]), label=cal_data_pd_df_train["target"]), + ) + cal_data_sp_df_test_X = cal_data_sp_df_test.drop('"target"') + y_pred = regressor.predict(xgboost.DMatrix(data=cal_data_sp_df_test_X.to_pandas())) + self.base_test_case( + name="xgb_booster_sp", + model=regressor, + sample_input=cal_data_sp_df_train.drop('"target"'), + test_input=cal_data_sp_df_test_X, + deploy_params={ + "predict": ( + {}, + lambda res: np.testing.assert_allclose( + res.to_pandas().values, + np.expand_dims(y_pred, axis=1), + ), + ), + }, + model_in_stage=model_in_stage, + permanent_deploy=permanent_deploy, + test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py index 2f9c2a95..c0cc7379 100644 --- a/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py @@ -62,7 +62,7 @@ def base_test_case( deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: warehouse_model_integ_test_utils.base_test_case( self._db_manager, @@ -75,21 +75,21 @@ def base_test_case( deploy_params=deploy_params, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @pytest.mark.pip_incompatible @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.5"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.5"}, ) def test_snowml_model_deploy_snowml_sklearn( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: iris_X = datasets.load_iris(as_frame=True).frame iris_X.columns = [s.replace(" (CM)", "").replace(" ", "") for s in iris_X.columns.str.upper()] @@ -116,21 +116,21 @@ def test_snowml_model_deploy_snowml_sklearn( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @pytest.mark.pip_incompatible @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.5"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.5"}, ) def test_snowml_model_deploy_xgboost( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: iris_X = datasets.load_iris(as_frame=True).frame iris_X.columns = [s.replace(" (CM)", "").replace(" ", "") for s in iris_X.columns.str.upper()] @@ -157,21 +157,21 @@ def test_snowml_model_deploy_xgboost( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @pytest.mark.pip_incompatible @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.5"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.5"}, ) def test_snowml_model_deploy_lightgbm( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: iris_X = datasets.load_iris(as_frame=True).frame iris_X.columns = [s.replace(" (CM)", "").replace(" ", "") for s in iris_X.columns.str.upper()] @@ -198,7 +198,7 @@ def test_snowml_model_deploy_lightgbm( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/model/warehouse_tensorflow_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_tensorflow_model_integ_test.py index 08699931..3e16cb28 100644 --- a/tests/integ/snowflake/ml/model/warehouse_tensorflow_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_tensorflow_model_integ_test.py @@ -15,7 +15,7 @@ from snowflake.ml.utils import connection_params from snowflake.snowpark import DataFrame as SnowparkDataFrame, Session from tests.integ.snowflake.ml.model import warehouse_model_integ_test_utils -from tests.integ.snowflake.ml.test_utils import db_manager +from tests.integ.snowflake.ml.test_utils import db_manager, model_factory class SimpleModule(tf.Module): @@ -42,25 +42,6 @@ def call(self, tensors: List[tf.Tensor]) -> List[tf.Tensor]: return [x] -def _prepare_keras_model( - dtype: tf.dtypes.DType = tf.float32, -) -> Tuple[tf.keras.Model, List[tf.Tensor], List[tf.Tensor]]: - n_input, n_hidden, n_out, batch_size, learning_rate = 10, 15, 1, 100, 0.01 - x = np.random.rand(batch_size, n_input) - data_x = [tf.convert_to_tensor(x, dtype=dtype)] - raw_data_y = tf.random.uniform((batch_size, 1)) - raw_data_y = tf.where(raw_data_y > 0.5, tf.ones_like(raw_data_y), tf.zeros_like(raw_data_y)) - data_y = [tf.cast(raw_data_y, dtype=dtype)] - - def loss_fn(y_true: List[tf.Tensor], y_pred: List[tf.Tensor]) -> tf.Tensor: - return tf.keras.losses.mse(y_true[0], y_pred[0]) - - model = KerasModel(n_hidden, n_out) - model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), loss=loss_fn) - model.fit(data_x, data_y, batch_size=batch_size, epochs=100) - return model, data_x, data_y - - class TestWarehouseTensorflowModelInteg(parameterized.TestCase): @classmethod def setUpClass(self) -> None: @@ -102,7 +83,7 @@ def base_test_case( deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: warehouse_model_integ_test_utils.base_test_case( self._db_manager, @@ -115,20 +96,20 @@ def base_test_case( deploy_params=deploy_params, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.4"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.4"}, ) def test_tf_tensor_as_sample( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: model = SimpleModule(name="simple") data_x = [tf.constant([[5.0], [10.0]])] @@ -151,20 +132,20 @@ def test_tf_tensor_as_sample( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.4"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.4"}, ) def test_tf_df_as_sample( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: model = SimpleModule(name="simple") data_x = [tf.constant([[5.0], [10.0]])] @@ -187,20 +168,20 @@ def test_tf_df_as_sample( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.4"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.4"}, ) def test_tf_sp( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: model = SimpleModule(name="simple") data_x = [tf.constant([[5.0], [10.0]])] @@ -231,22 +212,22 @@ def test_tf_sp( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.4"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.4"}, ) def test_keras_tensor_as_sample( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: - model, data_x, data_y = _prepare_keras_model() + model, data_x, data_y = model_factory.ModelFactory.prepare_keras_model() x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False) y_pred = model.predict(data_x)[0] @@ -267,22 +248,22 @@ def test_keras_tensor_as_sample( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.4"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.4"}, ) def test_keras_df_as_sample( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: - model, data_x, data_y = _prepare_keras_model() + model, data_x, data_y = model_factory.ModelFactory.prepare_keras_model() x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False) y_pred = model.predict(data_x)[0] @@ -303,22 +284,22 @@ def test_keras_df_as_sample( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, - # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, - # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, + {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.4"}, + {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.4"}, ) def test_keras_sp( self, model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, - test_released_library: Optional[bool] = False, + test_released_version: Optional[str] = None, ) -> None: - model, data_x, data_y = _prepare_keras_model() + model, data_x, data_y = model_factory.ModelFactory.prepare_keras_model() x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False) x_df.columns = ["col_0"] y_pred = model.predict(data_x)[0] @@ -347,7 +328,7 @@ def test_keras_sp( }, model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, - test_released_library=test_released_library, + test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_monitor.py b/tests/integ/snowflake/ml/modeling/metrics/test_monitor.py index 1773713d..0746e72b 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/test_monitor.py +++ b/tests/integ/snowflake/ml/modeling/metrics/test_monitor.py @@ -2,14 +2,42 @@ # # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # +import math import unittest +import numpy as np from absl.testing import absltest from snowflake import snowpark from snowflake.ml.utils import connection_params +def rel_entropy(x: float, y: float) -> float: + if np.isnan(x) or np.isnan(y): + return np.NAN + elif x > 0 and y > 0: + return x * math.log2(x / y) + elif x == 0 and y >= 0: + return 0 + else: + return np.inf + + +# This is the official JS algorithm +def JS_helper(p: list, q: list) -> float: + p = np.asarray(p) + q = np.asarray(q) + m = (p + q) / 2.0 + tmp = np.column_stack((p, m)) + left = np.array([rel_entropy(x, y) for x, y in tmp]) + tmp = np.column_stack((q, m)) + right = np.array([rel_entropy(x, y) for x, y in tmp]) + left_sum = np.sum(left) + right_sum = np.sum(right) + js = left_sum + right_sum + return np.sqrt(js / 2.0) + + @unittest.skip("not PrPr") class MonitorTest(absltest.TestCase): """Test Covariance matrix.""" @@ -36,12 +64,14 @@ def test_compare_udfs(self) -> None: return_type=snowpark.types.IntegerType(), input_types=[snowpark.types.IntegerType(), snowpark.types.IntegerType()], name="add1", + replace=True, ) self._session.udf.register( lambda x, y: x + y + 1, return_type=snowpark.types.IntegerType(), input_types=[snowpark.types.IntegerType(), snowpark.types.IntegerType()], name="add2", + replace=True, ) res = monitor.compare_udfs_outputs("add1", "add2", inputDf) pdf = res.to_pandas() @@ -87,6 +117,46 @@ def test_get_basic_stats(self) -> None: assert d1["MIN"] == -2 and d2["MIN"] == -5 assert d1["MAX"] == 100 and d2["MAX"] == 98 + def test_jensenshannon(self) -> None: + from snowflake.ml.modeling.metrics import monitor + + df1 = self._session.create_dataframe( + [ + snowpark.Row(-3), + snowpark.Row(-2), + snowpark.Row(8), + snowpark.Row(100), + ], + schema=["col1"], + ) + + df2 = self._session.create_dataframe( + [ + snowpark.Row(-2), + snowpark.Row(8), + snowpark.Row(100), + snowpark.Row(140), + ], + schema=["col2"], + ) + + df3 = self._session.create_dataframe( + [ + snowpark.Row(-3), + snowpark.Row(-2), + snowpark.Row(8), + snowpark.Row(8), + snowpark.Row(8), + snowpark.Row(100), + ], + schema=["col1"], + ) + + js = monitor.jensenshannon(df1, "col1", df2, "col2") + assert abs(js - JS_helper([0.125, 0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25, 0.125])) <= 1e-5 + js = monitor.jensenshannon(df1, "col1", df3, "col1") + assert abs(js - JS_helper([0.25, 0.25, 0.25, 0.25], [1.0 / 6, 1.0 / 6, 0.5, 1.0 / 6])) <= 1e-5 + if __name__ == "__main__": absltest.main() diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/test_label_encoder.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_label_encoder.py index c7417925..cb439855 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/test_label_encoder.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_label_encoder.py @@ -6,6 +6,7 @@ import pickle import sys import tempfile +from typing import List from unittest import TestCase import cloudpickle @@ -14,8 +15,8 @@ from absl.testing.absltest import main from sklearn.preprocessing import LabelEncoder as SklearnLabelEncoder -from snowflake.ml.modeling.preprocessing import ( - LabelEncoder, # type: ignore[attr-defined] +from snowflake.ml.modeling.preprocessing import ( # type: ignore[attr-defined] + LabelEncoder, ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session @@ -39,7 +40,7 @@ class LabelEncoderTest(TestCase): def setUp(self) -> None: """Creates Snowpark and Snowflake environments for testing.""" self._session = Session.builder.configs(SnowflakeLoginOptions()).create() - self._to_be_deleted_files = [] + self._to_be_deleted_files: List[str] = [] def tearDown(self) -> None: self._session.close() diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/test_one_hot_encoder.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_one_hot_encoder.py index fa90a2c3..ebe86086 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/test_one_hot_encoder.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_one_hot_encoder.py @@ -1640,9 +1640,9 @@ def test_fit_snowpark_transform_everydtypes(self) -> None: ("X", np.uint8), ("Y", np.float64), ("Z", np.str_), - ("A", np.bool8), - # ("B", np.bytes0), - ("C", np.object0), + ("A", np.bool_), + ("B", np.bytes_), + ("C", np.object_), ], ) pd_df = pd.DataFrame(x) @@ -1722,6 +1722,28 @@ def test_get_output_cols_sparse(self) -> None: out_cols = ohe.transform(snow_df).columns self.assertCountEqual(ohe.get_output_cols(), out_cols) + def test_column_insensitivity(self) -> None: + # UCI_BANK_MARKETING_20COLUMNS + snow_df = self._session.sql( + """SELECT *, IFF(Y = 'yes', 1.0, 0.0) as LABEL + FROM ML_DATASETS.PUBLIC.UCI_BANK_MARKETING_20COLUMNS + LIMIT 1000""" + ).drop("Y") + cols = [ + "AGE", + "CAMPAIGN", + "CONTACT", + "DAY_OF_WEEK", + "EDUCATION", + "JOB", + "MONTH", + "DURATION", + ] + lower_cols = [c.lower() for c in cols] + + ohe = OneHotEncoder(input_cols=lower_cols, output_cols=cols, sparse=False).fit(snow_df) + ohe.transform(snow_df) + if __name__ == "__main__": main() diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/test_robust_scaler.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_robust_scaler.py index c3cfd309..893a961e 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/test_robust_scaler.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_robust_scaler.py @@ -17,8 +17,8 @@ from absl.testing.absltest import main from sklearn.preprocessing import RobustScaler as SklearnRobustScaler -from snowflake.ml.modeling.preprocessing import ( - RobustScaler, # type: ignore[attr-defined] +from snowflake.ml.modeling.preprocessing import ( # type: ignore[attr-defined] + RobustScaler, ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/test_standard_scaler.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_standard_scaler.py index 1f1522a0..0f13ad60 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/test_standard_scaler.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_standard_scaler.py @@ -15,8 +15,8 @@ from absl.testing.absltest import TestCase, main from sklearn.preprocessing import StandardScaler as SklearnStandardScaler -from snowflake.ml.modeling.preprocessing import ( - StandardScaler, # type: ignore[attr-defined] +from snowflake.ml.modeling.preprocessing import ( # type: ignore[attr-defined] + StandardScaler, ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session diff --git a/tests/integ/snowflake/ml/registry/BUILD.bazel b/tests/integ/snowflake/ml/registry/BUILD.bazel index 8f7eb64e..843269f9 100644 --- a/tests/integ/snowflake/ml/registry/BUILD.bazel +++ b/tests/integ/snowflake/ml/registry/BUILD.bazel @@ -19,17 +19,33 @@ py_test( "//tests/integ/snowflake/ml/test_utils:model_factory", "//snowflake/ml/registry:model_registry", "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/test_utils:test_env_utils", ], ) py_test( - name = "model_registry_integ_test_with_snowservice", - timeout = "eternal", # 3600s - srcs = ["model_registry_integ_test_with_snowservice.py"], + name = "model_registry_integ_test_snowservice_base", + srcs = ["model_registry_integ_test_snowservice_base.py"], deps = [ "//tests/integ/snowflake/ml/test_utils:db_manager", "//tests/integ/snowflake/ml/test_utils:model_factory", + "//tests/integ/snowflake/ml/test_utils:test_env_utils", + "//snowflake/ml/model:deploy_platforms", "//snowflake/ml/registry:model_registry", "//snowflake/ml/utils:connection_params", ], ) + +py_test( + name = "model_registry_integ_test_snowservice", + timeout = "eternal", # 3600s + srcs = ["model_registry_integ_test_snowservice.py"], + deps = [":model_registry_integ_test_snowservice_base"], +) + +py_test( + name = "model_registry_integ_test_snowservice_merge_gate", + timeout = "eternal", # 3600s + srcs = ["model_registry_integ_test_snowservice_merge_gate.py"], + deps = [":model_registry_integ_test_snowservice_base"], +) diff --git a/tests/integ/snowflake/ml/registry/model_registry_basic_integ_test.py b/tests/integ/snowflake/ml/registry/model_registry_basic_integ_test.py index 724ccb1d..a4c122c9 100644 --- a/tests/integ/snowflake/ml/registry/model_registry_basic_integ_test.py +++ b/tests/integ/snowflake/ml/registry/model_registry_basic_integ_test.py @@ -7,7 +7,7 @@ from absl.testing import absltest, parameterized -from snowflake.ml.registry import model_registry +from snowflake.ml.registry import _schema, model_registry from snowflake.ml.utils import connection_params from snowflake.snowpark import Session from tests.integ.snowflake.ml.test_utils import db_manager @@ -44,6 +44,11 @@ class TestModelRegistryBasicInteg(parameterized.TestCase): def setUpClass(cls) -> None: """Creates Snowpark and Snowflake environments for testing.""" cls._session = Session.builder.configs(connection_params.SnowflakeLoginOptions()).create() + cls._database = cls._session.get_current_database() + cls._schema = cls._session.get_current_schema() + assert cls._database is not None + assert cls._schema is not None + cls._db_manager = db_manager.DBManager(cls._session) cls._db_manager.cleanup_databases() @@ -59,6 +64,12 @@ def setUpClass(cls) -> None: _PRE_CREATED_DB_AND_SCHEMA_NAME_LOWER, ) + # restore the session to use the original database and schema + cls._session.use_database(cls._database) + cls._session.use_schema(cls._schema) + assert cls._database == cls._session.get_current_database() + assert cls._schema == cls._session.get_current_schema() + @classmethod def tearDownClass(cls) -> None: cls._db_manager.drop_database(_PRE_CREATED_DB_NAME_UPPER, if_exists=True) @@ -69,6 +80,11 @@ def tearDownClass(cls) -> None: cls._db_manager.drop_database(_CUSTOM_NEW_DB_NAME_LOWER, if_exists=True) cls._session.close() + def _validate_restore_db_and_schema(self) -> None: + """Validate that the database and schema are restored after creating registry.""" + self.assertEqual(self._database, self._session.get_current_database()) + self.assertEqual(self._schema, self._session.get_current_schema()) + @parameterized.parameters( # type: ignore[misc] {"database_name": _PRE_CREATED_DB_NAME_UPPER, "schema_name": None}, { @@ -121,6 +137,7 @@ def test_create_and_drop_model_registry(self, database_name: str, schema_name: O session=self._session, database_name=database_name, schema_name=schema_name ) self.assertTrue(create_result) + self._validate_restore_db_and_schema() # Test create again, should be non-op create_result = model_registry.create_model_registry( @@ -128,24 +145,44 @@ def test_create_and_drop_model_registry(self, database_name: str, schema_name: O ) self.assertTrue(create_result) + self._validate_restore_db_and_schema() + _ = model_registry.ModelRegistry( session=self._session, database_name=database_name, schema_name=schema_name ) self._db_manager.drop_schema(schema_name, database_name) self.assertTrue(self._db_manager.assert_schema_existence(schema_name, database_name, exists=False)) + self._validate_restore_db_and_schema() else: create_result = model_registry.create_model_registry(session=self._session, database_name=database_name) self.assertTrue(create_result) + self._validate_restore_db_and_schema() # Test create again, should be non-op create_result = model_registry.create_model_registry(session=self._session, database_name=database_name) self.assertTrue(create_result) + self._validate_restore_db_and_schema() _ = model_registry.ModelRegistry(session=self._session, database_name=database_name) self._db_manager.drop_database(database_name) self.assertTrue(self._db_manager.assert_database_existence(database_name, exists=False)) + self._validate_restore_db_and_schema() + + def test_add_new_registry_table_column_without_allowlist(self) -> None: + broken_registry = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "registry_broken") + try: + model_registry.create_model_registry(session=self._session, database_name=broken_registry) + except Exception as e: + self._db_manager.drop_database(broken_registry) + raise Exception(f"Test failed with exception:{e}") + + _schema._REGISTRY_TABLE_SCHEMA["new_column"] = "VARCHAR" + with self.assertRaisesRegex(TypeError, "Registry table:.* doesn't have required column:.*"): + model_registry.ModelRegistry(session=self._session, database_name=broken_registry) + + _schema._REGISTRY_TABLE_SCHEMA.pop("new_column") if __name__ == "__main__": diff --git a/tests/integ/snowflake/ml/registry/model_registry_integ_test.py b/tests/integ/snowflake/ml/registry/model_registry_integ_test.py index e1bd5e1d..64325431 100644 --- a/tests/integ/snowflake/ml/registry/model_registry_integ_test.py +++ b/tests/integ/snowflake/ml/registry/model_registry_integ_test.py @@ -15,7 +15,11 @@ from snowflake.ml.registry import model_registry from snowflake.ml.utils import connection_params from snowflake.snowpark import Session -from tests.integ.snowflake.ml.test_utils import db_manager, model_factory +from tests.integ.snowflake.ml.test_utils import ( + db_manager, + model_factory, + test_env_utils, +) class TestModelRegistryInteg(absltest.TestCase): @@ -62,6 +66,9 @@ def test_basic_workflow(self) -> None: model_version=model_version, model=model, tags=model_tags, + conda_dependencies=[ + test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") + ], sample_input_data=test_features, options={"embed_local_ml_library": True}, ) @@ -74,12 +81,19 @@ def test_basic_workflow(self) -> None: model_version=model_version, model=model, tags={"stage": "testing", "classifier_type": "svm.SVC"}, + conda_dependencies=[ + test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") + ], sample_input_data=test_features, options={"embed_local_ml_library": True}, ) model_ref = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version) + # Test getting model name and model version + self.assertEqual(model_ref.get_name(), model_name) + self.assertEqual(model_ref.get_version(), model_version) + # Test metrics test_accuracy = metrics.accuracy_score(test_labels, local_prediction) @@ -183,7 +197,7 @@ def test_basic_workflow(self) -> None: deployment_name=permanent_deployment_name, target_method="predict", permanent=True, - options={"relax_version": True}, + options={"relax_version": test_env_utils.is_in_pip_env()}, ) remote_prediction_perm = model_ref.predict(permanent_deployment_name, test_features) np.testing.assert_allclose(remote_prediction_perm.to_numpy(), np.expand_dims(local_prediction, axis=1)) @@ -193,7 +207,7 @@ def test_basic_workflow(self) -> None: deployment_name=custom_permanent_deployment_name, target_method="predict_proba", permanent=True, - options={"permanent_udf_stage_location": self.perm_stage, "relax_version": True}, + options={"permanent_udf_stage_location": self.perm_stage, "relax_version": test_env_utils.is_in_pip_env()}, ) remote_prediction_proba_perm = model_ref.predict(custom_permanent_deployment_name, test_features) np.testing.assert_allclose(remote_prediction_proba_perm.to_numpy(), local_prediction_proba) @@ -211,6 +225,13 @@ def test_basic_workflow(self) -> None: self.assertEqual(filtered_model_deployment_list["MODEL_VERSION"][0], second=model_version) self.assertEqual(filtered_model_deployment_list["STAGE_PATH"][0], second=self.perm_stage) + self.assertEqual( + self._session.sql( + f"SHOW USER FUNCTIONS LIKE '%{custom_permanent_deployment_name}' IN DATABASE \"{self.registry_name}\";" + ).count(), + 1, + ) + model_ref.delete_deployment(deployment_name=custom_permanent_deployment_name) # type: ignore[attr-defined] model_deployment_list = model_ref.list_deployments().to_pandas() # type: ignore[attr-defined] @@ -219,13 +240,20 @@ def test_basic_workflow(self) -> None: self.assertEqual(model_deployment_list["MODEL_VERSION"][0], second=model_version) self.assertEqual(model_deployment_list["DEPLOYMENT_NAME"][0], second=permanent_deployment_name) + self.assertEqual( + self._session.sql( + f"SHOW USER FUNCTIONS LIKE '%{custom_permanent_deployment_name}' IN DATABASE \"{self.registry_name}\";" + ).count(), + 0, + ) + # Test temp deployment temp_deployment_name = f"{model_name}_{model_version}_temp_deploy" model_ref.deploy( # type: ignore[attr-defined] deployment_name=temp_deployment_name, target_method="predict", permanent=False, - options={"relax_version": True}, + options={"relax_version": test_env_utils.is_in_pip_env()}, ) remote_prediction_temp = model_ref.predict(temp_deployment_name, test_features) np.testing.assert_allclose(remote_prediction_temp.to_numpy(), np.expand_dims(local_prediction, axis=1)) @@ -244,7 +272,7 @@ def test_snowml_model(self) -> None: model_name = "snowml_xgb_classifier" model_version = self.run_id - model, test_features = model_factory.ModelFactory.prepare_snowml_model() + model, test_features = model_factory.ModelFactory.prepare_snowml_model_xgb() local_prediction = model.predict(test_features) local_prediction_proba = model.predict_proba(test_features) @@ -253,6 +281,9 @@ def test_snowml_model(self) -> None: model_name=model_name, model_version=model_version, model=model, + conda_dependencies=[ + test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") + ], options={"embed_local_ml_library": True}, ) @@ -299,6 +330,9 @@ def test_snowml_pipeline(self) -> None: model_name=model_name, model_version=model_version, model=model, + conda_dependencies=[ + test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") + ], options={"embed_local_ml_library": True}, ) diff --git a/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice.py b/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice.py new file mode 100644 index 00000000..021d165f --- /dev/null +++ b/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice.py @@ -0,0 +1,145 @@ +# TODO[shchen], SNOW-889081, re-enable once server-side image build is supported. +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# +# +# import functools +# import tempfile +# import uuid +# +# import numpy as np +# import pandas as pd +# import pytest +# import torch +from absl.testing import absltest + +# +# from snowflake.ml.model import deploy_platforms +# from snowflake.ml.model._signatures import pytorch_handler, tensorflow_handler +# from tests.integ.snowflake.ml.registry.model_registry_integ_test_snowservice_base import ( +# TestModelRegistryIntegSnowServiceBase, +# ) +# from tests.integ.snowflake.ml.test_utils import model_factory +# +# +# class TestModelRegistryIntegWithSnowServiceDeployment(TestModelRegistryIntegSnowServiceBase): +# @pytest.mark.pip_incompatible +# def test_sklearn_deployment_with_snowml_conda(self) -> None: +# self._test_snowservice_deployment( +# model_name="test_sklearn_model_with_snowml_conda", +# model_version=uuid.uuid4().hex, +# prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_sklearn_model, +# embed_local_ml_library=False, +# conda_dependencies=["snowflake-ml-python==1.0.2"], +# prediction_assert_fn=lambda local_prediction, remote_prediction: np.testing.assert_allclose( +# remote_prediction.to_numpy(), np.expand_dims(local_prediction, axis=1) +# ), +# deployment_options={ +# "platform": deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES, +# "target_method": "predict", +# "options": { +# "compute_pool": self._TEST_CPU_COMPUTE_POOL, +# "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), +# "num_workers": 1, +# }, +# }, +# ) +# +# @pytest.mark.pip_incompatible +# def test_sklearn_deployment_with_local_source_code(self) -> None: +# self._test_snowservice_deployment( +# model_name="test_sklearn_model_with_local_source_code", +# model_version=uuid.uuid4().hex, +# prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_sklearn_model, +# prediction_assert_fn=lambda local_prediction, remote_prediction: np.testing.assert_allclose( +# remote_prediction.to_numpy(), np.expand_dims(local_prediction, axis=1) +# ), +# deployment_options={ +# "platform": deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES, +# "target_method": "predict", +# "options": { +# "compute_pool": self._TEST_CPU_COMPUTE_POOL, +# "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), +# }, +# }, +# ) +# +# @pytest.mark.pip_incompatible +# def test_huggingface_custom_model_deployment(self) -> None: +# with tempfile.TemporaryDirectory() as tmpdir: +# self._test_snowservice_deployment( +# model_name="gpt2_model_gpu", +# model_version=uuid.uuid4().hex, +# conda_dependencies=["pytorch", "transformers"], +# prepare_model_and_feature_fn=functools.partial( +# model_factory.ModelFactory.prepare_gpt2_model, +# local_cache_dir=tmpdir, +# ), +# prediction_assert_fn=lambda local_prediction, remote_prediction: pd.testing.assert_frame_equal( +# remote_prediction, local_prediction, check_dtype=False +# ), +# deployment_options={ +# "platform": deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES, +# "target_method": "predict", +# "options": { +# "compute_pool": self._TEST_CPU_COMPUTE_POOL, +# "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), +# "num_workers": 1, +# }, +# }, +# ) +# +# @pytest.mark.pip_incompatible +# def test_torch_model_deployment_with_gpu(self) -> None: +# self._test_snowservice_deployment( +# model_name="torch_model", +# model_version=uuid.uuid4().hex, +# prepare_model_and_feature_fn=functools.partial( +# model_factory.ModelFactory.prepare_torch_model, force_remote_gpu_inference=True +# ), +# conda_dependencies=[ +# "pytorch-nightly::pytorch", +# "pytorch-nightly::pytorch-cuda==12.1", +# "nvidia::cuda==12.1.*", +# ], +# prediction_assert_fn=lambda local_prediction, remote_prediction: torch.testing.assert_close( +# pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(remote_prediction)[0], +# local_prediction[0], +# check_dtype=False, +# ), +# deployment_options={ +# "platform": deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES, +# "target_method": "forward", +# "options": { +# "compute_pool": self._TEST_GPU_COMPUTE_POOL, +# "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), +# "num_workers": 1, +# "use_gpu": True, +# }, +# }, +# ) +# +# @pytest.mark.pip_incompatible +# def test_keras_model_deployment(self) -> None: +# self._test_snowservice_deployment( +# model_name="keras_model", +# model_version=uuid.uuid4().hex, +# prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_keras_model, +# prediction_assert_fn=lambda local_prediction, remote_prediction: np.testing.assert_allclose( +# tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(remote_prediction)[0].numpy(), +# local_prediction[0], +# atol=1e-6, +# ), +# deployment_options={ +# "platform": deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES, +# "target_method": "predict", +# "options": { +# "compute_pool": self._TEST_CPU_COMPUTE_POOL, +# "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), +# }, +# }, +# ) +# +# +if __name__ == "__main__": + absltest.main() diff --git a/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_base.py b/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_base.py new file mode 100644 index 00000000..e0c52155 --- /dev/null +++ b/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_base.py @@ -0,0 +1,132 @@ +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# + +import uuid +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from unittest import SkipTest + +import pandas as pd +from absl.testing import absltest, parameterized + +from snowflake.ml.model import model_signature +from snowflake.ml.registry import model_registry +from snowflake.ml.utils import connection_params +from snowflake.snowpark import DataFrame as SnowparkDataFrame, Session +from tests.integ.snowflake.ml.test_utils import ( + db_manager, + model_factory, + test_env_utils, +) + + +class TestModelRegistryIntegSnowServiceBase(parameterized.TestCase): + _SNOWSERVICE_CONNECTION_NAME = "snowservice" + _TEST_CPU_COMPUTE_POOL = "MODEL_DEPLOYMENT_INTEG_TEST_POOL_STANDARD_2" + _TEST_GPU_COMPUTE_POOL = "MODEL_DEPLOYMENT_INTEG_TEST_POOL_GPU_3" + _RUN_ID = uuid.uuid4().hex[:2] + _TEST_DB = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "db").upper() + _TEST_SCHEMA = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "schema").upper() + _TEST_IMAGE_REPO = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "repo").upper() + _TEST_ROLE = "SYSADMIN" + + @classmethod + def setUpClass(cls) -> None: + """Creates Snowpark and Snowflake environments for testing.""" + try: + login_options = connection_params.SnowflakeLoginOptions(connection_name=cls._SNOWSERVICE_CONNECTION_NAME) + except KeyError: + raise SkipTest( + "SnowService connection parameters not present: skipping " + "TestModelRegistryIntegWithSnowServiceDeployment." + ) + + cls._session = Session.builder.configs( + { + **login_options, + **{"database": cls._TEST_DB, "schema": cls._TEST_SCHEMA}, + } + ).create() + + cls._db_manager = db_manager.DBManager(cls._session) + cls._db_manager.set_role(cls._TEST_ROLE) + cls._db_manager.cleanup_databases(expire_hours=6) + model_registry.create_model_registry( + session=cls._session, database_name=cls._TEST_DB, schema_name=cls._TEST_SCHEMA + ) + cls.registry = model_registry.ModelRegistry( + session=cls._session, database_name=cls._TEST_DB, schema_name=cls._TEST_SCHEMA + ) + cls._db_manager.create_image_repo(cls._TEST_IMAGE_REPO) + + @classmethod + def tearDownClass(cls) -> None: + cls._db_manager.drop_image_repo(cls._TEST_IMAGE_REPO) + cls._db_manager.drop_database(cls._TEST_DB) + cls._session.close() + + def _test_snowservice_deployment( + self, + model_name: str, + model_version: str, + prepare_model_and_feature_fn: Callable[[], Tuple[Any, Any]], + deployment_options: Dict[str, Any], + prediction_assert_fn: Callable[[Any, Union[pd.DataFrame, SnowparkDataFrame]], Any], + pip_requirements: Optional[List[str]] = None, + conda_dependencies: Optional[List[str]] = None, + embed_local_ml_library: Optional[bool] = True, + ): + + model, test_features, *_ = prepare_model_and_feature_fn() + target_method = deployment_options["target_method"] + + if hasattr(model, "predict_with_device"): + local_prediction = model.predict_with_device(test_features, model_factory.DEVICE.CPU) + else: + local_prediction = getattr(model, target_method)(test_features) + + # In test, latest snowpark version might not be in conda channel yet, which can cause image build to fail. + # Instead we rely on snowpark version on information.schema table. Note that this will not affect end user + # as by the time they use it, the latest snowpark should be available in conda already. + conda_dependencies = conda_dependencies or [] + conda_dependencies.append( + test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") + ) + + self.registry.log_model( + model_name=model_name, + model_version=model_version, + model=model, + conda_dependencies=conda_dependencies, + pip_requirements=pip_requirements, + signatures={target_method: model_signature.infer_signature(test_features, local_prediction)}, + options={"embed_local_ml_library": embed_local_ml_library}, + ) + + model_ref = model_registry.ModelReference( + registry=self.registry, model_name=model_name, model_version=model_version + ) + + deployment_name = f"{model_name}_{model_version}_deployment" + deployment_options["deployment_name"] = deployment_name + model_ref.deploy(**deployment_options) + + remote_prediction = model_ref.predict(deployment_name, test_features) + prediction_assert_fn(local_prediction, remote_prediction) + + model_deployment_list = model_ref.list_deployments().to_pandas() # type: ignore[attr-defined] + self.assertEqual(model_deployment_list.shape[0], 1) + self.assertEqual(model_deployment_list["MODEL_NAME"][0], model_name) + self.assertEqual(model_deployment_list["MODEL_VERSION"][0], model_version) + self.assertEqual(model_deployment_list["DEPLOYMENT_NAME"][0], deployment_name) + + model_ref.delete_deployment(deployment_name=deployment_name) # type: ignore[attr-defined] + self.assertEqual(model_ref.list_deployments().to_pandas().shape[0], 0) # type: ignore[attr-defined] + + self.assertEqual(self.registry.list_models().to_pandas().shape[0], 1) + self.registry.delete_model(model_name=model_name, model_version=model_version, delete_artifact=True) + self.assertEqual(self.registry.list_models().to_pandas().shape[0], 0) + + +if __name__ == "__main__": + absltest.main() diff --git a/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_merge_gate.py b/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_merge_gate.py new file mode 100644 index 00000000..3f29f0c9 --- /dev/null +++ b/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_merge_gate.py @@ -0,0 +1,40 @@ +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# + +import uuid + +import pandas as pd +import pytest +from absl.testing import absltest + +from snowflake.ml.model import deploy_platforms +from tests.integ.snowflake.ml.registry.model_registry_integ_test_snowservice_base import ( + TestModelRegistryIntegSnowServiceBase, +) +from tests.integ.snowflake.ml.test_utils import model_factory + + +class TestModelRegistryIntegWithSnowServiceDeployment(TestModelRegistryIntegSnowServiceBase): + @pytest.mark.pip_incompatible + def test_snowml_model_deployment_xgboost(self) -> None: + self._test_snowservice_deployment( + model_name="xgboost_model", + model_version=uuid.uuid4().hex, + prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_snowml_model_xgb, + prediction_assert_fn=lambda local_prediction, remote_prediction: pd.testing.assert_frame_equal( + remote_prediction, local_prediction, check_dtype=False + ), + deployment_options={ + "platform": deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES, + "target_method": "predict", + "options": { + "compute_pool": self._TEST_CPU_COMPUTE_POOL, + "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), + }, + }, + ) + + +if __name__ == "__main__": + absltest.main() diff --git a/tests/integ/snowflake/ml/registry/model_registry_integ_test_with_snowservice.py b/tests/integ/snowflake/ml/registry/model_registry_integ_test_with_snowservice.py deleted file mode 100644 index 9f00b38a..00000000 --- a/tests/integ/snowflake/ml/registry/model_registry_integ_test_with_snowservice.py +++ /dev/null @@ -1,224 +0,0 @@ -# -# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. -# -import functools -import tempfile -import uuid -from typing import Any, Callable, Dict, List, Optional, Tuple -from unittest import SkipTest - -import numpy as np -import pandas as pd -import pytest -from absl.testing import absltest, parameterized - -from snowflake.ml.model import _deployer -from snowflake.ml.registry import model_registry -from snowflake.ml.utils import connection_params -from snowflake.snowpark import Session -from tests.integ.snowflake.ml.test_utils import db_manager, model_factory - - -class TestModelRegistryIntegWithSnowServiceDeployment(parameterized.TestCase): - _SNOWSERVICE_CONNECTION_NAME = "snowservice" - _TEST_CPU_COMPUTE_POOL = "MODEL_DEPLOYMENT_INTEG_TEST_POOL" - _TEST_GPU_COMPUTE_POOL = "MODEL_DEPLOYMENT_INTEG_TEST_POOL_GPU_3" - _RUN_ID = uuid.uuid4().hex[:2] - _TEST_DB = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "db").upper() - _TEST_SCHEMA = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "schema").upper() - _TEST_IMAGE_REPO = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "repo").upper() - _TEST_ROLE = "SYSADMIN" - _TEST_WAREHOUSE = "SNOW_ML_XSMALL" - - @classmethod - def setUpClass(cls) -> None: - """Creates Snowpark and Snowflake environments for testing.""" - try: - login_options = connection_params.SnowflakeLoginOptions(connection_name=cls._SNOWSERVICE_CONNECTION_NAME) - except KeyError: - raise SkipTest( - "SnowService connection parameters not present: skipping " - "TestModelRegistryIntegWithSnowServiceDeployment." - ) - - cls._session = Session.builder.configs( - { - **login_options, - **{"database": cls._TEST_DB, "schema": cls._TEST_SCHEMA}, - } - ).create() - - cls._db_manager = db_manager.DBManager(cls._session) - cls._db_manager.set_role(cls._TEST_ROLE) - cls._db_manager.set_warehouse(cls._TEST_WAREHOUSE) - model_registry.create_model_registry( - session=cls._session, database_name=cls._TEST_DB, schema_name=cls._TEST_SCHEMA - ) - cls.registry = model_registry.ModelRegistry( - session=cls._session, database_name=cls._TEST_DB, schema_name=cls._TEST_SCHEMA - ) - cls._db_manager.create_image_repo(cls._TEST_IMAGE_REPO) - - @classmethod - def tearDownClass(cls) -> None: - cls._db_manager.drop_image_repo(cls._TEST_IMAGE_REPO) - cls._db_manager.drop_database(cls._TEST_DB) - cls._session.close() - - def _test_snowservice_deployment( - self, - model_name: str, - model_version: str, - prepare_model_and_feature_fn: Callable[[], Tuple[Any, Any]], - deployment_options: Dict[str, Any], - conda_dependencies: Optional[List[str]] = None, - embed_local_ml_library: Optional[bool] = True, - ): - - model, test_features, *_ = prepare_model_and_feature_fn() - - self.registry.log_model( - model_name=model_name, - model_version=model_version, - model=model, - conda_dependencies=conda_dependencies, - sample_input_data=test_features, - options={"embed_local_ml_library": embed_local_ml_library}, - ) - - model_ref = model_registry.ModelReference( - registry=self.registry, model_name=model_name, model_version=model_version - ) - - deployment_name = f"{model_name}_{model_version}_deployment" - deployment_options["deployment_name"] = deployment_name - model_ref.deploy(**deployment_options) - target_method = deployment_options["target_method"] - local_prediction = getattr(model, target_method)(test_features) - remote_prediction = model_ref.predict(deployment_name, test_features) - - if isinstance(local_prediction, np.ndarray): - np.testing.assert_allclose(remote_prediction.to_numpy(), np.expand_dims(local_prediction, axis=1)) - else: - pd.testing.assert_frame_equal(remote_prediction, local_prediction, check_dtype=False) - - model_deployment_list = model_ref.list_deployments().to_pandas() # type: ignore[attr-defined] - self.assertEqual(model_deployment_list.shape[0], 1) - self.assertEqual(model_deployment_list["MODEL_NAME"][0], model_name) - self.assertEqual(model_deployment_list["MODEL_VERSION"][0], model_version) - self.assertEqual(model_deployment_list["DEPLOYMENT_NAME"][0], deployment_name) - - model_ref.delete_deployment(deployment_name=deployment_name) # type: ignore[attr-defined] - self.assertEqual(model_ref.list_deployments().to_pandas().shape[0], 0) # type: ignore[attr-defined] - - self.assertEqual(self.registry.list_models().to_pandas().shape[0], 1) - self.registry.delete_model(model_name=model_name, model_version=model_version, delete_artifact=True) - self.assertEqual(self.registry.list_models().to_pandas().shape[0], 0) - - # TODO: doesnt work, Mismatched elements: 10 / 100 (10%). could be due to version mismatch? - @pytest.mark.pip_incompatible - def test_sklearn_deployment_with_snowml_conda(self) -> None: - self._test_snowservice_deployment( - model_name="test_sklearn_model", - model_version=uuid.uuid4().hex, - prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_sklearn_model, - embed_local_ml_library=False, - conda_dependencies=["snowflake-ml-python==1.0.2"], - deployment_options={ - "platform": _deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE, - "target_method": "predict", - "options": { - "compute_pool": self._TEST_CPU_COMPUTE_POOL, - "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), - }, - }, - ) - - @pytest.mark.pip_incompatible - def test_sklearn_deployment_with_local_source_code(self) -> None: - self._test_snowservice_deployment( - model_name="test_sklearn_model", - model_version=uuid.uuid4().hex, - prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_sklearn_model, - deployment_options={ - "platform": _deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE, - "target_method": "predict", - "options": { - "compute_pool": self._TEST_CPU_COMPUTE_POOL, - "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), - }, - }, - ) - - @pytest.mark.pip_incompatible - def test_sklearn_deployment(self) -> None: - self._test_snowservice_deployment( - model_name="test_sklearn_model", - model_version=uuid.uuid4().hex, - prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_sklearn_model, - deployment_options={ - "platform": _deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE, - "target_method": "predict", - "options": { - "compute_pool": self._TEST_CPU_COMPUTE_POOL, - "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), - }, - }, - ) - - @pytest.mark.pip_incompatible - def test_huggingface_deployment(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - self._test_snowservice_deployment( - model_name="gpt2_model_gpu", - model_version=uuid.uuid4().hex, - conda_dependencies=["pytorch", "transformers"], - prepare_model_and_feature_fn=functools.partial( - model_factory.ModelFactory.prepare_gpt2_model, local_cache_dir=tmpdir - ), - deployment_options={ - "platform": _deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE, - "target_method": "predict", - "options": { - "compute_pool": self._TEST_GPU_COMPUTE_POOL, - "use_gpu": True, - "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), - }, - }, - ) - - @pytest.mark.pip_incompatible - def test_snowml_model_deployment_logistic_with_sourcecode_embedded_in_model(self) -> None: - self._test_snowservice_deployment( - model_name="snowml", - model_version=uuid.uuid4().hex, - prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_snowml_model_logistic, - deployment_options={ - "platform": _deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE, - "target_method": "predict", - "options": { - "compute_pool": self._TEST_GPU_COMPUTE_POOL, - "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), - }, - }, - ) - - # - # TODO[schen], SNOW-861613, investigate xgboost model prediction hanging issue when run with Gunicorn --preload - # def test_snowml_model_deployment_xgboost(self) -> None: - # self._test_snowservice_deployment( - # model_name="snowml", - # model_version=uuid.uuid4().hex, - # prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_snowml_model, - # deployment_options={ - # "platform": _deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE, - # "target_method": "predict", - # "options": { - # "compute_pool": self._TEST_GPU_COMPUTE_POOL, - # } - # }, - # ) - - -if __name__ == "__main__": - absltest.main() diff --git a/tests/integ/snowflake/ml/test_utils/BUILD.bazel b/tests/integ/snowflake/ml/test_utils/BUILD.bazel index 117ac691..eadfcc29 100644 --- a/tests/integ/snowflake/ml/test_utils/BUILD.bazel +++ b/tests/integ/snowflake/ml/test_utils/BUILD.bazel @@ -24,3 +24,13 @@ py_library( "//snowflake/ml/modeling/xgboost:xgb_classifier", ] ) + +py_library( + name = "test_env_utils", + testonly = True, + srcs = ["test_env_utils.py"], + deps = [ + "//snowflake/ml/_internal/utils:query_result_checker", + "//snowflake/ml/_internal:env", + ], +) diff --git a/tests/integ/snowflake/ml/test_utils/db_manager.py b/tests/integ/snowflake/ml/test_utils/db_manager.py index 29054192..fffb027a 100644 --- a/tests/integ/snowflake/ml/test_utils/db_manager.py +++ b/tests/integ/snowflake/ml/test_utils/db_manager.py @@ -54,10 +54,10 @@ def drop_database(self, db_name: str, if_exists: bool = False) -> None: if_exists_sql = " IF EXISTS" if if_exists else "" self._session.sql(f"DROP DATABASE{if_exists_sql} {actual_db_name}").collect() - def cleanup_databases(self, expire_days: int = 3) -> None: + def cleanup_databases(self, expire_hours: int = 72) -> None: databases_df = self.show_databases(f"{_COMMON_PREFIX}%") stale_databases = databases_df.filter( - f"\"created_on\" < dateadd('day', {-expire_days}, current_timestamp())" + f"\"created_on\" < dateadd('hour', {-expire_hours}, current_timestamp())" ).collect() for stale_db in stale_databases: self.drop_database(stale_db.name, if_exists=True) diff --git a/tests/integ/snowflake/ml/test_utils/model_factory.py b/tests/integ/snowflake/ml/test_utils/model_factory.py index 632f5a90..159fa982 100644 --- a/tests/integ/snowflake/ml/test_utils/model_factory.py +++ b/tests/integ/snowflake/ml/test_utils/model_factory.py @@ -1,22 +1,34 @@ # # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # - +from enum import Enum from typing import List, Tuple import numpy as np import numpy.typing as npt import pandas as pd +import tensorflow as tf +import torch from sklearn import datasets, svm from snowflake.ml.model import custom_model -from snowflake.ml.modeling.linear_model import LogisticRegression -from snowflake.ml.modeling.pipeline import Pipeline -from snowflake.ml.modeling.preprocessing import MinMaxScaler, OneHotEncoder -from snowflake.ml.modeling.xgboost import XGBClassifier +from snowflake.ml.modeling.linear_model import ( # type: ignore[attr-defined] + LogisticRegression, +) +from snowflake.ml.modeling.pipeline import Pipeline # type: ignore[attr-defined] +from snowflake.ml.modeling.preprocessing import ( # type: ignore[attr-defined] + MinMaxScaler, + OneHotEncoder, +) +from snowflake.ml.modeling.xgboost import XGBClassifier # type: ignore[attr-defined] from snowflake.snowpark import DataFrame, Session +class DEVICE(Enum): + CUDA = "cuda" + CPU = "cpu" + + class ModelFactory: @staticmethod def prepare_sklearn_model() -> Tuple[svm.SVC, npt.ArrayLike, npt.ArrayLike]: @@ -42,7 +54,7 @@ def one_vs_all(dataset: npt.NDArray[np.float64], digit: int) -> List[bool]: return clf, test_features, test_labels @staticmethod - def prepare_snowml_model() -> Tuple[XGBClassifier, pd.DataFrame]: + def prepare_snowml_model_xgb() -> Tuple[XGBClassifier, pd.DataFrame]: iris = datasets.load_iris() df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=iris["feature_names"] + ["target"]) df.columns = [s.replace(" (CM)", "").replace(" ", "") for s in df.columns.str.upper()] @@ -158,3 +170,110 @@ def predict(self, input_df: pd.DataFrame) -> pd.DataFrame: test_data = pd.DataFrame(["Hello, how are you?", "Once upon a time"]) return gpt2_model, test_data + + @staticmethod + def prepare_torch_model( + dtype: torch.dtype = torch.float32, force_remote_gpu_inference: bool = False + ) -> Tuple[torch.nn.Module, List[torch.Tensor], List[torch.Tensor]]: + class TorchModel(torch.nn.Module): + def __init__(self, n_input: int, n_hidden: int, n_out: int, dtype: torch.dtype = torch.float32) -> None: + super().__init__() + self.model = torch.nn.Sequential( + torch.nn.Linear(n_input, n_hidden, dtype=dtype), + torch.nn.ReLU(), + torch.nn.Linear(n_hidden, n_out, dtype=dtype), + torch.nn.Sigmoid(), + ) + + def forward_training(self, tensors: List[torch.Tensor]) -> List[torch.Tensor]: + return [self.model(tensors[0])] + + def forward(self, tensors: List[torch.Tensor]) -> List[torch.Tensor]: + device = DEVICE.CUDA if force_remote_gpu_inference else DEVICE.CPU + return self.predict_with_device(tensors, device) + + def predict_with_device(self, tensors: List[torch.Tensor], device: DEVICE) -> List[torch.Tensor]: + self.model.eval() + self.model.to(device.value) + with torch.no_grad(): + tensors = [tensor.to(device.value) for tensor in tensors] + return [self.model(tensors[0])] + + n_input, n_hidden, n_out, batch_size, learning_rate = 10, 15, 1, 100, 0.01 + x = np.random.rand(batch_size, n_input) + data_x = [torch.from_numpy(x).to(dtype=dtype)] + data_y = [(torch.rand(size=(batch_size, 1)) < 0.5).to(dtype=dtype)] + + model = TorchModel(n_input, n_hidden, n_out, dtype=dtype) + loss_function = torch.nn.MSELoss() + optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) + for _epoch in range(100): + pred_y = model.forward_training(data_x) + loss = loss_function(pred_y[0], data_y[0]) + optimizer.zero_grad() + loss.backward() + optimizer.step() + return model, data_x, data_y + + def prepare_jittable_torch_model( + dtype: torch.dtype = torch.float32, force_remote_gpu_inference: bool = False + ) -> Tuple[torch.nn.Module, List[torch.Tensor], List[torch.Tensor]]: + class TorchModel(torch.nn.Module): + def __init__(self, n_input: int, n_hidden: int, n_out: int, dtype: torch.dtype = torch.float32) -> None: + super().__init__() + self.model = torch.nn.Sequential( + torch.nn.Linear(n_input, n_hidden, dtype=dtype), + torch.nn.ReLU(), + torch.nn.Linear(n_hidden, n_out, dtype=dtype), + torch.nn.Sigmoid(), + ) + + def forward(self, tensors: List[torch.Tensor]) -> List[torch.Tensor]: + return [self.model(tensors[0])] + + n_input, n_hidden, n_out, batch_size, learning_rate = 10, 15, 1, 100, 0.01 + x = np.random.rand(batch_size, n_input) + data_x = [torch.from_numpy(x).to(dtype=dtype)] + data_y = [(torch.rand(size=(batch_size, 1)) < 0.5).to(dtype=dtype)] + + model = TorchModel(n_input, n_hidden, n_out, dtype=dtype) + loss_function = torch.nn.MSELoss() + optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) + for _epoch in range(100): + pred_y = model(data_x) + loss = loss_function(pred_y[0], data_y[0]) + optimizer.zero_grad() + loss.backward() + optimizer.step() + return model, data_x, data_y + + @staticmethod + def prepare_keras_model( + dtype: tf.dtypes.DType = tf.float32, + ) -> Tuple[tf.keras.Model, List[tf.Tensor], List[tf.Tensor]]: + class KerasModel(tf.keras.Model): + def __init__(self, n_hidden: int, n_out: int) -> None: + super().__init__() + self.fc_1 = tf.keras.layers.Dense(n_hidden, activation="relu") + self.fc_2 = tf.keras.layers.Dense(n_out, activation="sigmoid") + + def call(self, tensors: List[tf.Tensor]) -> List[tf.Tensor]: + input = tensors[0] + x = self.fc_1(input) + x = self.fc_2(x) + return [x] + + n_input, n_hidden, n_out, batch_size, learning_rate = 10, 15, 1, 100, 0.01 + x = np.random.rand(batch_size, n_input) + data_x = [tf.convert_to_tensor(x, dtype=dtype)] + raw_data_y = tf.random.uniform((batch_size, 1)) + raw_data_y = tf.where(raw_data_y > 0.5, tf.ones_like(raw_data_y), tf.zeros_like(raw_data_y)) + data_y = [tf.cast(raw_data_y, dtype=dtype)] + + def loss_fn(y_true: List[tf.Tensor], y_pred: List[tf.Tensor]) -> tf.Tensor: + return tf.keras.losses.mse(y_true[0], y_pred[0]) + + model = KerasModel(n_hidden, n_out) + model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), loss=loss_fn) + model.fit(data_x, data_y, batch_size=batch_size, epochs=100) + return model, data_x, data_y diff --git a/tests/integ/snowflake/ml/test_utils/test_env_utils.py b/tests/integ/snowflake/ml/test_utils/test_env_utils.py new file mode 100644 index 00000000..4b846027 --- /dev/null +++ b/tests/integ/snowflake/ml/test_utils/test_env_utils.py @@ -0,0 +1,59 @@ +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# + +import functools +import importlib +import textwrap + +from packaging import version + +import snowflake.connector +from snowflake.ml._internal import env +from snowflake.ml._internal.utils import query_result_checker +from snowflake.snowpark import session + + +@functools.lru_cache +def is_in_pip_env() -> bool: + try: + importlib.import_module("conda") + return False + except ModuleNotFoundError: + return True + + +@functools.lru_cache +def get_latest_package_versions_in_server( + session: session.Session, package_name: str, python_version: str = env.PYTHON_VERSION +) -> str: + parsed_python_version = version.Version(python_version) + sql = textwrap.dedent( + f""" + SELECT PACKAGE_NAME, VERSION + FROM information_schema.packages + WHERE package_name = '{package_name}' + AND language = 'python' + AND runtime_version = '{parsed_python_version.major}.{parsed_python_version.minor}'; + """ + ) + + version_list = [] + try: + result = ( + query_result_checker.SqlResultValidator( + session=session, + query=sql, + ) + .has_column("VERSION") + .has_dimensions(expected_rows=None, expected_cols=2) + .validate() + ) + for row in result: + req_ver = version.parse(row["VERSION"]) + version_list.append(req_ver) + except snowflake.connector.DataError: + return package_name + if len(version_list) == 0: + return package_name + return f"{package_name}=={max(version_list)}" diff --git a/third_party/rules_conda/conda.bzl b/third_party/rules_conda/conda.bzl index 3e85b21d..3af945a7 100644 --- a/third_party/rules_conda/conda.bzl +++ b/third_party/rules_conda/conda.bzl @@ -1,4 +1,4 @@ -load(":utils.bzl", "CONDA_EXT_MAP", "EXECUTE_TIMEOUT", "INSTALLER_SCRIPT_EXT_MAP", "execute_waitable_windows", "get_arch", "get_os", "get_path_envar", "windowsify") +load(":utils.bzl", "CONDA_EXT_MAP", "EXECUTE_TIMEOUT", "INSTALLER_SCRIPT_EXT_MAP", "ENV_VAR_SEPARATOR_MAP", "PYTHON_EXT_MAP", "execute_waitable_windows", "get_arch", "get_os", "get_path_envar", "windowsify") # libmamba version LIBMAMBA_SOLVER_VERSION = "23.1.0" @@ -79,20 +79,26 @@ def _install_conda(rctx, installer): path_envar = get_path_envar(rctx) + args = [rctx.path(installer)] + installer_flags + # Strip environment variables when installing conda to make sure # any activated conda environment would not affect the installer. # Also, since the installer writes to $HOME which breaks hermecity, we make it # write to the repo dir. - args = ["env", "-i", "HOME={}".format(rctx.attr.conda_dir), "PATH={}".format(path_envar), rctx.path(installer)] + installer_flags + install_conda_env = { + "HOME": rctx.attr.conda_dir, + "PATH": path_envar + } # execute installer with flags adjusted to OS if os == "Windows": + install_conda_env.update({"CONDA_DLL_SEARCH_MODIFICATION_ENABLE": ""}) # TODO: fix always returning 0 # it seems that either miniconda installer returns 0 even on failure or the wrapper does something wrong # also stdout and stderr are always empty - result = execute_waitable_windows(rctx, args, quiet = rctx.attr.quiet, environment = {"CONDA_DLL_SEARCH_MODIFICATION_ENABLE": ""}, timeout = rctx.attr.timeout) + result = execute_waitable_windows(rctx, args, quiet = rctx.attr.quiet, environment = install_conda_env, timeout = rctx.attr.timeout) else: - result = rctx.execute(args, quiet = rctx.attr.quiet, timeout = rctx.attr.timeout) + result = rctx.execute(args, quiet = rctx.attr.quiet, environment = install_conda_env, timeout = rctx.attr.timeout) if result.return_code: fail("Failure installing conda.\nstdout: {}\nstderr: {}".format(result.stdout, result.stderr)) @@ -103,17 +109,21 @@ def _install_conda(rctx, installer): # Therefore, we expose the python interpreter (thus the right bin/ path) so that later invocations # of the conda command can setup the right PATH env. conda_entrypoint = rctx.path("{}/condabin/conda{}".format(rctx.attr.conda_dir, CONDA_EXT_MAP[os])) - conda_base_python = rctx.path("{}/bin/python".format(rctx.attr.conda_dir)) + python_executable = "python{}".format(PYTHON_EXT_MAP[os]) + interpreter_path = python_executable if os == "Windows" else "bin/{}".format(python_executable) + python = rctx.path("{}/{}".format( + rctx.attr.conda_dir, + interpreter_path, + )) + additional_paths = [str(rctx.path(python).dirname)] + if os == "Windows": + additional_paths = additional_paths + [str(rctx.path("{}/Library/bin".format(rctx.attr.conda_dir)))] + additional_paths = additional_paths + [path_envar] + actual_environment = {"HOME": rctx.attr.conda_dir, "PATH": ENV_VAR_SEPARATOR_MAP[os].join(additional_paths)} # install mamba solver. install_mamba_result = rctx.execute( [ - # strip all environment variables except for PATH. - # this is to prevent the conda_entrypoint from recognizing the conda environment (if any) - # where bazel was invoked. - "env", - "-i", - "PATH={}:{}".format(conda_base_python.dirname, path_envar), conda_entrypoint, "install", "-n", @@ -122,15 +132,18 @@ def _install_conda(rctx, installer): "-y", ], quiet = rctx.attr.quiet, + working_directory = rctx.attr.conda_dir, + environment = actual_environment, timeout = rctx.attr.timeout, ) + if install_mamba_result.return_code: print("Failure installing conda-libmamba-solver.") print("stdout: \n", result.stdout) print("stderr: \n", result.stderr) fail("Failure installing conda-libmamba-solver.") - return conda_entrypoint, conda_base_python + return conda_entrypoint, python # create BUILD file with exposed conda binary def _create_conda_build_file(rctx, conda_entrypoint, conda_base_python): diff --git a/third_party/rules_conda/env.bzl b/third_party/rules_conda/env.bzl index 8203a272..be3629d7 100644 --- a/third_party/rules_conda/env.bzl +++ b/third_party/rules_conda/env.bzl @@ -1,4 +1,4 @@ -load(":utils.bzl", "CONDA_EXT_MAP", "EXECUTE_TIMEOUT", "PYTHON_EXT_MAP", "get_os", "get_path_envar") +load(":utils.bzl", "CONDA_EXT_MAP", "ENV_VAR_SEPARATOR_MAP", "EXECUTE_TIMEOUT", "PYTHON_EXT_MAP", "get_os", "get_path_envar") # This excluded violating file is from arrow-cpp 10.0.1 in Snowflake Anaconda Channel. @@ -23,25 +23,34 @@ filegroup( def _conda_cmd(rctx, conda_args, environment = {}): path_envar = get_path_envar(rctx) + os = get_os(rctx) + python_executable = "python{}".format(PYTHON_EXT_MAP[os]) + interpreter_path = python_executable if os == "Windows" else "bin/{}".format(python_executable) conda_entrypoint = Label("@{}//:{}/condabin/conda{}".format( rctx.attr.conda_repo, rctx.attr.conda_dir, - CONDA_EXT_MAP[get_os(rctx)], + CONDA_EXT_MAP[os], )) - python = Label("@{}//:{}/bin/python".format( + python = Label("@{}//:{}/{}".format( rctx.attr.conda_repo, rctx.attr.conda_dir, + interpreter_path, )) - actual_environment = {"PATH": "{}:{}".format(rctx.path(python).dirname, path_envar)} + + additional_paths = [str(rctx.path(python).dirname)] + if os == "Windows": + additional_paths = additional_paths + [str(rctx.path("{}/Library/bin".format(rctx.attr.conda_repo, rctx.attr.conda_dir)))] + additional_paths = additional_paths + [path_envar] + actual_environment = {"PATH": ENV_VAR_SEPARATOR_MAP[os].join(additional_paths)} actual_environment.update(environment) - environment_args = ["{}={}".format(env, val) for env, val in actual_environment.items()] return rctx.execute( # all environment variables are stripped. PATH and environment variables passed in are # added above. this is to prevent the conda_entrypoint from recognizing the conda # environment (if any) where bazel was invoked. - ["env", "-i"] + environment_args + [conda_entrypoint] + conda_args, + [conda_entrypoint] + conda_args, quiet = rctx.attr.quiet, + environment = actual_environment, timeout = rctx.attr.timeout, ) diff --git a/third_party/rules_conda/utils.bzl b/third_party/rules_conda/utils.bzl index 471c6d3f..077b8637 100644 --- a/third_party/rules_conda/utils.bzl +++ b/third_party/rules_conda/utils.bzl @@ -16,6 +16,12 @@ PYTHON_EXT_MAP = { "Linux": "", } +ENV_VAR_SEPARATOR_MAP = { + "Windows": ";", + "MacOSX": ":", + "Linux": ":", +} + EXECUTE_TIMEOUT = 3600 def get_os(rctx): @@ -90,9 +96,27 @@ def execute_waitable_windows(rctx, args, environment = {}, tmp_script = "tmp.bat def windowsify(path): return str(path).replace("/", "\\") +PATH_SCRIPT = """ +@echo off +call echo %PATH% +set "EXITCODE=%ERRORLEVEL%" +if "%OS%"=="Windows_NT" ( endlocal & exit /b "%EXITCODE%" ) +exit /b "%EXITCODE%"" +""" + # Returns a clean PATH environment variable sufficient for conda installer and commands. def get_path_envar(rctx): - getconf_result = rctx.execute(["getconf", "PATH"]) + os = get_os(rctx) + if os == "Windows": + tmp_script = "tmp.bat" + rctx.file( + tmp_script, + content = PATH_SCRIPT, + ) + getconf_result = rctx.execute([rctx.path(tmp_script)]) + rctx.delete(tmp_script) + else: + getconf_result = rctx.execute(["getconf", "PATH"]) if getconf_result.return_code: fail("Unable to get PATH.\nstderr: {}".format(getconf_result.stderr)) return getconf_result.stdout.strip() diff --git a/third_party/rules_python_description_content_type.patch b/third_party/rules_python_description_content_type.patch deleted file mode 100644 index 9e71403d..00000000 --- a/third_party/rules_python_description_content_type.patch +++ /dev/null @@ -1,127 +0,0 @@ -From ea505469e55f50ab59b0253bfedbef27063bd903 Mon Sep 17 00:00:00 2001 -From: Zhuo Peng -Date: Thu, 15 Jun 2023 14:14:58 -0700 -Subject: [PATCH] description_content_type - -add summary ---- - python/packaging.bzl | 14 ++++++++++++++ - tools/wheelmaker.py | 16 ++++++++++++++++ - 2 files changed, 30 insertions(+) - -diff --git python/packaging.bzl python/packaging.bzl -index 19b5894..e7e92e8 100644 ---- python/packaging.bzl -+++ python/packaging.bzl -@@ -221,6 +221,12 @@ def _py_wheel_impl(ctx): - args.add("--description_file", description_file) - other_inputs.append(description_file) - -+ if ctx.attr.summary: -+ args.add("--summary", ctx.attr.summary) -+ -+ if ctx.attr.description_content_type: -+ args.add("--description_content_type", ctx.attr.description_content_type) -+ - ctx.actions.run( - inputs = depset(direct = other_inputs, transitive = [inputs_to_package]), - outputs = [outfile, name_file], -@@ -352,6 +358,10 @@ _other_attrs = { - "classifiers": attr.string_list( - doc = "A list of strings describing the categories for the package. For valid classifiers see https://pypi.org/classifiers", - ), -+ "description_content_type": attr.string( -+ doc = "The type of contents in description_file. See https://packaging.python.org/en/latest/specifications/core-metadata/#description-content-type", -+ default = "", -+ ), - "description_file": attr.label( - doc = "A file containing text describing the package in a single line.", - allow_single_file = True, -@@ -377,6 +387,10 @@ _other_attrs = { - default = [], - doc = "path prefixes to strip from files added to the generated package", - ), -+ "summary": attr.string( -+ doc = "A one-line summary of what the package does", -+ default = "", -+ ), - } - - py_wheel = rule( -diff --git tools/wheelmaker.py tools/wheelmaker.py -index fb8e37b..6a90062 100644 ---- tools/wheelmaker.py -+++ tools/wheelmaker.py -@@ -171,10 +171,12 @@ Root-Is-Purelib: {} - self, - extra_headers, - description, -+ description_content_type, - classifiers, - python_requires, - requires, - extra_requires, -+ summary, - ): - """Write METADATA file to the distribution.""" - # https://www.python.org/dev/peps/pep-0566/ -@@ -183,11 +185,15 @@ Root-Is-Purelib: {} - metadata.append("Metadata-Version: 2.1") - metadata.append("Name: %s" % self._name) - metadata.append("Version: %s" % self._version) -+ if description_content_type: -+ metadata.append("Description-Content-Type: %s" % description_content_type) - metadata.extend(extra_headers) - for classifier in classifiers: - metadata.append("Classifier: %s" % classifier) - if python_requires: - metadata.append("Requires-Python: %s" % python_requires) -+ if summary: -+ metadata.append("Summary: %s" % summary) - for requirement in requires: - metadata.append("Requires-Dist: %s" % requirement) - -@@ -323,9 +329,15 @@ def parse_args() -> argparse.Namespace: - wheel_group.add_argument( - "--python_requires", help="Version of python that the wheel will work with" - ) -+ wheel_group.add_argument( -+ "--summary", help="A one-line summary of what the package does" -+ ) - wheel_group.add_argument( - "--description_file", help="Path to the file with package description" - ) -+ wheel_group.add_argument( -+ "--description_content_type", help="Content type of the package description" -+ ) - wheel_group.add_argument( - "--entry_points_file", - help="Path to a correctly-formatted entry_points.txt file", -@@ -429,6 +441,7 @@ def main() -> None: - arguments.description_file, "rt", encoding="utf-8" - ) as description_file: - description = description_file.read() -+ description_content_type = arguments.description_content_type - - extra_requires = collections.defaultdict(list) - if arguments.extra_requires: -@@ -439,14 +452,17 @@ def main() -> None: - python_requires = arguments.python_requires or "" - requires = arguments.requires or [] - extra_headers = arguments.header or [] -+ summary = arguments.summary - - maker.add_metadata( - extra_headers=extra_headers, - description=description, -+ description_content_type=description_content_type, - classifiers=classifiers, - python_requires=python_requires, - requires=requires, - extra_requires=extra_requires, -+ summary=summary, - ) - - if arguments.entry_points_file: --- -2.39.2 (Apple Git-143)