From b938743b2b7534c97b56ba5be9b926184533ad7d Mon Sep 17 00:00:00 2001 From: Kiran Dama <69480841+sfc-gh-kdama@users.noreply.github.com> Date: Mon, 13 Nov 2023 11:21:23 -0800 Subject: [PATCH] Release snowflake-ml-python: 1.0.12 Co-authored-by: Snowflake Authors --- .bazelrc | 6 + .pre-commit-config.yaml | 5 + CHANGELOG.md | 26 + WORKSPACE | 16 +- bazel/environments/BUILD.bazel | 1 + bazel/environments/conda-env-snowflake.yml | 1 + bazel/environments/conda-env.yml | 2 + bazel/environments/conda-gpu-env.yml | 68 ++ bazel/environments/fetch_conda_env_config.bzl | 4 + bazel/platforms/BUILD.bazel | 31 +- bazel/py_rules.bzl | 38 +- bazel/requirements/BUILD.bazel | 5 + .../parse_and_generate_requirements.py | 25 +- ci/RunBazelAction.sh | 20 +- ci/build_and_run_tests.sh | 19 +- ci/conda_recipe/meta.yaml | 5 +- ci/get_excluded_tests.sh | 15 +- ci/skip_continuous_run_targets | 2 + ci/skip_merge_gate_targets | 1 + ...red_targets => skip_type_checking_targets} | 1 - ci/type_check.sh | 12 +- requirements.txt | 6 +- requirements.yml | 16 +- snowflake/ml/_internal/BUILD.bazel | 6 + snowflake/ml/_internal/env_utils.py | 326 ++++--- snowflake/ml/_internal/env_utils_test.py | 800 +++++------------- snowflake/ml/_internal/file_utils.py | 48 ++ snowflake/ml/_internal/file_utils_test.py | 2 +- snowflake/ml/_internal/migrator_utils.py | 3 + snowflake/ml/_internal/utils/BUILD.bazel | 31 + snowflake/ml/_internal/utils/identifier.py | 5 +- .../_internal/utils/log_stream_processor.py | 30 + .../utils/log_stream_processor_test.py | 61 ++ .../ml/_internal/utils/sql_identifier.py | 82 ++ .../ml/_internal/utils/sql_identifier_test.py | 47 + snowflake/ml/feature_store/BUILD.bazel | 1 + .../_internal/scripts/upload_test_datasets.py | 68 +- snowflake/ml/feature_store/entity.py | 7 +- snowflake/ml/feature_store/feature_store.py | 327 +++---- snowflake/ml/feature_store/feature_view.py | 138 ++- .../Time_Series_Feature_Demo.ipynb | 609 ++----------- .../feature_store_case_sensitivity_test.py | 11 +- .../feature_store/tests/feature_store_test.py | 81 +- snowflake/ml/model/BUILD.bazel | 122 +-- snowflake/ml/model/_api.py | 544 ++++++++++++ .../_deploy_client/image_builds/BUILD.bazel | 6 +- .../image_builds/docker_context.py | 20 +- .../image_builds/docker_context_test.py | 24 +- .../image_builds/inference_server/BUILD.bazel | 23 +- .../image_builds/inference_server/gpu_test.py | 12 + .../image_builds/inference_server/main.py | 47 +- .../inference_server/main_test.py | 6 +- .../inference_server/main_vllm_test.py | 129 +++ .../image_builds/server_image_builder.py | 13 +- .../templates/dockerfile_template | 10 +- .../templates/image_build_job_spec_template | 2 +- .../test_fixtures/dockerfile_test_fixture | 9 +- .../dockerfile_test_fixture_with_CUDA | 9 +- .../dockerfile_test_fixture_with_model | 12 +- .../_deploy_client/snowservice/BUILD.bazel | 2 +- .../_deploy_client/snowservice/deploy.py | 113 ++- .../snowservice/deploy_options.py | 3 + .../_deploy_client/snowservice/deploy_test.py | 27 +- .../ml/model/_deploy_client/utils/BUILD.bazel | 1 + .../model/_deploy_client/utils/constants.py | 2 +- .../utils/image_registry_client.py | 2 + .../utils/snowservice_client.py | 83 +- .../utils/snowservice_client_test.py | 85 +- .../_deploy_client/warehouse/BUILD.bazel | 4 +- .../model/_deploy_client/warehouse/deploy.py | 58 +- .../_deploy_client/warehouse/deploy_test.py | 159 ++-- .../warehouse/infer_template.py | 96 ++- snowflake/ml/model/_deployer.py | 305 ------- snowflake/ml/model/_env.py | 151 ---- snowflake/ml/model/_env_test.py | 142 ---- snowflake/ml/model/_handlers/_base.py | 87 -- snowflake/ml/model/_handlers/llm.py | 178 ---- snowflake/ml/model/_handlers/mlflow.py | 318 ------- snowflake/ml/model/_model.py | 496 ----------- snowflake/ml/model/_model_meta.py | 505 ----------- snowflake/ml/model/_model_meta_test.py | 297 ------- snowflake/ml/model/_model_test.py | 346 -------- snowflake/ml/model/_module_model/BUILD.bazel | 28 + .../_module_model/module_manifest/BUILD.bazel | 8 + .../module_manifest/module_manifest.py | 2 + .../_module_model/module_method/BUILD.bazel | 28 + .../fixtures/handler_fixture_1.py_fixture | 78 ++ .../fixtures/handler_fixture_2.py_fixture | 78 ++ .../module_method/handler_generator.py | 43 + .../module_method/handler_generator_test.py | 46 + .../module_method/infer_handler.py_template | 78 ++ .../module_method/module_method.py | 2 + .../ml/model/_module_model/module_model.py | 137 +++ .../model/_module_model/module_model_test.py | 57 ++ .../_module_model/module_runtime/BUILD.bazel | 8 + .../module_runtime/module_runtime.py | 2 + snowflake/ml/model/_packager/BUILD.bazel | 68 ++ .../ml/model/_packager/model_env/BUILD.bazel | 23 + .../ml/model/_packager/model_env/model_env.py | 392 +++++++++ .../_packager/model_env/model_env_test.py | 781 +++++++++++++++++ .../model_handler.py} | 33 +- .../ml/model/_packager/model_handler_test.py | 62 ++ .../model_handlers}/BUILD.bazel | 80 +- .../model/_packager/model_handlers/_base.py | 161 ++++ .../model/_packager/model_handlers/_utils.py | 57 ++ .../model_handlers}/custom.py | 98 ++- .../model_handlers}/huggingface_pipeline.py | 169 ++-- .../ml/model/_packager/model_handlers/llm.py | 276 ++++++ .../model/_packager/model_handlers/mlflow.py | 233 +++++ .../model_handlers}/pytorch.py | 89 +- .../model_handlers}/sklearn.py | 88 +- .../model_handlers}/snowmlmodel.py | 105 +-- .../model_handlers}/tensorflow.py | 91 +- .../model_handlers}/torchscript.py | 89 +- .../model_handlers}/xgboost.py | 115 +-- .../model_handlers_migrator/BUILD.bazel | 24 + .../model_handlers_migrator/base_migrator.py | 36 + .../base_migrator_test.py | 68 ++ .../model_handlers_test}/BUILD.bazel | 18 +- .../model_handlers_test}/custom_test.py | 109 ++- .../huggingface_pipeline_test.py | 124 +-- .../model_handlers_test}/mlflow_test.py | 143 ++-- .../model_handlers_test}/pytorch_test.py | 95 ++- .../model_handlers_test}/sklearn_test.py | 99 ++- .../model_handlers_test}/snowmlmodel_test.py | 138 +-- .../model_handlers_test}/tensorflow_test.py | 116 ++- .../model_handlers_test}/torchscript_test.py | 50 +- .../model_handlers_test}/xgboost_test.py | 97 ++- .../ml/model/_packager/model_meta/BUILD.bazel | 74 ++ .../_packager/model_meta/model_blob_meta.py | 48 ++ .../model/_packager/model_meta/model_meta.py | 357 ++++++++ .../_packager/model_meta/model_meta_schema.py | 70 ++ .../model_meta/model_meta_schema_test.py | 21 + .../_packager/model_meta/model_meta_test.py | 293 +++++++ .../_packager/model_meta_migrator/BUILD.bazel | 55 ++ .../model_meta_migrator/base_migrator.py | 33 + .../model_meta_migrator/base_migrator_test.py | 54 ++ .../model_meta_migrator/migrator_plans.py | 21 + .../migrator_plans_test.py | 48 ++ .../model_meta_migrator/migrator_v1.py | 48 ++ .../model_meta_migrator/migrator_v1_test.py | 132 +++ .../ml/model/_packager/model_packager.py | 149 ++++ .../ml/model/_packager/model_packager_test.py | 260 ++++++ snowflake/ml/model/models/llm.py | 63 +- snowflake/ml/model/type_hints.py | 28 +- .../modeling/_internal/snowpark_handlers.py | 169 ++-- .../ml/modeling/metrics/classification.py | 19 + .../_internal/_grid_search_cv.py | 30 +- .../_internal/_randomized_search_cv.py | 30 +- .../modeling/preprocessing/ordinal_encoder.py | 176 ++-- snowflake/ml/registry/BUILD.bazel | 3 +- snowflake/ml/registry/model_registry.py | 88 +- snowflake/ml/registry/model_registry_test.py | 17 +- ...t to Snowpark Container Service Demo.ipynb | 125 ++- .../notebooks/Finetune_Registry.ipynb | 497 +++++++++-- snowflake/ml/requirements.bzl | 61 +- snowflake/ml/test_utils/BUILD.bazel | 6 + snowflake/ml/test_utils/test_env_utils.py | 11 + snowflake/ml/utils/connection_params.py | 2 +- snowflake/ml/version.bzl | 2 +- .../integ/snowflake/ml/_internal/BUILD.bazel | 3 +- .../ml/_internal/grid_search_integ_test.py | 59 +- .../_internal/randomized_search_integ_test.py | 50 +- .../image_registry_client_integ_test.py | 18 +- tests/integ/snowflake/ml/model/BUILD.bazel | 54 +- .../deployment_to_snowservice_integ_test.py | 30 +- .../ml/model/model_badcase_integ_test.py | 33 +- .../ml/model/spcs_llm_model_integ_test.py | 175 ++-- .../warehouse_custom_model_integ_test.py | 50 +- ...e_huggingface_pipeline_model_integ_test.py | 56 +- .../warehouse_mlflow_model_integ_test.py | 10 +- .../model/warehouse_model_compat_v1_test.py | 689 +++++++++++++++ .../model/warehouse_model_integ_test_utils.py | 30 +- .../warehouse_pytorch_model_integ_test.py | 26 +- ...ehouse_sklearn_xgboost_model_integ_test.py | 30 +- .../warehouse_snowml_model_integ_test.py | 14 +- .../warehouse_tensorflow_model_integ_test.py | 26 +- .../ml/modeling/framework/BUILD.bazel | 3 + .../snowflake/ml/modeling/framework/utils.py | 22 +- .../modeling/metrics/accuracy_score_test.py | 8 +- .../modeling/metrics/confusion_matrix_test.py | 8 +- .../metrics/d2_absolute_error_score_test.py | 8 +- .../modeling/metrics/d2_pinball_score_test.py | 10 +- .../metrics/explained_variance_score_test.py | 10 +- .../ml/modeling/metrics/f1_score_test.py | 16 +- .../ml/modeling/metrics/fbeta_score_test.py | 18 +- .../ml/modeling/metrics/log_loss_test.py | 12 +- .../metrics/mean_absolute_error_test.py | 8 +- .../mean_absolute_percentage_error_test.py | 8 +- .../metrics/mean_squared_error_test.py | 10 +- .../ml/modeling/metrics/metrics_utils_test.py | 4 +- .../metrics/precision_recall_curve_test.py | 8 +- .../precision_recall_fscore_support_test.py | 20 +- .../modeling/metrics/precision_score_test.py | 16 +- .../ml/modeling/metrics/recall_score_test.py | 16 +- .../ml/modeling/metrics/roc_auc_score_test.py | 14 +- .../ml/modeling/metrics/roc_curve_test.py | 14 +- .../preprocessing/k_bins_discretizer_test.py | 8 +- .../preprocessing/ordinal_encoder_test.py | 50 +- tests/integ/snowflake/ml/registry/BUILD.bazel | 1 + .../ml/registry/model_registry_compat_test.py | 62 ++ ...el_registry_snowservice_integ_test_base.py | 22 +- .../integ/snowflake/ml/test_utils/BUILD.bazel | 1 + .../ml/test_utils/common_test_base.py | 7 +- .../ml/test_utils/spcs_integ_test_base.py | 43 +- tests/pytest.ini | 1 + third_party/rules_python/BUILD.bazel | 3 + third_party/rules_python/packaging.patch | 12 + 208 files changed, 10498 insertions(+), 6466 deletions(-) create mode 100755 bazel/environments/conda-gpu-env.yml create mode 100644 ci/skip_continuous_run_targets rename ci/{type_ignored_targets => skip_type_checking_targets} (92%) create mode 100644 snowflake/ml/_internal/migrator_utils.py create mode 100644 snowflake/ml/_internal/utils/log_stream_processor.py create mode 100644 snowflake/ml/_internal/utils/log_stream_processor_test.py create mode 100644 snowflake/ml/_internal/utils/sql_identifier.py create mode 100644 snowflake/ml/_internal/utils/sql_identifier_test.py create mode 100644 snowflake/ml/model/_api.py create mode 100644 snowflake/ml/model/_deploy_client/image_builds/inference_server/gpu_test.py create mode 100644 snowflake/ml/model/_deploy_client/image_builds/inference_server/main_vllm_test.py delete mode 100644 snowflake/ml/model/_deployer.py delete mode 100644 snowflake/ml/model/_env.py delete mode 100644 snowflake/ml/model/_env_test.py delete mode 100644 snowflake/ml/model/_handlers/_base.py delete mode 100644 snowflake/ml/model/_handlers/llm.py delete mode 100644 snowflake/ml/model/_handlers/mlflow.py delete mode 100644 snowflake/ml/model/_model.py delete mode 100644 snowflake/ml/model/_model_meta.py delete mode 100644 snowflake/ml/model/_model_meta_test.py delete mode 100644 snowflake/ml/model/_model_test.py create mode 100644 snowflake/ml/model/_module_model/BUILD.bazel create mode 100644 snowflake/ml/model/_module_model/module_manifest/BUILD.bazel create mode 100644 snowflake/ml/model/_module_model/module_manifest/module_manifest.py create mode 100644 snowflake/ml/model/_module_model/module_method/BUILD.bazel create mode 100644 snowflake/ml/model/_module_model/module_method/fixtures/handler_fixture_1.py_fixture create mode 100644 snowflake/ml/model/_module_model/module_method/fixtures/handler_fixture_2.py_fixture create mode 100644 snowflake/ml/model/_module_model/module_method/handler_generator.py create mode 100644 snowflake/ml/model/_module_model/module_method/handler_generator_test.py create mode 100644 snowflake/ml/model/_module_model/module_method/infer_handler.py_template create mode 100644 snowflake/ml/model/_module_model/module_method/module_method.py create mode 100644 snowflake/ml/model/_module_model/module_model.py create mode 100644 snowflake/ml/model/_module_model/module_model_test.py create mode 100644 snowflake/ml/model/_module_model/module_runtime/BUILD.bazel create mode 100644 snowflake/ml/model/_module_model/module_runtime/module_runtime.py create mode 100644 snowflake/ml/model/_packager/BUILD.bazel create mode 100644 snowflake/ml/model/_packager/model_env/BUILD.bazel create mode 100644 snowflake/ml/model/_packager/model_env/model_env.py create mode 100644 snowflake/ml/model/_packager/model_env/model_env_test.py rename snowflake/ml/model/{_model_handler.py => _packager/model_handler.py} (63%) create mode 100644 snowflake/ml/model/_packager/model_handler_test.py rename snowflake/ml/model/{_handlers => _packager/model_handlers}/BUILD.bazel (55%) create mode 100644 snowflake/ml/model/_packager/model_handlers/_base.py create mode 100644 snowflake/ml/model/_packager/model_handlers/_utils.py rename snowflake/ml/model/{_handlers => _packager/model_handlers}/custom.py (66%) rename snowflake/ml/model/{_handlers => _packager/model_handlers}/huggingface_pipeline.py (77%) create mode 100644 snowflake/ml/model/_packager/model_handlers/llm.py create mode 100644 snowflake/ml/model/_packager/model_handlers/mlflow.py rename snowflake/ml/model/{_handlers => _packager/model_handlers}/pytorch.py (74%) rename snowflake/ml/model/{_handlers => _packager/model_handlers}/sklearn.py (74%) rename snowflake/ml/model/{_handlers => _packager/model_handlers}/snowmlmodel.py (70%) rename snowflake/ml/model/{_handlers => _packager/model_handlers}/tensorflow.py (75%) rename snowflake/ml/model/{_handlers => _packager/model_handlers}/torchscript.py (74%) rename snowflake/ml/model/{_handlers => _packager/model_handlers}/xgboost.py (67%) create mode 100644 snowflake/ml/model/_packager/model_handlers_migrator/BUILD.bazel create mode 100644 snowflake/ml/model/_packager/model_handlers_migrator/base_migrator.py create mode 100644 snowflake/ml/model/_packager/model_handlers_migrator/base_migrator_test.py rename snowflake/ml/model/{_handlers_test => _packager/model_handlers_test}/BUILD.bazel (79%) rename snowflake/ml/model/{_handlers_test => _packager/model_handlers_test}/custom_test.py (69%) rename snowflake/ml/model/{_handlers_test => _packager/model_handlers_test}/huggingface_pipeline_test.py (90%) rename snowflake/ml/model/{_handlers_test => _packager/model_handlers_test}/mlflow_test.py (67%) rename snowflake/ml/model/{_handlers_test => _packager/model_handlers_test}/pytorch_test.py (63%) rename snowflake/ml/model/{_handlers_test => _packager/model_handlers_test}/sklearn_test.py (58%) rename snowflake/ml/model/{_handlers_test => _packager/model_handlers_test}/snowmlmodel_test.py (54%) rename snowflake/ml/model/{_handlers_test => _packager/model_handlers_test}/tensorflow_test.py (62%) rename snowflake/ml/model/{_handlers_test => _packager/model_handlers_test}/torchscript_test.py (68%) rename snowflake/ml/model/{_handlers_test => _packager/model_handlers_test}/xgboost_test.py (54%) create mode 100644 snowflake/ml/model/_packager/model_meta/BUILD.bazel create mode 100644 snowflake/ml/model/_packager/model_meta/model_blob_meta.py create mode 100644 snowflake/ml/model/_packager/model_meta/model_meta.py create mode 100644 snowflake/ml/model/_packager/model_meta/model_meta_schema.py create mode 100644 snowflake/ml/model/_packager/model_meta/model_meta_schema_test.py create mode 100644 snowflake/ml/model/_packager/model_meta/model_meta_test.py create mode 100644 snowflake/ml/model/_packager/model_meta_migrator/BUILD.bazel create mode 100644 snowflake/ml/model/_packager/model_meta_migrator/base_migrator.py create mode 100644 snowflake/ml/model/_packager/model_meta_migrator/base_migrator_test.py create mode 100644 snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py create mode 100644 snowflake/ml/model/_packager/model_meta_migrator/migrator_plans_test.py create mode 100644 snowflake/ml/model/_packager/model_meta_migrator/migrator_v1.py create mode 100644 snowflake/ml/model/_packager/model_meta_migrator/migrator_v1_test.py create mode 100644 snowflake/ml/model/_packager/model_packager.py create mode 100644 snowflake/ml/model/_packager/model_packager_test.py create mode 100644 snowflake/ml/test_utils/test_env_utils.py create mode 100644 tests/integ/snowflake/ml/model/warehouse_model_compat_v1_test.py create mode 100644 third_party/rules_python/BUILD.bazel create mode 100644 third_party/rules_python/packaging.patch diff --git a/.bazelrc b/.bazelrc index 01f7ddac..f4ea3bf0 100644 --- a/.bazelrc +++ b/.bazelrc @@ -15,6 +15,8 @@ build:_build --platforms //bazel/platforms:snowflake_conda_env --host_platform / build:_sf_only --platforms //bazel/platforms:snowflake_conda_env --host_platform //bazel/platforms:snowflake_conda_env --repo_env=BAZEL_CONDA_ENV_NAME=sf_only build:_extended --platforms //bazel/platforms:extended_conda_env --host_platform //bazel/platforms:extended_conda_env --repo_env=BAZEL_CONDA_ENV_NAME=extended build:_extended_oss --platforms //bazel/platforms:extended_conda_env --host_platform //bazel/platforms:extended_conda_env --repo_env=BAZEL_CONDA_ENV_NAME=extended_oss +build:_extended_gpu_oss --platforms //bazel/platforms:extended_conda_gpu_env --host_platform //bazel/platforms:extended_conda_gpu_env --repo_env=BAZEL_CONDA_ENV_NAME=extended_gpu_oss + # Public definitions @@ -37,6 +39,7 @@ run:pre_build --config=_build --config=py3.8 # Config to run type check build:typecheck --aspects @rules_mypy//:mypy.bzl%mypy_aspect --output_groups=mypy --config=_extended --config=py3.8 build:typecheck_oss --aspects @rules_mypy//:mypy.bzl%mypy_aspect --output_groups=mypy --config=_extended_oss --config=py3.8 +build:typecheck_gpu_oss --aspects @rules_mypy//:mypy.bzl%mypy_aspect --output_groups=mypy --config=_extended_gpu_oss --config=py3.8 # Config to build the doc build:docs --config=_sf_only --config=py3.8 @@ -49,3 +52,6 @@ cquery:extended --config=_extended test:extended_oss --config=_extended_oss run:extended_oss --config=_extended_oss cquery:extended_oss --config=_extended_oss +test:extended_gpu_oss --config=_extended_gpu_oss +run:extended_gpu_oss --config=_extended_gpu_oss +cquery:extended_gpu_oss --config=_extended_gpu_oss diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f1f2a6ba..4dc0e237 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -62,6 +62,7 @@ repos: rev: v0.36.0 hooks: - id: markdownlint-fix + language_version: 16.20.2 - repo: https://github.com/keith/pre-commit-buildifier rev: 6.0.0 hooks: @@ -69,6 +70,10 @@ repos: args: - --warnings=all files: \.(bzl|bazel|sky)$ + exclude: > + (?x)^( + snowflake/ml/requirements.bzl + )$ # - id: buildifier-lint # args: *args - repo: https://github.com/crate-ci/typos # config: _typos.toml diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b8d8869..2632a62c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,30 @@ # Release History +## 1.0.12 + +### Bug Fixes + +- Model Registry: Fix regression issue that container logging is not shown during model deployment to SPCS. +- Model Development: Enhance the column capacity of OrdinalEncoder. +- Model Registry: Fix unbound `batch_size`` error when deploying a model other than Hugging Face Pipeline + and LLM with GPU on SPCS. + +### Behavior Changes + +- Model Registry: Raise early error when deploying to SPCS with db/schema that starts with underscore. +- Model Registry: `conda-forge` channel is now automatically added to channel lists when deploying to SPCS. +- Model Registry: `relax_version` will not strip all version specifier, instead it will relax `==x.y.z` specifier to + `>=x.y,<(x+1)`. +- Model Registry: Python with different patchlevel but the same major and minor will not result a warning when loading + the model via Model Registry and would be considered to use when deploying to SPCS. +- Model Registry: When logging a `snowflake.ml.model.models.huggingface_pipeline.HuggingFacePipelineModel` object, + versions of local installed libraries won't be picked as dependencies of models, instead it will pick up some pre- + defined dependencies to improve user experience. + +### New Features + +- Model Registry: Enable best-effort SPCS job/service log streaming when logging level is set to INFO. + ## 1.0.11 ### New Features @@ -17,6 +42,7 @@ - Model Development: Fix support for XGBoost and LightGBM models using SKLearn Grid Search and Randomized Search model selectors. - Model Development: DecimalType is now supported as a DataType. - Model Development: Fix metrics compatibility with Snowpark Dataframes that use Snowflake identifiers +- Model Registry: Resolve 'delete_deployment' not deleting the SPCS service in certain cases. ## 1.0.10 diff --git a/WORKSPACE b/WORKSPACE index cd3722ea..83e0e3ba 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -1,6 +1,5 @@ workspace(name = "SnowML") -load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository") load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive", "http_jar") http_jar( @@ -24,14 +23,19 @@ load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace") bazel_skylib_workspace() -# Latest @ 2023-06-20 -# Replace with released version once newer version released. -git_repository( +http_archive( name = "rules_python", - commit = "0d59fcf561f6d2c4705924bc17c151fb4b998841", - remote = "https://github.com/bazelbuild/rules_python.git", + patch_args = ["-p1"], + patches = ["@//third_party/rules_python:packaging.patch"], + sha256 = "9d04041ac92a0985e344235f5d946f71ac543f1b1565f2cdbc9a2aaee8adf55b", + strip_prefix = "rules_python-0.26.0", + url = "https://github.com/bazelbuild/rules_python/releases/download/0.26.0/rules_python-0.26.0.tar.gz", ) +load("@rules_python//python:repositories.bzl", "py_repositories") + +py_repositories() + load("//third_party/rules_conda:defs.bzl", "conda_create", "load_conda", "register_toolchain") http_archive( diff --git a/bazel/environments/BUILD.bazel b/bazel/environments/BUILD.bazel index 9ddc0d99..0291b0cb 100644 --- a/bazel/environments/BUILD.bazel +++ b/bazel/environments/BUILD.bazel @@ -2,4 +2,5 @@ exports_files([ "conda-env-snowflake.yml", "conda-env-build.yml", "conda-env.yml", + "conda-gpu-env.yml", ]) diff --git a/bazel/environments/conda-env-snowflake.yml b/bazel/environments/conda-env-snowflake.yml index ed30250f..3e644689 100644 --- a/bazel/environments/conda-env-snowflake.yml +++ b/bazel/environments/conda-env-snowflake.yml @@ -19,6 +19,7 @@ dependencies: - flask==2.1.3 - fsspec==2023.3.0 - httpx==0.23.0 + - importlib_resources==5.1.4 - inflection==0.5.1 - joblib==1.1.1 - jsonschema==3.2.0 diff --git a/bazel/environments/conda-env.yml b/bazel/environments/conda-env.yml index 9277ffd3..30024989 100644 --- a/bazel/environments/conda-env.yml +++ b/bazel/environments/conda-env.yml @@ -24,6 +24,7 @@ dependencies: - flask==2.1.3 - fsspec==2023.3.0 - httpx==0.23.0 + - importlib_resources==5.1.4 - inflection==0.5.1 - joblib==1.1.1 - jsonschema==3.2.0 @@ -62,3 +63,4 @@ dependencies: - pip: - --extra-index-url https://pypi.org/simple - peft==0.5.0 + - vllm==0.2.1.post1 diff --git a/bazel/environments/conda-gpu-env.yml b/bazel/environments/conda-gpu-env.yml new file mode 100755 index 00000000..833979ff --- /dev/null +++ b/bazel/environments/conda-gpu-env.yml @@ -0,0 +1,68 @@ +--- +# DO NOT EDIT! +# Generate by running 'bazel run --config=pre_build //bazel/requirements:sync_requirements' + +channels: + - https://repo.anaconda.com/pkgs/snowflake + - nodefaults +dependencies: + - absl-py==1.3.0 + - aiohttp==3.8.3 + - anyio==3.5.0 + - boto3==1.24.28 + - cachetools==4.2.2 + - cloudpickle==2.0.0 + - conda-forge::accelerate==0.22.0 + - conda-forge::mypy==1.5.1 + - conda-forge::starlette==0.27.0 + - conda-forge::types-PyYAML==6.0.12 + - conda-forge::types-cachetools==4.2.2 + - conda-libmamba-solver==23.7.0 + - coverage==6.3.2 + - cryptography==39.0.1 + - flask-cors==3.0.10 + - flask==2.1.3 + - fsspec==2023.3.0 + - httpx==0.23.0 + - importlib_resources==5.1.4 + - inflection==0.5.1 + - joblib==1.1.1 + - jsonschema==3.2.0 + - lightgbm==3.3.5 + - mlflow==2.3.1 + - moto==4.0.11 + - networkx==2.8.4 + - numpy==1.24.3 + - nvidia::cuda==11.7.* + - packaging==23.0 + - pandas==1.5.3 + - protobuf==3.20.3 + - pytest==7.4.0 + - pytimeparse==1.1.8 + - pytorch::pytorch-cuda==11.7.* + - pytorch::pytorch==2.0.1 + - pyyaml==6.0 + - requests==2.29.0 + - ruamel.yaml==0.17.21 + - s3fs==2023.3.0 + - scikit-learn==1.3.0 + - scipy==1.9.3 + - sentencepiece==0.1.99 + - shap==0.42.1 + - snowflake-connector-python==3.2.0 + - snowflake-snowpark-python==1.6.1 + - sphinx==5.0.2 + - sqlparse==0.4.4 + - tensorflow==2.10.0 + - tokenizers==0.13.2 + - torchdata==0.6.1 + - transformers==4.32.1 + - types-protobuf==4.23.0.1 + - types-requests==2.30.0.0 + - typing-extensions==4.5.0 + - xgboost==1.7.3 + - pip + - pip: + - --extra-index-url https://pypi.org/simple + - peft==0.5.0 + - vllm==0.2.1.post1 diff --git a/bazel/environments/fetch_conda_env_config.bzl b/bazel/environments/fetch_conda_env_config.bzl index edd1cb0c..1f51f903 100644 --- a/bazel/environments/fetch_conda_env_config.bzl +++ b/bazel/environments/fetch_conda_env_config.bzl @@ -16,6 +16,10 @@ def _fetch_conda_env_config_impl(rctx): "compatible_target": ["@SnowML//bazel/platforms:extended_conda_channels"], "environment": "@//bazel/environments:conda-env.yml", }, + "extended_gpu_oss": { + "compatible_target": ["@SnowML//bazel/platforms:extended_conda_channels", "@SnowML//bazel/platforms:has_gpu"], + "environment": "@//bazel/environments:conda-gpu-env.yml", + }, # `extended_oss` is the extended env for OSS repo which is a strict subset of `extended`. # It's intended for development without dev VPN. "extended_oss": { diff --git a/bazel/platforms/BUILD.bazel b/bazel/platforms/BUILD.bazel index 892d9db6..df2eb24d 100644 --- a/bazel/platforms/BUILD.bazel +++ b/bazel/platforms/BUILD.bazel @@ -2,6 +2,8 @@ package(default_visibility = ["//visibility:public"]) constraint_setting(name = "conda_env") +constraint_setting(name = "gpu_env") + constraint_value( name = "snowflake_conda_channel", constraint_setting = ":conda_env", @@ -12,14 +14,39 @@ constraint_value( constraint_setting = ":conda_env", ) +constraint_value( + name = "has_gpu", + constraint_setting = ":gpu_env", +) + +constraint_value( + name = "no_gpu", + constraint_setting = ":gpu_env", +) + platform( name = "snowflake_conda_env", - constraint_values = [":snowflake_conda_channel"], + constraint_values = [ + ":snowflake_conda_channel", + ":no_gpu", + ], parents = ["@local_config_platform//:host"], ) platform( name = "extended_conda_env", - constraint_values = [":extended_conda_channels"], + constraint_values = [ + ":extended_conda_channels", + ":no_gpu", + ], + parents = ["@local_config_platform//:host"], +) + +platform( + name = "extended_conda_gpu_env", + constraint_values = [ + ":extended_conda_channels", + ":has_gpu", + ], parents = ["@local_config_platform//:host"], ) diff --git a/bazel/py_rules.bzl b/bazel/py_rules.bzl index b833ead0..72d6bd1a 100644 --- a/bazel/py_rules.bzl +++ b/bazel/py_rules.bzl @@ -18,6 +18,9 @@ Overriding default implementation of py_{binary|library|test} to add additional //bazel/platforms:snowflake_conda_env - None of the target can be built with the host platform (@local_config_platform//:host). However that platform is not the default platform (see .bazelrc). +5. Similarly, a boolean attribute "require_gpu" is available in all the wrapped rules. + The value of this flag affects which platform and toolchain could be running the target as above. + Only toolchain supporting GPU would be able to run target tagged with "require_gpu=True" ### Setup ```python @@ -44,33 +47,52 @@ def py_genrule(**attrs): _COMPATIBLE_WITH_SNOWPARK_TAG = "wheel_compatible_with_snowpark" -def _add_target_compatibility_labels(compatible_with_snowpark, attrs): +def _add_target_compatibility_labels(compatible_with_snowpark, require_gpu, attrs): + if compatible_with_snowpark and require_gpu: + fail("`require_gpu` is not compatible with snowpark.!") if compatible_with_snowpark: attrs["target_compatible_with"] = select({ "//bazel/platforms:extended_conda_channels": [], "//bazel/platforms:snowflake_conda_channel": [], "//conditions:default": ["@platforms//:incompatible"], + }) + select({ + "//bazel/platforms:has_gpu": [], + "//bazel/platforms:no_gpu": [], + "//conditions:default": ["@platforms//:incompatible"], + }) + elif require_gpu: + attrs["target_compatible_with"] = select({ + "//bazel/platforms:extended_conda_channels": [], + "//conditions:default": ["@platforms//:incompatible"], + }) + select({ + "//bazel/platforms:has_gpu": [], + "//conditions:default": ["@platforms//:incompatible"], }) else: attrs["target_compatible_with"] = select({ "//bazel/platforms:extended_conda_channels": [], "//conditions:default": ["@platforms//:incompatible"], + }) + select({ + "//bazel/platforms:has_gpu": [], + "//bazel/platforms:no_gpu": [], + "//conditions:default": ["@platforms//:incompatible"], }) -def py_binary(compatible_with_snowpark = True, **attrs): +def py_binary(compatible_with_snowpark = True, require_gpu = False, **attrs): """Modified version of core py_binary to add check for experimental dependencies. See the Bazel core [py_binary](https://docs.bazel.build/versions/master/be/python.html#py_binary) documentation. Args: compatible_with_snowpark: see file-level document. + require_gpu: see file-level document. **attrs: Rule attributes """ if not check_for_tests_dependencies(native.package_name(), attrs): fail("A target in src cannot depend on packages in tests!") if not check_for_experimental_dependencies(native.package_name(), attrs): fail("Non Experimental Target cannot depend on experimental library!") - _add_target_compatibility_labels(compatible_with_snowpark, attrs) + _add_target_compatibility_labels(compatible_with_snowpark, require_gpu, attrs) # Disable bazel's behavior to add __init__.py files to modules by default. This causes import errors. Context: # * https://bazel.build/reference/be/python#py_test.legacy_create_init @@ -82,7 +104,7 @@ def py_binary(compatible_with_snowpark = True, **attrs): }) native_py_binary(**attrs) -def py_library(compatible_with_snowpark = True, **attrs): +def py_library(compatible_with_snowpark = True, require_gpu = False, **attrs): """Modified version of core py_library to add additional imports and check for experimental dependencies. See the Bazel core [py_library](https://docs.bazel.build/versions/master/be/python.html#py_library) documentation. @@ -103,23 +125,25 @@ def py_library(compatible_with_snowpark = True, **attrs): Args: compatible_with_snowpark: see file-level document. + require_gpu: see file-level document. **attrs: Rule attributes """ if not check_for_tests_dependencies(native.package_name(), attrs): fail("A target in src cannot depend on packages in tests!") if not check_for_experimental_dependencies(native.package_name(), attrs): fail("Non Experimental Target cannot depend on experimental library!") - _add_target_compatibility_labels(compatible_with_snowpark, attrs) + _add_target_compatibility_labels(compatible_with_snowpark, require_gpu, attrs) native_py_library(**attrs) -def py_test(compatible_with_snowpark = True, **attrs): +def py_test(compatible_with_snowpark = True, require_gpu = False, **attrs): """Modified version of core py_binary to add check for experimental dependencies. See the Bazel core [py_test](https://docs.bazel.build/versions/master/be/python.html#py_test) documentation. Args: compatible_with_snowpark: see file-level document. + require_gpu: see file-level document. **attrs: Rule attributes """ if not check_for_test_name(native.package_name(), attrs): @@ -128,7 +152,7 @@ def py_test(compatible_with_snowpark = True, **attrs): fail("A target in src cannot depend on packages in tests!") if not check_for_experimental_dependencies(native.package_name(), attrs): fail("Non Experimental Target cannot depend on experimental library!") - _add_target_compatibility_labels(compatible_with_snowpark, attrs) + _add_target_compatibility_labels(compatible_with_snowpark, require_gpu, attrs) # Disable bazel's behavior to add __init__.py files to modules by default. This causes import errors. Context: # * https://bazel.build/reference/be/python#py_test.legacy_create_init diff --git a/bazel/requirements/BUILD.bazel b/bazel/requirements/BUILD.bazel index 002d9796..1c8acc9f 100644 --- a/bazel/requirements/BUILD.bazel +++ b/bazel/requirements/BUILD.bazel @@ -38,6 +38,11 @@ _GENERATED_REQUIREMENTS_FILES = { "generated": "conda-env.yml", "target": "//bazel/environments:conda-env.yml", }, + "conda_gpu_env_yml": { + "cmd": "--mode dev_gpu_version --format conda_env", + "generated": "conda-gpu-env.yml", + "target": "//bazel/environments:conda-gpu-env.yml", + }, "conda_meta": { "cmd": "--mode version_requirements --format conda_meta --version " + VERSION, "generated": "meta.yaml", diff --git a/bazel/requirements/parse_and_generate_requirements.py b/bazel/requirements/parse_and_generate_requirements.py index fca25bd7..87a87dc6 100644 --- a/bazel/requirements/parse_and_generate_requirements.py +++ b/bazel/requirements/parse_and_generate_requirements.py @@ -327,8 +327,22 @@ def generate_requirements( duplicates = {item for item, count in counter.items() if count > 1} if duplicates and duplicates != {"snowflake-snowpark-python"}: raise ValueError(f"Duplicate Requirements: {duplicates}") - channels_to_use = [SNOWFLAKE_CONDA_CHANNEL, "nodefaults"] + + if mode == "dev_gpu_version": + pytorch_req = next(filter(lambda req: get_req_name(req, "conda") == "pytorch", requirements), None) + if pytorch_req: + pytorch_req["from_channel"] = "pytorch" + # TODO(halu): Central place for supported CUDA version. + # To integrate with cuda util. + cuda_req = RequirementInfo( + name_conda="cuda", + dev_version_conda="11.7.*", + from_channel="nvidia", + ) + pytorch_cuda_req = RequirementInfo(name_conda="pytorch-cuda", dev_version="11.7.*", from_channel="pytorch") + requirements.extend([cuda_req, pytorch_cuda_req]) + snowflake_only_env = list( sorted( filter( @@ -375,7 +389,7 @@ def generate_requirements( ) sys.stdout.writelines(results) elif (mode, format) == ("dev_version", "python"): - sys.stdout.writelines(f"REQUIREMENTS = {repr(snowflake_only_env)}\n") + sys.stdout.write(f"REQUIREMENTS = {json.dumps(snowflake_only_env, indent=4)}\n") elif (mode, format) == ("version_requirements", "bzl"): extras_requirements = list(filter(lambda req_info: filter_by_extras(req_info, True, False), requirements)) extras_results: MutableMapping[str, Sequence[str]] = {} @@ -414,7 +428,7 @@ def generate_requirements( ) sys.stdout.write( "EXTRA_REQUIREMENTS = {extra_requirements}\n\nREQUIREMENTS = {requirements}\n".format( - extra_requirements=json.dumps(extras_results), requirements=json.dumps(results) + extra_requirements=json.dumps(extras_results, indent=4), requirements=json.dumps(results, indent=4) ) ) elif (mode, format) == ("version_requirements", "python"): @@ -424,7 +438,7 @@ def generate_requirements( ) ) sys.stdout.writelines(f"REQUIREMENTS = {repr(results)}\n") - elif (mode, format) == ("dev_version", "conda_env"): + elif (mode, format) == ("dev_version", "conda_env") or (mode, format) == ("dev_gpu_version", "conda_env"): env_result = { "channels": channels_to_use, "dependencies": snowflake_only_env if snowflake_channel_only else extended_env, @@ -471,7 +485,7 @@ def main() -> None: parser.add_argument( "--mode", type=str, - choices=["dev_version", "version_requirements", "version_requirements_extras", "validate"], + choices=["dev_version", "dev_gpu_version", "version_requirements", "version_requirements_extras", "validate"], help="Define the mode when specifying the requirements.", required=True, ) @@ -498,6 +512,7 @@ def main() -> None: ("version_requirements", "bzl", False), # wheel rule requirements ("version_requirements", "python", False), # model deployment core dependencies list ("dev_version", "conda_env", False), # dev conda-env.yml file + ("dev_gpu_version", "conda_env", False), # dev conda-gpu-env.yml file ("dev_version", "conda_env", True), # dev conda-env-snowflake.yml file ("version_requirements", "conda_meta", False), # conda build recipe metadata file ] diff --git a/ci/RunBazelAction.sh b/ci/RunBazelAction.sh index 435b1844..743dbffe 100755 --- a/ci/RunBazelAction.sh +++ b/ci/RunBazelAction.sh @@ -83,8 +83,7 @@ fi working_dir=$(mktemp -d "/tmp/tmp_XXXXX") trap 'rm -rf "${working_dir}"' EXIT -tag_filter="--test_tag_filters=-perf_test" -coverage_tag_filter="--test_tag_filters=-perf_test,-sproc_test" +tag_filter="--test_tag_filters=" cache_test_results="--cache_test_results=no" case "${mode}" in @@ -92,13 +91,12 @@ merge_gate) affected_targets_file="${working_dir}/affected_targets" ./bazel/get_affected_targets.sh -b "${bazel}" -f "${affected_targets_file}" - tag_filter="--test_tag_filters=-autogen,-perf_test" - coverage_tag_filter="--test_tag_filters=-autogen,-perf_test,-sproc_test" + tag_filter="--test_tag_filters=-autogen" - query_expr='kind(".*_test rule", rdeps(//... - //snowflake/ml/experimental/... - set('"$("${coverage_report_file}" + lcov -a "${sf_only_coverage_report_file}" -a "${extended_coverage_report_file}" -o "${coverage_report_file}" + + if [[ "${mode}" = "local_unittest" || "${mode}" = "local_all" ]]; then + cp -f "${coverage_report_file}" ".coverage.dat" + fi genhtml --prefix "$(pwd)" --output html_coverage_report "${coverage_report_file}" fi diff --git a/ci/build_and_run_tests.sh b/ci/build_and_run_tests.sh index a9757113..78524baf 100755 --- a/ci/build_and_run_tests.sh +++ b/ci/build_and_run_tests.sh @@ -1,7 +1,7 @@ #!/bin/bash # Usage -# build_and_run_tests.sh [-b ] [--env pip|conda] [--mode merge_gate|continuous_run|release] [--with-snowpark] +# build_and_run_tests.sh [-b ] [--env pip|conda] [--mode merge_gate|continuous_run|release] [--with-snowpark] [--report ] # # Args # workspace: path to the workspace, SnowML code should be in snowml directory. @@ -14,6 +14,7 @@ # continuous_run (default): run all tests except auto-generated tests. (For nightly run.) # release: run all tests including auto-generated tests. (For releasing) # with-snowpark: Build and test with snowpark in snowpark-python directory in the workspace. +# report: Path to xml test report # # Action # - Copy the integration tests from workspace folder and execute them in testing Python env using pytest. @@ -27,7 +28,7 @@ PROG=$0 help() { local exit_code=$1 - echo "Usage: ${PROG} [-b ] [--env pip|conda] [--mode merge_gate|continuous_run|release] [--with-snowpark]" + echo "Usage: ${PROG} [-b ] [--env pip|conda] [--mode merge_gate|continuous_run|release] [--with-snowpark] [--report ]" exit "${exit_code}" } @@ -39,6 +40,7 @@ MODE="continuous_run" SNOWML_DIR="snowml" SNOWPARK_DIR="snowpark-python" IS_NT=false +JUNIT_REPORT_PATH="" while (($#)); do case $1 in @@ -65,6 +67,10 @@ while (($#)); do help 1 fi ;; + --report) + shift + JUNIT_REPORT_PATH=$1 + ;; -h | --help) help 0 ;; @@ -243,6 +249,9 @@ COMMON_PYTEST_FLAG=() COMMON_PYTEST_FLAG+=(--strict-markers) # Strict the pytest markers to avoid typo in markers COMMON_PYTEST_FLAG+=(--import-mode=append) COMMON_PYTEST_FLAG+=(-n logical) +if [[ -n "${JUNIT_REPORT_PATH}" ]]; then + COMMON_PYTEST_FLAG+=(--junitxml "${JUNIT_REPORT_PATH}") +fi if [ "${ENV}" = "pip" ]; then # Copy wheel package @@ -255,7 +264,7 @@ if [ "${ENV}" = "pip" ]; then # otherwise it will fail in dependency resolution. python3.8 -m pip install --upgrade pip python3.8 -m pip list - python3.8 -m pip install "snowflake_ml_python-${VERSION}-py3-none-any.whl[all]" pytest-xdist[psutil] -r "${WORKSPACE}/${SNOWML_DIR}/requirements.txt" --no-cache-dir --force-reinstall + python3.8 -m pip install "snowflake_ml_python-${VERSION}-py3-none-any.whl[all]" "pytest-xdist[psutil]==2.5.0" -r "${WORKSPACE}/${SNOWML_DIR}/requirements.txt" --no-cache-dir --force-reinstall if [ "${WITH_SNOWPARK}" = true ]; then cp "$(find "${WORKSPACE}" -maxdepth 1 -iname 'snowflake_snowpark_python-*.whl')" "${TEMP_TEST_DIR}" python3.8 -m pip install "$(find . -maxdepth 1 -iname 'snowflake_snowpark_python-*.whl')" --no-deps --force-reinstall @@ -275,12 +284,12 @@ else conda clean --all --force-pkgs-dirs -y # Create testing env - conda create -y -p testenv -c "${WORKSPACE}/conda-bld" -c "https://repo.anaconda.com/pkgs/snowflake/" --override-channels "python=3.8" snowflake-ml-python pytest-xdist psutil inflection "${OPTIONAL_REQUIREMENTS[@]}" + conda create -y -p testenv -c "${WORKSPACE}/conda-bld" -c "https://repo.anaconda.com/pkgs/snowflake/" --override-channels "python=3.8" snowflake-ml-python "py==1.9.0" "pytest-xdist==2.5.0" psutil inflection "${OPTIONAL_REQUIREMENTS[@]}" conda list -p testenv # Run integration tests set +e - TEST_SRCDIR="${TEMP_TEST_DIR}" conda run -p testenv --no-capture-output python -m pytest "${COMMON_PYTEST_FLAG[@]}" tests/integ/ + TEST_SRCDIR="${TEMP_TEST_DIR}" conda run -p testenv --no-capture-output python -m pytest "${COMMON_PYTEST_FLAG[@]}" -m "not conda_incompatible" tests/integ/ TEST_RETCODE=$? set -e diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml index 0da5ff39..8ab59e33 100644 --- a/ci/conda_recipe/meta.yaml +++ b/ci/conda_recipe/meta.yaml @@ -17,7 +17,7 @@ build: noarch: python package: name: snowflake-ml-python - version: 1.0.11 + version: 1.0.12 requirements: build: - python @@ -29,6 +29,7 @@ requirements: - cachetools>=3.1.1,<5 - cloudpickle>=2.0.0 - fsspec>=2022.11,<2024 + - importlib_resources>=5.1.4, <6 - numpy>=1.23,<2 - packaging>=20.9,<24 - pandas>=1.0.0,<2 @@ -52,6 +53,6 @@ requirements: - tensorflow>=2.9,<3,!=2.12.0 - tokenizers>=0.10,<1 - torchdata>=0.4,<1 - - transformers>=4.29.2,<5 + - transformers>=4.32.1,<5 source: path: ../../ diff --git a/ci/get_excluded_tests.sh b/ci/get_excluded_tests.sh index f8c6ff1d..64c74e1b 100755 --- a/ci/get_excluded_tests.sh +++ b/ci/get_excluded_tests.sh @@ -71,7 +71,7 @@ if [[ $mode = "unused" || $mode = "all" ]]; then # -- Begin of Query Rules Heredoc -- cat >"${unused_test_rule_file}" <"${unaffected_test_rule_file}" <"${excluded_test_source_rule_file}" +# -- Begin of Query Rules Heredoc -- +cat >"${excluded_test_source_rule_file}" <"${working_dir}/type_checked_targets_query" + "$("${working_dir}/type_checked_targets_query" "${bazel}" query --query_file="${working_dir}/type_checked_targets_query" >"${working_dir}/type_checked_targets" echo "Type checking the following targets:" "$(<"${working_dir}/type_checked_targets")" diff --git a/requirements.txt b/requirements.txt index e005708e..29d37aac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ flask-cors==3.0.10 flask==2.1.3 fsspec[http]==2023.3.0 httpx==0.23.0 +importlib_resources==5.1.4 inflection==0.5.1 joblib==1.1.1 jsonschema==3.2.0 @@ -42,13 +43,14 @@ sphinx==5.0.2 sqlparse==0.4.4 starlette==0.27.0 tensorflow==2.13.1 -tokenizers==0.13.2 +tokenizers==0.14.1 torch==2.0.1 torchdata==0.6.1 -transformers==4.32.1 +transformers==4.35.0 types-PyYAML==6.0.12 types-cachetools==4.2.2 types-protobuf==4.23.0.1 types-requests==2.30.0.0 typing-extensions==4.5.0 +vllm==0.2.1.post1 xgboost==1.7.3 diff --git a/requirements.yml b/requirements.yml index f34af4af..e2d9f390 100644 --- a/requirements.yml +++ b/requirements.yml @@ -108,6 +108,9 @@ version_requirements: '>=2022.11,<2024' - name: httpx dev_version: 0.23.0 +- name: importlib_resources + dev_version: 5.1.4 + version_requirements: '>=5.1.4, <6' - name: inflection dev_version: 0.5.1 tags: @@ -221,7 +224,8 @@ requirements_extra_tags: - tensorflow - name: tokenizers - dev_version: 0.13.2 + dev_version_conda: 0.13.2 + dev_version_pypi: 0.14.1 version_requirements: '>=0.10,<1' requirements_extra_tags: - transformers @@ -231,8 +235,9 @@ requirements_extra_tags: - torch - name: transformers - dev_version: 4.32.1 - version_requirements: '>=4.29.2,<5' + dev_version_conda: 4.32.1 + dev_version_pypi: 4.35.0 + version_requirements: '>=4.32.1,<5' requirements_extra_tags: - transformers - name: types-requests @@ -270,3 +275,8 @@ version_requirements_pypi: '>=0.5.0,<1' requirements_extra_tags: - llm +- name_pypi: vllm + dev_version_pypi: 0.2.1.post1 + version_requirements_pypi: '>=0.2.1.post1,<1' + requirements_extra_tags: + - llm diff --git a/snowflake/ml/_internal/BUILD.bazel b/snowflake/ml/_internal/BUILD.bazel index fcdd26f5..a1139433 100644 --- a/snowflake/ml/_internal/BUILD.bazel +++ b/snowflake/ml/_internal/BUILD.bazel @@ -41,6 +41,7 @@ py_library( srcs = ["env_utils.py"], deps = [ ":env", + "//snowflake/ml/_internal/exceptions", "//snowflake/ml/_internal/utils:query_result_checker", ], ) @@ -74,3 +75,8 @@ py_test( "//snowflake/ml/_internal/exceptions", ], ) + +py_library( + name = "migrator_utils", + srcs = ["migrator_utils.py"], +) diff --git a/snowflake/ml/_internal/env_utils.py b/snowflake/ml/_internal/env_utils.py index e25d22a8..d78806b5 100644 --- a/snowflake/ml/_internal/env_utils.py +++ b/snowflake/ml/_internal/env_utils.py @@ -1,24 +1,32 @@ import collections import copy +import pathlib import re import textwrap import warnings from importlib import metadata as importlib_metadata -from typing import DefaultDict, Dict, List, Optional, Tuple +from typing import Any, DefaultDict, Dict, List, Optional, Tuple +import yaml from packaging import requirements, specifiers, utils as packaging_utils, version import snowflake.connector from snowflake.ml._internal import env as snowml_env +from snowflake.ml._internal.exceptions import ( + error_codes, + exceptions as snowml_exceptions, +) from snowflake.ml._internal.utils import query_result_checker from snowflake.snowpark import session -_SNOWML_PKG_NAME = "snowflake-ml-python" +_SNOWFLAKE_CONDA_CHANNEL_URL = "https://repo.anaconda.com/pkgs/snowflake" +_NODEFAULTS = "nodefaults" _INFO_SCHEMA_PACKAGES_HAS_RUNTIME_VERSION: Optional[bool] = None _SNOWFLAKE_CONDA_PACKAGE_CACHE: Dict[str, List[version.Version]] = {} DEFAULT_CHANNEL_NAME = "" SNOWML_SPROC_ENV = "IN_SNOWML_SPROC" +SNOWPARK_ML_PKG_NAME = "snowflake-ml-python" def _validate_pip_requirement_string(req_str: str) -> requirements.Requirement: @@ -185,8 +193,8 @@ def get_local_installed_version_of_pip_package(pip_req: requirements.Requirement """Get the local installed version of a given pip package requirement. If the package is locally installed, and the local version meet the specifier of the requirements, return a new requirement specifier that pins the version. - If the local version does not meet the specifier of the requirements, a warn will be omitted and returns a new - requirement specifier that pins the version. + If the local version does not meet the specifier of the requirements, a warn will be omitted and returns + the original package requirement. If the package is not locally installed or not found, the original package requirement is returned. Args: @@ -199,7 +207,7 @@ def get_local_installed_version_of_pip_package(pip_req: requirements.Requirement local_dist = importlib_metadata.distribution(pip_req.name) local_dist_version = local_dist.version except importlib_metadata.PackageNotFoundError: - if pip_req.name == _SNOWML_PKG_NAME: + if pip_req.name == SNOWPARK_ML_PKG_NAME: local_dist_version = snowml_env.VERSION else: return pip_req @@ -207,23 +215,41 @@ def get_local_installed_version_of_pip_package(pip_req: requirements.Requirement new_pip_req.specifier = specifiers.SpecifierSet(specifiers=f"=={local_dist_version}") if not pip_req.specifier.contains(local_dist_version): warnings.warn( - f"Package requirement {str(pip_req)} specified, while version {local_dist_version} is installed.", + f"Package requirement {str(pip_req)} specified, while version {local_dist_version} is installed. " + "Local version will be ignored to conform to package requirement.", category=UserWarning, ) + return pip_req return new_pip_req def relax_requirement_version(req: requirements.Requirement) -> requirements.Requirement: - """Remove version specifier from a requirement. + """Relax version specifier from a requirement. It detects any ==x.y.z in specifiers and replaced with + >=x.y, <(x+1) Args: req: The requirement that version specifier to be removed. Returns: - A new requirement object without version specifier while others kept. + A new requirement object after relaxations. """ new_req = copy.deepcopy(req) - new_req.specifier = specifiers.SpecifierSet() + relaxed_specifier_set = set() + for spec in new_req.specifier._specs: + if spec.operator != "==": + relaxed_specifier_set.add(spec) + continue + pinned_version = None + try: + pinned_version = version.parse(spec.version) + except version.InvalidVersion: + # For the case that the version string has * like 1.2.* + relaxed_specifier_set.add(spec) + continue + assert pinned_version is not None + relaxed_specifier_set.add(specifiers.Specifier(f">={pinned_version.major}.{pinned_version.minor}")) + relaxed_specifier_set.add(specifiers.Specifier(f"<{pinned_version.major + 1}")) + new_req.specifier._specs = frozenset(relaxed_specifier_set) return new_req @@ -249,9 +275,6 @@ def validate_requirements_in_snowflake_conda_channel( reqs: List of requirement specifiers. python_version: A string of python version where model is run. - Raises: - ValueError: Raised when the specifier cannot be supported when creating UDF. - Returns: A list of pinned latest version that available in Snowflake anaconda channel and meet the version specifier. """ @@ -309,8 +332,6 @@ def validate_requirements_in_snowflake_conda_channel( except snowflake.connector.DataError: return None for req in reqs: - if len(req.specifier) > 1 or any(spec.operator != "==" for spec in req.specifier): - raise ValueError("At most 1 version specifier using == operator is supported without local conda resolver.") available_versions = list(req.specifier.filter(set(_SNOWFLAKE_CONDA_PACKAGE_CACHE.get(req.name, [])))) if not available_versions: return None @@ -319,13 +340,149 @@ def validate_requirements_in_snowflake_conda_channel( return sorted(ret_list) +def save_conda_env_file( + path: pathlib.Path, + conda_chan_deps: DefaultDict[str, List[requirements.Requirement]], + python_version: str, +) -> None: + """Generate conda.yml file given a dict of dependencies after validation. + The channels part of conda.yml file will contains Snowflake Anaconda Channel, nodefaults and all channel names + in keys of the dict, ordered by the number of the packages which belongs to. + The dependencies part of conda.yml file will contains requirements specifications. If the requirements is in the + value list whose key is DEFAULT_CHANNEL_NAME, then the channel won't be specified explicitly. Otherwise, it will be + specified explicitly via {channel}::{requirement} format. + + Args: + path: Path to the conda.yml file. + conda_chan_deps: Dict of conda dependencies after validated. + python_version: A string 'major.minor' showing python version relate to model. + """ + assert path.suffix in [".yml", ".yaml"], "Conda environment file should have extension of yml or yaml." + path.parent.mkdir(parents=True, exist_ok=True) + env: Dict[str, Any] = dict() + env["name"] = "snow-env" + # Get all channels in the dependencies, ordered by the number of the packages which belongs to and put into + # channels section. + channels = list(dict(sorted(conda_chan_deps.items(), key=lambda item: len(item[1]), reverse=True)).keys()) + if DEFAULT_CHANNEL_NAME in channels: + channels.remove(DEFAULT_CHANNEL_NAME) + env["channels"] = [_SNOWFLAKE_CONDA_CHANNEL_URL] + channels + [_NODEFAULTS] + env["dependencies"] = [f"python=={python_version}.*"] + for chan, reqs in conda_chan_deps.items(): + env["dependencies"].extend( + [f"{chan}::{str(req)}" if chan != DEFAULT_CHANNEL_NAME else str(req) for req in reqs] + ) + + with open(path, "w", encoding="utf-8") as f: + yaml.safe_dump(env, stream=f, default_flow_style=False) + + +def save_requirements_file(path: pathlib.Path, pip_deps: List[requirements.Requirement]) -> None: + """Generate Python requirements.txt file in the given directory path. + + Args: + path: Path to the requirements.txt file. + pip_deps: List of dependencies string after validated. + """ + assert path.suffix in [".txt"], "PIP requirement file should have extension of txt." + path.parent.mkdir(parents=True, exist_ok=True) + requirements = "\n".join(map(str, pip_deps)) + with open(path, "w", encoding="utf-8") as out: + out.write(requirements) + + +def load_conda_env_file( + path: pathlib.Path, +) -> Tuple[DefaultDict[str, List[requirements.Requirement]], Optional[List[requirements.Requirement]], Optional[str]]: + """Read conda.yml file to get a dict of dependencies after validation. + The channels part of conda.yml file will be processed with following rules: + 1. If it is Snowflake Anaconda Channel, ignore as it is default. + 2. If it is nodefaults channel (meta-channel), ignore as we don't need it. + 3. If it is a channel where its name does not appear in the dependencies section, added into the dict as key where + value is an empty list. + 4. If it is a channel where its name appears in the dependencies section as {channel}::{requirements}, remove it + as when parsing the requirements part it will be added into the dict as a key. + + The dependencies part of conda.yml file will be processed as follows. + If the requirements has no channel specified, it will be stored in the bucket of DEFAULT_CHANNEL_NAME + If the requirements is specified explicitly via {channel}::{requirement} format, it will be stored into + corresponding bucket in the dict. + If the requirement is a dict whose key is "pip", it will be parsed as pip requirements. + If the requirement has the name "python", it will be parsed as python version requirement. + + Args: + path: Path to conda.yml. + + Raises: + ValueError: Raised when the requirement has an unsupported or unknown type. + + Returns: + A tuple of Dict of conda dependencies after validated, optional pip requirements if exist + and a string 'major.minor.patchlevel' of python version. + """ + with open(path, encoding="utf-8") as f: + env = yaml.safe_load(stream=f) + + assert isinstance(env, dict) + + deps = [] + pip_deps = [] + + python_version = None + + channels = env.get("channels", []) + if _SNOWFLAKE_CONDA_CHANNEL_URL in channels: + channels.remove(_SNOWFLAKE_CONDA_CHANNEL_URL) + if _NODEFAULTS in channels: + channels.remove(_NODEFAULTS) + + for dep in env["dependencies"]: + if isinstance(dep, str): + ver = parse_python_version_string(dep) + # ver is None: not python + # ver is "": python w/o specifier + # ver is str: python w/ specifier + if ver: + python_version = ver + elif ver is None: + deps.append(dep) + elif isinstance(dep, dict) and "pip" in dep: + pip_deps.extend(dep["pip"]) + else: + raise ValueError(f"Unable to parse the conda.yml file, confronting unsupported type of requirement {dep}") + + conda_dep_dict = validate_conda_dependency_string_list(deps) + pip_deps_list = validate_pip_requirement_string_list(pip_deps) + + for channel in channels: + if channel not in conda_dep_dict: + conda_dep_dict[channel] = [] + + return conda_dep_dict, pip_deps_list if pip_deps_list else None, python_version + + +def load_requirements_file(path: pathlib.Path) -> List[requirements.Requirement]: + """Load Python requirements.txt file from the given directory path. + + Args: + path: Path to the requirements.txt file. + + Returns: + List of dependencies string after validated. + """ + with open(path, encoding="utf-8") as f: + reqs = f.readlines() + + return validate_pip_requirement_string_list(reqs) + + # We have to use re to support MLFlow generated python string, which use = rather than == -PYTHON_VERSION_PATTERN = re.compile(r"python(?:(?P=|==|>|<|>=|<=|~=|===)(?P\d(?:\.\d+)+))?") +PYTHON_VERSION_PATTERN = re.compile(r"python(?:(?P=|==|>|<|>=|<=|~=|===)(?P\d(?:\.\d+)+))?(\.*)?") def parse_python_version_string(dep: str) -> Optional[str]: if dep.startswith("python"): - m = PYTHON_VERSION_PATTERN.search(dep) + m = PYTHON_VERSION_PATTERN.match(dep) if m is None: return None op = m.group("op") @@ -340,6 +497,33 @@ def parse_python_version_string(dep: str) -> Optional[str]: return None +def validate_py_runtime_version(provided_py_version_str: str) -> None: + """Validate the provided python version string with python version in current runtime. + If the major or minor is different, errors out. + + Args: + provided_py_version_str: the provided python version string. + + Raises: + SnowflakeMLException: Raised when the provided python version has different major or minor. + """ + if provided_py_version_str != snowml_env.PYTHON_VERSION: + provided_py_version = version.parse(provided_py_version_str) + current_py_version = version.parse(snowml_env.PYTHON_VERSION) + if ( + provided_py_version.major != current_py_version.major + or provided_py_version.minor != current_py_version.minor + ): + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.LOCAL_ENVIRONMENT_ERROR, + original_exception=RuntimeError( + f"Unable to load model which is saved with Python {provided_py_version_str} " + f"while current Python version is {snowml_env.PYTHON_VERSION}. " + "To load model metadata only, set meta_only to True." + ), + ) + + def _find_conda_dep_spec( conda_chan_deps: DefaultDict[str, List[requirements.Requirement]], pkg_name: str ) -> Optional[Tuple[str, requirements.Requirement]]: @@ -355,15 +539,13 @@ def _find_pip_req_spec(pip_reqs: List[requirements.Requirement], pkg_name: str) return spec -def _find_dep_spec( +def find_dep_spec( conda_chan_deps: DefaultDict[str, List[requirements.Requirement]], pip_reqs: List[requirements.Requirement], conda_pkg_name: str, pip_pkg_name: Optional[str] = None, remove_spec: bool = False, -) -> Tuple[ - DefaultDict[str, List[requirements.Requirement]], List[requirements.Requirement], Optional[requirements.Requirement] -]: +) -> Optional[requirements.Requirement]: if pip_pkg_name is None: pip_pkg_name = conda_pkg_name spec_conda = _find_conda_dep_spec(conda_chan_deps, conda_pkg_name) @@ -371,109 +553,11 @@ def _find_dep_spec( channel, spec = spec_conda if remove_spec: conda_chan_deps[channel].remove(spec) - return conda_chan_deps, pip_reqs, spec + return spec else: spec_pip = _find_pip_req_spec(pip_reqs, pip_pkg_name) if spec_pip: if remove_spec: pip_reqs.remove(spec_pip) - return conda_chan_deps, pip_reqs, spec_pip - return conda_chan_deps, pip_reqs, None - - -def generate_env_for_cuda( - conda_chan_deps: DefaultDict[str, List[requirements.Requirement]], - pip_reqs: List[requirements.Requirement], - cuda_version: str, -) -> Tuple[DefaultDict[str, List[requirements.Requirement]], List[requirements.Requirement]]: - conda_chan_deps_cuda = copy.deepcopy(conda_chan_deps) - pip_reqs_cuda = copy.deepcopy(pip_reqs) - - cuda_version_obj = version.parse(cuda_version) - cuda_version_spec_str = f"{cuda_version_obj.major}.{cuda_version_obj.minor}.*" - - try: - append_conda_dependency( - conda_chan_deps_cuda, - ("nvidia", requirements.Requirement(f"cuda=={cuda_version_spec_str}")), - ) - except (DuplicateDependencyError, DuplicateDependencyInMultipleChannelsError): - pass - - conda_chan_deps_cuda, pip_reqs_cuda, xgboost_spec = _find_dep_spec( - conda_chan_deps_cuda, pip_reqs, conda_pkg_name="xgboost", remove_spec=True - ) - if xgboost_spec: - xgboost_spec.name = "py-xgboost-gpu" - try: - append_conda_dependency( - conda_chan_deps_cuda, - ("conda-forge", xgboost_spec), - ) - except (DuplicateDependencyError, DuplicateDependencyInMultipleChannelsError): - pass - - conda_chan_deps_cuda, pip_reqs_cuda, pytorch_spec = _find_dep_spec( - conda_chan_deps_cuda, pip_reqs, conda_pkg_name="pytorch", pip_pkg_name="torch", remove_spec=True - ) - if pytorch_spec: - pytorch_spec.name = "pytorch" - try: - append_conda_dependency( - conda_chan_deps_cuda, - ("pytorch", pytorch_spec), - ) - except (DuplicateDependencyError, DuplicateDependencyInMultipleChannelsError): - pass - - try: - append_conda_dependency( - conda_chan_deps_cuda, - p_chan_dep=("pytorch", requirements.Requirement(f"pytorch-cuda=={cuda_version_spec_str}")), - ) - except (DuplicateDependencyError, DuplicateDependencyInMultipleChannelsError): - pass - - conda_chan_deps_cuda, pip_reqs_cuda, tf_spec = _find_dep_spec( - conda_chan_deps_cuda, pip_reqs, conda_pkg_name="tensorflow", remove_spec=True - ) - if tf_spec: - tf_spec.name = "tensorflow-gpu" - try: - append_conda_dependency( - conda_chan_deps_cuda, - ("conda-forge", tf_spec), - ) - except (DuplicateDependencyError, DuplicateDependencyInMultipleChannelsError): - pass - - conda_chan_deps_cuda, pip_reqs_cuda, transformers_spec = _find_dep_spec( - conda_chan_deps_cuda, pip_reqs, conda_pkg_name="transformers", remove_spec=False - ) - if transformers_spec: - try: - append_conda_dependency( - conda_chan_deps_cuda, - ("conda-forge", requirements.Requirement("accelerate>=0.22.0")), - ) - except (DuplicateDependencyError, DuplicateDependencyInMultipleChannelsError): - pass - - # Required by bitsandbytes - try: - append_conda_dependency( - conda_chan_deps_cuda, - (DEFAULT_CHANNEL_NAME, get_local_installed_version_of_pip_package(requirements.Requirement("scipy"))), - ) - except (DuplicateDependencyError, DuplicateDependencyInMultipleChannelsError): - pass - - try: - append_requirement_list( - pip_reqs_cuda, - requirements.Requirement("bitsandbytes>=0.41.0"), - ) - except DuplicateDependencyError: - pass - - return conda_chan_deps_cuda, pip_reqs_cuda + return spec_pip + return None diff --git a/snowflake/ml/_internal/env_utils_test.py b/snowflake/ml/_internal/env_utils_test.py index 42279708..44249633 100644 --- a/snowflake/ml/_internal/env_utils_test.py +++ b/snowflake/ml/_internal/env_utils_test.py @@ -1,9 +1,13 @@ import collections +import copy +import pathlib import platform +import tempfile import textwrap from importlib import metadata as importlib_metadata from typing import DefaultDict, List, cast +import yaml from absl.testing import absltest from packaging import requirements, specifiers @@ -239,13 +243,13 @@ def test_get_local_installed_version_of_pip_package(self) -> None: r = requirements.Requirement(f"pip!={importlib_metadata.version('pip')}") self.assertEqual( - requirements.Requirement(f"pip=={importlib_metadata.version('pip')}"), + requirements.Requirement(f"pip!={importlib_metadata.version('pip')}"), env_utils.get_local_installed_version_of_pip_package(r), ) - r = requirements.Requirement(env_utils._SNOWML_PKG_NAME) + r = requirements.Requirement(env_utils.SNOWPARK_ML_PKG_NAME) self.assertEqual( - requirements.Requirement(f"{env_utils._SNOWML_PKG_NAME}=={snowml_env.VERSION}"), + requirements.Requirement(f"{env_utils.SNOWPARK_ML_PKG_NAME}=={snowml_env.VERSION}"), env_utils.get_local_installed_version_of_pip_package(r), ) @@ -262,13 +266,29 @@ def test_get_local_installed_version_of_pip_package(self) -> None: def test_relax_requirement_version(self) -> None: r = requirements.Requirement("python-package==1.0.1") - self.assertEqual(env_utils.relax_requirement_version(r), requirements.Requirement("python-package")) + self.assertEqual(env_utils.relax_requirement_version(r), requirements.Requirement("python-package>=1.0,<2")) + + r = requirements.Requirement("python-package==1.0.*") + self.assertEqual(env_utils.relax_requirement_version(r), requirements.Requirement("python-package==1.0.*")) + + r = requirements.Requirement("python-package==1.0") + self.assertEqual(env_utils.relax_requirement_version(r), requirements.Requirement("python-package>=1.0,<2")) + + r = requirements.Requirement("python-package==1.*") + self.assertEqual(env_utils.relax_requirement_version(r), requirements.Requirement("python-package==1.*")) + + r = requirements.Requirement("python-package==1") + self.assertEqual(env_utils.relax_requirement_version(r), requirements.Requirement("python-package>=1.0,<2")) r = requirements.Requirement("python-package==1.0.1, !=1.0.2") - self.assertEqual(env_utils.relax_requirement_version(r), requirements.Requirement("python-package")) + self.assertEqual( + env_utils.relax_requirement_version(r), requirements.Requirement("python-package>=1.0,<2,!=1.0.2") + ) r = requirements.Requirement("python-package[extra]==1.0.1") - self.assertEqual(env_utils.relax_requirement_version(r), requirements.Requirement("python-package[extra]")) + self.assertEqual( + env_utils.relax_requirement_version(r), requirements.Requirement("python-package[extra]>=1.0,<2") + ) r = requirements.Requirement("python-package") self.assertEqual(env_utils.relax_requirement_version(r), requirements.Requirement("python-package")) @@ -438,19 +458,22 @@ def test_validate_requirements_in_snowflake_conda_channel(self) -> None: ["xgboost==1.7.3"], ) - with self.assertRaises(ValueError): + self.assertListEqual( env_utils.validate_requirements_in_snowflake_conda_channel( session=c_session, - reqs=[requirements.Requirement("xgboost<1.7")], + reqs=[requirements.Requirement("xgboost>=1.7,<1.8")], python_version=snowml_env.PYTHON_VERSION, - ) + ), + ["xgboost<1.8,>=1.7"], + ) - with self.assertRaises(ValueError): + self.assertIsNone( env_utils.validate_requirements_in_snowflake_conda_channel( session=c_session, reqs=[requirements.Requirement("xgboost==1.7.1, ==1.7.3")], python_version=snowml_env.PYTHON_VERSION, ) + ) # clear cache env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} @@ -581,6 +604,7 @@ def test_parse_python_version_string(self) -> None: self.assertIsNone(env_utils.parse_python_version_string("not_python")) self.assertEqual(env_utils.parse_python_version_string("python"), "") self.assertEqual(env_utils.parse_python_version_string("python==3.8.13"), "3.8.13") + self.assertEqual(env_utils.parse_python_version_string("python==3.8.*"), "3.8") self.assertEqual(env_utils.parse_python_version_string("python=3.11"), "3.11") with self.assertRaises(ValueError): env_utils.parse_python_version_string("python<=3.11") @@ -630,12 +654,13 @@ def test_find_dep_spec(self) -> None: pip_reqs = [requirements.Requirement("pip_package==1.0.0")] - conda_reqs_result, pip_reqs_result, spec = env_utils._find_dep_spec( - conda_reqs, pip_reqs, conda_pkg_name="somepackage" - ) + original_conda_reqs = copy.deepcopy(conda_reqs) + original_pip_reqs = copy.deepcopy(pip_reqs) - self.assertDictEqual(conda_reqs_result, conda_reqs) - self.assertListEqual(pip_reqs_result, pip_reqs) + spec = env_utils.find_dep_spec(conda_reqs, pip_reqs, conda_pkg_name="somepackage") + + self.assertDictEqual(original_conda_reqs, conda_reqs) + self.assertListEqual(original_pip_reqs, pip_reqs) self.assertEqual(spec, requirements.Requirement("somepackage==1.0.0")) conda_reqs = collections.defaultdict( @@ -648,12 +673,13 @@ def test_find_dep_spec(self) -> None: pip_reqs = [requirements.Requirement("pip_package==1.0.0")] - conda_reqs_result, pip_reqs_result, spec = env_utils._find_dep_spec( - conda_reqs, pip_reqs, conda_pkg_name="pip_package" - ) + original_conda_reqs = copy.deepcopy(conda_reqs) + original_pip_reqs = copy.deepcopy(pip_reqs) + + spec = env_utils.find_dep_spec(conda_reqs, pip_reqs, conda_pkg_name="pip_package") - self.assertDictEqual(conda_reqs_result, conda_reqs) - self.assertListEqual(pip_reqs_result, pip_reqs) + self.assertDictEqual(original_conda_reqs, conda_reqs) + self.assertListEqual(original_pip_reqs, pip_reqs) self.assertEqual(spec, requirements.Requirement("pip_package==1.0.0")) conda_reqs = collections.defaultdict( @@ -666,12 +692,13 @@ def test_find_dep_spec(self) -> None: pip_reqs = [requirements.Requirement("pip_package==1.0.0")] - conda_reqs_result, pip_reqs_result, spec = env_utils._find_dep_spec( - conda_reqs, pip_reqs, conda_pkg_name="somepackage", pip_pkg_name="pip_package" - ) + original_conda_reqs = copy.deepcopy(conda_reqs) + original_pip_reqs = copy.deepcopy(pip_reqs) + + spec = env_utils.find_dep_spec(conda_reqs, pip_reqs, conda_pkg_name="somepackage", pip_pkg_name="pip_package") - self.assertDictEqual(conda_reqs_result, conda_reqs) - self.assertListEqual(pip_reqs_result, pip_reqs) + self.assertDictEqual(original_conda_reqs, conda_reqs) + self.assertListEqual(original_pip_reqs, pip_reqs) self.assertEqual(spec, requirements.Requirement("somepackage==1.0.0")) conda_reqs = collections.defaultdict( @@ -684,12 +711,12 @@ def test_find_dep_spec(self) -> None: pip_reqs = [requirements.Requirement("pip_package==1.0.0")] - conda_reqs_result, pip_reqs_result, spec = env_utils._find_dep_spec( - conda_reqs, pip_reqs, conda_pkg_name="somepackage", remove_spec=True - ) + original_pip_reqs = copy.deepcopy(pip_reqs) + + spec = env_utils.find_dep_spec(conda_reqs, pip_reqs, conda_pkg_name="somepackage", remove_spec=True) self.assertDictEqual( - conda_reqs_result, + conda_reqs, collections.defaultdict( list, { @@ -698,7 +725,7 @@ def test_find_dep_spec(self) -> None: }, ), ) - self.assertListEqual(pip_reqs_result, pip_reqs) + self.assertListEqual(pip_reqs, original_pip_reqs) self.assertEqual(spec, requirements.Requirement("somepackage==1.0.0")) conda_reqs = collections.defaultdict( @@ -708,15 +735,14 @@ def test_find_dep_spec(self) -> None: "another_channel": [requirements.Requirement("another_package==1.0.0")], }, ) + original_conda_reqs = copy.deepcopy(conda_reqs) pip_reqs = [requirements.Requirement("pip_package==1.0.0")] - conda_reqs_result, pip_reqs_result, spec = env_utils._find_dep_spec( - conda_reqs, pip_reqs, conda_pkg_name="pip_package", remove_spec=True - ) + spec = env_utils.find_dep_spec(conda_reqs, pip_reqs, conda_pkg_name="pip_package", remove_spec=True) - self.assertDictEqual(conda_reqs_result, conda_reqs) - self.assertListEqual(pip_reqs_result, []) + self.assertDictEqual(conda_reqs, original_conda_reqs) + self.assertListEqual(pip_reqs, []) self.assertEqual(spec, requirements.Requirement("pip_package==1.0.0")) conda_reqs = collections.defaultdict( @@ -729,12 +755,12 @@ def test_find_dep_spec(self) -> None: pip_reqs = [requirements.Requirement("pip_package==1.0.0")] - conda_reqs_result, pip_reqs_result, spec = env_utils._find_dep_spec( + spec = env_utils.find_dep_spec( conda_reqs, pip_reqs, conda_pkg_name="somepackage", pip_pkg_name="pip_package", remove_spec=True ) self.assertDictEqual( - conda_reqs_result, + conda_reqs, collections.defaultdict( list, { @@ -743,545 +769,173 @@ def test_find_dep_spec(self) -> None: }, ), ) - self.assertListEqual(pip_reqs_result, pip_reqs) + self.assertListEqual(pip_reqs, pip_reqs) self.assertEqual(spec, requirements.Requirement("somepackage==1.0.0")) - def test_generate_conda_env_for_cuda(self) -> None: - conda_reqs: DefaultDict[str, List[requirements.Requirement]] = collections.defaultdict( - list, - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("somepackage==1.0.0")], - "another_channel": [requirements.Requirement("another_package==1.0.0")], - }, - ) - - conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - env_utils.DEFAULT_CHANNEL_NAME: [ - requirements.Requirement("somepackage==1.0.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - "another_channel": [requirements.Requirement("another_package==1.0.0")], - }, - ) - - conda_reqs = collections.defaultdict( - list, - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("somepackage==1.0.0")], - "another_channel": [requirements.Requirement("another_package==1.0.0")], - }, - ) - - conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - env_utils.DEFAULT_CHANNEL_NAME: [ - requirements.Requirement("somepackage==1.0.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - "another_channel": [requirements.Requirement("another_package==1.0.0")], - }, - ) - - conda_reqs = collections.defaultdict( - list, - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("somepackage==1.0.0")], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.8.*"), - ], - "another_channel": [requirements.Requirement("another_package==1.0.0")], - }, - ) - - conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - env_utils.DEFAULT_CHANNEL_NAME: [ - requirements.Requirement("somepackage==1.0.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.8.*"), - ], - "another_channel": [requirements.Requirement("another_package==1.0.0")], - }, - ) - - conda_reqs = collections.defaultdict( - list, - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("pytorch==1.0.0")], - }, - ) - - conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - env_utils.DEFAULT_CHANNEL_NAME: [], - "pytorch": [ - requirements.Requirement("pytorch==1.0.0"), - requirements.Requirement("pytorch-cuda==11.7.*"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - conda_reqs = collections.defaultdict( - list, - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("pytorch>=1.0.0")], - }, - ) - - conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - env_utils.DEFAULT_CHANNEL_NAME: [], - "pytorch": [ - requirements.Requirement("pytorch>=1.0.0"), - requirements.Requirement("pytorch-cuda==11.7.*"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - conda_reqs = collections.defaultdict( - list, - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("pytorch>=1.0.0")], - "pytorch": [ - requirements.Requirement("pytorch-cuda==11.8.*"), - ], - }, - ) - - conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - env_utils.DEFAULT_CHANNEL_NAME: [], - "pytorch": [ - requirements.Requirement("pytorch-cuda==11.8.*"), - requirements.Requirement("pytorch>=1.0.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - conda_reqs = collections.defaultdict( - list, - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("pytorch>=1.0.0")], - "pytorch": [ - requirements.Requirement("pytorch>=1.1.0"), - requirements.Requirement("pytorch-cuda==11.8.*"), - ], - }, - ) - - conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - env_utils.DEFAULT_CHANNEL_NAME: [], - "pytorch": [ - requirements.Requirement("pytorch>=1.1.0"), - requirements.Requirement("pytorch-cuda==11.8.*"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - conda_reqs = collections.defaultdict( - list, - { - "conda-forge": [requirements.Requirement("pytorch==1.0.0")], - }, - ) - - conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - "conda-forge": [], - "pytorch": [ - requirements.Requirement("pytorch==1.0.0"), - requirements.Requirement("pytorch-cuda==11.7.*"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - conda_reqs_result, pip_reqs_result = env_utils.generate_env_for_cuda( - collections.defaultdict( - list, - ), - [requirements.Requirement("torch==1.0.0")], - cuda_version="11.7", - ) - - self.assertDictEqual( - conda_reqs_result, - { - "pytorch": [ - requirements.Requirement("pytorch==1.0.0"), - requirements.Requirement("pytorch-cuda==11.7.*"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - self.assertListEqual(pip_reqs_result, []) - - conda_reqs = collections.defaultdict( - list, - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("tensorflow==1.0.0")], - }, - ) - - conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - env_utils.DEFAULT_CHANNEL_NAME: [], - "conda-forge": [ - requirements.Requirement("tensorflow-gpu==1.0.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - conda_reqs = collections.defaultdict( - list, - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("tensorflow>=1.0.0")], - }, - ) - - conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - env_utils.DEFAULT_CHANNEL_NAME: [], - "conda-forge": [ - requirements.Requirement("tensorflow-gpu>=1.0.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - conda_reqs = collections.defaultdict( - list, - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("tensorflow>=1.0.0")], - "conda-forge": [ - requirements.Requirement("tensorflow-gpu>=1.1.0"), - ], - }, - ) - - conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - env_utils.DEFAULT_CHANNEL_NAME: [], - "conda-forge": [ - requirements.Requirement("tensorflow-gpu>=1.1.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - conda_reqs = collections.defaultdict( - list, - { - "conda-forge": [requirements.Requirement("tensorflow==1.0.0")], - }, - ) - - conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - "conda-forge": [ - requirements.Requirement("tensorflow-gpu==1.0.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - conda_reqs_result, pip_reqs_result = env_utils.generate_env_for_cuda( - collections.defaultdict( - list, - ), - [requirements.Requirement("tensorflow==1.0.0")], - cuda_version="11.7", - ) - - self.assertDictEqual( - conda_reqs_result, - { - "conda-forge": [ - requirements.Requirement("tensorflow-gpu==1.0.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - self.assertListEqual(pip_reqs_result, []) - - conda_reqs = collections.defaultdict( - list, - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("xgboost==1.0.0")], - }, - ) - - conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - env_utils.DEFAULT_CHANNEL_NAME: [], - "conda-forge": [ - requirements.Requirement("py-xgboost-gpu==1.0.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - conda_reqs = collections.defaultdict( - list, - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("xgboost>=1.0.0")], - }, - ) - - conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - env_utils.DEFAULT_CHANNEL_NAME: [], - "conda-forge": [ - requirements.Requirement("py-xgboost-gpu>=1.0.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - conda_reqs = collections.defaultdict( - list, - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("xgboost>=1.0.0")], - "conda-forge": [ - requirements.Requirement("py-xgboost-gpu>=1.1.0"), - ], - }, - ) - - conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - env_utils.DEFAULT_CHANNEL_NAME: [], - "conda-forge": [ - requirements.Requirement("py-xgboost-gpu>=1.1.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - conda_reqs = collections.defaultdict( - list, - { - "conda-forge": [requirements.Requirement("xgboost==1.0.0")], - }, - ) - - conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - "conda-forge": [ - requirements.Requirement("py-xgboost-gpu==1.0.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - conda_reqs_result, pip_reqs_result = env_utils.generate_env_for_cuda( - collections.defaultdict( - list, - ), - [requirements.Requirement("xgboost==1.0.0")], - cuda_version="11.7", - ) - - self.assertDictEqual( - conda_reqs_result, - { - "conda-forge": [ - requirements.Requirement("py-xgboost-gpu==1.0.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - self.assertListEqual(pip_reqs_result, []) - - conda_reqs = collections.defaultdict( - list, - { - env_utils.DEFAULT_CHANNEL_NAME: [ - requirements.Requirement("transformers==1.0.0"), - requirements.Requirement("pytorch==1.0.0"), - ], - }, - ) - - conda_reqs_result, pip_reqs_result = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - env_utils.DEFAULT_CHANNEL_NAME: [ - requirements.Requirement("transformers==1.0.0"), - env_utils.get_local_installed_version_of_pip_package(requirements.Requirement("scipy")), - ], - "pytorch": [ - requirements.Requirement("pytorch==1.0.0"), - requirements.Requirement("pytorch-cuda==11.7.*"), - ], - "conda-forge": [ - requirements.Requirement("accelerate>=0.22.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - self.assertListEqual(pip_reqs_result, [requirements.Requirement("bitsandbytes>=0.41.0")]) - conda_reqs = collections.defaultdict( - list, - { - env_utils.DEFAULT_CHANNEL_NAME: [ - requirements.Requirement("transformers==1.0.0"), - requirements.Requirement("scipy==1.0.0"), - ], - "conda-forge": [ - requirements.Requirement("accelerate==1.0.0"), - ], - }, - ) - conda_reqs_result, pip_reqs_result = env_utils.generate_env_for_cuda( - conda_reqs, [requirements.Requirement("bitsandbytes==1.0.0")], cuda_version="11.7" - ) - - self.assertDictEqual( - conda_reqs_result, - { - env_utils.DEFAULT_CHANNEL_NAME: [ - requirements.Requirement("transformers==1.0.0"), - requirements.Requirement("scipy==1.0.0"), - ], - "conda-forge": [ - requirements.Requirement("accelerate==1.0.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - self.assertListEqual(pip_reqs_result, [requirements.Requirement("bitsandbytes==1.0.0")]) - - conda_reqs = collections.defaultdict( - list, - { - "conda-forge": [requirements.Requirement("transformers==1.0.0")], - }, - ) - - conda_reqs_result, pip_reqs_result = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") - - self.assertDictEqual( - conda_reqs_result, - { - env_utils.DEFAULT_CHANNEL_NAME: [ - env_utils.get_local_installed_version_of_pip_package(requirements.Requirement("scipy")), - ], - "conda-forge": [ - requirements.Requirement("transformers==1.0.0"), - requirements.Requirement("accelerate>=0.22.0"), - ], - "nvidia": [ - requirements.Requirement(requirement_string="cuda==11.7.*"), - ], - }, - ) - - self.assertListEqual(pip_reqs_result, [requirements.Requirement("bitsandbytes>=0.41.0")]) +class EnvFileTest(absltest.TestCase): + def test_conda_env_file(self) -> None: + cd: DefaultDict[str, List[requirements.Requirement]] + + with tempfile.TemporaryDirectory() as tmpdir: + cd = collections.defaultdict(list) + env_file_path = pathlib.Path(tmpdir, "conda.yml") + env_utils.save_conda_env_file(env_file_path, cd, python_version="3.8") + loaded_cd, _, _ = env_utils.load_conda_env_file(env_file_path) + self.assertEqual(cd, loaded_cd) + + with tempfile.TemporaryDirectory() as tmpdir: + cd = collections.defaultdict(list) + cd[env_utils.DEFAULT_CHANNEL_NAME] = [requirements.Requirement("numpy")] + env_file_path = pathlib.Path(tmpdir, "conda.yml") + env_utils.save_conda_env_file(env_file_path, cd, python_version="3.8") + loaded_cd, _, _ = env_utils.load_conda_env_file(env_file_path) + self.assertEqual(cd, loaded_cd) + + with tempfile.TemporaryDirectory() as tmpdir: + cd = collections.defaultdict(list) + cd[env_utils.DEFAULT_CHANNEL_NAME] = [requirements.Requirement("numpy>=1.22.4")] + env_file_path = pathlib.Path(tmpdir, "conda.yml") + env_utils.save_conda_env_file(env_file_path, cd, python_version="3.8") + loaded_cd, _, _ = env_utils.load_conda_env_file(env_file_path) + self.assertEqual(cd, loaded_cd) + + with tempfile.TemporaryDirectory() as tmpdir: + cd = collections.defaultdict(list) + cd.update( + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("numpy>=1.22.4")], + "conda-forge": [requirements.Requirement("pytorch!=2.0")], + } + ) + env_file_path = pathlib.Path(tmpdir, "conda.yml") + env_utils.save_conda_env_file(env_file_path, cd, python_version="3.8") + loaded_cd, _, _ = env_utils.load_conda_env_file(env_file_path) + self.assertEqual(cd, loaded_cd) + + with tempfile.TemporaryDirectory() as tmpdir: + cd = collections.defaultdict(list) + cd.update( + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("numpy>=1.22.4")], + "apple": [], + "conda-forge": [requirements.Requirement("pytorch!=2.0")], + } + ) + env_file_path = pathlib.Path(tmpdir, "conda.yml") + env_utils.save_conda_env_file(env_file_path, cd, python_version="3.8") + with open(env_file_path, encoding="utf-8") as f: + written_yaml = yaml.safe_load(f) + self.assertDictEqual( + written_yaml, + { + "name": "snow-env", + "channels": ["https://repo.anaconda.com/pkgs/snowflake", "conda-forge", "apple", "nodefaults"], + "dependencies": [ + "python==3.8.*", + "numpy>=1.22.4", + "conda-forge::pytorch!=2.0", + ], + }, + ) + loaded_cd, pip_reqs, _ = env_utils.load_conda_env_file(env_file_path) + self.assertEqual(cd, loaded_cd) + self.assertIsNone(pip_reqs) + + with tempfile.TemporaryDirectory() as tmpdir: + env_file_path = pathlib.Path(tmpdir, "conda.yml") + with open(env_file_path, "w", encoding="utf-8") as f: + yaml.safe_dump( + stream=f, + data={ + "dependencies": [ + f"python=={snowml_env.PYTHON_VERSION}", + "::numpy>=1.22.4", + "conda-forge::pytorch!=2.0", + {"pip": ["python-package==2.3.0"]}, + ], + }, + ) + loaded_cd, pip_reqs, python_ver = env_utils.load_conda_env_file(env_file_path) + self.assertEqual( + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("numpy>=1.22.4")], + "conda-forge": [requirements.Requirement("pytorch!=2.0")], + }, + loaded_cd, + ) + self.assertListEqual(pip_reqs, [requirements.Requirement("python-package==2.3.0")]) + self.assertEqual(python_ver, snowml_env.PYTHON_VERSION) + + with tempfile.TemporaryDirectory() as tmpdir: + env_file_path = pathlib.Path(tmpdir, "conda.yml") + with open(env_file_path, "w", encoding="utf-8") as f: + yaml.safe_dump( + stream=f, + data={ + "name": "snow-env", + "channels": ["https://repo.anaconda.com/pkgs/snowflake", "nodefaults"], + "dependencies": [ + f"python=={snowml_env.PYTHON_VERSION}", + "::numpy>=1.22.4", + "conda-forge::pytorch!=2.0", + {"pip": ["python-package==2.3.0"]}, + ], + }, + ) + loaded_cd, pip_reqs, python_ver = env_utils.load_conda_env_file(env_file_path) + self.assertEqual( + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("numpy>=1.22.4")], + "conda-forge": [requirements.Requirement("pytorch!=2.0")], + }, + loaded_cd, + ) + self.assertListEqual(pip_reqs, [requirements.Requirement("python-package==2.3.0")]) + self.assertEqual(python_ver, snowml_env.PYTHON_VERSION) + + with tempfile.TemporaryDirectory() as tmpdir: + env_file_path = pathlib.Path(tmpdir, "conda.yml") + with open(env_file_path, "w", encoding="utf-8") as f: + yaml.safe_dump( + stream=f, + data={ + "name": "snow-env", + "channels": ["https://repo.anaconda.com/pkgs/snowflake", "apple", "nodefaults"], + "dependencies": [ + "python=3.8", + "::numpy>=1.22.4", + "conda-forge::pytorch!=2.0", + {"pip": ["python-package"]}, + ], + }, + ) + loaded_cd, pip_reqs, python_ver = env_utils.load_conda_env_file(env_file_path) + self.assertEqual( + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("numpy>=1.22.4")], + "conda-forge": [requirements.Requirement("pytorch!=2.0")], + "apple": [], + }, + loaded_cd, + ) + self.assertListEqual(pip_reqs, [requirements.Requirement("python-package")]) + self.assertEqual(python_ver, "3.8") + + def test_generate_requirements_file(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + rl: List[requirements.Requirement] = [] + pip_file_path = pathlib.Path(tmpdir, "requirements.txt") + env_utils.save_requirements_file(pip_file_path, rl) + loaded_rl = env_utils.load_requirements_file(pip_file_path) + self.assertEqual(rl, loaded_rl) + + with tempfile.TemporaryDirectory() as tmpdir: + rl = [requirements.Requirement("python-package==1.0.1")] + pip_file_path = pathlib.Path(tmpdir, "requirements.txt") + env_utils.save_requirements_file(pip_file_path, rl) + loaded_rl = env_utils.load_requirements_file(pip_file_path) + self.assertEqual(rl, loaded_rl) if __name__ == "__main__": diff --git a/snowflake/ml/_internal/file_utils.py b/snowflake/ml/_internal/file_utils.py index 1f9717cb..afd12a71 100644 --- a/snowflake/ml/_internal/file_utils.py +++ b/snowflake/ml/_internal/file_utils.py @@ -358,3 +358,51 @@ def _get_unzipped_dir() -> str: extract_root = _get_unzipped_dir() snowml_file_path = os.path.relpath(file_path, snowml_start_path) return os.path.join(extract_root, *(snowml_file_path.split("/"))) + + +def upload_directory_to_stage( + session: snowpark.Session, local_path: pathlib.Path, stage_path: pathlib.PurePosixPath +) -> None: + """Upload a local folder recursively to a stage and keep the structure. + + Args: + session: Snowpark Session. + local_path: Local path to upload. + stage_path: Base path in the stage. + """ + file_operation = snowpark.FileOperation(session=session) + + for root, _, filenames in os.walk(local_path): + root_path = pathlib.Path(root) + for filename in filenames: + local_file_path = root_path / filename + stage_dir_path = ( + stage_path / pathlib.PurePosixPath(local_file_path.relative_to(local_path).as_posix()).parent + ) + file_operation.put( + str(local_file_path), + str(stage_dir_path), + auto_compress=False, + overwrite=False, + ) + + +def download_directory_from_stage( + session: snowpark.Session, stage_path: pathlib.PurePosixPath, local_path: pathlib.Path +) -> None: + """Upload a folder in stage recursively to a folder in local and keep the structure. + + Args: + session: Snowpark Session. + stage_path: Stage path to download from. + local_path: Local path as the base of destination. + """ + file_operation = file_operation = snowpark.FileOperation(session=session) + file_list = [ + pathlib.PurePosixPath(stage_path.parts[0], *pathlib.PurePosixPath(row.name).parts[1:]) + for row in session.sql(f"ls {stage_path}").collect() + ] + for stage_file_path in file_list: + local_file_dir = local_path / stage_file_path.relative_to(stage_path).parent + local_file_dir.mkdir(parents=True, exist_ok=True) + file_operation.get(str(stage_file_path), str(local_file_dir)) diff --git a/snowflake/ml/_internal/file_utils_test.py b/snowflake/ml/_internal/file_utils_test.py index d1b4d1fd..c2ef2bc7 100644 --- a/snowflake/ml/_internal/file_utils_test.py +++ b/snowflake/ml/_internal/file_utils_test.py @@ -189,7 +189,7 @@ def _populate_tmpdir(tmpdir: str) -> None: f.flush() os.mkdir(os.path.join(tmpdir, "env")) - with open(os.path.join(tmpdir, "env", "conda.yaml"), "w", encoding="utf-8") as f: + with open(os.path.join(tmpdir, "env", "conda.yml"), "w", encoding="utf-8") as f: f.write("python==3.8.13") f.flush() diff --git a/snowflake/ml/_internal/migrator_utils.py b/snowflake/ml/_internal/migrator_utils.py new file mode 100644 index 00000000..2802b882 --- /dev/null +++ b/snowflake/ml/_internal/migrator_utils.py @@ -0,0 +1,3 @@ +class UnableToUpgradeError(Exception): + def __init__(self, last_supported_version: str) -> None: + self.last_supported_version = last_supported_version diff --git a/snowflake/ml/_internal/utils/BUILD.bazel b/snowflake/ml/_internal/utils/BUILD.bazel index 85b50775..f2bdfb12 100644 --- a/snowflake/ml/_internal/utils/BUILD.bazel +++ b/snowflake/ml/_internal/utils/BUILD.bazel @@ -180,3 +180,34 @@ py_test( "//snowflake/ml/test_utils:mock_session", ], ) + +py_library( + name = "sql_identifier", + srcs = ["sql_identifier.py"], + deps = [ + "//snowflake/ml/_internal/utils:identifier", + ], +) + +py_test( + name = "sql_identifier_test", + srcs = ["sql_identifier_test.py"], + deps = [ + ":sql_identifier", + "//snowflake/ml/test_utils:mock_data_frame", + "//snowflake/ml/test_utils:mock_session", + ], +) + +py_library( + name = "log_stream_processor", + srcs = ["log_stream_processor.py"], +) + +py_test( + name = "log_stream_processor_test", + srcs = ["log_stream_processor_test.py"], + deps = [ + ":log_stream_processor", + ], +) diff --git a/snowflake/ml/_internal/utils/identifier.py b/snowflake/ml/_internal/utils/identifier.py index 18186ee7..baff014b 100644 --- a/snowflake/ml/_internal/utils/identifier.py +++ b/snowflake/ml/_internal/utils/identifier.py @@ -305,7 +305,10 @@ def resolve_identifier(id: str) -> str: Resolved identifier """ if _is_quoted(id): - return id + if UNQUOTED_CASE_SENSITIVE_RE.match(id[1:-1]): + return id[1:-1] + else: + return id else: return id.upper() diff --git a/snowflake/ml/_internal/utils/log_stream_processor.py b/snowflake/ml/_internal/utils/log_stream_processor.py new file mode 100644 index 00000000..9c3f8547 --- /dev/null +++ b/snowflake/ml/_internal/utils/log_stream_processor.py @@ -0,0 +1,30 @@ +import logging +from typing import Optional + +logger = logging.getLogger(__name__) + + +class LogStreamProcessor: + def __init__(self) -> None: + self.last_line_seen = 0 + + def process_new_logs(self, job_logs: Optional[str], *, log_level: int = logging.INFO) -> None: + if not job_logs: + return + log_entries = job_logs.split("\n") + start_index = self.last_line_seen + log_length = len(log_entries) + for i in range(start_index, log_length): + log_entry = log_entries[i] + if log_level == logging.DEBUG: + logger.debug(log_entry) + elif log_level == logging.INFO: + logger.info(log_entry) + elif log_level == logging.WARNING: + logger.warning(log_entry) + elif log_level == logging.ERROR: + logger.error(log_entry) + elif log_level == logging.CRITICAL: + logger.critical(log_entry) + + self.last_line_seen = log_length diff --git a/snowflake/ml/_internal/utils/log_stream_processor_test.py b/snowflake/ml/_internal/utils/log_stream_processor_test.py new file mode 100644 index 00000000..5a4a984f --- /dev/null +++ b/snowflake/ml/_internal/utils/log_stream_processor_test.py @@ -0,0 +1,61 @@ +import logging +from io import StringIO + +from absl.testing import absltest + +from snowflake.ml._internal.utils import log_stream_processor + + +class LogStreamProcessorTest(absltest.TestCase): + def setUp(self) -> None: + self.log_stream = StringIO() + self.log_handler = logging.StreamHandler(self.log_stream) + self.log_handler.setLevel(logging.INFO) + self.log_handler.setFormatter(logging.Formatter("%(message)s")) + logging.getLogger().addHandler(self.log_handler) + + def tearDown(self) -> None: + logging.getLogger().removeHandler(self.log_handler) + self.log_stream.close() + logging.shutdown() + + def reset_log_stream(self) -> None: + # Clear the log stream + self.log_stream.truncate(0) + self.log_stream.seek(0) + + def test_only_new_log_is_shown(self) -> None: + lsp = log_stream_processor.LogStreamProcessor() + log1 = "TIMESTAMP1: HI 1" + log2 = "TIMESTAMP1: HI 1 \n TIMESTAMP2: HI 2" + log3 = "TIMESTAMP1: HI 1 \n TIMESTAMP2: HI 2 \n TIMESTAMP3: HI 3" + log4 = "TIMESTAMP1: HI 1 \n TIMESTAMP2: HI 2 \n TIMESTAMP3: HI 3" + + lsp.process_new_logs(log1) + self.assertEqual("TIMESTAMP1: HI 1", self.log_stream.getvalue().strip()) + + self.reset_log_stream() + + lsp.process_new_logs(log2) + self.assertEqual("TIMESTAMP2: HI 2", self.log_stream.getvalue().strip()) + + self.reset_log_stream() + + lsp.process_new_logs(log3) + self.assertEqual("TIMESTAMP3: HI 3", self.log_stream.getvalue().strip()) + + self.reset_log_stream() + + # No new log returned + lsp.process_new_logs(log4) + self.assertEqual("", self.log_stream.getvalue().strip()) + + self.reset_log_stream() + + # Process empty log + lsp.process_new_logs(None) + self.assertEqual("", self.log_stream.getvalue().strip()) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/_internal/utils/sql_identifier.py b/snowflake/ml/_internal/utils/sql_identifier.py new file mode 100644 index 00000000..0d895c3c --- /dev/null +++ b/snowflake/ml/_internal/utils/sql_identifier.py @@ -0,0 +1,82 @@ +from typing import List + +from snowflake.ml._internal.utils import identifier + + +class SqlIdentifier(str): + """Represents an identifier in SQL. An identifier has 3 states: + 1. User input: this is the raw input string to initializer. + 2. identifier(): this is the state that ready input to SQL. + 3. resolved(): this is the state how the identifier stored in database. + + For example: + 1. user input -> 2. identifier() -> 3. resolved() + SqlIdentifier('abc', True) ABC ABC + SqlIdentifier('"abc"', True) "abc" abc + SqlIdentifier('abc', False) "abc" abc + """ + + def __new__(cls, name: str, quotes_to_preserve_case: bool = True) -> "SqlIdentifier": + """Create new instance of sql identifier. + Refer to here for more details: https://docs.snowflake.com/en/sql-reference/identifiers-syntax + + Args: + name: A string name. + quotes_to_preserve_case: If true, then double quotes are needed to preserve case. This is the default + mode. When it's false, case are preserved automatically. For instance, This happens when you trying + to construct SqlIdentifier from result of SQL queries. + + Raises: + ValueError: input name is not a valid identifier. + + Returns: + Returns new instance created. + """ + # TODO (wezhou) add stronger validation to recognize a valid snowflake identifier. + if not name: + raise ValueError(f"name:`{name}` is not a valid identifier.") + if quotes_to_preserve_case: + return super().__new__(cls, identifier.resolve_identifier(name)) + else: + return super().__new__(cls, identifier.get_inferred_name(name)) + + def __init__(self, name: str, quotes_to_preserve_case: bool = True) -> None: + """Initialize sql identifier. + + Args: + name: A string name. + quotes_to_preserve_case: If true then double quotes are needed to preserve case-sensitivity. + Otherwise, case-sensivitity are preserved automatically. + """ + super().__init__() + + def identifier(self) -> str: + """Get the identifier value. This is how the string looks like input to SQL. + + Returns: + An identifier string. + """ + return str(self) + + def resolved(self) -> str: + """Get a resolved string after applying identifier requirement rules. This is how the identifier stored + in database. + + Returns: + A resolved string. + """ + return identifier.get_unescaped_names(str(self)) + + def __eq__(self, other: object) -> bool: + if isinstance(other, SqlIdentifier): + return self.resolved() == other.resolved() + if isinstance(other, str): + return str(self) == other + return False + + def __hash__(self) -> int: + return super().__hash__() + + +def to_sql_identifiers(list_of_str: List[str], quotes_to_preserve_case: bool = True) -> List[SqlIdentifier]: + return [SqlIdentifier(val, quotes_to_preserve_case) for val in list_of_str] diff --git a/snowflake/ml/_internal/utils/sql_identifier_test.py b/snowflake/ml/_internal/utils/sql_identifier_test.py new file mode 100644 index 00000000..e63fb254 --- /dev/null +++ b/snowflake/ml/_internal/utils/sql_identifier_test.py @@ -0,0 +1,47 @@ +from absl.testing import absltest + +from snowflake.ml._internal.utils.sql_identifier import SqlIdentifier + + +class SqlIdentifierTest(absltest.TestCase): + def test_sql_identifier(self) -> None: + id = SqlIdentifier("abc", quotes_to_preserve_case=True) + self.assertEqual(id.identifier(), "ABC") + self.assertEqual(id.resolved(), "ABC") + + id = SqlIdentifier('"abc"', quotes_to_preserve_case=True) + self.assertEqual(id.identifier(), '"abc"') + self.assertEqual(id.resolved(), "abc") + + id = SqlIdentifier("abc", quotes_to_preserve_case=False) + self.assertEqual(id.identifier(), '"abc"') + self.assertEqual(id.resolved(), "abc") + + id = SqlIdentifier("ABC", quotes_to_preserve_case=False) + self.assertEqual(id.identifier(), "ABC") + self.assertEqual(id.resolved(), "ABC") + + def test_sql_identifier_equality(self) -> None: + id_1 = SqlIdentifier("abc", quotes_to_preserve_case=True) + id_2 = SqlIdentifier("ABC", quotes_to_preserve_case=True) + self.assertEqual(id_1, id_2) + + id_1 = SqlIdentifier('"ABC"', quotes_to_preserve_case=True) + id_2 = SqlIdentifier("ABC", quotes_to_preserve_case=True) + self.assertEqual(id_1, id_2) + + id_1 = SqlIdentifier("abc", quotes_to_preserve_case=False) + id_2 = SqlIdentifier('"abc"', quotes_to_preserve_case=True) + self.assertEqual(id_1, id_2) + + id_1 = SqlIdentifier("abc", quotes_to_preserve_case=False) + id_2 = SqlIdentifier("abc", quotes_to_preserve_case=False) + self.assertEqual(id_1, id_2) + + id_1 = SqlIdentifier("ABC", quotes_to_preserve_case=False) + id_2 = SqlIdentifier("abc", quotes_to_preserve_case=False) + self.assertNotEqual(id_1, id_2) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/feature_store/BUILD.bazel b/snowflake/ml/feature_store/BUILD.bazel index 0432db58..88cf42bd 100644 --- a/snowflake/ml/feature_store/BUILD.bazel +++ b/snowflake/ml/feature_store/BUILD.bazel @@ -31,6 +31,7 @@ py_library( "//snowflake/ml/_internal:telemetry", "//snowflake/ml/_internal/utils:identifier", "//snowflake/ml/_internal/utils:query_result_checker", + "//snowflake/ml/_internal/utils:sql_identifier", "//snowflake/ml/dataset", ], ) diff --git a/snowflake/ml/feature_store/_internal/scripts/upload_test_datasets.py b/snowflake/ml/feature_store/_internal/scripts/upload_test_datasets.py index 0aa0eeb2..d11fee6c 100644 --- a/snowflake/ml/feature_store/_internal/scripts/upload_test_datasets.py +++ b/snowflake/ml/feature_store/_internal/scripts/upload_test_datasets.py @@ -4,30 +4,68 @@ from snowflake.ml._internal.utils import identifier from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session +from snowflake.snowpark.types import FloatType, IntegerType, StructField, StructType -PARQUET_FILE_NAME = "yellow_tripdata_2016-01.parquet" +# TODO these global parameters should be shared with those defined in feature_store/tests/common_utils.py +FS_INTEG_TEST_DB = "SNOWML_FEATURE_STORE_TEST_DB" +FS_INTEG_TEST_DATASET_SCHEMA = "TEST_DATASET" +FS_INTEG_TEST_YELLOW_TRIP_DATA = "yellow_tripdata_2016_01" +FS_INTEG_TEST_WINE_QUALITY_DATA = "wine_quality_data" -PARQUET_FILE_LOCAL_PATH = f"file://~/Downloads/{PARQUET_FILE_NAME}" +TRIPDATA_NAME = "yellow_tripdata_2016-01.parquet" +WINEDATA_NAME = "winequality-red.csv" +FILE_LOCAL_PATH = "file://~/Downloads/" -def get_destination_table_name(original_file_name: str) -> str: - return original_file_name.split(".")[0].replace("-", "_").upper() - -if __name__ == "__main__": - sess = Session.builder.configs(SnowflakeLoginOptions()).create() - current_db = "SNOWML_FEATURE_STORE_TEST_DB" - current_schema = "TEST_DATASET" - - sess.file.put(PARQUET_FILE_LOCAL_PATH, sess.get_session_stage()) - df = sess.read.parquet(f"{sess.get_session_stage()}/{PARQUET_FILE_NAME}") +def create_tripdata(sess: Session, overwrite_mode: str) -> None: + sess.file.put(f"{FILE_LOCAL_PATH}/{TRIPDATA_NAME}", sess.get_session_stage()) + df = sess.read.parquet(f"{sess.get_session_stage()}/{TRIPDATA_NAME}") for old_col_name in df.columns: df = df.with_column_renamed(old_col_name, identifier.get_unescaped_names(old_col_name)) - table_name = get_destination_table_name(PARQUET_FILE_NAME) - full_table_name = f"{current_db}.{current_schema}.{table_name}" - df.write.mode("ignore").save_as_table(full_table_name) + full_table_name = f"{FS_INTEG_TEST_DB}.{FS_INTEG_TEST_DATASET_SCHEMA}.{FS_INTEG_TEST_YELLOW_TRIP_DATA}" + df.write.mode(overwrite_mode).save_as_table(full_table_name) + rows_count = sess.sql(f"SELECT COUNT(*) FROM {full_table_name}").collect()[0][0] + + print(f"{full_table_name} has total {rows_count} rows.") + + +def create_winedata(sess: Session, overwrite_mode: str) -> None: + sess.file.put(f"{FILE_LOCAL_PATH}/{WINEDATA_NAME}", sess.get_session_stage()) + input_schema = StructType( + [ + StructField("fixed_acidity", FloatType()), + StructField("volatile_acidity", FloatType()), + StructField("citric_acid", FloatType()), + StructField("residual_sugar", FloatType()), + StructField("chlorides", FloatType()), + StructField("free_sulfur_dioxide", IntegerType()), + StructField("total_sulfur_dioxide", IntegerType()), + StructField("density", FloatType()), + StructField("pH", FloatType()), + StructField("sulphates", FloatType()), + StructField("alcohol", FloatType()), + StructField("quality", IntegerType()), + ] + ) + + full_table_name = f"{FS_INTEG_TEST_DB}.{FS_INTEG_TEST_DATASET_SCHEMA}.{FS_INTEG_TEST_WINE_QUALITY_DATA}" + df = ( + sess.read.options({"field_delimiter": ",", "skip_header": 1}) + .schema(input_schema) + .csv(f"{sess.get_session_stage()}/{WINEDATA_NAME}") + ) + df.write.mode(overwrite_mode).save_as_table(full_table_name) rows_count = sess.sql(f"SELECT COUNT(*) FROM {full_table_name}").collect()[0][0] print(f"{full_table_name} has total {rows_count} rows.") + + +if __name__ == "__main__": + sess = Session.builder.configs(SnowflakeLoginOptions()).create() + + create_tripdata(sess, "overwrite") + create_winedata(sess, "overwrite") + print("Script completes successfully.") diff --git a/snowflake/ml/feature_store/entity.py b/snowflake/ml/feature_store/entity.py index dbeb7837..40088fdb 100644 --- a/snowflake/ml/feature_store/entity.py +++ b/snowflake/ml/feature_store/entity.py @@ -1,6 +1,10 @@ from typing import List from snowflake.ml._internal.utils.identifier import get_unescaped_names +from snowflake.ml._internal.utils.sql_identifier import ( + SqlIdentifier, + to_sql_identifiers, +) ENTITY_NAME_LENGTH_LIMIT = 32 FEATURE_VIEW_ENTITY_TAG_DELIMITER = "," @@ -25,8 +29,9 @@ def __init__(self, name: str, join_keys: List[str], desc: str = "") -> None: join_keys: join keys associated with a FeatureView, used for feature retrieval. desc: description of the Entity. """ + self.name: str = name - self.join_keys: List[str] = join_keys + self.join_keys: List[SqlIdentifier] = to_sql_identifiers(join_keys) self.desc = desc self._validate() diff --git a/snowflake/ml/feature_store/feature_store.py b/snowflake/ml/feature_store/feature_store.py index 3916abea..657a3877 100644 --- a/snowflake/ml/feature_store/feature_store.py +++ b/snowflake/ml/feature_store/feature_store.py @@ -2,6 +2,7 @@ import json import logging import re +import warnings from dataclasses import dataclass from enum import Enum from typing import Dict, List, Optional, Tuple, Union, cast @@ -15,6 +16,10 @@ exceptions as snowml_exceptions, ) from snowflake.ml._internal.utils import identifier, query_result_checker as qrc +from snowflake.ml._internal.utils.sql_identifier import ( + SqlIdentifier, + to_sql_identifiers, +) from snowflake.ml.dataset.dataset import Dataset, FeatureStoreMetadata from snowflake.ml.feature_store.entity import ( ENTITY_JOIN_KEY_DELIMITER, @@ -75,9 +80,9 @@ class CreationMode(Enum): @dataclass(frozen=True) class _FeatureStoreConfig: - database: str - schema: str - default_warehouse: str + database: SqlIdentifier + schema: SqlIdentifier + default_warehouse: SqlIdentifier @property def full_schema_path(self) -> str: @@ -123,12 +128,16 @@ def __init__( SnowflakeMLException: [RuntimeError] Failed to find resources. SnowflakeMLException: [RuntimeError] Failed to create feature store. """ + database = SqlIdentifier(database) + name = SqlIdentifier(name) + default_warehouse = SqlIdentifier(default_warehouse) + self._telemetry_stmp = telemetry.get_function_usage_statement_params(PROJECT) self._session: Session = session self._config = _FeatureStoreConfig( - database=identifier.resolve_identifier(database), - schema=identifier.resolve_identifier(name), - default_warehouse=identifier.resolve_identifier(default_warehouse), + database=database, + schema=name, + default_warehouse=default_warehouse, ) # A dict from object name to tuple of search space and object domain. # search space used in query "SHOW LIKE IN " @@ -245,13 +254,13 @@ def register_feature_view( E.g. * * * * * UTC NOTE: If refresh_freq is not provided, then FeatureView will be registered as View on Snowflake backend and there won't be extra storage cost. - warehouse: warehouse to run the compute for the registered FeatureView, if not provided default_warehouse - specified in the FeatureStore will be used. + warehouse: warehouse to run the compute for the registered FeatureView, if not provided + default_warehouse specified in the FeatureStore will be used. block: Specify whether the FeatureView backend materialization should be blocking or not. If blocking then the API will wait until the initial FeatureView data is generated. Returns: - FeatureView object with version and status populated. + A materialized FeatureView object. Raises: SnowflakeMLException: [ValueError] FeatureView is already registered, or duplicate name and version @@ -260,6 +269,9 @@ def register_feature_view( SnowflakeMLException: [RuntimeError] Failed to create dynamic table, task, or view. SnowflakeMLException: [RuntimeError] Failed to find resources. """ + if warehouse is not None: + warehouse = SqlIdentifier(warehouse) + if feature_view.status != FeatureViewStatus.DRAFT: raise snowml_exceptions.SnowflakeMLException( error_code=error_codes.OBJECT_ALREADY_EXISTS, @@ -277,7 +289,7 @@ def register_feature_view( original_exception=ValueError(f"Entity {e.name} has not been registered."), ) - feature_view_name = self._get_feature_view_name(feature_view.name, version) + feature_view_name = FeatureView._get_physical_name(feature_view.name, version) dynamic_table_results = self._find_object("DYNAMIC TABLES", feature_view_name) view_results = self._find_object("VIEWS", feature_view_name) if len(dynamic_table_results) > 0 or len(view_results) > 0: @@ -290,12 +302,11 @@ def register_feature_view( ) fully_qualified_name = self._get_fully_qualified_name(feature_view_name) - - entities = FEATURE_VIEW_ENTITY_TAG_DELIMITER.join( - [identifier.get_unescaped_names(e.name) for e in feature_view.entities] - ) + entities = FEATURE_VIEW_ENTITY_TAG_DELIMITER.join([e.name for e in feature_view.entities]) timestamp_col = ( - feature_view.timestamp_col if feature_view.timestamp_col is not None else TIMESTAMP_COL_PLACEHOLDER + feature_view.timestamp_col + if feature_view.timestamp_col is not None + else SqlIdentifier(TIMESTAMP_COL_PLACEHOLDER) ) def create_col_desc(col: StructField) -> str: @@ -306,69 +317,20 @@ def create_col_desc(col: StructField) -> str: column_descs = ", ".join([f"{create_col_desc(col)}" for col in feature_view.output_schema.fields]) if refresh_freq is not None: - schedule_task = False - if refresh_freq != "DOWNSTREAM" and timeparse(refresh_freq) is None: - cron_expr = refresh_freq - refresh_freq = "DOWNSTREAM" - schedule_task = True - + schedule_task = refresh_freq != "DOWNSTREAM" and timeparse(refresh_freq) is None target_warehouse = self._config.default_warehouse if warehouse is None else warehouse - target_warehouse = identifier.strip_wrapping_quotes(identifier.resolve_identifier(target_warehouse)) - - # TODO: cluster by join keys once DT supports that - try: - query = f"""CREATE DYNAMIC TABLE {fully_qualified_name} ({column_descs}) - TARGET_LAG = '{refresh_freq}' - COMMENT = '{feature_view.desc}' - TAG ( - {self._get_fully_qualified_name(FEATURE_VIEW_ENTITY_TAG)} = '{entities}', - {self._get_fully_qualified_name(FEATURE_VIEW_TS_COL_TAG)} = '{timestamp_col}', - {self._get_fully_qualified_name(FEATURE_STORE_OBJECT_TAG)} = '' - ) - WAREHOUSE = "{target_warehouse}" - AS {feature_view.query} - """ - self._session.sql(query).collect(statement_params=self._telemetry_stmp) - - self._session.sql(f"ALTER DYNAMIC TABLE {fully_qualified_name} REFRESH").collect( - block=block, statement_params=self._telemetry_stmp - ) - - if schedule_task: - self._session.sql( - f"""CREATE TASK {fully_qualified_name} - WAREHOUSE = "{target_warehouse}" - SCHEDULE = 'USING CRON {cron_expr}' - AS ALTER DYNAMIC TABLE {fully_qualified_name} REFRESH - """ - ).collect(statement_params=self._telemetry_stmp) - self._session.sql( - f""" - ALTER TASK {fully_qualified_name} - SET TAG {self._get_fully_qualified_name(FEATURE_STORE_OBJECT_TAG)} = '' - """ - ).collect(statement_params=self._telemetry_stmp) - self._session.sql(f"ALTER TASK {fully_qualified_name} RESUME").collect( - statement_params=self._telemetry_stmp - ) - except Exception as e: - self._session.sql(f"DROP DYNAMIC TABLE IF EXISTS {fully_qualified_name}").collect( - statement_params=self._telemetry_stmp - ) - raise snowml_exceptions.SnowflakeMLException( - error_code=error_codes.INTERNAL_SNOWPARK_ERROR, - original_exception=RuntimeError( - f"Create dynamic table [\n{query}\n] or task {fully_qualified_name} failed: {e}." - ), - ) from e - - feature_view._version = version - feature_view._database = self._config.database - feature_view._schema = self._config.schema - feature_view._status = self._get_feature_view_status(feature_view) - feature_view._refresh_freq = refresh_freq - feature_view._warehouse = target_warehouse - + self._create_dynamic_table( + feature_view_name, + feature_view, + fully_qualified_name, + column_descs, + entities, + schedule_task, + refresh_freq, + target_warehouse, + timestamp_col, + block, + ) else: try: query = f"""CREATE VIEW {fully_qualified_name} ({column_descs}) @@ -387,13 +349,8 @@ def create_col_desc(col: StructField) -> str: original_exception=RuntimeError(f"Create view {fully_qualified_name} [\n{query}\n] failed: {e}"), ) from e - feature_view._version = version - feature_view._database = self._config.database - feature_view._schema = self._config.schema - feature_view._status = FeatureViewStatus.STATIC - - logger.info(f"Registered FeatureView {feature_view.name} with version {feature_view.version}.") - return feature_view + logger.info(f"Registered FeatureView {feature_view.name} with version {version}.") + return self.get_feature_view(feature_view.name, version) # type: ignore[no-any-return] @telemetry.send_api_usage_telemetry(project=PROJECT) @snowpark_utils.private_preview(version="1.0.8") @@ -416,8 +373,7 @@ def read_feature_view(self, feature_view: FeatureView) -> DataFrame: original_exception=ValueError(f"FeatureView {feature_view.name} has not been registered."), ) - fv_name = self._get_feature_view_name(feature_view.name, feature_view.version) - return self._session.sql(f"SELECT * FROM {self._get_fully_qualified_name(fv_name)}") + return self._session.sql(f"SELECT * FROM {feature_view.fully_qualified_name()}") @telemetry.send_api_usage_telemetry(project=PROJECT) @snowpark_utils.private_preview(version="1.0.8") @@ -426,7 +382,7 @@ def list_feature_views( entity_name: Optional[str] = None, feature_view_name: Optional[str] = None, as_dataframe: bool = True, - ) -> Union[DataFrame, List[FeatureView]]: + ) -> Union[Optional[DataFrame], List[FeatureView]]: """ List FeatureViews in the FeatureStore. If entity_name is specified, FeatureViews associated with that Entity will be listed. @@ -449,12 +405,11 @@ def list_feature_views( fvs.append(self._compose_feature_view(row)) if as_dataframe: - values = [] - schema = None + result = None for fv in fvs: - values.append(list(fv._to_dict().values())) - schema = [x.lstrip("_") for x in list(fv._to_dict().keys())] if schema is None else schema - return self._session.create_dataframe(values, schema=schema) + fv_df = fv.to_df(self._session) + result = fv_df if result is None else result.union(fv_df) # type: ignore[attr-defined] + return result else: return fvs @@ -475,7 +430,7 @@ def get_feature_view(self, name: str, version: str) -> FeatureView: SnowflakeMLException: [ValueError] FeatureView with name and version is not found, or incurred exception when reconstructing the FeatureView object. """ - fv_name = self._get_feature_view_name(name, version) + fv_name = FeatureView._get_physical_name(name, version) results = self._get_backend_representations(fv_name) if len(results) != 1: raise snowml_exceptions.SnowflakeMLException( @@ -651,9 +606,7 @@ def delete_feature_view(self, feature_view: FeatureView) -> None: original_exception=ValueError(f"FeatureView {feature_view.name} has not been registered."), ) - fully_qualified_name = self._get_fully_qualified_name( - self._get_feature_view_name(feature_view.name, feature_view.version) - ) + fully_qualified_name = feature_view.fully_qualified_name() if feature_view.status == FeatureViewStatus.STATIC: self._session.sql(f"DROP VIEW IF EXISTS {fully_qualified_name}").collect( statement_params=self._telemetry_stmp @@ -750,7 +703,7 @@ def get_entity(self, name: str) -> Entity: ) ) WHERE TAG_NAME LIKE '{unesc_full_entity_tag_name}' - AND TAG_DATABASE = '{identifier.get_unescaped_names(self._config.database)}' + AND TAG_DATABASE = '{self._config.database.resolved()}' """, self._telemetry_stmp, ) @@ -840,6 +793,9 @@ def retrieve_feature_values( Raises: ValueError: if features is empty. """ + if spine_timestamp_col is not None: + spine_timestamp_col = SqlIdentifier(spine_timestamp_col) + if len(features) == 0: raise ValueError("features cannot be empty") if isinstance(features[0], str): @@ -876,8 +832,9 @@ def generate_dataset( the provided table. Note result dataset will be a snowflake clone of registered table. New data can append on same registered table and previously generated dataset won't be affected. Default result table name will be a concatenation of materialized_table name and current timestamp. - spine_timestamp_col: Name of timestamp column in spine_df that will be used to join time-series features. - If spine_timestamp_col is not none, the input features also must have timestamp_col. + spine_timestamp_col: Name of timestamp column in spine_df that will be used to join + time-series features. If spine_timestamp_col is not none, the input features also must have + timestamp_col. spine_label_cols: Name of column(s) in spine_df that contains labels. exclude_columns: Column names to exclude from the result dataframe. The underlying storage will still contain the columns. @@ -898,6 +855,12 @@ def generate_dataset( SnowflakeMLException: [RuntimeError] Failed to create clone from table. SnowflakeMLException: [RuntimeError] Failed to find resources. """ + if spine_timestamp_col is not None: + spine_timestamp_col = SqlIdentifier(spine_timestamp_col) + if spine_label_cols is not None: + spine_label_cols = to_sql_identifiers(spine_label_cols) # type: ignore[assignment] + if exclude_columns is not None: + exclude_columns = to_sql_identifiers(exclude_columns) # type: ignore[assignment] allowed_save_mode = {"errorifexists", "merge"} if save_mode.lower() not in allowed_save_mode: @@ -954,9 +917,9 @@ def generate_dataset( result_df = self._session.sql(f"SELECT * FROM {snapshot_table}") if exclude_columns is not None: - dataset_cols = identifier.get_unescaped_names(result_df.columns) + dataset_cols = to_sql_identifiers(result_df.columns) for col in exclude_columns: - if identifier.get_unescaped_names(col) not in dataset_cols: + if col not in dataset_cols: raise snowml_exceptions.SnowflakeMLException( error_code=error_codes.INVALID_ARGUMENT, original_exception=ValueError( @@ -998,7 +961,7 @@ def clear(self) -> None: f""" SELECT * FROM {self._config.database}.INFORMATION_SCHEMA.SCHEMATA - WHERE SCHEMA_NAME = '{self._config.schema}' + WHERE SCHEMA_NAME = '{self._config.schema.resolved()}' """ ).collect() if len(result) == 0: @@ -1030,12 +993,85 @@ def clear(self) -> None: ) from e logger.info(f"Feature store {self._config.full_schema_path} has been cleared.") + def _create_dynamic_table( + self, + feature_view_name: str, + feature_view: FeatureView, + fully_qualified_name: str, + column_descs: str, + entities: str, + schedule_task: bool, + refresh_freq: str, + warehouse: SqlIdentifier, + timestamp_col: SqlIdentifier, + block: bool, + ) -> None: + # TODO: cluster by join keys once DT supports that + try: + query = f"""CREATE DYNAMIC TABLE {fully_qualified_name} ({column_descs}) + TARGET_LAG = '{'DOWNSTREAM' if schedule_task else refresh_freq}' + COMMENT = '{feature_view.desc}' + TAG ( + {self._get_fully_qualified_name(FEATURE_VIEW_ENTITY_TAG)} = '{entities}', + {self._get_fully_qualified_name(FEATURE_VIEW_TS_COL_TAG)} = '{timestamp_col}', + {self._get_fully_qualified_name(FEATURE_STORE_OBJECT_TAG)} = '' + ) + WAREHOUSE = {warehouse} + AS {feature_view.query} + """ + self._session.sql(query).collect(statement_params=self._telemetry_stmp) + self._session.sql(f"ALTER DYNAMIC TABLE {fully_qualified_name} REFRESH").collect( + block=block, statement_params=self._telemetry_stmp + ) + + if schedule_task: + self._session.sql( + f"""CREATE TASK {fully_qualified_name} + WAREHOUSE = {warehouse} + SCHEDULE = 'USING CRON {refresh_freq}' + AS ALTER DYNAMIC TABLE {fully_qualified_name} REFRESH + """ + ).collect(statement_params=self._telemetry_stmp) + self._session.sql( + f""" + ALTER TASK {fully_qualified_name} + SET TAG {self._get_fully_qualified_name(FEATURE_STORE_OBJECT_TAG)} = '' + """ + ).collect(statement_params=self._telemetry_stmp) + self._session.sql(f"ALTER TASK {fully_qualified_name} RESUME").collect( + statement_params=self._telemetry_stmp + ) + except Exception as e: + self._session.sql(f"DROP DYNAMIC TABLE IF EXISTS {fully_qualified_name}").collect( + statement_params=self._telemetry_stmp + ) + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INTERNAL_SNOWPARK_ERROR, + original_exception=RuntimeError( + f"Create dynamic table [\n{query}\n] or task {fully_qualified_name} failed: {e}." + ), + ) from e + + found_dts = self._find_object("DYNAMIC TABLES", feature_view_name) + if len(found_dts) != 1: + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.NOT_FOUND, + original_exception=ValueError(f"Can not find dynamic table: `{feature_view_name}`."), + ) + if found_dts[0]["refresh_mode"] != "INCREMENTAL": + warnings.warn( + f"Dynamic table: `{fully_qualified_name}` will not refresh in INCREMENTAL mode. " + + "It will likely incurr bigger computation cost. " + + f"The reason is: {found_dts[0]['refresh_mode_reason']}", + category=UserWarning, + ) + def _dump_dataset( self, df: DataFrame, table_name: str, - join_keys: List[str], - spine_timestamp_col: Optional[str] = None, + join_keys: List[SqlIdentifier], + spine_timestamp_col: Optional[SqlIdentifier] = None, ) -> None: if len(df.queries["queries"]) != 1: raise snowml_exceptions.SnowflakeMLException( @@ -1100,8 +1136,8 @@ def _join_features( self, spine_df: DataFrame, features: List[Union[FeatureView, FeatureViewSlice]], - spine_timestamp_col: Optional[str], - ) -> Tuple[DataFrame, List[str]]: + spine_timestamp_col: Optional[SqlIdentifier], + ) -> Tuple[DataFrame, List[SqlIdentifier]]: if len(spine_df.queries["queries"]) != 1: raise snowml_exceptions.SnowflakeMLException( error_code=error_codes.INVALID_ARGUMENT, @@ -1119,8 +1155,7 @@ def _join_features( ) for e in f.entities: for k in e.join_keys: - k = identifier.get_unescaped_names(k) - if k not in identifier.get_unescaped_names(spine_df.columns): + if k not in to_sql_identifiers(spine_df.columns): raise snowml_exceptions.SnowflakeMLException( error_code=error_codes.INVALID_ARGUMENT, original_exception=ValueError( @@ -1141,7 +1176,7 @@ def _join_features( join_keys = [k for e in f.entities for k in e.join_keys] join_keys_str = ", ".join(join_keys) assert f.version is not None - join_table_name = self._get_fully_qualified_name(self._get_feature_view_name(f.name, f.version)) + join_table_name = f.fully_qualified_name() if spine_timestamp_col is not None and f.timestamp_col is not None: if _ENABLE_ASOF_JOIN: @@ -1188,25 +1223,25 @@ def _composed_union_window_join_query( self, layer: int, s_query: str, - s_ts_col: str, + s_ts_col: SqlIdentifier, f_df: DataFrame, f_table_name: str, - f_cols: List[str], - f_ts_col: str, - join_keys: List[str], + f_cols: List[SqlIdentifier], + f_ts_col: SqlIdentifier, + join_keys: List[SqlIdentifier], ) -> str: s_df = self._session.sql(s_query) - s_only_cols = [col for col in s_df.columns if col not in identifier.get_unescaped_names([*join_keys, s_ts_col])] - f_only_cols = [col for col in f_df.columns if col not in identifier.get_unescaped_names([*join_keys, f_ts_col])] + s_only_cols = [col for col in to_sql_identifiers(s_df.columns) if col not in [*join_keys, s_ts_col]] + f_only_cols = [col for col in to_sql_identifiers(f_df.columns) if col not in [*join_keys, f_ts_col]] join_keys_str = ", ".join(join_keys) - temp_prefix = "_fs_temp_" + temp_prefix = "_FS_TEMP_" - def join_cols(cols: List[str], end_comma: bool, rename: bool, prefix: str = "") -> str: + def join_cols(cols: List[SqlIdentifier], end_comma: bool, rename: bool, prefix: str = "") -> str: if not cols: return "" - cols = [f"{prefix}{col}" for col in cols] + cols = [f"{prefix}{col}" for col in cols] # type: ignore[misc] if rename: - cols = [f"{col} AS {col[len(temp_prefix):]}" for col in cols] + cols = [f"{col} AS {col.replace(temp_prefix, '')}" for col in cols] # type: ignore[misc] line_end = "," if end_comma else "" return ", ".join(cols) + line_end @@ -1259,7 +1294,7 @@ def join_cols(cols: List[str], end_comma: bool, rename: bool, prefix: str = "") )""" # Part 4: join original spine table with window table - prefix_f_only_cols = [f"{temp_prefix}{name}" for name in f_only_cols] + prefix_f_only_cols = to_sql_identifiers([f"{temp_prefix}{name.resolved()}" for name in f_only_cols], False) last_select = f""" SELECT {join_keys_str}, @@ -1276,9 +1311,6 @@ def join_cols(cols: List[str], end_comma: bool, rename: bool, prefix: str = "") return complete_query - def _get_feature_view_name(self, raw_name: str, version: str) -> str: - return identifier.concat_names([raw_name, FEATURE_VIEW_NAME_DELIMITER, version]) - def _get_entity_name(self, raw_name: str) -> str: return identifier.concat_names([ENTITY_TAG_PREFIX, raw_name]) @@ -1292,15 +1324,14 @@ def _get_backend_representations(self, object_name_pattern: str) -> List[Row]: return dynamic_table_results + view_results def _update_feature_view_status(self, feature_view: FeatureView, operation: str) -> FeatureView: + assert operation in ["RESUME", "SUSPEND"], f"Operation: {operation} not supported" if feature_view.status == FeatureViewStatus.DRAFT or feature_view.version is None: raise snowml_exceptions.SnowflakeMLException( error_code=error_codes.NOT_FOUND, original_exception=ValueError(f"FeatureView {feature_view.name} has not been registered."), ) - fully_qualified_name = self._get_fully_qualified_name( - self._get_feature_view_name(feature_view.name, feature_view.version) - ) + fully_qualified_name = feature_view.fully_qualified_name() try: self._session.sql(f"ALTER DYNAMIC TABLE {fully_qualified_name} {operation}").collect( statement_params=self._telemetry_stmp @@ -1314,26 +1345,10 @@ def _update_feature_view_status(self, feature_view: FeatureView, operation: str) original_exception=RuntimeError(f"Failed to update feature view {fully_qualified_name}'s status: {e}"), ) from e - feature_view._status = self._get_feature_view_status(feature_view) + feature_view._status = self.get_feature_view(feature_view.name, feature_view.version).status logger.info(f"Successfully {operation} FeatureView {feature_view.name} with version {feature_view.version}.") return feature_view - def _get_feature_view_status(self, feature_view: FeatureView) -> FeatureViewStatus: - fv_name = self._get_feature_view_name( - feature_view.name, - feature_view.version if feature_view.version is not None else "", - ) - results = self._get_backend_representations(fv_name) - if len(results) != 1: - raise snowml_exceptions.SnowflakeMLException( - error_code=error_codes.NOT_FOUND, - original_exception=ValueError( - f"Failed to get status for {feature_view.name} with version {feature_view.version}: {results}" - ), - ) - - return FeatureViewStatus(results[0]["scheduling_state"]) - def _find_feature_views(self, entity_name: str, feature_view_name: Optional[str]) -> List[FeatureView]: if not self._validate_entity_exists(entity_name): return [] @@ -1370,7 +1385,7 @@ def _find_feature_views(self, entity_name: str, feature_view_name: Optional[str] ) from e outputs = [] for r in results: - if identifier.get_unescaped_names(entity_name) in r["TAG_VALUE"]: + if identifier.get_unescaped_names(entity_name) == identifier.get_unescaped_names(r["TAG_VALUE"]): fv_name, version = r["OBJECT_NAME"].split(FEATURE_VIEW_NAME_DELIMITER) if feature_view_name is not None: if fv_name == identifier.get_unescaped_names(feature_view_name): @@ -1383,6 +1398,8 @@ def _find_feature_views(self, entity_name: str, feature_view_name: Optional[str] def _compose_feature_view(self, row: Row) -> FeatureView: name, version = row["name"].split(FEATURE_VIEW_NAME_DELIMITER) + name = identifier.get_inferred_name(name) + version = identifier.get_inferred_name(version) m = re.match(DT_QUERY_PATTERN, row["text"]) if m is not None: @@ -1402,11 +1419,13 @@ def _compose_feature_view(self, row: Row) -> FeatureView: desc=desc, version=version, status=FeatureViewStatus(row["scheduling_state"]), - feature_descs=self._fetch_column_descs("DYNAMIC TABLE", row["name"]), + feature_descs=self._fetch_column_descs("DYNAMIC TABLE", identifier.get_inferred_name(row["name"])), refresh_freq=m.group("refresh_freq"), - database=self._config.database, - schema=self._config.schema, + database=self._config.database.identifier(), + schema=self._config.schema.identifier(), warehouse=m.group("warehouse"), + refresh_mode=row["refresh_mode"], + refresh_mode_reason=row["refresh_mode_reason"], ) return fv @@ -1428,11 +1447,13 @@ def _compose_feature_view(self, row: Row) -> FeatureView: desc=desc, version=version, status=FeatureViewStatus.STATIC, - feature_descs=self._fetch_column_descs("VIEW", row["name"]), + feature_descs=self._fetch_column_descs("VIEW", identifier.get_inferred_name(row["name"])), refresh_freq=None, - database=self._config.database, - schema=self._config.schema, + database=self._config.database.identifier(), + schema=self._config.schema.identifier(), warehouse=None, + refresh_mode=None, + refresh_mode_reason=None, ) return fv @@ -1472,6 +1493,10 @@ def _find_object(self, object_type: str, object_name_pattern: str) -> List[Row]: Returns: Return a list of rows round. """ + # TODO (wezhou) change type of object_name_pattern to SqlIdentifier. + if isinstance(object_name_pattern, SqlIdentifier): + object_name_pattern = object_name_pattern.identifier() + if object_name_pattern == "": return [] @@ -1508,7 +1533,7 @@ def _find_object(self, object_type: str, object_name_pattern: str) -> List[Row]: ) ) WHERE TAG_NAME = '{FEATURE_STORE_OBJECT_TAG}' - AND TAG_SCHEMA = '{identifier.get_unescaped_names(self._config.schema)}' + AND TAG_SCHEMA = '{self._config.schema.resolved()}' """ for row in all_rows ] diff --git a/snowflake/ml/feature_store/feature_view.py b/snowflake/ml/feature_store/feature_view.py index 05116c1c..badd5ab7 100644 --- a/snowflake/ml/feature_store/feature_view.py +++ b/snowflake/ml/feature_store/feature_view.py @@ -6,9 +6,10 @@ from enum import Enum from typing import Dict, List, Optional -from snowflake.ml._internal.utils.identifier import ( - get_inferred_name, - get_unescaped_names, +from snowflake.ml._internal.utils.identifier import concat_names, get_unescaped_names +from snowflake.ml._internal.utils.sql_identifier import ( + SqlIdentifier, + to_sql_identifiers, ) from snowflake.ml.feature_store.entity import Entity from snowflake.snowpark import DataFrame, Session @@ -35,7 +36,7 @@ class FeatureViewStatus(Enum): @dataclass(frozen=True) class FeatureViewSlice: feature_view_ref: FeatureView - names: List[str] + names: List[SqlIdentifier] def __repr__(self) -> str: states = (f"{k}={v}" for k, v in vars(self).items()) @@ -45,10 +46,7 @@ def __eq__(self, other: object) -> bool: if not isinstance(other, FeatureViewSlice): return False - return ( - get_unescaped_names(self.names) == get_unescaped_names(other.names) - and self.feature_view_ref == other.feature_view_ref - ) + return self.names == other.names and self.feature_view_ref == other.feature_view_ref def to_json(self) -> str: fvs_dict = { @@ -89,22 +87,30 @@ def __init__( entities: entities that the FeatureView is associated with. feature_df: Snowpark DataFrame containing data source and all feature feature_df logics. Final projection of the DataFrame should contain feature names, join keys and timestamp(if applicable). - timestamp_col: name of the timestamp column for point-in-time lookup when consuming the feature values. + timestamp_col: name of the timestamp column for point-in-time lookup when consuming the + feature values. desc: description of the FeatureView. """ + self._name: str = name self._entities: List[Entity] = entities self._feature_df: DataFrame = feature_df - self._timestamp_col: Optional[str] = timestamp_col if timestamp_col is not None else None + self._timestamp_col: Optional[SqlIdentifier] = ( + SqlIdentifier(timestamp_col) if timestamp_col is not None else None + ) self._desc: str = desc self._query: str = self._get_query() self._version: Optional[str] = None self._status: FeatureViewStatus = FeatureViewStatus.DRAFT - self._feature_desc: OrderedDict[str, Optional[str]] = OrderedDict((f, None) for f in self._get_feature_names()) + self._feature_desc: OrderedDict[SqlIdentifier, Optional[str]] = OrderedDict( + (f, None) for f in self._get_feature_names() + ) self._refresh_freq: Optional[str] = None - self._database: Optional[str] = None - self._schema: Optional[str] = None - self._warehouse: Optional[str] = None + self._database: Optional[SqlIdentifier] = None + self._schema: Optional[SqlIdentifier] = None + self._warehouse: Optional[SqlIdentifier] = None + self._refresh_mode: Optional[str] = None + self._refresh_mode_reason: Optional[str] = None self._validate() def slice(self, names: List[str]) -> FeatureViewSlice: @@ -120,27 +126,36 @@ def slice(self, names: List[str]) -> FeatureViewSlice: Raises: ValueError: if selected feature names is not found in the FeatureView. """ + res = [] for name in names: - name = get_unescaped_names(name) + name = SqlIdentifier(name) if name not in self.feature_names: raise ValueError(f"Feature name {name} not found in FeatureView {self.name}.") res.append(name) return FeatureViewSlice(self, res) - def fully_qualified_name(self) -> str: - """ - Returns the fully qualified name for the FeatureView in Snowflake storage. + def physical_name(self) -> str: + """Returns the physical name for this feature in Snowflake. Returns: - fully qualified name string + Physical name string. Raises: RuntimeError: if the FeatureView is not materialized. """ if self.status == FeatureViewStatus.DRAFT: raise RuntimeError(f"FeatureView {self.name} has not been materialized.") - return f"{self._database}.{self._schema}.{self.name}{FEATURE_VIEW_NAME_DELIMITER}{self.version}" + return FeatureView._get_physical_name(self.name, self.version) + + def fully_qualified_name(self) -> str: + """Returns the fully qualified name (..) for the + FeatureView in Snowflake. + + Returns: + fully qualified name string. + """ + return f"{self._database}.{self._schema}.{self.physical_name()}" def attach_feature_desc(self, descs: Dict[str, str]) -> FeatureView: """ @@ -156,7 +171,7 @@ def attach_feature_desc(self, descs: Dict[str, str]) -> FeatureView: ValueError: if feature name is not found in the FeatureView. """ for f, d in descs.items(): - f = get_unescaped_names(f) + f = SqlIdentifier(f) if f not in self._feature_desc: raise ValueError( f"Feature name {f} is not found in FeatureView {self.name}, " @@ -178,7 +193,7 @@ def feature_df(self) -> DataFrame: return self._feature_df @property - def timestamp_col(self) -> Optional[str]: + def timestamp_col(self) -> Optional[SqlIdentifier]: return self._timestamp_col @property @@ -198,33 +213,44 @@ def status(self) -> FeatureViewStatus: return self._status @property - def feature_names(self) -> List[str]: + def feature_names(self) -> List[SqlIdentifier]: return list(self._feature_desc.keys()) @property def feature_descs(self) -> Dict[str, Optional[str]]: - return dict(self._feature_desc) + new_dict = {} + for k, v in self._feature_desc.items(): + new_dict[k.identifier()] = v + return new_dict @property def refresh_freq(self) -> Optional[str]: return self._refresh_freq @property - def database(self) -> Optional[str]: + def database(self) -> Optional[SqlIdentifier]: return self._database @property - def schema(self) -> Optional[str]: + def schema(self) -> Optional[SqlIdentifier]: return self._schema @property - def warehouse(self) -> Optional[str]: + def warehouse(self) -> Optional[SqlIdentifier]: return self._warehouse @property def output_schema(self) -> StructType: return self._feature_df.schema + @property + def refresh_mode(self) -> Optional[str]: + return self._refresh_mode + + @property + def refresh_mode_reason(self) -> Optional[str]: + return self._refresh_mode_reason + def _get_query(self) -> str: if len(self._feature_df.queries["queries"]) != 1: raise ValueError( @@ -240,29 +266,30 @@ def _validate(self) -> None: f"FeatureView name `{self._name}` contains invalid character `{FEATURE_VIEW_NAME_DELIMITER}`." ) - unescaped_df_cols = get_unescaped_names(self._feature_df.columns) + unescaped_df_cols = to_sql_identifiers(self._feature_df.columns) for e in self._entities: - for k in get_unescaped_names(e.join_keys): + for k in e.join_keys: if k not in unescaped_df_cols: raise ValueError( f"join_key {k} in Entity {e.name} is not found in input dataframe: {unescaped_df_cols}" ) if self._timestamp_col is not None: - ts_col = get_unescaped_names(self._timestamp_col) - if ts_col == TIMESTAMP_COL_PLACEHOLDER: + ts_col = self._timestamp_col + if ts_col == SqlIdentifier(TIMESTAMP_COL_PLACEHOLDER): raise ValueError(f"Invalid timestamp_col name, cannot be {TIMESTAMP_COL_PLACEHOLDER}.") - if ts_col not in get_unescaped_names(self._feature_df.columns): + if ts_col not in to_sql_identifiers(self._feature_df.columns): raise ValueError(f"timestamp_col {ts_col} is not found in input dataframe.") - col_type = self._feature_df.schema[get_inferred_name(ts_col)].datatype + col_type = self._feature_df.schema[ts_col].datatype if not isinstance(col_type, (DateType, TimeType, TimestampType, _NumericType)): raise ValueError(f"Invalid data type for timestamp_col {ts_col}: {col_type}.") - def _get_feature_names(self) -> List[str]: - join_keys = [k for e in self._entities for k in get_unescaped_names(e.join_keys)] - ts_col = [get_unescaped_names(self._timestamp_col)] if self._timestamp_col is not None else [] - return [c for c in get_unescaped_names(self._feature_df.columns) if c not in join_keys + ts_col] + def _get_feature_names(self) -> List[SqlIdentifier]: + join_keys = [k for e in self._entities for k in e.join_keys] + ts_col = [self._timestamp_col] if self._timestamp_col is not None else [] + feature_names = to_sql_identifiers(self._feature_df.columns, False) + return [c for c in feature_names if c not in join_keys + ts_col] def __repr__(self) -> str: states = (f"{k}={v}" for k, v in vars(self).items()) @@ -275,7 +302,7 @@ def __eq__(self, other: object) -> bool: return ( get_unescaped_names(self.name) == get_unescaped_names(other.name) and get_unescaped_names(self.version) == get_unescaped_names(other.version) - and get_unescaped_names(self.timestamp_col) == get_unescaped_names(other.timestamp_col) + and self.timestamp_col == other.timestamp_col and self.entities == other.entities and self.desc == other.desc and self.feature_descs == other.feature_descs @@ -283,7 +310,10 @@ def __eq__(self, other: object) -> bool: and self.query == other.query and self.refresh_freq == other.refresh_freq and str(self.status) == str(other.status) + and self.database == other.database and self.warehouse == other.warehouse + and self.refresh_mode == other.refresh_mode + and self.refresh_mode_reason == other.refresh_mode_reason ) def _to_dict(self) -> Dict[str, str]: @@ -292,8 +322,18 @@ def _to_dict(self) -> Dict[str, str]: fv_dict.pop("_feature_df") fv_dict["_entities"] = [e.__dict__ for e in self._entities] fv_dict["_status"] = str(self._status) + fv_dict["_database"] = str(self._database) if self._database is not None else None + fv_dict["_schema"] = str(self._schema) if self._schema is not None else None + fv_dict["_warehouse"] = str(self._warehouse) if self._warehouse is not None else None return fv_dict + def to_df(self, session: Session) -> DataFrame: + values = list(self._to_dict().values()) + schema = [x.lstrip("_") for x in list(self._to_dict().keys())] + values.append(self.physical_name()) + schema.append("physical_name") + return session.create_dataframe([values], schema=schema) + def to_json(self) -> str: state_dict = self._to_dict() state_dict[FEATURE_OBJ_TYPE] = self.__class__.__name__ @@ -318,6 +358,18 @@ def from_json(cls, json_str: str, session: Session) -> FeatureView: database=json_dict["_database"], schema=json_dict["_schema"], warehouse=json_dict["_warehouse"], + refresh_mode=json_dict["_refresh_mode"], + refresh_mode_reason=json_dict["_refresh_mode_reason"], + ) + + @staticmethod + def _get_physical_name(fv_name: Optional[str], fv_version: Optional[str]) -> str: + return concat_names( + [ + fv_name if fv_name is not None else "", + FEATURE_VIEW_NAME_DELIMITER, + fv_version if fv_version is not None else "", + ] ) @staticmethod @@ -334,6 +386,8 @@ def _construct_feature_view( database: Optional[str], schema: Optional[str], warehouse: Optional[str], + refresh_mode: Optional[str], + refresh_mode_reason: Optional[str], ) -> FeatureView: fv = FeatureView( name=name, @@ -345,8 +399,10 @@ def _construct_feature_view( fv._version = version fv._status = status fv._refresh_freq = refresh_freq - fv._database = database - fv._schema = schema - fv._warehouse = warehouse + fv._database = SqlIdentifier(database) if database is not None else None + fv._schema = SqlIdentifier(schema) if schema is not None else None + fv._warehouse = SqlIdentifier(warehouse) if warehouse is not None else None + fv._refresh_mode = refresh_mode + fv._refresh_mode_reason = refresh_mode_reason fv.attach_feature_desc(feature_descs) return fv diff --git a/snowflake/ml/feature_store/notebooks/internal_demo/Time_Series_Feature_Demo.ipynb b/snowflake/ml/feature_store/notebooks/internal_demo/Time_Series_Feature_Demo.ipynb index 1ec13727..8777e5b6 100644 --- a/snowflake/ml/feature_store/notebooks/internal_demo/Time_Series_Feature_Demo.ipynb +++ b/snowflake/ml/feature_store/notebooks/internal_demo/Time_Series_Feature_Demo.ipynb @@ -51,31 +51,10 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": null, "id": "da1a922d", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "# Scale cell width with the browser window to accommodate .show() commands for wider tables.\n", "from IPython.display import display, HTML\n", @@ -95,31 +74,10 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": null, "id": "11935b50", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "import sys\n", "import os\n", @@ -144,14 +102,15 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 2, "id": "671378ae", "metadata": {}, "outputs": [], "source": [ + "import os\n", + "conda_env = os.environ['CONDA_DEFAULT_ENV']\n", "import sys\n", - "\n", - "sys.path.insert(0, '/tmp/snowml')" + "sys.path.append(f'/opt/homebrew/anaconda3/envs/{conda_env}/lib/python3.8/site-packages')" ] }, { @@ -164,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 3, "id": "f39a3f77", "metadata": {}, "outputs": [], @@ -180,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": null, "id": "e665bd41", "metadata": {}, "outputs": [], @@ -190,32 +149,10 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": null, "id": "75bfcfd1", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-------------------------------------------------------------------------------------------------------------------------------------\n", - "|\"TRIP_DISTANCE\" |\"FARE_AMOUNT\" |\"PASSENGER_COUNT\" |\"PULOCATIONID\" |\"DOLOCATIONID\" |\"PICKUP_TS\" |\"DROPOFF_TS\" |\n", - "-------------------------------------------------------------------------------------------------------------------------------------\n", - "|3.2 |14.0 |1 |48 |262 |2016-01-01 00:12:22 |2016-01-01 00:29:14 |\n", - "|1.0 |9.5 |2 |162 |48 |2016-01-01 00:41:31 |2016-01-01 00:55:10 |\n", - "|0.9 |6.0 |1 |246 |90 |2016-01-01 00:53:37 |2016-01-01 00:59:57 |\n", - "|0.8 |5.0 |1 |170 |162 |2016-01-01 00:13:28 |2016-01-01 00:18:07 |\n", - "|1.8 |11.0 |1 |161 |140 |2016-01-01 00:33:04 |2016-01-01 00:47:14 |\n", - "|2.3 |11.0 |1 |141 |137 |2016-01-01 00:49:47 |2016-01-01 01:04:44 |\n", - "|13.8 |43.0 |1 |100 |53 |2016-01-01 00:41:58 |2016-01-01 01:22:06 |\n", - "|3.46 |20.0 |5 |48 |79 |2016-01-01 00:25:28 |2016-01-01 00:55:46 |\n", - "|0.83 |5.5 |4 |79 |107 |2016-01-01 00:56:57 |2016-01-01 01:02:24 |\n", - "|0.87 |7.0 |1 |164 |164 |2016-01-01 00:10:08 |2016-01-01 00:23:05 |\n", - "-------------------------------------------------------------------------------------------------------------------------------------\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "source_df = session.table(\"SNOWML_FEATURE_STORE_TEST_DB.TEST_DATASET.yellow_tripdata_2016_01\")\n", "\n", @@ -246,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": null, "id": "6c37a635", "metadata": {}, "outputs": [], @@ -275,64 +212,10 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": null, "id": "70609920", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NAMEJOIN_KEYSDESC
0TRIP_DROPOFFDOLOCATIONID
1TRIP_PICKUPPULOCATIONID
\n", - "
" - ], - "text/plain": [ - " NAME JOIN_KEYS DESC\n", - "0 TRIP_DROPOFF DOLOCATIONID \n", - "1 TRIP_PICKUP PULOCATIONID " - ] - }, - "execution_count": 118, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "trip_pickup = Entity(name=\"trip_pickup\", join_keys=[\"PULOCATIONID\"])\n", "trip_dropoff = Entity(name=\"trip_dropoff\", join_keys=[\"DOLOCATIONID\"])\n", @@ -368,19 +251,10 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": null, "id": "995b4bcd", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:snowflake.snowpark.session:The version of package 'numpy' in the local environment is 1.24.4, which does not fit the criteria for the requirement 'numpy'. Your UDF might not work when the package version is different between the server and your local environment.\n", - "WARNING:snowflake.snowpark.session:Package 'pytimeparse' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.\n" - ] - } - ], + "outputs": [], "source": [ "@F.pandas_udf(\n", " name=\"vec_window_end\",\n", @@ -415,47 +289,10 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": null, "id": "7d0c4339", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "----------------------------------------------------------------------------------\n", - "|\"PULOCATIONID\" |\"TS\" |\"MEAN_FARE_2_HR\" |\"MEAN_FARE_5_HR\" |\n", - "----------------------------------------------------------------------------------\n", - "|49 |2016-01-01 00:15:00 |7.0 |7.0 |\n", - "|49 |2016-01-01 00:30:00 |9.3 |9.3 |\n", - "|49 |2016-01-01 00:45:00 |9.409090909090908 |9.409090909090908 |\n", - "|49 |2016-01-01 01:00:00 |12.296296296296296 |12.296296296296296 |\n", - "|49 |2016-01-01 01:15:00 |13.540816326530612 |13.540816326530612 |\n", - "|49 |2016-01-01 01:30:00 |13.27027027027027 |13.27027027027027 |\n", - "|49 |2016-01-01 01:45:00 |13.145 |13.145 |\n", - "|49 |2016-01-01 02:00:00 |13.007936507936508 |13.007936507936508 |\n", - "|49 |2016-01-01 02:15:00 |13.00326797385621 |12.925806451612903 |\n", - "|49 |2016-01-01 02:30:00 |13.258064516129032 |13.154450261780104 |\n", - "----------------------------------------------------------------------------------\n", - "\n", - "--------------------------------------------------------------------------------\n", - "|\"DOLOCATIONID\" |\"TS\" |\"COUNT_TRIP_2_HR\" |\"COUNT_TRIP_5_HR\" |\n", - "--------------------------------------------------------------------------------\n", - "|255 |2016-01-01 00:15:00 |2 |2 |\n", - "|255 |2016-01-01 00:30:00 |24 |24 |\n", - "|255 |2016-01-01 00:45:00 |70 |70 |\n", - "|255 |2016-01-01 01:00:00 |118 |118 |\n", - "|255 |2016-01-01 01:15:00 |162 |162 |\n", - "|255 |2016-01-01 01:30:00 |221 |221 |\n", - "|255 |2016-01-01 01:45:00 |272 |272 |\n", - "|255 |2016-01-01 02:00:00 |333 |333 |\n", - "|255 |2016-01-01 02:15:00 |398 |400 |\n", - "|255 |2016-01-01 02:30:00 |441 |465 |\n", - "--------------------------------------------------------------------------------\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "from snowflake.snowpark import Window\n", "from snowflake.snowpark.functions import col\n", @@ -533,7 +370,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": null, "id": "f0cd2075", "metadata": {}, "outputs": [], @@ -544,21 +381,10 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": null, "id": "d8960b0e", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "FeatureView(_name=TRIP_DROPOFF_TIME_SERIES_FEATURES, _entities=[Entity(name=TRIP_DROPOFF, join_keys=['DOLOCATIONID'], desc=)], _feature_df=, _timestamp_col=TS, _desc=, _query=SELECT \"DOLOCATIONID\", \"WINDOW_END\" AS \"TS\", sum(\"TRIP_COUNT_1_HR\") OVER (PARTITION BY \"DOLOCATIONID\" ORDER BY \"WINDOW_END\" DESC NULLS LAST ROWS BETWEEN CURRENT ROW AND 7 FOLLOWING ) AS \"COUNT_TRIP_2_HR\", sum(\"TRIP_COUNT_1_HR\") OVER (PARTITION BY \"DOLOCATIONID\" ORDER BY \"WINDOW_END\" DESC NULLS LAST ROWS BETWEEN CURRENT ROW AND 19 FOLLOWING ) AS \"COUNT_TRIP_5_HR\" FROM ( SELECT \"DOLOCATIONID\", \"WINDOW_END\", sum(\"FARE_AMOUNT\") AS \"FARE_SUM_1_HR\", count(1) AS \"TRIP_COUNT_1_HR\" FROM ( SELECT \"TRIP_DISTANCE\", \"FARE_AMOUNT\", \"PASSENGER_COUNT\", \"PULOCATIONID\", \"DOLOCATIONID\", \"PICKUP_TS\", \"DROPOFF_TS\", vec_window_end(\"DROPOFF_TS\", '15m') AS \"WINDOW_END\" FROM ( SELECT * FROM ( SELECT \"TRIP_DISTANCE\", \"FARE_AMOUNT\", \"PASSENGER_COUNT\", \"PULOCATIONID\", \"DOLOCATIONID\", CAST ((\"TPEP_PICKUP_DATETIME\" / 1000000 :: INT) AS TIMESTAMP) AS \"PICKUP_TS\", CAST ((\"TPEP_DROPOFF_DATETIME\" / 1000000 :: INT) AS TIMESTAMP) AS \"DROPOFF_TS\" FROM SNOWML_FEATURE_STORE_TEST_DB.TEST_DATASET.yellow_tripdata_2016_01) WHERE DROPOFF_TS >= '2016-01-01 00:00:00' AND DROPOFF_TS < '2016-01-03 00:00:00')) GROUP BY \"DOLOCATIONID\", \"WINDOW_END\"), _version=V1, _status=FeatureViewStatus.RUNNING, _feature_desc=OrderedDict([('COUNT_TRIP_2_HR', None), ('COUNT_TRIP_5_HR', None)]), _refresh_freq=1 minute, _database=FS_TIME_SERIES_EXAMPLE, _schema=AWESOME_FS, _warehouse=PUBLIC)" - ] - }, - "execution_count": 123, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "dropoff_fv = FeatureView(name=\"trip_dropoff_time_series_features\", entities=[trip_dropoff], feature_df=dropoff_df, timestamp_col=\"ts\")\n", "fs.register_feature_view(feature_view=dropoff_fv, version=\"v1\", refresh_freq=\"1 minute\", block=True)" @@ -577,26 +403,10 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": null, "id": "bc93de79", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "---------------------------------------------------------------------------------------------------------------------------------\n", - "|\"NAME\" |\"VERSION\" |\"ENTITIES\" |\"FEATURE_DESC\" |\n", - "---------------------------------------------------------------------------------------------------------------------------------\n", - "|TRIP_PICKUP_TIME_SERIES_FEATURES |V1 |[ |{ |\n", - "| | | \"{\\\"name\\\": \\\"TRIP_PICKUP\\\", \\\"join_keys\\\": [... | \"MEAN_FARE_2_HR\": null, |\n", - "| | |] | \"MEAN_FARE_5_HR\": null |\n", - "| | | |} |\n", - "---------------------------------------------------------------------------------------------------------------------------------\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "fs.list_feature_views(entity_name=\"trip_pickup\").select([\"NAME\", \"VERSION\", \"ENTITIES\", \"FEATURE_DESC\"]).show()" ] @@ -612,43 +422,10 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": null, "id": "a4e3376c", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-----------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "|\"PULOCATIONID\" |\"DOLOCATIONID\" |\"PICKUP_TS\" |\"FARE_AMOUNT\" |\"MEAN_FARE_2_HR\" |\"MEAN_FARE_5_HR\" |\"COUNT_TRIP_2_HR\" |\"COUNT_TRIP_5_HR\" |\n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "|132 |219 |2016-01-01 00:08:08 |20.0 |NULL |NULL |NULL |NULL |\n", - "|132 |219 |2016-01-01 00:28:28 |14.0 |37.342592592592595 |37.342592592592595 |NULL |NULL |\n", - "|164 |219 |2016-01-01 00:29:09 |55.0 |12.699115044247788 |12.699115044247788 |NULL |NULL |\n", - "|163 |219 |2016-01-01 00:31:08 |52.0 |12.61353711790393 |12.61353711790393 |1 |1 |\n", - "|132 |219 |2016-01-01 00:31:38 |18.5 |37.40517857142857 |37.40517857142857 |1 |1 |\n", - "|132 |219 |2016-01-01 00:35:37 |13.5 |37.40517857142857 |37.40517857142857 |1 |1 |\n", - "|132 |219 |2016-01-01 00:40:44 |12.5 |37.40517857142857 |37.40517857142857 |1 |1 |\n", - "|114 |219 |2016-01-01 00:59:52 |52.0 |12.618020304568528 |12.618020304568528 |4 |4 |\n", - "|170 |219 |2016-01-01 01:08:24 |46.5 |11.720843672456576 |11.720843672456576 |6 |6 |\n", - "|41 |219 |2016-01-01 01:45:59 |55.5 |12.363905325443787 |12.363905325443787 |9 |9 |\n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "\n" - ] - }, - { - "data": { - "text/plain": [ - "{'queries': ['SELECT * FROM FS_TIME_SERIES_EXAMPLE.AWESOME_FS.yellow_tripdata_2016_01_training_data_2023_09_20_13_31_23'],\n", - " 'post_actions': []}" - ] - }, - "execution_count": 125, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spine_df = source_df.select([\"PULOCATIONID\", \"DOLOCATIONID\", \"PICKUP_TS\", \"FARE_AMOUNT\"])\n", "training_data = fs.generate_dataset(\n", @@ -665,110 +442,10 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": null, "id": "6bced5e5", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
PULOCATIONIDDOLOCATIONIDMEAN_FARE_2_HRMEAN_FARE_5_HRCOUNT_TRIP_2_HRCOUNT_TRIP_5_HR
72133211689.95370410.302292564.01008.0
5497627922510.02231210.37961913.038.0
12970826121114.34051714.800613200.0259.0
10834910016114.89698514.896985305.0305.0
47453523626210.60258210.602582563.0563.0
\n", - "
" - ], - "text/plain": [ - " PULOCATIONID DOLOCATIONID MEAN_FARE_2_HR MEAN_FARE_5_HR \\\n", - "72133 211 68 9.953704 10.302292 \n", - "549762 79 225 10.022312 10.379619 \n", - "129708 261 211 14.340517 14.800613 \n", - "108349 100 161 14.896985 14.896985 \n", - "474535 236 262 10.602582 10.602582 \n", - "\n", - " COUNT_TRIP_2_HR COUNT_TRIP_5_HR \n", - "72133 564.0 1008.0 \n", - "549762 13.0 38.0 \n", - "129708 200.0 259.0 \n", - "108349 305.0 305.0 \n", - "474535 563.0 563.0 " - ] - }, - "execution_count": 126, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", @@ -783,19 +460,10 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": null, "id": "8f0e6902", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "31.74351768372038 %\n", - "Mean squared error: 90.17\n" - ] - } - ], + "outputs": [], "source": [ "from sklearn.impute import SimpleImputer\n", "from sklearn.pipeline import make_pipeline\n", @@ -823,21 +491,12 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": null, "id": "c57a81e2", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:absl:The database \"my_cool_registry\" already exists. Skipping creation.\n", - "WARNING:absl:The schema \"my_cool_registry\"._SYSTEM_MODEL_REGISTRY_SCHEMA already exists. Skipping creation.\n" - ] - } - ], + "outputs": [], "source": [ - "from snowflake.ml.registry import model_registry\n", + "from snowflake.ml.registry import model_registry, artifact\n", "import time\n", "\n", "registry = model_registry.ModelRegistry(session=session, database_name=\"my_cool_registry\", create_if_not_exists=True)" @@ -845,19 +504,25 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": null, + "id": "4caab287", + "metadata": {}, + "outputs": [], + "source": [ + "artifact_ref = registry.log_artifact(\n", + " artifact_type=artifact.ArtifactType.DATASET,\n", + " artifact_name=\"MY_COOL_DATASET\",\n", + " artifact_spec=training_data.to_json(),\n", + " artifact_version=\"V1\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "a935926a", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/homebrew/anaconda3/envs/fs_demo/lib/python3.8/site-packages/snowflake/ml/model/model_signature.py:52: UserWarning: The sample input has 653271 rows, thus a truncation happened before inferring signature. This might cause inaccurate signature inference. If that happens, consider specifying signature manually.\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "model_name = f\"my_model_{time.time()}\"\n", "\n", @@ -865,7 +530,7 @@ " model_name=model_name,\n", " model_version=\"v1\",\n", " model=estimator,\n", - " dataset=training_data,\n", + " artifacts=[artifact_ref],\n", ")" ] }, @@ -880,19 +545,10 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": null, "id": "999a633d", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/homebrew/anaconda3/envs/fs_demo/lib/python3.8/site-packages/snowflake/snowpark/session.py:1833: UserWarning: Pandas Dataframe has non-standard index of type which will not be written. Consider changing the index to pd.RangeIndex(start=0,...,step=1) or call reset_index() to keep index as column(s)\n", - " success, nchunks, nrows, ci_output = write_pandas(\n" - ] - } - ], + "outputs": [], "source": [ "# Prepare some source prediction data\n", "pred_df = training_pd.sample(3, random_state=996)[['PULOCATIONID', 'DOLOCATIONID', 'PICKUP_TS']]\n", @@ -902,38 +558,32 @@ }, { "cell_type": "code", - "execution_count": 133, + "execution_count": null, "id": "0a18a5ea", "metadata": {}, "outputs": [], "source": [ "# Enrich source prediction data with features\n", - "registered_training_data = registry.get_dataset(\n", - " model_name=\"my_trained_model\", \n", - " model_version=\"v1\",\n", - ")\n", + "from snowflake.ml.dataset.dataset import Dataset\n", + "\n", + "registered_artifact = registry.get_artifact(\n", + " artifact_ref.name, \n", + " artifact_ref.version)\n", + "registered_dataset = Dataset.from_json(registered_artifact._spec, session)\n", "\n", "enriched_df = fs.retrieve_feature_values(\n", " spine_df=pred_df, \n", - " features=registered_training_data.load_features(), \n", + " features=registered_dataset.load_features(), \n", " spine_timestamp_col='PICKUP_TS'\n", ").drop(['PICKUP_TS']).to_pandas()" ] }, { "cell_type": "code", - "execution_count": 134, + "execution_count": null, "id": "3bd545ee", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[11.75684447 8.77725855 12.42049179]\n" - ] - } - ], + "outputs": [], "source": [ "model_ref = model_registry.ModelReference(\n", " registry=registry, \n", @@ -957,118 +607,10 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": null, "id": "d45ba589", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:snowflake.snowpark.session:The version of package 'numpy' in the local environment is 1.24.4, which does not fit the criteria for the requirement 'numpy'. Your UDF might not work when the package version is different between the server and your local environment.\n", - "WARNING:snowflake.snowpark.session:Package 'pytimeparse' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.\n", - "ERROR:snowflake.snowpark._internal.server_connection:Failed to execute query [queryID: 01af19db-0406-b1b7-000c-a90273b47663] \n", - "CREATE OR REPLACE \n", - "TEMPORARY FUNCTION window_end(arg1 TIMESTAMP,arg2 STRING)\n", - "RETURNS TIMESTAMP\n", - "LANGUAGE PYTHON \n", - "RUNTIME_VERSION=3.8\n", - "\n", - "PACKAGES=('numpy','pandas','pytimeparse','cloudpickle==2.0.0')\n", - "\n", - "\n", - "HANDLER='compute'\n", - "\n", - "AS $$\n", - "import pickle\n", - "\n", - "func = pickle.loads(bytes.fromhex('8005955b030000000000008c17636c6f75647069636b6c652e636c6f75647069636b6c65948c0d5f6275696c74696e5f747970659493948c0a4c616d6264615479706594859452942868028c08436f6465547970659485945294284b024b004b004b074b054b434376640164006c007d02640164006c017d03640164026c026d037d0401007c047c016401190083017d057c0564006b087242740464037c01640119009b009d02830182017c007c02a0056404a10118007c02a00664056406a1021a007c051a007c0514007c0517007d067c036a077c06640664078d02530094284e4b008c0974696d6570617273659485948c1643616e6e6f7420706172736520696e74657276616c20948c13313937302d30312d30315430303a30303a3030944b018c0173948c04756e69749485947494288c056e756d7079948c0670616e646173948c15707974696d6570617273652e74696d65706172736594680a8c0a56616c75654572726f72948c0a6461746574696d653634948c0b74696d6564656c74613634948c0b746f5f6461746574696d65947494288c0178948c08696e74657276616c948c026e70948c02706494680a8c0a74696d655f736c696365948c0974696d655f736c6f749474948c4e2f7661722f666f6c646572732f67792f7738777931727931356c6a31326e6e37776c7130356a743030303030676e2f542f6970796b65726e656c5f39313232332f323334313939343137392e7079948c167665635f77696e646f775f656e645f636f6d70757465944b0a4310000a080108010c020c01080112012601942929749452947d94288c0b5f5f7061636b6167655f5f944e8c085f5f6e616d655f5f948c085f5f6d61696e5f5f94754e4e4e749452948c1c636c6f75647069636b6c652e636c6f75647069636b6c655f66617374948c125f66756e6374696f6e5f7365747374617465949394682b7d947d9428682868228c0c5f5f7175616c6e616d655f5f9468228c0f5f5f616e6e6f746174696f6e735f5f947d948c0e5f5f6b7764656661756c74735f5f944e8c0c5f5f64656661756c74735f5f944e8c0a5f5f6d6f64756c655f5f9468298c075f5f646f635f5f944e8c0b5f5f636c6f737572655f5f944e8c175f636c6f75647069636b6c655f7375626d6f64756c6573945d948c0b5f5f676c6f62616c735f5f947d947586948652302e'))\n", - "# The following comment contains the source code generated by snowpark-python for explanatory purposes.\n", - "# @F.pandas_udf(\n", - "# name=udf_name,\n", - "# replace=True,\n", - "# packages=[\"numpy\", \"pandas\", \"pytimeparse\"],\n", - "# session=session,\n", - "# )\n", - "# def vec_window_end_compute(\n", - "# x: T.PandasSeries[datetime.datetime],\n", - "# interval: T.PandasSeries[str],\n", - "# ) -> T.PandasSeries[datetime.datetime]:\n", - "# import numpy as np\n", - "# import pandas as pd\n", - "# from pytimeparse.timeparse import timeparse\n", - "#\n", - "# time_slice = timeparse(interval[0])\n", - "# if time_slice is None:\n", - "# raise ValueError(f\"Cannot parse interval {interval[0]}\")\n", - "# time_slot = (x - np.datetime64('1970-01-01T00:00:00')) // np.timedelta64(1, 's') // time_slice * time_slice + time_slice\n", - "# return pd.to_datetime(time_slot, unit='s')\n", - "#\n", - "# func = vec_window_end_compute\n", - "\n", - "\n", - "\n", - "from threading import RLock\n", - "\n", - "lock = RLock()\n", - "\n", - "class InvokedFlag:\n", - " def __init__(self):\n", - " self.invoked = False\n", - "\n", - "def lock_function_once(f, flag):\n", - " def wrapper(*args, **kwargs):\n", - " if not flag.invoked:\n", - " with lock:\n", - " if not flag.invoked:\n", - " result = f(*args, **kwargs)\n", - " flag.invoked = True\n", - " return result\n", - " return f(*args, **kwargs)\n", - " return f(*args, **kwargs)\n", - " return wrapper\n", - "\n", - "\n", - "invoked = InvokedFlag()\n", - "\n", - "def compute(df):\n", - " return lock_function_once(func, invoked)(*[df[idx] for idx in range(df.shape[1])])\n", - "\n", - "import pandas\n", - "\n", - "compute._sf_vectorized_input = pandas.DataFrame\n", - "$$\n", - "\n", - "\n", - "002002 (42710): SQL compilation error:\n", - "Object 'WINDOW_END(ARG1 TIMESTAMP_NTZ, ARG2 VARCHAR):TIMESTAMP_NTZ(9)' already exists.\n" - ] - }, - { - "ename": "SnowparkSQLException", - "evalue": "(1304): 01af19db-0406-b1b7-000c-a90273b47663: 002002 (42710): SQL compilation error:\nObject 'WINDOW_END(ARG1 TIMESTAMP_NTZ, ARG2 VARCHAR):TIMESTAMP_NTZ(9)' already exists.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mSnowparkSQLException\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[113], line 16\u001b[0m\n\u001b[1;32m 6\u001b[0m session \u001b[38;5;241m=\u001b[39m Session\u001b[38;5;241m.\u001b[39mbuilder\u001b[38;5;241m.\u001b[39mconfigs(SnowflakeLoginOptions())\u001b[38;5;241m.\u001b[39mcreate()\n\u001b[1;32m 8\u001b[0m udf_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwindow_end\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;129;43m@F\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpandas_udf\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mudf_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[43mreplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 13\u001b[0m \u001b[43m \u001b[49m\u001b[43mpackages\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnumpy\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpandas\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpytimeparse\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[43m \u001b[49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msession\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m---> 16\u001b[0m \u001b[38;5;28;43;01mdef\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;21;43mvec_window_end_compute\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mT\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPandasSeries\u001b[49m\u001b[43m[\u001b[49m\u001b[43mdatetime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdatetime\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[43m \u001b[49m\u001b[43minterval\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mT\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPandasSeries\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m>\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mT\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPandasSeries\u001b[49m\u001b[43m[\u001b[49m\u001b[43mdatetime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdatetime\u001b[49m\u001b[43m]\u001b[49m\u001b[43m:\u001b[49m\n\u001b[1;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mimport\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;21;43;01mnumpy\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mas\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;21;43;01mnp\u001b[39;49;00m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mimport\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;21;43;01mpandas\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mas\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;21;43;01mpd\u001b[39;49;00m\n", - "File \u001b[0;32m/opt/homebrew/anaconda3/envs/fs_demo/lib/python3.8/site-packages/snowflake/snowpark/udf.py:601\u001b[0m, in \u001b[0;36mUDFRegistration.register\u001b[0;34m(self, func, return_type, input_types, name, is_permanent, stage_location, imports, packages, replace, if_not_exists, parallel, max_batch_size, strict, secure, external_access_integrations, secrets, statement_params, source_code_display, **kwargs)\u001b[0m\n\u001b[1;32m 598\u001b[0m _from_pandas \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_from_pandas_udf_function\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 600\u001b[0m \u001b[38;5;66;03m# register udf\u001b[39;00m\n\u001b[0;32m--> 601\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_do_register_udf\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 602\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 603\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 604\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_types\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 605\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 606\u001b[0m \u001b[43m \u001b[49m\u001b[43mstage_location\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 607\u001b[0m \u001b[43m \u001b[49m\u001b[43mimports\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 608\u001b[0m \u001b[43m \u001b[49m\u001b[43mpackages\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 609\u001b[0m \u001b[43m \u001b[49m\u001b[43mreplace\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 610\u001b[0m \u001b[43m \u001b[49m\u001b[43mif_not_exists\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 611\u001b[0m \u001b[43m \u001b[49m\u001b[43mparallel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 612\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_batch_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 613\u001b[0m \u001b[43m \u001b[49m\u001b[43m_from_pandas\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 614\u001b[0m \u001b[43m \u001b[49m\u001b[43mstrict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 615\u001b[0m \u001b[43m \u001b[49m\u001b[43msecure\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 616\u001b[0m \u001b[43m \u001b[49m\u001b[43mexternal_access_integrations\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexternal_access_integrations\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 617\u001b[0m \u001b[43m \u001b[49m\u001b[43msecrets\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msecrets\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 618\u001b[0m \u001b[43m \u001b[49m\u001b[43mstatement_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstatement_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 619\u001b[0m \u001b[43m \u001b[49m\u001b[43msource_code_display\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msource_code_display\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 620\u001b[0m \u001b[43m \u001b[49m\u001b[43mapi_call_source\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mUDFRegistration.register\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 621\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m[pandas_udf]\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m_from_pandas\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 622\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_permanent\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_permanent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 623\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/opt/homebrew/anaconda3/envs/fs_demo/lib/python3.8/site-packages/snowflake/snowpark/udf.py:881\u001b[0m, in \u001b[0;36mUDFRegistration._do_register_udf\u001b[0;34m(self, func, return_type, input_types, name, stage_location, imports, packages, replace, if_not_exists, parallel, max_batch_size, from_pandas_udf_function, strict, secure, external_access_integrations, secrets, statement_params, source_code_display, api_call_source, skip_upload_on_content_match, is_permanent)\u001b[0m\n\u001b[1;32m 877\u001b[0m tb \u001b[38;5;241m=\u001b[39m sys\u001b[38;5;241m.\u001b[39mexc_info()[\u001b[38;5;241m2\u001b[39m]\n\u001b[1;32m 878\u001b[0m ne \u001b[38;5;241m=\u001b[39m SnowparkClientExceptionMessages\u001b[38;5;241m.\u001b[39mSQL_EXCEPTION_FROM_PROGRAMMING_ERROR(\n\u001b[1;32m 879\u001b[0m pe\n\u001b[1;32m 880\u001b[0m )\n\u001b[0;32m--> 881\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ne\u001b[38;5;241m.\u001b[39mwith_traceback(tb) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 882\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m:\n\u001b[1;32m 883\u001b[0m raised \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", - "File \u001b[0;32m/opt/homebrew/anaconda3/envs/fs_demo/lib/python3.8/site-packages/snowflake/snowpark/udf.py:852\u001b[0m, in \u001b[0;36mUDFRegistration._do_register_udf\u001b[0;34m(self, func, return_type, input_types, name, stage_location, imports, packages, replace, if_not_exists, parallel, max_batch_size, from_pandas_udf_function, strict, secure, external_access_integrations, secrets, statement_params, source_code_display, api_call_source, skip_upload_on_content_match, is_permanent)\u001b[0m\n\u001b[1;32m 850\u001b[0m raised \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 851\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 852\u001b[0m \u001b[43mcreate_python_udf_or_sp\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 853\u001b[0m \u001b[43m \u001b[49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_session\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 854\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 855\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 856\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhandler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 857\u001b[0m \u001b[43m \u001b[49m\u001b[43mobject_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mTempObjectType\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mFUNCTION\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 858\u001b[0m \u001b[43m \u001b[49m\u001b[43mobject_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mudf_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 859\u001b[0m \u001b[43m \u001b[49m\u001b[43mall_imports\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mall_imports\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 860\u001b[0m \u001b[43m \u001b[49m\u001b[43mall_packages\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mall_packages\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 861\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_permanent\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_permanent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 862\u001b[0m \u001b[43m \u001b[49m\u001b[43mreplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreplace\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 863\u001b[0m \u001b[43m \u001b[49m\u001b[43mif_not_exists\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mif_not_exists\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 864\u001b[0m \u001b[43m \u001b[49m\u001b[43minline_python_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 865\u001b[0m \u001b[43m \u001b[49m\u001b[43mapi_call_source\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mapi_call_source\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 866\u001b[0m \u001b[43m \u001b[49m\u001b[43mstrict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstrict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 867\u001b[0m \u001b[43m \u001b[49m\u001b[43msecure\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msecure\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 868\u001b[0m \u001b[43m \u001b[49m\u001b[43mexternal_access_integrations\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexternal_access_integrations\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 869\u001b[0m \u001b[43m \u001b[49m\u001b[43msecrets\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msecrets\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 870\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;66;03m# an exception might happen during registering a udf\u001b[39;00m\n\u001b[1;32m 872\u001b[0m \u001b[38;5;66;03m# (e.g., a dependency might not be found on the stage),\u001b[39;00m\n\u001b[1;32m 873\u001b[0m \u001b[38;5;66;03m# then for a permanent udf, we should delete the uploaded\u001b[39;00m\n\u001b[1;32m 874\u001b[0m \u001b[38;5;66;03m# python file and raise the exception\u001b[39;00m\n\u001b[1;32m 875\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ProgrammingError \u001b[38;5;28;01mas\u001b[39;00m pe:\n", - "File \u001b[0;32m/opt/homebrew/anaconda3/envs/fs_demo/lib/python3.8/site-packages/snowflake/snowpark/_internal/udf_utils.py:1031\u001b[0m, in \u001b[0;36mcreate_python_udf_or_sp\u001b[0;34m(session, return_type, input_args, handler, object_type, object_name, all_imports, all_packages, is_permanent, replace, if_not_exists, inline_python_code, execute_as, api_call_source, strict, secure, external_access_integrations, secrets)\u001b[0m\n\u001b[1;32m 1012\u001b[0m secrets_in_sql \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 1013\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mSECRETS=(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m,\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin([\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mk\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mv\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mfor\u001b[39;00m\u001b[38;5;250m \u001b[39mk,\u001b[38;5;250m \u001b[39mv\u001b[38;5;250m \u001b[39m\u001b[38;5;129;01min\u001b[39;00m\u001b[38;5;250m \u001b[39msecrets\u001b[38;5;241m.\u001b[39mitems()])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 1014\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m secrets\n\u001b[1;32m 1015\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1016\u001b[0m )\n\u001b[1;32m 1018\u001b[0m create_query \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 1019\u001b[0m \u001b[38;5;124mCREATE\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m OR REPLACE \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mif\u001b[39;00m\u001b[38;5;250m \u001b[39mreplace\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01melse\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\n\u001b[1;32m 1020\u001b[0m \u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mif\u001b[39;00m\u001b[38;5;250m \u001b[39mis_permanent\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01melse\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTEMPORARY\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSECURE\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mif\u001b[39;00m\u001b[38;5;250m \u001b[39msecure\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01melse\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mobject_type\u001b[38;5;241m.\u001b[39mvalue\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m\"\u001b[39m,\u001b[38;5;250m \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIF NOT EXISTS\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mif\u001b[39;00m\u001b[38;5;250m \u001b[39mif_not_exists\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01melse\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mobject_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msql_func_args\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1029\u001b[0m \u001b[38;5;132;01m{\u001b[39;00minline_python_code_in_sql\u001b[38;5;132;01m}\u001b[39;00m\n\u001b[1;32m 1030\u001b[0m \u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[0;32m-> 1031\u001b[0m \u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcreate_query\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_ddl_on_temp_object\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mis_permanent\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1033\u001b[0m \u001b[38;5;66;03m# fire telemetry after _run_query is successful\u001b[39;00m\n\u001b[1;32m 1034\u001b[0m api_call_source \u001b[38;5;241m=\u001b[39m api_call_source \u001b[38;5;129;01more\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_internal.create_python_udf_or_sp\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", - "File \u001b[0;32m/opt/homebrew/anaconda3/envs/fs_demo/lib/python3.8/site-packages/snowflake/snowpark/session.py:1690\u001b[0m, in \u001b[0;36mSession._run_query\u001b[0;34m(self, query, is_ddl_on_temp_object, log_on_exception)\u001b[0m\n\u001b[1;32m 1684\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_run_query\u001b[39m(\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 1686\u001b[0m query: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m 1687\u001b[0m is_ddl_on_temp_object: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 1688\u001b[0m log_on_exception: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 1689\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Any]:\n\u001b[0;32m-> 1690\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_conn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_query\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1691\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1692\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_ddl_on_temp_object\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_ddl_on_temp_object\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1693\u001b[0m \u001b[43m \u001b[49m\u001b[43mlog_on_exception\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlog_on_exception\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1694\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", - "File \u001b[0;32m/opt/homebrew/anaconda3/envs/fs_demo/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py:102\u001b[0m, in \u001b[0;36mServerConnection._Decorator.wrap_exception..wrap\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m SnowparkClientExceptionMessages\u001b[38;5;241m.\u001b[39mSERVER_SESSION_EXPIRED(\n\u001b[1;32m 99\u001b[0m ex\u001b[38;5;241m.\u001b[39mcause\n\u001b[1;32m 100\u001b[0m )\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[0;32m--> 102\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ex\n", - "File \u001b[0;32m/opt/homebrew/anaconda3/envs/fs_demo/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py:96\u001b[0m, in \u001b[0;36mServerConnection._Decorator.wrap_exception..wrap\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m SnowparkClientExceptionMessages\u001b[38;5;241m.\u001b[39mSERVER_SESSION_HAS_BEEN_CLOSED()\n\u001b[1;32m 95\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 96\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 97\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ReauthenticationRequest \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[1;32m 98\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m SnowparkClientExceptionMessages\u001b[38;5;241m.\u001b[39mSERVER_SESSION_EXPIRED(\n\u001b[1;32m 99\u001b[0m ex\u001b[38;5;241m.\u001b[39mcause\n\u001b[1;32m 100\u001b[0m )\n", - "File \u001b[0;32m/opt/homebrew/anaconda3/envs/fs_demo/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py:366\u001b[0m, in \u001b[0;36mServerConnection.run_query\u001b[0;34m(self, query, to_pandas, to_iter, is_ddl_on_temp_object, block, data_type, async_job_plan, log_on_exception, case_sensitive, params, num_statements, **kwargs)\u001b[0m\n\u001b[1;32m 364\u001b[0m query_id_log \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m [queryID: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mex\u001b[38;5;241m.\u001b[39msfqid\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(ex, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msfqid\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 365\u001b[0m logger\u001b[38;5;241m.\u001b[39merror(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed to execute query\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mquery_id_log\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mquery\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mex\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 366\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ex\n\u001b[1;32m 368\u001b[0m \u001b[38;5;66;03m# fetch_pandas_all/batches() only works for SELECT statements\u001b[39;00m\n\u001b[1;32m 369\u001b[0m \u001b[38;5;66;03m# We call fetchall() if fetch_pandas_all/batches() fails,\u001b[39;00m\n\u001b[1;32m 370\u001b[0m \u001b[38;5;66;03m# because when the query plan has multiple queries, it will\u001b[39;00m\n\u001b[1;32m 371\u001b[0m \u001b[38;5;66;03m# have non-select statements, and it shouldn't fail if the user\u001b[39;00m\n\u001b[1;32m 372\u001b[0m \u001b[38;5;66;03m# calls to_pandas() to execute the query.\u001b[39;00m\n\u001b[1;32m 373\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m block:\n", - "File \u001b[0;32m/opt/homebrew/anaconda3/envs/fs_demo/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py:347\u001b[0m, in \u001b[0;36mServerConnection.run_query\u001b[0;34m(self, query, to_pandas, to_iter, is_ddl_on_temp_object, block, data_type, async_job_plan, log_on_exception, case_sensitive, params, num_statements, **kwargs)\u001b[0m\n\u001b[1;32m 345\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_statement_params\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSNOWPARK_SKIP_TXN_COMMIT_IN_DDL\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 346\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m block:\n\u001b[0;32m--> 347\u001b[0m results_cursor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cursor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 348\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnotify_query_listeners(\n\u001b[1;32m 349\u001b[0m QueryRecord(results_cursor\u001b[38;5;241m.\u001b[39msfqid, results_cursor\u001b[38;5;241m.\u001b[39mquery)\n\u001b[1;32m 350\u001b[0m )\n\u001b[1;32m 351\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExecute query [queryID: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresults_cursor\u001b[38;5;241m.\u001b[39msfqid\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m] \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mquery\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m/opt/homebrew/anaconda3/envs/fs_demo/lib/python3.8/site-packages/snowflake/connector/cursor.py:908\u001b[0m, in \u001b[0;36mSnowflakeCursor.execute\u001b[0;34m(self, command, params, _bind_stage, timeout, _exec_async, _no_retry, _do_reset, _put_callback, _put_azure_callback, _put_callback_output_stream, _get_callback, _get_azure_callback, _get_callback_output_stream, _show_progress_bar, _statement_params, _is_internal, _describe_only, _no_results, _is_put_get, _raise_put_get_error, _force_put_overwrite, _skip_upload_on_content_match, file_stream, num_statements)\u001b[0m\n\u001b[1;32m 904\u001b[0m is_integrity_error \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 905\u001b[0m code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m100072\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 906\u001b[0m ) \u001b[38;5;66;03m# NULL result in a non-nullable column\u001b[39;00m\n\u001b[1;32m 907\u001b[0m error_class \u001b[38;5;241m=\u001b[39m IntegrityError \u001b[38;5;28;01mif\u001b[39;00m is_integrity_error \u001b[38;5;28;01melse\u001b[39;00m ProgrammingError\n\u001b[0;32m--> 908\u001b[0m \u001b[43mError\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43merrorhandler_wrapper\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconnection\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merror_class\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrvalue\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 909\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n", - "File \u001b[0;32m/opt/homebrew/anaconda3/envs/fs_demo/lib/python3.8/site-packages/snowflake/connector/errors.py:290\u001b[0m, in \u001b[0;36mError.errorhandler_wrapper\u001b[0;34m(connection, cursor, error_class, error_value)\u001b[0m\n\u001b[1;32m 267\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[1;32m 268\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21merrorhandler_wrapper\u001b[39m(\n\u001b[1;32m 269\u001b[0m connection: SnowflakeConnection \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 272\u001b[0m error_value: \u001b[38;5;28mdict\u001b[39m[\u001b[38;5;28mstr\u001b[39m, Any],\n\u001b[1;32m 273\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 274\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Error handler wrapper that calls the errorhandler method.\u001b[39;00m\n\u001b[1;32m 275\u001b[0m \n\u001b[1;32m 276\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 287\u001b[0m \u001b[38;5;124;03m exception to the first handler in that order.\u001b[39;00m\n\u001b[1;32m 288\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 290\u001b[0m handed_over \u001b[38;5;241m=\u001b[39m \u001b[43mError\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhand_to_other_handler\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 291\u001b[0m \u001b[43m \u001b[49m\u001b[43mconnection\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 292\u001b[0m \u001b[43m \u001b[49m\u001b[43mcursor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 293\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_class\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m handed_over:\n\u001b[1;32m 297\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m Error\u001b[38;5;241m.\u001b[39merrorhandler_make_exception(\n\u001b[1;32m 298\u001b[0m error_class,\n\u001b[1;32m 299\u001b[0m error_value,\n\u001b[1;32m 300\u001b[0m )\n", - "File \u001b[0;32m/opt/homebrew/anaconda3/envs/fs_demo/lib/python3.8/site-packages/snowflake/connector/errors.py:345\u001b[0m, in \u001b[0;36mError.hand_to_other_handler\u001b[0;34m(connection, cursor, error_class, error_value)\u001b[0m\n\u001b[1;32m 343\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m cursor \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 344\u001b[0m cursor\u001b[38;5;241m.\u001b[39mmessages\u001b[38;5;241m.\u001b[39mappend((error_class, error_value))\n\u001b[0;32m--> 345\u001b[0m \u001b[43mcursor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43merrorhandler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconnection\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcursor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merror_class\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merror_value\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 346\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 347\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m connection \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "File \u001b[0;32m/opt/homebrew/anaconda3/envs/fs_demo/lib/python3.8/site-packages/snowflake/connector/errors.py:221\u001b[0m, in \u001b[0;36mError.default_errorhandler\u001b[0;34m(connection, cursor, error_class, error_value)\u001b[0m\n\u001b[1;32m 219\u001b[0m errno \u001b[38;5;241m=\u001b[39m error_value\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124merrno\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 220\u001b[0m done_format_msg \u001b[38;5;241m=\u001b[39m error_value\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdone_format_msg\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_class(\n\u001b[1;32m 222\u001b[0m msg\u001b[38;5;241m=\u001b[39merror_value\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmsg\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m 223\u001b[0m errno\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01mif\u001b[39;00m errno \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mint\u001b[39m(errno),\n\u001b[1;32m 224\u001b[0m sqlstate\u001b[38;5;241m=\u001b[39merror_value\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msqlstate\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m 225\u001b[0m sfqid\u001b[38;5;241m=\u001b[39merror_value\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msfqid\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m 226\u001b[0m query\u001b[38;5;241m=\u001b[39merror_value\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquery\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m 227\u001b[0m done_format_msg\u001b[38;5;241m=\u001b[39m(\n\u001b[1;32m 228\u001b[0m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01mif\u001b[39;00m done_format_msg \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mbool\u001b[39m(done_format_msg)\n\u001b[1;32m 229\u001b[0m ),\n\u001b[1;32m 230\u001b[0m connection\u001b[38;5;241m=\u001b[39mconnection,\n\u001b[1;32m 231\u001b[0m cursor\u001b[38;5;241m=\u001b[39mcursor,\n\u001b[1;32m 232\u001b[0m )\n", - "\u001b[0;31mSnowparkSQLException\u001b[0m: (1304): 01af19db-0406-b1b7-000c-a90273b47663: 002002 (42710): SQL compilation error:\nObject 'WINDOW_END(ARG1 TIMESTAMP_NTZ, ARG2 VARCHAR):TIMESTAMP_NTZ(9)' already exists." - ] - } - ], + "outputs": [], "source": [ "from snowflake.snowpark import Session\n", "from snowflake.ml.utils.connection_params import SnowflakeLoginOptions\n", @@ -1117,25 +659,10 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": null, "id": "67a5a484", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Row(WINDOW_END(TS, '15M')=datetime.datetime(2023, 1, 31, 1, 15)),\n", - " Row(WINDOW_END(TS, '15M')=datetime.datetime(2023, 1, 31, 1, 15)),\n", - " Row(WINDOW_END(TS, '15M')=datetime.datetime(2023, 1, 31, 1, 30)),\n", - " Row(WINDOW_END(TS, '15M')=datetime.datetime(2023, 1, 31, 1, 30)),\n", - " Row(WINDOW_END(TS, '15M')=datetime.datetime(2023, 1, 31, 1, 30))]" - ] - }, - "execution_count": 114, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "session.sql(\"select window_end(ts, '15m') from foobar\").collect()" ] @@ -1157,7 +684,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.18" + "version": "3.10.9" } }, "nbformat": 4, diff --git a/snowflake/ml/feature_store/tests/feature_store_case_sensitivity_test.py b/snowflake/ml/feature_store/tests/feature_store_case_sensitivity_test.py index 723fec56..014e338f 100644 --- a/snowflake/ml/feature_store/tests/feature_store_case_sensitivity_test.py +++ b/snowflake/ml/feature_store/tests/feature_store_case_sensitivity_test.py @@ -11,6 +11,7 @@ from snowflake.ml._internal.utils import identifier from snowflake.ml._internal.utils.identifier import resolve_identifier +from snowflake.ml._internal.utils.sql_identifier import SqlIdentifier from snowflake.ml.feature_store import ( # type: ignore[attr-defined] CreationMode, Entity, @@ -254,17 +255,17 @@ def test_join_keys_and_ts_col(self, equi_names: List[str], diff_names: List[str] retrieved_e = fs.get_entity("MY_COOL_ENTITY") self.assertEqual(len(retrieved_e.join_keys), 1) - self.assertEqual(retrieved_e.join_keys[0], test_name) + self.assertEqual(retrieved_e.join_keys[0], SqlIdentifier(test_name)) self.assertEqual(len(fv_1.entities), 1) self.assertEqual(len(fv_1.entities[0].join_keys), 1) - self.assertEqual(fv_1.entities[0].join_keys[0], test_name) + self.assertEqual(fv_1.entities[0].join_keys[0], SqlIdentifier(test_name)) fv_2 = fs.get_feature_view("MY_FV", "V1") self.assertEqual(len(fv_2.entities), 1) self.assertEqual(len(fv_2.entities[0].join_keys), 1) - self.assertEqual(fv_2.entities[0].join_keys[0], test_name) - self.assertEqual(fv_2.timestamp_col, test_name) + self.assertEqual(fv_2.entities[0].join_keys[0], SqlIdentifier(test_name)) + self.assertEqual(fv_2.timestamp_col, SqlIdentifier(test_name)) fs.delete_feature_view(fv_2) fs.delete_entity("MY_COOL_ENTITY") @@ -322,7 +323,7 @@ def test_feature_view_names_and_versions_combination( fv_name = diff_full_name[0] version = diff_full_name[1] fv = FeatureView(name=fv_name, entities=[e], feature_df=df) - fs.register_feature_view(fv, version, block=True) + fv = fs.register_feature_view(fv, version, block=True) fs.read_feature_view(fv) self.assertEqual(len(fs.list_feature_views(as_dataframe=False)), len(diff_full_names) + 1) diff --git a/snowflake/ml/feature_store/tests/feature_store_test.py b/snowflake/ml/feature_store/tests/feature_store_test.py index 0161d0bc..994121da 100644 --- a/snowflake/ml/feature_store/tests/feature_store_test.py +++ b/snowflake/ml/feature_store/tests/feature_store_test.py @@ -1,4 +1,4 @@ -from typing import Optional, cast +from typing import Optional from uuid import uuid4 from absl.testing import absltest @@ -27,7 +27,8 @@ FEATURE_VIEW_TS_COL_TAG, ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions -from snowflake.snowpark import DataFrame, Session, exceptions as snowpark_exceptions +from snowflake.snowpark import Session, exceptions as snowpark_exceptions +from snowflake.snowpark.functions import call_udf, udf class FeatureStoreTest(absltest.TestCase): @@ -196,7 +197,7 @@ def test_create_and_delete_entities(self) -> None: actual_df=fs.list_entities().to_pandas(), target_data={ "NAME": ["aD", "PRODUCT", "USER"], - "JOIN_KEYS": ["aid", "pid,cid", "uid"], + "JOIN_KEYS": ["AID", "PID,CID", "UID"], "DESC": ["", "", ""], }, sort_cols=["NAME"], @@ -217,7 +218,7 @@ def test_create_and_delete_entities(self) -> None: actual_df=fs.list_entities().to_pandas(), target_data={ "NAME": ["PRODUCT", "USER"], - "JOIN_KEYS": ["pid,cid", "uid"], + "JOIN_KEYS": ["PID,CID", "UID"], "DESC": ["", ""], }, sort_cols=["NAME"], @@ -254,7 +255,7 @@ def test_retrieve_entity(self) -> None: actual_df=fs.list_entities().to_pandas(), target_data={ "NAME": ["FOO", "BAR"], - "JOIN_KEYS": ["a,b", "c"], + "JOIN_KEYS": ["A,B", "C"], "DESC": ["my foo", ""], }, sort_cols=["NAME"], @@ -450,7 +451,7 @@ def test_create_and_delete_feature_views(self) -> None: feature_df=self._session.sql(sql0), desc="my_fv1", ) - fs.register_feature_view( + fv1 = fs.register_feature_view( feature_view=fv1, version="FIRST", refresh_freq="5 minutes", @@ -586,7 +587,7 @@ def test_register_with_cron_expr(self) -> None: fv = fs.get_feature_view("my_fv", "v1") self.assertEqual(my_fv, fv) - task_name = fs._get_feature_view_name(fv.name, fv.version) + task_name = fv.physical_name() res = self._session.sql(f"SHOW TASKS LIKE '{task_name}' IN SCHEMA {fs._config.full_schema_path}").collect() self.assertEqual(len(res), 1) self.assertEqual(res[0]["state"], "started") @@ -800,7 +801,9 @@ def test_merge_features(self) -> None: fv3 = fs.register_feature_view(feature_view=fv3, version="v1", refresh_freq="DOWNSTREAM", block=True) merged_fv = fs.merge_features(features=[fv1, fv2, fv3], name="merged_fv") - fs.register_feature_view(feature_view=merged_fv, version="v1", refresh_freq="DOWNSTREAM", block=True) + merged_fv = fs.register_feature_view( + feature_view=merged_fv, version="v1", refresh_freq="DOWNSTREAM", block=True + ) df = fs.read_feature_view(merged_fv) compare_dataframe( @@ -831,7 +834,9 @@ def test_merge_feature_view_slice(self) -> None: fv2 = fs.register_feature_view(feature_view=fv2, version="v1", refresh_freq="DOWNSTREAM", block=True) merged_fv = fs.merge_features(features=[fv1, fv2.slice(["title"])], name="merged_fv") - fs.register_feature_view(feature_view=merged_fv, version="v1", refresh_freq="DOWNSTREAM", block=True) + merged_fv = fs.register_feature_view( + feature_view=merged_fv, version="v1", refresh_freq="DOWNSTREAM", block=True + ) df = fs.read_feature_view(merged_fv) compare_dataframe( @@ -852,7 +857,7 @@ def test_merge_feature_view_slice(self) -> None: fv4 = fs.register_feature_view(feature_view=fv4, version="v1", refresh_freq="DOWNSTREAM", block=True) merged_fv_2 = fs.merge_features(features=[fv3.slice(["title"]), fv4], name="merged_fv_2") - fs.register_feature_view( + merged_fv_2 = fs.register_feature_view( feature_view=merged_fv_2, version="v1", refresh_freq="DOWNSTREAM", @@ -890,7 +895,7 @@ def test_merge_feature_view_slice(self) -> None: fv6 = fs.register_feature_view(feature_view=fv6, version="v1", refresh_freq="DOWNSTREAM", block=True) merged_fv_3 = fs.merge_features(features=[fv5, fv6.slice(["title"])], name="merged_fv_3") - fs.register_feature_view( + merged_fv_3 = fs.register_feature_view( feature_view=merged_fv_3, version="v1", refresh_freq="DOWNSTREAM", @@ -932,7 +937,29 @@ def test_list_feature_views(self) -> None: [fv1], ) - self.assertEqual(len(cast(DataFrame, fs.list_feature_views()).collect()), 2) + df = fs.list_feature_views() + self.assertListEqual( + df.columns, + [ + "NAME", + "ENTITIES", + "TIMESTAMP_COL", + "DESC", + "QUERY", + "VERSION", + "STATUS", + "FEATURE_DESC", + "REFRESH_FREQ", + "DATABASE", + "SCHEMA", + "WAREHOUSE", + "REFRESH_MODE", + "REFRESH_MODE_REASON", + "PHYSICAL_NAME", + ], + ) + result = df.collect() + self.assertEqual(len(result), 2) def test_list_feature_views_system_error(self) -> None: fs = self._create_feature_store() @@ -1144,7 +1171,7 @@ def test_generate_dataset(self) -> None: ) # invalid columns in exclude_columns should fail - with self.assertRaisesRegex(ValueError, "foo in exclude_columns not exists in.*"): + with self.assertRaisesRegex(ValueError, "FOO in exclude_columns not exists in.*"): fs.generate_dataset( spine_df=spine_df, features=[fv1, fv2], @@ -1177,7 +1204,7 @@ def test_clear_feature_store_in_existing_schema(self) -> None: fs.register_entity(e) sql = f"SELECT name, id FROM {self._mock_table}" fv = FeatureView(name="fv", entities=[e], feature_df=self._session.sql(sql)) - fs.register_feature_view( + fv = fs.register_feature_view( feature_view=fv, version="v1", refresh_freq="* * * * * America/Los_Angeles", block=True ) @@ -1224,6 +1251,32 @@ def check_fs_objects(expected_count: int) -> None: result = self._session.sql(f"SHOW TAGS LIKE 'my_tag' IN SCHEMA {full_schema_path}").collect() self.assertEqual(len(result), 1) + def test_dynamic_table_full_refresh_warning(self) -> None: + temp_stage_name = "test_dynamic_table_full_refresh_warning_stage" + self._session.sql(f"CREATE OR REPLACE STAGE {temp_stage_name}").collect() + + udf_name = f"{FS_INTEG_TEST_DB}.{FS_INTEG_TEST_DATASET_SCHEMA}.minus_one" + + @udf( # type: ignore[misc, arg-type] + name=udf_name, + session=self._session, + is_permanent=True, + stage_location=f"@{temp_stage_name}", + replace=True, + ) + def minus_one(x: int) -> int: + return x - 1 + + fs = self._create_feature_store() + entity = Entity("foo", ["name"]) + fs.register_entity(entity) + + df = self._session.table(self._mock_table).select(call_udf(udf_name, "id").alias("uid"), "name") + fv = FeatureView(name="fv", entities=[entity], feature_df=df) + + with self.assertWarnsRegex(UserWarning, "Dynamic table: `.*` will not refresh in INCREMENTAL mode"): + fs.register_feature_view(feature_view=fv, version="V1", refresh_freq="1h") + if __name__ == "__main__": absltest.main() diff --git a/snowflake/ml/model/BUILD.bazel b/snowflake/ml/model/BUILD.bazel index 600c0004..b8220dc9 100644 --- a/snowflake/ml/model/BUILD.bazel +++ b/snowflake/ml/model/BUILD.bazel @@ -1,25 +1,7 @@ -load("//bazel:py_rules.bzl", "py_genrule", "py_library", "py_test") +load("//bazel:py_rules.bzl", "py_library", "py_test") package(default_visibility = ["//visibility:public"]) -GEN_CORE_REQ_CMD = "$(location //bazel/requirements:parse_and_generate_requirements) $(location //:requirements.yml) --schema $(location //bazel/requirements:requirements.schema.json) --mode version_requirements --format python --filter_by_tag deployment_core > $@" - -py_genrule( - name = "gen_core_requirements", - srcs = [ - "//:requirements.yml", - "//bazel/requirements:requirements.schema.json", - ], - outs = ["_core_requirements.py"], - cmd = GEN_CORE_REQ_CMD, - tools = ["//bazel/requirements:parse_and_generate_requirements"], -) - -py_library( - name = "_core_requirements", - srcs = [":gen_core_requirements"], -) - py_library( name = "type_hints", srcs = ["type_hints.py"], @@ -57,16 +39,6 @@ py_library( ], ) -py_library( - name = "_env", - srcs = ["_env.py"], - deps = [ - "//snowflake/ml/_internal:env", - "//snowflake/ml/_internal:env_utils", - "//snowflake/ml/_internal/exceptions", - ], -) - py_library( name = "custom_model", srcs = ["custom_model.py"], @@ -76,25 +48,9 @@ py_library( ) py_library( - name = "_model_meta", - srcs = ["_model_meta.py"], - deps = [ - ":_core_requirements", - ":_env", - ":model_signature", - ":type_hints", - "//snowflake/ml/_internal:env", - "//snowflake/ml/_internal:env_utils", - "//snowflake/ml/_internal:file_utils", - "//snowflake/ml/model/_signatures:snowpark_handler", - ], -) - -py_library( - name = "_deployer", - srcs = ["_deployer.py"], + name = "_api", + srcs = ["_api.py"], deps = [ - ":_model", ":deploy_platforms", ":model_signature", ":type_hints", @@ -103,46 +59,11 @@ py_library( "//snowflake/ml/model/_deploy_client/snowservice:deploy", "//snowflake/ml/model/_deploy_client/warehouse:deploy", "//snowflake/ml/model/_deploy_client/warehouse:infer_template", + "//snowflake/ml/model/_module_model:module_model", "//snowflake/ml/model/_signatures:snowpark_handler", ], ) -py_library( - name = "_model_handler", - srcs = ["_model_handler.py"], - deps = [ - ":type_hints", - "//snowflake/ml/model/_handlers:_base", - ], -) - -py_library( - name = "_model", - srcs = ["_model.py"], - deps = [ - ":_env", - ":_model_handler", - ":_model_meta", - ":custom_model", - ":model_signature", - ":type_hints", - "//snowflake/ml/_internal:env_utils", - "//snowflake/ml/_internal:file_utils", - "//snowflake/ml/_internal/exceptions", - "//snowflake/ml/model/_handlers:custom", - "//snowflake/ml/model/_handlers:huggingface_pipeline", - "//snowflake/ml/model/_handlers:llm", - "//snowflake/ml/model/_handlers:mlflow", - "//snowflake/ml/model/_handlers:pytorch", - "//snowflake/ml/model/_handlers:sklearn", - "//snowflake/ml/model/_handlers:snowmlmodel", - "//snowflake/ml/model/_handlers:tensorflow", - "//snowflake/ml/model/_handlers:torchscript", - "//snowflake/ml/model/_handlers:xgboost", - "//snowflake/ml/modeling/framework", - ], -) - py_test( name = "custom_model_test", srcs = ["custom_model_test.py"], @@ -151,15 +72,6 @@ py_test( ], ) -py_test( - name = "_env_test", - srcs = ["_env_test.py"], - deps = [ - ":_env", - "//snowflake/ml/_internal:env", - ], -) - py_test( name = "model_signature_test", srcs = ["model_signature_test.py"], @@ -168,29 +80,3 @@ py_test( "//snowflake/ml/test_utils:exception_utils", ], ) - -py_test( - name = "_model_meta_test", - srcs = ["_model_meta_test.py"], - deps = [ - ":_model_meta", - ":model_signature", - "//snowflake/ml/_internal:env_utils", - ], -) - -py_test( - name = "_model_test", - srcs = ["_model_test.py"], - deps = [ - ":_model", - ":custom_model", - ":model_signature", - "//snowflake/ml/_internal:env", - "//snowflake/ml/_internal:env_utils", - "//snowflake/ml/_internal:file_utils", - "//snowflake/ml/modeling/linear_model:linear_regression", - "//snowflake/ml/test_utils:exception_utils", - "//snowflake/ml/test_utils:mock_session", - ], -) diff --git a/snowflake/ml/model/_api.py b/snowflake/ml/model/_api.py new file mode 100644 index 00000000..78c83a06 --- /dev/null +++ b/snowflake/ml/model/_api.py @@ -0,0 +1,544 @@ +from types import ModuleType +from typing import Any, Dict, List, Literal, Optional, Union, cast, overload + +import pandas as pd + +from snowflake.ml._internal.exceptions import ( + error_codes, + exceptions as snowml_exceptions, +) +from snowflake.ml._internal.utils import identifier +from snowflake.ml.model import ( + deploy_platforms, + model_signature, + type_hints as model_types, +) +from snowflake.ml.model._deploy_client.snowservice import deploy as snowservice_deploy +from snowflake.ml.model._deploy_client.utils import constants as snowservice_constants +from snowflake.ml.model._deploy_client.warehouse import ( + deploy as warehouse_deploy, + infer_template, +) +from snowflake.ml.model._module_model import module_model +from snowflake.ml.model._signatures import snowpark_handler +from snowflake.snowpark import DataFrame as SnowparkDataFrame, Session, functions as F + + +@overload +def save_model( + *, + name: str, + model: model_types.SupportedNoSignatureRequirementsModelType, + session: Session, + stage_path: str, + metadata: Optional[Dict[str, str]] = None, + conda_dependencies: Optional[List[str]] = None, + pip_requirements: Optional[List[str]] = None, + python_version: Optional[str] = None, + ext_modules: Optional[List[ModuleType]] = None, + code_paths: Optional[List[str]] = None, + options: Optional[model_types.ModelSaveOption] = None, +) -> module_model.ModuleModel: + """Save a model that does not require a signature as module model to a stage path. + + Args: + name: Name of the model. + model: Model object. + session: Snowpark connection session. + stage_path: Path to the stage where module model will be saved. + metadata: Model metadata. + conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify + a dependency. It is a recommended way to specify your dependencies using conda. When channel is not + specified, defaults channel will be used. When deploying to Snowflake Warehouse, defaults channel would be + replaced with the Snowflake Anaconda channel. + pip_requirements: List of PIP package specs. Model will not be able to deploy to the warehouse if there is pip + requirements. + python_version: A string of python version where model is run. Used for user override. If specified as None, + current version would be captured. Defaults to None. + code_paths: Directory of code to import. + ext_modules: External modules that user might want to get pickled with model object. Defaults to None. + options: Model specific kwargs. + """ + ... + + +@overload +def save_model( + *, + name: str, + model: model_types.SupportedRequireSignatureModelType, + session: Session, + stage_path: str, + signatures: Dict[str, model_signature.ModelSignature], + metadata: Optional[Dict[str, str]] = None, + conda_dependencies: Optional[List[str]] = None, + pip_requirements: Optional[List[str]] = None, + python_version: Optional[str] = None, + ext_modules: Optional[List[ModuleType]] = None, + code_paths: Optional[List[str]] = None, + options: Optional[model_types.ModelSaveOption] = None, +) -> module_model.ModuleModel: + """Save a model that requires a external signature with user provided signatures as module model to a stage path. + + Args: + name: Name of the model. + model: Model object. + session: Snowpark connection session. + stage_path: Path to the stage where module model will be saved. + signatures: Model data signatures for inputs and output for every target methods. + metadata: Model metadata. + conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify + a dependency. It is a recommended way to specify your dependencies using conda. When channel is not + specified, defaults channel will be used. When deploying to Snowflake Warehouse, defaults channel would be + replaced with the Snowflake Anaconda channel. + pip_requirements: List of PIP package specs. Model will not be able to deploy to the warehouse if there is pip + requirements. + python_version: A string of python version where model is run. Used for user override. If specified as None, + current version would be captured. Defaults to None. + code_paths: Directory of code to import. + ext_modules: External modules that user might want to get pickled with model object. Defaults to None. + options: Model specific kwargs. + """ + ... + + +@overload +def save_model( + *, + name: str, + model: model_types.SupportedRequireSignatureModelType, + session: Session, + stage_path: str, + sample_input: model_types.SupportedDataType, + metadata: Optional[Dict[str, str]] = None, + conda_dependencies: Optional[List[str]] = None, + pip_requirements: Optional[List[str]] = None, + python_version: Optional[str] = None, + ext_modules: Optional[List[ModuleType]] = None, + code_paths: Optional[List[str]] = None, + options: Optional[model_types.ModelSaveOption] = None, +) -> module_model.ModuleModel: + """Save a model that requires a external signature as module model to a stage path with signature inferred from a + sample_input_data. + + Args: + name: Name of the model. + model: Model object. + session: Snowpark connection session. + stage_path: Path to the stage where module model will be saved. + sample_input: Sample input data to infer the model signatures from. + metadata: Model metadata. + conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify + a dependency. It is a recommended way to specify your dependencies using conda. When channel is not + specified, defaults channel will be used. When deploying to Snowflake Warehouse, defaults channel would be + replaced with the Snowflake Anaconda channel. + pip_requirements: List of PIP package specs. Model will not be able to deploy to the warehouse if there is pip + requirements. + python_version: A string of python version where model is run. Used for user override. If specified as None, + current version would be captured. Defaults to None. + code_paths: Directory of code to import. + ext_modules: External modules that user might want to get pickled with model object. Defaults to None. + options: Model specific kwargs. + """ + ... + + +def save_model( + *, + name: str, + model: model_types.SupportedModelType, + session: Session, + stage_path: str, + signatures: Optional[Dict[str, model_signature.ModelSignature]] = None, + sample_input: Optional[model_types.SupportedDataType] = None, + metadata: Optional[Dict[str, str]] = None, + conda_dependencies: Optional[List[str]] = None, + pip_requirements: Optional[List[str]] = None, + python_version: Optional[str] = None, + ext_modules: Optional[List[ModuleType]] = None, + code_paths: Optional[List[str]] = None, + options: Optional[model_types.ModelSaveOption] = None, +) -> module_model.ModuleModel: + """Save the model. + + Args: + name: Name of the model. + model: Model object. + session: Snowpark connection session. + stage_path: Path to the stage where module model will be saved. + signatures: Model data signatures for inputs and output for every target methods. If it is None, sample_input + would be used to infer the signatures if it is a local (non-SnowML modeling model). + If not None, sample_input should not be specified. Defaults to None. + sample_input: Sample input data to infer the model signatures from. If it is None, signatures must be specified + if it is a local (non-SnowML modeling model). If not None, signatures should not be specified. + Defaults to None. + metadata: Model metadata. + conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify + a dependency. It is a recommended way to specify your dependencies using conda. When channel is not + specified, defaults channel will be used. When deploying to Snowflake Warehouse, defaults channel would be + replaced with the Snowflake Anaconda channel. + pip_requirements: List of PIP package specs. Model will not be able to deploy to the warehouse if there is pip + requirements. + python_version: A string of python version where model is run. Used for user override. If specified as None, + current version would be captured. Defaults to None. + code_paths: Directory of code to import. + ext_modules: External modules that user might want to get pickled with model object. Defaults to None. + options: Model specific kwargs. + + Returns: + Module Model + """ + m = module_model.ModuleModel(session=session, stage_path=stage_path) + m.save( + name=name, + model=model, + signatures=signatures, + sample_input=sample_input, + metadata=metadata, + conda_dependencies=conda_dependencies, + pip_requirements=pip_requirements, + python_version=python_version, + ext_modules=ext_modules, + code_paths=code_paths, + options=options, + ) + return m + + +@overload +def load_model(*, session: Session, stage_path: str) -> module_model.ModuleModel: + """Load the model into memory from a zip file in the stage. + + Args: + session: Snowflake connection session. + stage_path: Path to the stage where module model will be loaded from. + """ + ... + + +@overload +def load_model(*, session: Session, stage_path: str, meta_only: Literal[False]) -> module_model.ModuleModel: + """Load the model into memory from a zip file in the stage. + + Args: + session: Snowflake connection session. + stage_path: Path to the stage where module model will be loaded from. + meta_only: Flag to indicate that if only load metadata. + """ + ... + + +@overload +def load_model(*, session: Session, stage_path: str, meta_only: Literal[True]) -> module_model.ModuleModel: + """Load the model into memory from a zip file in the stage with metadata only. + + Args: + session: Snowflake connection session. + stage_path: Path to the stage where module model will be loaded from. + meta_only: Flag to indicate that if only load metadata. + """ + ... + + +def load_model( + *, + session: Session, + stage_path: str, + meta_only: bool = False, +) -> module_model.ModuleModel: + """Load the model into memory from directory or a zip file in the stage. + + Args: + session: Snowflake connection session. Must be specified when specifying model_stage_file_path. + Exclusive with model_dir_path. + stage_path: Path to the stage where module model will be loaded from. + meta_only: Flag to indicate that if only load metadata. + + Returns: + Loaded module model. + """ + m = module_model.ModuleModel(session=session, stage_path=stage_path) + m.load(meta_only=meta_only) + return m + + +@overload +def deploy( + session: Session, + *, + name: str, + platform: deploy_platforms.TargetPlatform, + target_method: Optional[str], + stage_path: str, + options: Optional[model_types.DeployOptions], +) -> Optional[model_types.Deployment]: + """Create a deployment from a model in a zip file in a stage and deploy it to remote platform. + + Args: + session: Snowpark Connection Session. + name: Name of the deployment for the model. + platform: Target platform to deploy the model. + target_method: The name of the target method to be deployed. Can be omitted if there is only 1 target method in + the model. + stage_path: Path to the stage where module model will be deployed. + options: Additional options when deploying the model. + Each target platform will have their own specifications of options. + """ + ... + + +@overload +def deploy( + session: Session, + *, + model_id: str, + name: str, + platform: deploy_platforms.TargetPlatform, + target_method: Optional[str], + stage_path: str, + deployment_stage_path: str, + options: Optional[model_types.DeployOptions], +) -> Optional[model_types.Deployment]: + """Create a deployment from a model in a local directory and deploy it to remote platform. + + Args: + session: Snowpark Connection Session. + model_id: Internal model ID string. + name: Name of the deployment for the model. + platform: Target platform to deploy the model. + target_method: The name of the target method to be deployed. Can be omitted if there is only 1 target method in + the model. + stage_path: Path to the stage where module model will be deployed. + deployment_stage_path: Path to stage containing snowpark container service deployment artifacts. + options: Additional options when deploying the model. + Each target platform will have their own specifications of options. + """ + ... + + +def deploy( + session: Session, + *, + name: str, + platform: deploy_platforms.TargetPlatform, + stage_path: str, + target_method: Optional[str] = None, + deployment_stage_path: Optional[str] = None, + model_id: Optional[str] = None, + options: Optional[model_types.DeployOptions], +) -> Optional[model_types.Deployment]: + """Create a deployment from a model and deploy it to remote platform. + + Args: + session: Snowpark Connection Session. + model_id: Internal model ID string. + name: Name of the deployment for the model. + platform: Target platform to deploy the model. + target_method: The name of the target method to be deployed. Can be omitted if there is only 1 target method in + the model. + stage_path: Path to the stage where module model will be deployed. + deployment_stage_path: Path to stage containing deployment artifacts. + options: Additional options when deploying the model. + Each target platform will have their own specifications of options. + + Raises: + SnowflakeMLException: Raised when target platform is unsupported. + SnowflakeMLException: Raised when target method does not exist in model. + + Returns: + The deployment information. + """ + + info = None + + if not options: + options = {} + + m = load_model(session=session, stage_path=stage_path, meta_only=True) + assert m.packager.meta + + if target_method is None: + if len(m.packager.meta.signatures.keys()) == 1: + target_method = list(m.packager.meta.signatures.keys())[0] + else: + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=ValueError( + "Only when the model has 1 target methods can target_method be omitted when deploying." + ), + ) + + details: model_types.DeployDetails = {} + if platform == deploy_platforms.TargetPlatform.WAREHOUSE: + warehouse_deploy._deploy_to_warehouse( + session=session, + model_stage_file_path=m.model_stage_path, + model_meta=m.packager.meta, + udf_name=name, + target_method=target_method, + **options, + ) + + elif platform == deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES: + options = cast(model_types.SnowparkContainerServiceDeployOptions, options) + assert model_id, "Require 'model_id' for Snowpark container service deployment" + assert m.model_stage_path, "Require 'model_stage_file_path' for Snowpark container service deployment" + assert deployment_stage_path, "Require 'deployment_stage_path' for Snowpark container service deployment" + if snowservice_constants.COMPUTE_POOL not in options: + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=ValueError( + "Missing 'compute_pool' in options field for Snowpark container service deployment" + ), + ) + + details = snowservice_deploy._deploy( + session=session, + model_id=model_id, + model_meta=m.packager.meta, + service_func_name=name, + model_zip_stage_path=m.model_stage_path, + deployment_stage_path=deployment_stage_path, + target_method=target_method, + **options, + ) + + else: + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_TYPE, + original_exception=ValueError(f"Unsupported target Platform: {platform}"), + ) + signature = m.packager.meta.signatures.get(target_method, None) + if not signature: + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=ValueError(f"Target method {target_method} does not exist in model."), + ) + info = model_types.Deployment( + name=name, platform=platform, target_method=target_method, signature=signature, options=options, details=details + ) + return info + + +@overload +def predict( + session: Session, + *, + deployment: model_types.Deployment, + X: model_types.SupportedLocalDataType, + statement_params: Optional[Dict[str, Any]] = None, +) -> pd.DataFrame: + """Execute batch inference of a model remotely on local data. Can be any supported data type. Return a local + Pandas Dataframe. + + Args: + session: Snowpark Connection Session. + deployment: The deployment info to use for predict. + X: The input data. + statement_params: Statement Parameters for telemetry. + """ + ... + + +@overload +def predict( + session: Session, + *, + deployment: model_types.Deployment, + X: SnowparkDataFrame, + statement_params: Optional[Dict[str, Any]] = None, +) -> SnowparkDataFrame: + """Execute batch inference of a model remotely on a Snowpark DataFrame. Return a Snowpark DataFrame. + + Args: + session: Snowpark Connection Session. + deployment: The deployment info to use for predict. + X: The input Snowpark dataframe. + statement_params: Statement Parameters for telemetry. + """ + ... + + +def predict( + session: Session, + *, + deployment: model_types.Deployment, + X: Union[model_types.SupportedDataType, SnowparkDataFrame], + statement_params: Optional[Dict[str, Any]] = None, +) -> Union[pd.DataFrame, SnowparkDataFrame]: + """Execute batch inference of a model remotely. + + Args: + session: Snowpark Connection Session. + deployment: The deployment info to use for predict. + X: The input dataframe. + statement_params: Statement Parameters for telemetry. + + Returns: + The output dataframe. + """ + + # Get options + INTERMEDIATE_OBJ_NAME = "tmp_result" + sig = deployment["signature"] + + # Validate and prepare input + if not isinstance(X, SnowparkDataFrame): + keep_order = True + output_with_input_features = False + df = model_signature._convert_and_validate_local_data(X, sig.inputs) + s_df = snowpark_handler.SnowparkDataFrameHandler.convert_from_df(session, df, keep_order=keep_order) + else: + keep_order = False + output_with_input_features = True + model_signature._validate_snowpark_data(X, sig.inputs) + s_df = X + + if statement_params: + if s_df._statement_params is not None: + s_df._statement_params.update(statement_params) + else: + s_df._statement_params = statement_params # type: ignore[assignment] + + # Infer and get intermediate result + input_cols = [] + for col_name in s_df.columns: + literal_col_name = identifier.get_unescaped_names(col_name) + input_cols.extend( + [ + F.lit(literal_col_name), + F.col(col_name), + ] + ) + + # TODO[shchen]: SNOW-870032, For SnowService, external function name cannot be double quoted, else it results in + # external function no found. + udf_name = deployment["name"] + output_obj = F.call_udf(udf_name, F.object_construct(*input_cols)) + + if output_with_input_features: + df_res = s_df.with_column(INTERMEDIATE_OBJ_NAME, output_obj) + else: + df_res = s_df.select(output_obj.alias(INTERMEDIATE_OBJ_NAME)) + + if keep_order: + df_res = df_res.order_by( + F.col(INTERMEDIATE_OBJ_NAME)[infer_template._KEEP_ORDER_COL_NAME], + ascending=True, + ) + + # Prepare the output + output_cols = [] + for output_feature in sig.outputs: + output_cols.append(F.col(INTERMEDIATE_OBJ_NAME)[output_feature.name].astype(output_feature.as_snowpark_type())) + + df_res = df_res.with_columns( + [identifier.get_inferred_name(output_feature.name) for output_feature in sig.outputs], + output_cols, + ).drop(INTERMEDIATE_OBJ_NAME) + + # Get final result + if not isinstance(X, SnowparkDataFrame): + return snowpark_handler.SnowparkDataFrameHandler.convert_to_df(df_res, features=sig.outputs) + else: + return df_res diff --git a/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel b/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel index 005ae455..37fb5129 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel @@ -16,7 +16,7 @@ py_library( "//snowflake/ml/_internal/exceptions", "//snowflake/ml/_internal/utils:query_result_checker", "//snowflake/ml/_internal/utils:spcs_image_registry", - "//snowflake/ml/model:_model_meta", + "//snowflake/ml/model/_packager/model_meta", ], ) @@ -50,8 +50,8 @@ py_library( deps = [ "//snowflake/ml/_internal:file_utils", "//snowflake/ml/_internal/utils:identifier", - "//snowflake/ml/model:_model_meta", "//snowflake/ml/model/_deploy_client/utils:constants", + "//snowflake/ml/model/_packager/model_meta", ], ) @@ -88,7 +88,7 @@ py_test( ], deps = [ ":docker_context", - "//snowflake/ml/model:_model", + "//snowflake/ml/model:_api", ], ) diff --git a/snowflake/ml/model/_deploy_client/image_builds/docker_context.py b/snowflake/ml/model/_deploy_client/image_builds/docker_context.py index 3e9c2924..9d52e4a1 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/docker_context.py +++ b/snowflake/ml/model/_deploy_client/image_builds/docker_context.py @@ -5,12 +5,10 @@ from abc import ABC from typing import Optional -from packaging import version - from snowflake.ml._internal import file_utils from snowflake.ml._internal.utils import identifier -from snowflake.ml.model import _model_meta from snowflake.ml.model._deploy_client.utils import constants +from snowflake.ml.model._packager.model_meta import model_meta from snowflake.snowpark import FileOperation, Session @@ -22,7 +20,7 @@ class DockerContext(ABC): def __init__( self, context_dir: str, - model_meta: _model_meta.ModelMetadata, + model_meta: model_meta.ModelMetadata, session: Optional[Session] = None, model_zip_stage_path: Optional[str] = None, ) -> None: @@ -61,7 +59,7 @@ def _copy_model_env_dependency_to_docker_context(self) -> None: """ Convert model dependencies to files from model metadata. """ - self.model_meta.save_model_metadata(self.context_dir) + self.model_meta.save(self.context_dir) def _generate_docker_file(self) -> None: """ @@ -71,15 +69,9 @@ def _generate_docker_file(self) -> None: docker_file_template = file_utils.resolve_zip_import_path( os.path.join(os.path.dirname(__file__), "templates/dockerfile_template") ) - if self.model_meta.cuda_version: - cuda_version_parsed = version.parse(self.model_meta.cuda_version) - cuda_version_str = f"{cuda_version_parsed.major}.{cuda_version_parsed.minor}" - else: - cuda_version_str = "" if self.model_zip_stage_path is not None: norm_stage_path = posixpath.normpath(identifier.remove_prefix(self.model_zip_stage_path, "@")) - absolute_path = os.path.join("/", norm_stage_path) assert self.session fop = FileOperation(self.session) # The explicit download here is inefficient but a compromise. @@ -90,8 +82,8 @@ def _generate_docker_file(self) -> None: get_res_list = fop.get(stage_location=self.model_zip_stage_path, target_directory=self.context_dir) assert len(get_res_list) == 1, f"Single zip file should be returned, but got {len(get_res_list)} files." local_zip_file_path = os.path.basename(get_res_list[0].file) - copy_model_statement = f"COPY {local_zip_file_path} {absolute_path}" - extra_env_statement = f"ENV MODEL_ZIP_STAGE_PATH={absolute_path}" + copy_model_statement = f"COPY {local_zip_file_path} ./{norm_stage_path}" + extra_env_statement = f"ENV MODEL_ZIP_STAGE_PATH={norm_stage_path}" else: copy_model_statement = "" extra_env_statement = "" @@ -113,7 +105,7 @@ def _generate_docker_file(self) -> None: # Instead of omitting this ENV var when no CUDA required, we explicitly set it to empty to override # as no CUDA is detected thus it won't be affected by the existence of CUDA in base image. # https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-virtual.html - "cuda_override_env": cuda_version_str, + "cuda_override_env": self.model_meta.env.cuda_version if self.model_meta.env.cuda_version else "", "copy_model_statement": copy_model_statement, "extra_env_statement": extra_env_statement, } diff --git a/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py b/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py index 110da3a4..22d7e0f9 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py +++ b/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py @@ -9,9 +9,9 @@ from absl.testing import absltest from sklearn import neighbors -from snowflake.ml.model import _model as model_api from snowflake.ml.model._deploy_client.image_builds import docker_context from snowflake.ml.model._deploy_client.utils import constants +from snowflake.ml.model._packager import model_packager from snowflake.snowpark import FileOperation, GetResult, Session _IRIS = datasets.load_iris(as_frame=True) @@ -30,12 +30,14 @@ def setUp(self) -> None: self.context_dir = tempfile.mkdtemp() self.model_dir = tempfile.mkdtemp() - self.model_meta = model_api._save( + self.packager = model_packager.ModelPackager(self.model_dir) + self.packager.save( name="model", - local_dir_path=self.model_dir, model=_get_sklearn_model(), sample_input=_IRIS_X, ) + assert self.packager.meta + self.model_meta = self.packager.meta self.docker_context = docker_context.DockerContext(self.context_dir, model_meta=self.model_meta) @@ -83,14 +85,16 @@ def setUp(self) -> None: self.context_dir = tempfile.mkdtemp() self.model_dir = tempfile.mkdtemp() - self.model_meta = model_api._save( + self.packager = model_packager.ModelPackager(self.model_dir) + self.packager.save( name="model", - local_dir_path=self.model_dir, model=_get_sklearn_model(), sample_input=_IRIS_X, ) + assert self.packager.meta + self.model_meta = self.packager.meta - self.model_meta.cuda_version = "11.7.1" + self.model_meta.env.cuda_version = "11.7.1" self.docker_context = docker_context.DockerContext(self.context_dir, model_meta=self.model_meta) @@ -140,14 +144,16 @@ def setUp(self) -> None: self.context_dir = tempfile.mkdtemp() self.model_dir = tempfile.mkdtemp() - self.model_meta = model_api._save( + self.packager = model_packager.ModelPackager(self.model_dir) + self.packager.save( name="model", - local_dir_path=self.model_dir, model=_get_sklearn_model(), sample_input=_IRIS_X, ) + assert self.packager.meta + self.model_meta = self.packager.meta - self.model_meta.cuda_version = "11.7.1" + self.model_meta.env.cuda_version = "11.7.1" self.mock_session = absltest.mock.MagicMock(spec=Session) self.model_zip_stage_path = "@model_repo/model.zip" diff --git a/snowflake/ml/model/_deploy_client/image_builds/inference_server/BUILD.bazel b/snowflake/ml/model/_deploy_client/image_builds/inference_server/BUILD.bazel index f0ef420b..788c3488 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/inference_server/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/image_builds/inference_server/BUILD.bazel @@ -11,7 +11,7 @@ py_library( srcs = ["main.py"], compatible_with_snowpark = False, deps = [ - "//snowflake/ml/model:_model", + "//snowflake/ml/model:_api", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:type_hints", ], @@ -24,6 +24,25 @@ py_test( deps = [ ":main", "//snowflake/ml/_internal:file_utils", - "//snowflake/ml/model:_model_meta", + "//snowflake/ml/model/_packager/model_meta", ], ) + +py_test( + name = "main_vllm_test", + srcs = ["main_vllm_test.py"], + compatible_with_snowpark = False, + require_gpu = True, + deps = [ + ":main", + "//snowflake/ml/_internal:file_utils", + "//snowflake/ml/model/models:llm_model", + ], +) + +py_test( + name = "gpu_test", + srcs = ["gpu_test.py"], + compatible_with_snowpark = False, + require_gpu = True, +) diff --git a/snowflake/ml/model/_deploy_client/image_builds/inference_server/gpu_test.py b/snowflake/ml/model/_deploy_client/image_builds/inference_server/gpu_test.py new file mode 100644 index 00000000..4891e97c --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/inference_server/gpu_test.py @@ -0,0 +1,12 @@ +from absl.testing import absltest + + +class GPUTest(absltest.TestCase): + def test_gpu(self): + import torch + + self.assertEqual(torch.cuda.is_available(), True) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py index 9846a3bd..eda162e6 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +++ b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py @@ -75,8 +75,8 @@ def _run_setup() -> None: global TARGET_METHOD try: - MODEL_ZIP_STAGE_PATH = os.getenv("MODEL_ZIP_STAGE_PATH") - assert MODEL_ZIP_STAGE_PATH, "Missing environment variable MODEL_ZIP_STAGE_PATH" + model_zip_stage_path = os.getenv("MODEL_ZIP_STAGE_PATH") + assert model_zip_stage_path, "Missing environment variable MODEL_ZIP_STAGE_PATH" TARGET_METHOD = os.getenv("TARGET_METHOD") @@ -84,9 +84,6 @@ def _run_setup() -> None: _CONCURRENT_REQUESTS_MAX = int(_concurrent_requests_max_env) if _concurrent_requests_max_env else None - root_path = os.path.abspath(os.sep) - model_zip_stage_path = os.path.join(root_path, MODEL_ZIP_STAGE_PATH) - with tempfile.TemporaryDirectory() as tmp_dir: if zipfile.is_zipfile(model_zip_stage_path): extracted_dir = os.path.join(tmp_dir, "extracted_model_dir") @@ -99,23 +96,39 @@ def _run_setup() -> None: logger.info(f"Loading model from {extracted_dir} into memory") sys.path.insert(0, os.path.join(extracted_dir, _MODEL_CODE_DIR)) - from snowflake.ml.model import ( - _model as model_api, - type_hints as model_types, - ) - - # Backward for <= 1.0.5 - if hasattr(model_api, "_load_model_for_deploy"): - _LOADED_MODEL, _LOADED_META = model_api._load_model_for_deploy(extracted_dir) - else: - _LOADED_MODEL, _LOADED_META = model_api._load( - local_dir_path=extracted_dir, + from snowflake.ml.model import type_hints as model_types + + # TODO (Server-side Model Rollout): + # Keep try block only + try: + from snowflake.ml.model._packager import model_packager + + pk = model_packager.ModelPackager(extracted_dir) + pk.load( as_custom_model=True, meta_only=False, options=model_types.ModelLoadOption( {"use_gpu": cast(bool, os.environ.get("SNOWML_USE_GPU", False))} ), ) + _LOADED_MODEL = pk.model + _LOADED_META = pk.meta + except ImportError: + # Legacy model support + from snowflake.ml.model import ( # type: ignore[attr-defined] + _model as model_api, + ) + + if hasattr(model_api, "_load_model_for_deploy"): + _LOADED_MODEL, _LOADED_META = model_api._load_model_for_deploy(extracted_dir) + else: + _LOADED_MODEL, meta_LOADED_META = model_api._load( + local_dir_path=extracted_dir, + as_custom_model=True, + options=model_types.ModelLoadOption( + {"use_gpu": cast(bool, os.environ.get("SNOWML_USE_GPU", False))} + ), + ) _MODEL_LOADING_STATE = _ModelLoadingState.SUCCEEDED logger.info("Successfully loaded model into memory") _MODEL_LOADING_EVENT.set() @@ -167,6 +180,7 @@ def _do_predict(input_json: Dict[str, List[List[object]]]) -> responses.JSONResp assert len(input_data) != 0 and not all(not row for row in input_data), "empty data" except Exception as e: error_message = f"Input data malformed: {str(e)}\n{traceback.format_exc()}" + logger.error(f"Failed request with error: {error_message}") return responses.JSONResponse({"error": error_message}, status_code=http.HTTPStatus.BAD_REQUEST) try: @@ -180,6 +194,7 @@ def _do_predict(input_json: Dict[str, List[List[object]]]) -> responses.JSONResp return responses.JSONResponse(response) except Exception as e: error_message = f"Prediction failed: {str(e)}\n{traceback.format_exc()}" + logger.error(f"Failed request with error: {error_message}") return responses.JSONResponse({"error": error_message}, status_code=http.HTTPStatus.BAD_REQUEST) diff --git a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py index e4e91798..d6c66844 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py +++ b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py @@ -10,7 +10,8 @@ from starlette import testclient from snowflake.ml._internal import file_utils -from snowflake.ml.model import _model as model_api, custom_model +from snowflake.ml.model import custom_model +from snowflake.ml.model._packager import model_packager class MainTest(absltest.TestCase): @@ -41,9 +42,8 @@ def predict(self, input: pd.DataFrame) -> pd.DataFrame: tmpdir = self.create_tempdir() tmpdir_for_zip = self.create_tempdir() zip_full_path = os.path.join(tmpdir_for_zip.full_path, "model.zip") - model_api._save( + model_packager.ModelPackager(tmpdir.full_path).save( name="test_model", - local_dir_path=tmpdir.full_path, model=model, sample_input=x, metadata={"author": "halu", "version": "1"}, diff --git a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_vllm_test.py b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_vllm_test.py new file mode 100644 index 00000000..43400f58 --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_vllm_test.py @@ -0,0 +1,129 @@ +import contextlib +import http +import logging +import os +import tempfile +from typing import Any, Dict, List + +from absl.testing import absltest +from absl.testing.absltest import mock +from starlette import testclient + +from snowflake.ml._internal import file_utils +from snowflake.ml.model._packager import model_packager +from snowflake.ml.model.models import llm + +logger = logging.getLogger(__name__) + + +class MainVllmTest(absltest.TestCase): + @classmethod + def setUpClass(cls) -> None: + cls.cache_dir = tempfile.TemporaryDirectory() + cls._original_hf_home = os.getenv("HF_HOME", None) + os.environ["HF_HOME"] = cls.cache_dir.name + + @classmethod + def tearDownClass(cls) -> None: + if cls._original_hf_home: + os.environ["HF_HOME"] = cls._original_hf_home + else: + del os.environ["HF_HOME"] + cls.cache_dir.cleanup() + + def setUp(self) -> None: + super().setUp() + + def setup_lora_model(self) -> str: + import peft + + ft_model = peft.AutoPeftModelForCausalLM.from_pretrained( # type: ignore[attr-defined] + "peft-internal-testing/opt-350m-lora", + device_map="auto", + ) + tmpdir = self.create_tempdir().full_path + ft_model.save_pretrained(tmpdir) + options = llm.LLMOptions( + max_batch_size=100, + ) + model = llm.LLM(tmpdir, options=options) + tmpdir = self.create_tempdir() + tmpdir_for_zip = self.create_tempdir() + zip_full_path = os.path.join(tmpdir_for_zip.full_path, "model.zip") + model_packager.ModelPackager(tmpdir.full_path).save( + name="test_model", + model=model, + metadata={"author": "halu", "version": "1"}, + ) + with file_utils.zip_file_or_directory_to_stream(tmpdir.full_path, leading_path=tmpdir.full_path) as zf: + zf.seek(0) + with open(zip_full_path, "wb") as f: + f.write(zf.getvalue()) + return zip_full_path + + def setup_pretrain_model(self) -> str: + options = llm.LLMOptions( + max_batch_size=100, + ) + model = llm.LLM("facebook/opt-350m", options=options) + tmpdir = self.create_tempdir() + tmpdir_for_zip = self.create_tempdir() + zip_full_path = os.path.join(tmpdir_for_zip.full_path, "model.zip") + model_packager.ModelPackager(tmpdir.full_path).save( + name="test_model", + model=model, + metadata={"author": "halu", "version": "1"}, + ) + with file_utils.zip_file_or_directory_to_stream(tmpdir.full_path, leading_path=tmpdir.full_path) as zf: + zf.seek(0) + with open(zip_full_path, "wb") as f: + f.write(zf.getvalue()) + return zip_full_path + + @contextlib.contextmanager + def common_helper(self, model_zip_path): # type: ignore[no-untyped-def] + with mock.patch.dict( + os.environ, + { + "TARGET_METHOD": "infer", + "MODEL_ZIP_STAGE_PATH": model_zip_path, + }, + ): + import main + + client = testclient.TestClient(main.app) + yield main, client + + def generate_data(self, dfl: List[str]) -> Dict[str, Any]: + res = [] + for i, v in enumerate(dfl): + res.append( + [ + i, + { + "_ID": i, + "input": v, + }, + ] + ) + return {"data": res} + + def test_happy_lora_case(self) -> None: + model_zip_path = self.setup_lora_model() + with self.common_helper(model_zip_path) as (_, client): + prompts = ["1+1=", "2+2="] + data = self.generate_data(prompts) + response = client.post("/predict", json=data) + self.assertEqual(response.status_code, http.HTTPStatus.OK) + + def test_happy_pretrain_case(self) -> None: + model_zip_path = self.setup_pretrain_model() + with self.common_helper(model_zip_path) as (_, client): + prompts = ["1+1=", "2+2="] + data = self.generate_data(prompts) + response = client.post("/predict", json=data) + self.assertEqual(response.status_code, http.HTTPStatus.OK) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py b/snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py index 74a028b7..964fb9d3 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +++ b/snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py @@ -201,14 +201,5 @@ def _construct_and_upload_job_spec(self, base_image: str, kaniko_shell_script_st ) def _launch_kaniko_job(self, spec_stage_location: str) -> None: - logger.info("Submitting SPCS job for building docker image.") - job_id = self.client.create_job(compute_pool=self.compute_pool, spec_stage_location=spec_stage_location) - logger.info(f"Server image building SPCS job id is {job_id}.") - # Given image build can take a while, we set a generous timeout to be 1 hour. - self.client.block_until_resource_is_ready( - resource_name=job_id, - resource_type=constants.ResourceType.JOB, - container_name=constants.KANIKO_CONTAINER_NAME, - max_retries=240, - retry_interval_secs=15, - ) + logger.debug("Submitting job for building docker image with kaniko") + self.client.create_job(compute_pool=self.compute_pool, spec_stage_location=spec_stage_location) diff --git a/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template b/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template index a448e284..cafb6d89 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +++ b/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template @@ -1,16 +1,19 @@ # Note that base image tag should not be 'latest' as it might cause false positive image cache hit. FROM ${base_image} as build -COPY ${model_env_folder}/conda.yaml conda.yaml +COPY ${model_env_folder}/conda.yml conda.yml COPY ${model_env_folder}/requirements.txt requirements.txt # Set MAMBA_DOCKERFILE_ACTIVATE=1 to activate the conda environment during build time. ARG MAMBA_DOCKERFILE_ACTIVATE=1 +# Bitsandbytes uses this ENVVAR to determine CUDA library location +ENV CONDA_PREFIX=/opt/conda + # The micromamba image comes with an empty environment named base. # CONDA_OVERRIDE_CUDA ref https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-virtual.html RUN --mount=type=cache,target=/opt/conda/pkgs CONDA_OVERRIDE_CUDA="${cuda_override_env}" \ - micromamba install -y -n base -f conda.yaml && \ + micromamba install -y -n base -f conda.yml && \ python -m pip install "uvicorn[standard]" gunicorn starlette==0.30.0 && \ python -m pip install -r requirements.txt && \ micromamba clean -afy @@ -19,10 +22,9 @@ COPY ${inference_server_dir} ./${inference_server_dir} COPY ${entrypoint_script} ./${entrypoint_script} ${copy_model_statement} -# Bitsandbytes uses this ENVVAR to determine CUDA library location -ENV CONDA_PREFIX=/opt/conda ${extra_env_statement} + USER root RUN if id mambauser >/dev/null 2>&1; then \ echo "mambauser already exists."; \ diff --git a/snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template b/snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template index 9f8baa08..f3694df2 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template +++ b/snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template @@ -7,7 +7,7 @@ spec: args: - -c - >- - while [ ! -f "$script_path" ]; do sleep 1; done; + while [ ! -f "$script_path" ]; do echo "File not found: $script_path"; sleep 1; done; chmod +x $script_path; sh $script_path; volumeMounts: diff --git a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture index 2130d55b..49cef93d 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture +++ b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture @@ -1,18 +1,21 @@ FROM mambaorg/micromamba:1.4.3 as build -COPY env/conda.yaml conda.yaml +COPY env/conda.yml conda.yml COPY env/requirements.txt requirements.txt ARG MAMBA_DOCKERFILE_ACTIVATE=1 +ENV CONDA_PREFIX=/opt/conda RUN --mount=type=cache,target=/opt/conda/pkgs CONDA_OVERRIDE_CUDA="" \ - micromamba install -y -n base -f conda.yaml && \ + micromamba install -y -n base -f conda.yml && \ python -m pip install "uvicorn[standard]" gunicorn starlette==0.30.0 && \ python -m pip install -r requirements.txt && \ micromamba clean -afy COPY inference_server ./inference_server COPY gunicorn_run.sh ./gunicorn_run.sh -ENV CONDA_PREFIX=/opt/conda + + + USER root diff --git a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_CUDA b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_CUDA index 29c67a59..c17d8329 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_CUDA +++ b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_CUDA @@ -1,18 +1,21 @@ FROM mambaorg/micromamba:1.4.3 as build -COPY env/conda.yaml conda.yaml +COPY env/conda.yml conda.yml COPY env/requirements.txt requirements.txt ARG MAMBA_DOCKERFILE_ACTIVATE=1 +ENV CONDA_PREFIX=/opt/conda RUN --mount=type=cache,target=/opt/conda/pkgs CONDA_OVERRIDE_CUDA="11.7" \ - micromamba install -y -n base -f conda.yaml && \ + micromamba install -y -n base -f conda.yml && \ python -m pip install "uvicorn[standard]" gunicorn starlette==0.30.0 && \ python -m pip install -r requirements.txt && \ micromamba clean -afy COPY inference_server ./inference_server COPY gunicorn_run.sh ./gunicorn_run.sh -ENV CONDA_PREFIX=/opt/conda + + + USER root diff --git a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_model b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_model index f0719f73..1ace1d46 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_model +++ b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_model @@ -1,20 +1,22 @@ FROM mambaorg/micromamba:1.4.3 as build -COPY env/conda.yaml conda.yaml +COPY env/conda.yml conda.yml COPY env/requirements.txt requirements.txt ARG MAMBA_DOCKERFILE_ACTIVATE=1 +ENV CONDA_PREFIX=/opt/conda RUN --mount=type=cache,target=/opt/conda/pkgs CONDA_OVERRIDE_CUDA="11.7" \ - micromamba install -y -n base -f conda.yaml && \ + micromamba install -y -n base -f conda.yml && \ python -m pip install "uvicorn[standard]" gunicorn starlette==0.30.0 && \ python -m pip install -r requirements.txt && \ micromamba clean -afy COPY inference_server ./inference_server COPY gunicorn_run.sh ./gunicorn_run.sh -COPY model.zip /model_repo/model.zip -ENV CONDA_PREFIX=/opt/conda -ENV MODEL_ZIP_STAGE_PATH=/model_repo/model.zip +COPY model.zip ./model_repo/model.zip + +ENV MODEL_ZIP_STAGE_PATH=model_repo/model.zip + USER root RUN if id mambauser >/dev/null 2>&1; then \ diff --git a/snowflake/ml/model/_deploy_client/snowservice/BUILD.bazel b/snowflake/ml/model/_deploy_client/snowservice/BUILD.bazel index 94d2e802..0084687e 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/snowservice/BUILD.bazel @@ -23,13 +23,13 @@ py_library( ":instance_types", "//snowflake/ml/_internal/exceptions", "//snowflake/ml/_internal/utils:identifier", - "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:type_hints", "//snowflake/ml/model/_deploy_client/image_builds:base_image_builder", "//snowflake/ml/model/_deploy_client/image_builds:client_image_builder", "//snowflake/ml/model/_deploy_client/image_builds:server_image_builder", "//snowflake/ml/model/_deploy_client/utils:image_registry_client", "//snowflake/ml/model/_deploy_client/utils:snowservice_client", + "//snowflake/ml/model/_packager/model_meta", ], ) diff --git a/snowflake/ml/model/_deploy_client/snowservice/deploy.py b/snowflake/ml/model/_deploy_client/snowservice/deploy.py index 919c8e7d..4482bd5c 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/deploy.py +++ b/snowflake/ml/model/_deploy_client/snowservice/deploy.py @@ -6,18 +6,19 @@ import tempfile import time from abc import ABC -from typing import Any, Dict, Optional, Tuple, cast +from contextlib import contextmanager +from typing import Any, Dict, Generator, Optional, Tuple, cast import yaml from typing_extensions import Unpack -from snowflake.ml._internal import env_utils, file_utils +from snowflake.ml._internal import file_utils from snowflake.ml._internal.exceptions import ( error_codes, exceptions as snowml_exceptions, ) from snowflake.ml._internal.utils import identifier, query_result_checker -from snowflake.ml.model import _model_meta, type_hints +from snowflake.ml.model import type_hints from snowflake.ml.model._deploy_client.image_builds import ( base_image_builder, client_image_builder, @@ -30,16 +31,41 @@ image_registry_client, snowservice_client, ) +from snowflake.ml.model._packager.model_meta import model_meta, model_meta_schema from snowflake.snowpark import Session logger = logging.getLogger(__name__) +@contextmanager +def _debug_aware_tmp_directory(debug_dir: Optional[str] = None) -> Generator[str, None, None]: + """Debug-aware directory provider. + + Args: + debug_dir: A folder for deploymement context. + + Yields: + A directory path to write deployment artifacts + """ + create_temp = False + if debug_dir: + directory_path = debug_dir + else: + temp_dir_context = tempfile.TemporaryDirectory() + directory_path = temp_dir_context.name + create_temp = True + try: + yield directory_path + finally: + if create_temp: + temp_dir_context.cleanup() + + def _deploy( session: Session, *, model_id: str, - model_meta: _model_meta.ModelMetadata, + model_meta: model_meta.ModelMetadata, service_func_name: str, model_zip_stage_path: str, deployment_stage_path: str, @@ -117,31 +143,25 @@ def _deploy( options = deploy_options.SnowServiceDeployOptions.from_dict(cast(Dict[str, Any], kwargs)) model_meta_deploy = copy.deepcopy(model_meta) + # Set conda-forge as backup channel for SPCS deployment + if "conda-forge" not in model_meta_deploy.env._conda_dependencies: + model_meta_deploy.env._conda_dependencies["conda-forge"] = [] if options.use_gpu: # Make mypy happy assert options.num_gpus is not None - if model_meta.cuda_version is None: + if model_meta_deploy.env.cuda_version is None: raise snowml_exceptions.SnowflakeMLException( error_code=error_codes.INVALID_ARGUMENT, original_exception=ValueError( "You are requesting GPUs for models that do not use a GPU or does not have CUDA version set." ), ) - if model_meta.cuda_version: - ( - model_meta_deploy._conda_dependencies, - model_meta_deploy._pip_requirements, - ) = env_utils.generate_env_for_cuda( - model_meta._conda_dependencies, model_meta._pip_requirements, model_meta.cuda_version - ) + if model_meta.env.cuda_version: + model_meta_deploy.env.generate_env_for_cuda() else: # If user does not need GPU, we set this copies cuda_version to None, thus when Image builder gets a # not-None cuda_version, it gets to know that GPU is required. - model_meta_deploy._cuda_version = None - - # Set conda-forge as backup channel for SPCS deployment - if "conda-forge" not in model_meta_deploy._conda_dependencies: - model_meta_deploy._conda_dependencies["conda-forge"] = [] + model_meta_deploy.env._cuda_version = None _validate_compute_pool(session, options=options) @@ -267,7 +287,7 @@ def __init__( self, session: Session, model_id: str, - model_meta: _model_meta.ModelMetadata, + model_meta: model_meta.ModelMetadata, service_func_name: str, model_zip_stage_path: str, deployment_stage_path: str, @@ -300,6 +320,10 @@ def __init__( self._service_name = identifier.get_schema_level_object_identifier(db, schema, f"service_{model_id}") # Spec file and future deployment related artifacts will be stored under {stage}/models/{model_id} self._model_artifact_stage_location = posixpath.join(deployment_stage_path, "models", self.id) + self.debug_dir: Optional[str] = None + if self.options.debug_mode: + self.debug_dir = tempfile.mkdtemp() + logger.warning(f"Debug model is enabled, deployment artifacts will be available in {self.debug_dir}") def deploy(self) -> type_hints.SnowparkContainerServiceDeployDetails: """ @@ -313,7 +337,7 @@ def deploy(self) -> type_hints.SnowparkContainerServiceDeployDetails: full_image_name = self.options.prebuilt_snowflake_image (service_spec, service_function_sql) = self._deploy_workflow(self.options.prebuilt_snowflake_image) else: - with tempfile.TemporaryDirectory() as context_dir: + with _debug_aware_tmp_directory(debug_dir=self.debug_dir) as context_dir: extra_kwargs = {} if self.options.model_in_image: extra_kwargs = { @@ -340,7 +364,7 @@ def deploy(self) -> type_hints.SnowparkContainerServiceDeployDetails: else: logger.warning( "Building the Docker image and deploying to Snowpark Container Service. " - "This process may take a few minutes." + "This process may take anywhere from a few minutes to a longer period for GPU-based models." ) start = time.time() self._build_and_upload_image( @@ -392,7 +416,7 @@ def _get_full_image_name(self, image_repo: str, context_dir: str) -> str: # issue because model dependency is also captured in the model env/ folder, which will be hashed. The aim is to # reuse the same Docker image even if the user logs a similar model without new dependencies. docker_context_dir_hash = file_utils.hash_directory( - context_dir, ignore_hidden=True, excluded_files=[_model_meta.ModelMetadata.MODEL_METADATA_FILE] + context_dir, ignore_hidden=True, excluded_files=[model_meta.MODEL_METADATA_FILE] ) # By default, we associate a 'latest' tag with each of our created images for easy existence checking. # Additional tags are added for readability. @@ -440,20 +464,22 @@ def _prepare_and_upload_artifacts_to_stage(self, image: str) -> str: os.path.join(os.path.dirname(__file__), "templates/service_spec_template") ) - with tempfile.TemporaryDirectory() as tempdir: - spec_file_path = os.path.join(tempdir, f"{constants.SERVICE_SPEC}.yaml") + with _debug_aware_tmp_directory(self.debug_dir) as dir_path: + spec_file_path = os.path.join(dir_path, f"{constants.SERVICE_SPEC}.yaml") with open(spec_template_path, encoding="utf-8") as template, open( spec_file_path, "w+", encoding="utf-8" ) as spec_file: assert self.model_zip_stage_path.startswith("@") norm_stage_path = posixpath.normpath(identifier.remove_prefix(self.model_zip_stage_path, "@")) + # Ensure model stage path has root prefix as stage mount will it mount it to root. + absolute_model_stage_path = os.path.join("/", norm_stage_path) (db, schema, stage, path) = identifier.parse_schema_level_object_identifier(norm_stage_path) substitutes = { "image": image, "predict_endpoint_name": constants.PREDICT, "model_stage": identifier.get_schema_level_object_identifier(db, schema, stage), - "model_zip_stage_path": norm_stage_path, + "model_zip_stage_path": absolute_model_stage_path, "inference_server_container_name": constants.INFERENCE_SERVER_CONTAINER, "target_method": self.target_method, "num_workers": self.options.num_workers, @@ -475,7 +501,7 @@ def _prepare_and_upload_artifacts_to_stage(self, image: str) -> str: # Make LLM use case sequential if any( - model_blob_meta.model_type == "huggingface_pipeline" + model_blob_meta.model_type == "huggingface_pipeline" or model_blob_meta.model_type == "llm" for model_blob_meta in self.model_meta.models.values() ): container["env"]["_CONCURRENT_REQUESTS_MAX"] = 1 @@ -496,6 +522,27 @@ def _prepare_and_upload_artifacts_to_stage(self, image: str) -> str: ) return spec_file_yaml_string + def _get_max_batch_rows(self) -> Optional[int]: + # To avoid too large batch in HF LLM case + max_batch_rows = None + if self.options.use_gpu: + for model_blob_meta in self.model_meta.models.values(): + batch_size = None + if model_blob_meta.model_type == "huggingface_pipeline": + model_blob_options_hf = cast( + model_meta_schema.HuggingFacePipelineModelBlobOptions, model_blob_meta.options + ) + batch_size = model_blob_options_hf["batch_size"] + if model_blob_meta.model_type == "llm": + model_blob_options_llm = cast(model_meta_schema.LLMModelBlobOptions, model_blob_meta.options) + batch_size = model_blob_options_llm["batch_size"] + if batch_size: + if max_batch_rows is None: + max_batch_rows = batch_size + else: + max_batch_rows = min(batch_size, max_batch_rows) + return max_batch_rows + def _deploy_workflow(self, image: str) -> Tuple[str, str]: """This function handles workflow deployment to SnowService with the given image. @@ -518,23 +565,17 @@ def _deploy_workflow(self, image: str) -> Tuple[str, str]: min_instances=self.options.min_instances, max_instances=self.options.max_instances, ) + logger.info(f"Wait for service {self._service_name} to become ready...") client.block_until_resource_is_ready( resource_name=self._service_name, resource_type=constants.ResourceType.SERVICE ) - - # To avoid too large batch in HF LLM case - max_batch_rows = None - if self.options.use_gpu: - for model_blob_meta in self.model_meta.models.values(): - if model_blob_meta.model_type == "huggingface_pipeline": - max_batch_rows = int(model_blob_meta.options.get("batch_size", 1)) - if model_blob_meta.model_type == "llm": - max_batch_rows = int(model_blob_meta.options.get("batch_size", 1)) + logger.info(f"Service {self._service_name} is ready. Creating service function...") service_function_sql = client.create_or_replace_service_function( service_func_name=self.service_func_name, service_name=self._service_name, endpoint_name=constants.PREDICT, - max_batch_rows=max_batch_rows, + max_batch_rows=self._get_max_batch_rows(), ) - return (service_spec_string, service_function_sql) + logger.info(f"Service function {self.service_func_name} is created. Deployment completed successfully!") + return service_spec_string, service_function_sql diff --git a/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py b/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py index 197413bd..0cbe700d 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +++ b/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py @@ -25,6 +25,7 @@ def __init__( enable_remote_image_build: Optional[bool] = True, force_image_build: Optional[bool] = False, model_in_image: Optional[bool] = False, + debug_mode: Optional[bool] = False, ) -> None: """Initialization @@ -51,6 +52,7 @@ def __init__( system will automatically check whether a previously built image can be reused model_in_image: When set to True, image would container full model weights. The default if False, which means image without model weights and we do stage mount to access weights. + debug_mode: When set to True, deployment artifacts will be persisted in a local temp directory. """ self.compute_pool = compute_pool @@ -63,6 +65,7 @@ def __init__( self.enable_remote_image_build = enable_remote_image_build self.force_image_build = force_image_build self.model_in_image = model_in_image + self.debug_mode = debug_mode if self.num_workers is None and self.use_gpu: logger.info("num_workers has been defaulted to 1 when using GPU.") diff --git a/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py b/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py index c84d9025..d9380525 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py +++ b/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py @@ -49,7 +49,7 @@ def _get_mocked_compute_pool_res( ] ) - @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model_meta.ModelMetadata") # type: ignore[misc] + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.model_meta.ModelMetadata") # type: ignore[misc] @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore[misc] def test_deploy_with_model_id(self, m_deployment_class: mock.MagicMock, m_model_meta_class: mock.MagicMock) -> None: m_deployment = m_deployment_class.return_value @@ -86,7 +86,7 @@ def test_deploy_with_model_id(self, m_deployment_class: mock.MagicMock, m_model_ ) m_deployment.deploy.assert_called_once() - @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model_meta.ModelMetadata") # type: ignore[misc] + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.model_meta.ModelMetadata") # type: ignore[misc] @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore[misc] def test_deploy_with_not_ready_compute_pool( self, m_deployment_class: mock.MagicMock, m_model_meta_class: mock.MagicMock @@ -114,7 +114,7 @@ def test_deploy_with_not_ready_compute_pool( m_deployment_class.assert_not_called() - @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model_meta.ModelMetadata") # type: ignore[misc] + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.model_meta.ModelMetadata") # type: ignore[misc] @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore[misc] def test_deploy_with_compute_pool_in_suspended_state_with_auto_resume( self, m_deployment_class: mock.MagicMock, m_model_meta_class: mock.MagicMock @@ -143,7 +143,7 @@ def test_deploy_with_compute_pool_in_suspended_state_with_auto_resume( m_deployment.deploy.assert_called_once() - @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model_meta.ModelMetadata") # type: ignore[misc] + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.model_meta.ModelMetadata") # type: ignore[misc] @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore[misc] def test_deploy_with_empty_model_id( self, m_deployment_class: mock.MagicMock, m_model_meta_class: mock.MagicMock @@ -163,7 +163,7 @@ def test_deploy_with_empty_model_id( m_deployment_class.assert_not_called() - @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model_meta.ModelMetadata") # type: ignore[misc] + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.model_meta.ModelMetadata") # type: ignore[misc] @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore[misc] def test_deploy_with_missing_required_options( self, m_deployment_class: mock.MagicMock, m_model_meta_class: mock.MagicMock @@ -185,7 +185,7 @@ def test_deploy_with_missing_required_options( ) m_deployment_class.assert_not_called() - @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model_meta.ModelMetadata") # type: ignore[misc] + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.model_meta.ModelMetadata") # type: ignore[misc] @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore[misc] def test_deploy_with_over_requested_gpus( self, m_deployment_class: mock.MagicMock, m_model_meta_class: mock.MagicMock @@ -213,13 +213,13 @@ def test_deploy_with_over_requested_gpus( ) m_deployment_class.assert_not_called() - @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model_meta.ModelMetadata") # type: ignore[misc] + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.model_meta.ModelMetadata") # type: ignore[misc] @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore[misc] def test_deploy_with_over_requested_gpus_no_cuda( self, m_deployment_class: mock.MagicMock, m_model_meta_class: mock.MagicMock ) -> None: m_model_meta = m_model_meta_class.return_value - m_model_meta.cuda_version = None + m_model_meta.env.cuda_version = None with exception_utils.assert_snowml_exceptions( self, expected_original_error_type=ValueError, @@ -227,9 +227,7 @@ def test_deploy_with_over_requested_gpus_no_cuda( ): self.m_session.add_mock_sql( query=f"DESC COMPUTE POOL {self.options['compute_pool']}", - result=mock_data_frame.MockDataFrame( - [row.Row(name="MY_GPU_POOL", state="IDLE", min_nodes=1, max_nodes=1, instance_family="GPU_7")] - ), + result=self._get_mocked_compute_pool_res(instance_family="GPU_7"), ) _deploy( session=cast(session.Session, self.m_session), @@ -245,7 +243,7 @@ def test_deploy_with_over_requested_gpus_no_cuda( m_deployment_class.assert_not_called() @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.copy.deepcopy") # type: ignore[misc] - @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model_meta.ModelMetadata") # type: ignore[misc] + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.model_meta.ModelMetadata") # type: ignore[misc] @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore[misc] def test_deploy_with_gpu_validation_and_unknown_instance_type( self, m_deployment_class: mock.MagicMock, m_model_meta_class: mock.MagicMock, m_deepcopy_func: mock.MagicMock @@ -368,7 +366,7 @@ def test_get_or_create_image_repo(self, m_snowservice_client_class: mock.MagicMo class SnowServiceDeploymentTestCase(absltest.TestCase): - @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model_meta.ModelMetadata") # type: ignore[misc] + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.model_meta.ModelMetadata") # type: ignore[misc] def setUp(self, m_model_meta_class: mock.MagicMock) -> None: super().setUp() self.m_session = cast(session.Session, mock_session.MockSession(conn=None, test_case=self)) @@ -430,7 +428,8 @@ def test_deploy(self, m_image_exists_class: mock.MagicMock, m_add_tag_class: moc [ ( "WARNING:snowflake.ml.model._deploy_client.snowservice.deploy:Building the Docker image " - "and deploying to Snowpark Container Service. This process may take a few minutes." + "and deploying to Snowpark Container Service. This process may take anywhere from a few " + "minutes to a longer period for GPU-based models." ), ( f"WARNING:snowflake.ml.model._deploy_client.snowservice.deploy:Image successfully built! " diff --git a/snowflake/ml/model/_deploy_client/utils/BUILD.bazel b/snowflake/ml/model/_deploy_client/utils/BUILD.bazel index fcf8ce1c..d7548e7b 100644 --- a/snowflake/ml/model/_deploy_client/utils/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/utils/BUILD.bazel @@ -13,6 +13,7 @@ py_library( deps = [ ":constants", "//snowflake/ml/_internal/exceptions", + "//snowflake/ml/_internal/utils:log_stream_processor", "//snowflake/ml/_internal/utils:uri", ], ) diff --git a/snowflake/ml/model/_deploy_client/utils/constants.py b/snowflake/ml/model/_deploy_client/utils/constants.py index f727398e..bab32324 100644 --- a/snowflake/ml/model/_deploy_client/utils/constants.py +++ b/snowflake/ml/model/_deploy_client/utils/constants.py @@ -42,7 +42,7 @@ class ResourceStatus(Enum): PROD_IMAGE_REGISTRY_SUBDOMAIN = "registry" DEV_IMAGE_REGISTRY_SUBDOMAIN = "registry-dev" MODEL_ENV_FOLDER = "env" -CONDA_FILE = "conda.yaml" +CONDA_FILE = "conda.yml" IMAGE_BUILD_JOB_SPEC_TEMPLATE = "image_build_job_spec_template" KANIKO_SHELL_SCRIPT_TEMPLATE = "kaniko_shell_script_template" CONTEXT = "context" diff --git a/snowflake/ml/model/_deploy_client/utils/image_registry_client.py b/snowflake/ml/model/_deploy_client/utils/image_registry_client.py index 897d115d..097ec1d6 100644 --- a/snowflake/ml/model/_deploy_client/utils/image_registry_client.py +++ b/snowflake/ml/model/_deploy_client/utils/image_registry_client.py @@ -244,6 +244,7 @@ def copy_image( Returns: None """ + logger.info(f"Copying image from {source_image_with_digest} to {dest_image_with_tag}") if snowpark_utils.is_in_stored_procedure(): # type: ignore[no-untyped-call] logger.warning(f"Running inside Sproc. Please ensure image already exists at {dest_image_with_tag}") return None @@ -257,3 +258,4 @@ def copy_image( creds_manager=image_auth_manager.SnowflakeAuthManager(dest_image_with_tag.split("/")[0]), ) imagelib.copy_image(src_image=src_image, dest_image=dest_image, arch=arch, session=self.session) + logger.info("Image copy completed successfully") diff --git a/snowflake/ml/model/_deploy_client/utils/snowservice_client.py b/snowflake/ml/model/_deploy_client/utils/snowservice_client.py index 606f24c5..6db99bca 100644 --- a/snowflake/ml/model/_deploy_client/utils/snowservice_client.py +++ b/snowflake/ml/model/_deploy_client/utils/snowservice_client.py @@ -7,7 +7,7 @@ error_codes, exceptions as snowml_exceptions, ) -from snowflake.ml._internal.utils import uri +from snowflake.ml._internal.utils import log_stream_processor, uri from snowflake.ml.model._deploy_client.utils import constants from snowflake.snowpark import Session @@ -60,18 +60,20 @@ def create_or_replace_service( MIN_INSTANCES={min_instances} MAX_INSTANCES={max_instances} """ + logger.info(f"Creating service {service_name}") logger.debug(f"Create service with SQL: \n {sql}") self.session.sql(sql).collect() - def create_job(self, compute_pool: str, spec_stage_location: str) -> str: - """Return the newly created Job ID. + def create_job(self, compute_pool: str, spec_stage_location: str) -> None: + """Execute the job creation SQL command. Note that the job creation is synchronous, hence we execute it in a + async way so that we can query the log in the meantime. + + Upon job failure, full job container log will be logged. Args: compute_pool: name of the compute pool spec_stage_location: path to the stage location where the spec is located at. - Returns: - job id in string format. """ stage, path = uri.get_stage_and_path(spec_stage_location) sql = f""" @@ -81,9 +83,16 @@ def create_job(self, compute_pool: str, spec_stage_location: str) -> str: SPEC = '{path}' """ logger.debug(f"Create job with SQL: \n {sql}") - self.session.sql(sql).collect() - job_id = self.session.sql("SELECT LAST_QUERY_ID() AS QUERY_ID").collect()[0]["QUERY_ID"] - return str(job_id) + cur = self.session._conn._conn.cursor() + cur.execute_async(sql) + job_id = cur._sfqid + self.block_until_resource_is_ready( + resource_name=str(job_id), + resource_type=constants.ResourceType.JOB, + container_name=constants.KANIKO_CONTAINER_NAME, + max_retries=240, + retry_interval_secs=15, + ) def _drop_service_if_exists(self, service_name: str) -> None: """Drop service if it already exists. @@ -160,19 +169,35 @@ def block_until_resource_is_ready( SnowflakeMLException: If the resource does not reach the ready/done state within the specified number of retries. """ - for attempt_idx in range(max_retries + 1): - status = self.get_resource_status(resource_name=resource_name, resource_type=resource_type) - if resource_type == constants.ResourceType.JOB and status == constants.ResourceStatus.DONE: - full_job_log = self.get_resource_log( + assert resource_type == constants.ResourceType.SERVICE or resource_type == constants.ResourceType.JOB + query_command = "" + if resource_type == constants.ResourceType.SERVICE: + query_command = f"CALL SYSTEM$GET_SERVICE_LOGS('{resource_name}', '0', '{container_name}')" + elif resource_type == constants.ResourceType.JOB: + query_command = f"CALL SYSTEM$GET_JOB_LOGS('{resource_name}', '{container_name}')" + logger.warning( + f"Best-effort log streaming from SPCS will be enabled when python logging level is set to INFO." + f"Alternatively, you can also query the logs by running the query '{query_command}'" + ) + lsp = log_stream_processor.LogStreamProcessor() + + for attempt_idx in range(max_retries): + if logger.level <= logging.INFO: + resource_log = self.get_resource_log( resource_name=resource_name, resource_type=resource_type, container_name=container_name, ) - logger.debug(full_job_log) + lsp.process_new_logs(resource_log, log_level=logging.INFO) + + status = self.get_resource_status(resource_name=resource_name, resource_type=resource_type) + + if resource_type == constants.ResourceType.JOB and status == constants.ResourceStatus.DONE: return elif resource_type == constants.ResourceType.SERVICE and status == constants.ResourceStatus.READY: return - elif ( + + if ( status in [ constants.ResourceStatus.FAILED, @@ -180,20 +205,25 @@ def block_until_resource_is_ready( constants.ResourceStatus.INTERNAL_ERROR, constants.ResourceStatus.DELETING, ] - or attempt_idx >= max_retries + or attempt_idx >= max_retries - 1 ): + if logger.level > logging.INFO: + resource_log = self.get_resource_log( + resource_name=resource_name, + resource_type=resource_type, + container_name=container_name, + ) + # Show full error log when logging level is above INFO level. For INFO level and below, we already + # show the log through logStreamProcessor above. + logger.error(resource_log) + error_message = "failed" - if attempt_idx >= max_retries: + if attempt_idx >= max_retries - 1: error_message = "does not reach ready/done status" - error_log = self.get_resource_log( - resource_name=resource_name, resource_type=resource_type, container_name=container_name - ) + if resource_type == constants.ResourceType.SERVICE: self._drop_service_if_exists(service_name=resource_name) - if error_log: - logger.error(error_log) - raise snowml_exceptions.SnowflakeMLException( error_code=error_codes.INTERNAL_SNOWPARK_CONTAINER_SERVICE_ERROR, original_exception=RuntimeError( @@ -253,11 +283,10 @@ def get_resource_status( status_func = constants.RESOURCE_TO_STATUS_FUNCTION_MAPPING[resource_type] try: row = self.session.sql(f"CALL {status_func}('{resource_name}');").collect() - except Exception as e: - raise snowml_exceptions.SnowflakeMLException( - error_code=error_codes.INTERNAL_SNOWFLAKE_API_ERROR, - original_exception=RuntimeError(f"Error while querying the {resource_type} {resource_name} status."), - ) from e + except Exception: + # Silent fail as SPCS status call is not guaranteed to return in time. Will rely on caller to retry. + return None + resource_metadata = json.loads(row[0][status_func])[0] logger.debug(f"Resource status metadata: {resource_metadata}") if resource_metadata and resource_metadata["status"]: diff --git a/snowflake/ml/model/_deploy_client/utils/snowservice_client_test.py b/snowflake/ml/model/_deploy_client/utils/snowservice_client_test.py index 674b488e..e394c5c6 100644 --- a/snowflake/ml/model/_deploy_client/utils/snowservice_client_test.py +++ b/snowflake/ml/model/_deploy_client/utils/snowservice_client_test.py @@ -1,5 +1,5 @@ import json -from typing import cast +from typing import Optional, cast from absl.testing import absltest from absl.testing.absltest import mock @@ -50,31 +50,55 @@ def test_create_or_replace_service(self) -> None: spec_stage_location=m_spec_storgae_location, ) - def test_create_job(self) -> None: - m_compute_pool = "mock_compute_pool" - m_stage = "@mock_spec_stage" - m_stage_path = "a/hello.yaml" - m_spec_storgae_location = f"{m_stage}/{m_stage_path}" - expected_job_id = "abcd" - self.m_session.add_mock_sql( - query=f""" - EXECUTE SERVICE - IN COMPUTE POOL {m_compute_pool} - FROM {m_stage} - SPEC = '{m_stage_path}' - """, - result=mock_data_frame.MockDataFrame(collect_result=[]), - ) - row = snowpark.Row(**{"QUERY_ID": expected_job_id}) - self.m_session.add_mock_sql( - query="SELECT LAST_QUERY_ID() AS QUERY_ID", - result=mock_data_frame.MockDataFrame(collect_result=[row]), - ) - job_id = self.client.create_job( - compute_pool=m_compute_pool, - spec_stage_location=m_spec_storgae_location, - ) - self.assertEqual(job_id, expected_job_id) + def _add_mock_cursor_to_session(self, *, expected_job_id: Optional[str] = None) -> None: + mock_cursor = mock.Mock() + mock_cursor.execute_async.return_value = None + mock_cursor._sfqid = expected_job_id + + # Replace the cursor in the m_session with the mock_cursor + self.m_session._conn = mock.Mock() + self.m_session._conn._conn = mock.Mock() + self.m_session._conn._conn.cursor.return_value = mock_cursor + + def test_create_job_successfully(self) -> None: + with mock.patch.object(self.client, "get_resource_status", return_value=constants.ResourceStatus.DONE): + m_compute_pool = "mock_compute_pool" + m_stage = "@mock_spec_stage" + m_stage_path = "a/hello.yaml" + m_spec_storgae_location = f"{m_stage}/{m_stage_path}" + expected_job_id = "abcd" + self._add_mock_cursor_to_session(expected_job_id=expected_job_id) + self.client.create_job( + compute_pool=m_compute_pool, + spec_stage_location=m_spec_storgae_location, + ) + + def test_create_job_failed(self) -> None: + with self.assertLogs(level="INFO") as cm: + with mock.patch.object(self.client, "get_resource_status", return_value=constants.ResourceStatus.FAILED): + with exception_utils.assert_snowml_exceptions(self, expected_original_error_type=RuntimeError): + test_log = "Job fails because of xyz." + m_compute_pool = "mock_compute_pool" + m_stage = "@mock_spec_stage" + m_stage_path = "a/hello.yaml" + m_spec_storgae_location = f"{m_stage}/{m_stage_path}" + expected_job_id = "abcd" + + self.m_session.add_mock_sql( + query=f"CALL SYSTEM$GET_JOB_LOGS('{expected_job_id}', '{constants.KANIKO_CONTAINER_NAME}')", + result=mock_data_frame.MockDataFrame( + collect_result=[snowpark.Row(**{"SYSTEM$GET_JOB_LOGS": test_log})] + ), + ) + + self._add_mock_cursor_to_session(expected_job_id=expected_job_id) + + self.client.create_job( + compute_pool=m_compute_pool, + spec_stage_location=m_spec_storgae_location, + ) + + self.assertTrue(cm.output, test_log) def test_create_service_function(self) -> None: m_service_func_name = "mock_service_func_name" @@ -217,6 +241,7 @@ def test_block_until_service_is_ready_happy_path(self) -> None: def test_block_until_service_is_ready_timeout(self) -> None: test_log = "service fails because of xyz." + self.m_session.add_mock_sql( query=f"CALL SYSTEM$GET_SERVICE_LOGS('{self.m_service_name}', '0'," f"'{constants.INFERENCE_SERVER_CONTAINER}')", @@ -224,6 +249,7 @@ def test_block_until_service_is_ready_timeout(self) -> None: collect_result=[snowpark.Row(**{"SYSTEM$GET_SERVICE_LOGS": test_log})] ), ) + self.m_session.add_mock_sql( query=f"DROP SERVICE IF EXISTS {self.m_service_name}", result=mock_data_frame.MockDataFrame(collect_result=[]), @@ -246,6 +272,13 @@ def test_block_until_service_is_ready_retries_and_ready(self) -> None: def test_block_until_service_is_ready_retries_and_fail(self) -> None: test_log = "service fails because of abc." + # First status call return None; first get_log is empty; second status call return failed state + self.m_session.add_mock_sql( + query=f"CALL SYSTEM$GET_SERVICE_LOGS('{self.m_service_name}', '0'," + f"'{constants.INFERENCE_SERVER_CONTAINER}')", + result=mock_data_frame.MockDataFrame(collect_result=[]), + ) + self.m_session.add_mock_sql( query=f"CALL SYSTEM$GET_SERVICE_LOGS('{self.m_service_name}', '0'," f"'{constants.INFERENCE_SERVER_CONTAINER}')", diff --git a/snowflake/ml/model/_deploy_client/warehouse/BUILD.bazel b/snowflake/ml/model/_deploy_client/warehouse/BUILD.bazel index 743d0789..d5764e2c 100644 --- a/snowflake/ml/model/_deploy_client/warehouse/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/warehouse/BUILD.bazel @@ -16,8 +16,8 @@ py_library( "//snowflake/ml/_internal:env_utils", "//snowflake/ml/_internal:file_utils", "//snowflake/ml/_internal/exceptions", - "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager/model_meta", ], ) @@ -28,8 +28,8 @@ py_test( ":deploy", "//snowflake/ml/_internal:env", "//snowflake/ml/_internal:env_utils", - "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:model_signature", + "//snowflake/ml/model/_packager/model_meta", "//snowflake/ml/test_utils:exception_utils", "//snowflake/ml/test_utils:mock_data_frame", "//snowflake/ml/test_utils:mock_session", diff --git a/snowflake/ml/model/_deploy_client/warehouse/deploy.py b/snowflake/ml/model/_deploy_client/warehouse/deploy.py index 792a9c0b..eb62cc3c 100644 --- a/snowflake/ml/model/_deploy_client/warehouse/deploy.py +++ b/snowflake/ml/model/_deploy_client/warehouse/deploy.py @@ -1,3 +1,4 @@ +import copy import logging import posixpath import tempfile @@ -11,8 +12,9 @@ error_codes, exceptions as snowml_exceptions, ) -from snowflake.ml.model import _model_meta, type_hints as model_types +from snowflake.ml.model import type_hints as model_types from snowflake.ml.model._deploy_client.warehouse import infer_template +from snowflake.ml.model._packager.model_meta import model_meta from snowflake.snowpark import session as snowpark_session, types as st logger = logging.getLogger(__name__) @@ -22,7 +24,7 @@ def _deploy_to_warehouse( session: snowpark_session.Session, *, model_stage_file_path: str, - model_meta: _model_meta.ModelMetadata, + model_meta: model_meta.ModelMetadata, udf_name: str, target_method: str, **kwargs: Unpack[model_types.WarehouseDeployOptions], @@ -126,14 +128,14 @@ def _write_UDF_py_file( model_stage_file_name=model_stage_file_name, _KEEP_ORDER_COL_NAME=infer_template._KEEP_ORDER_COL_NAME, target_method=target_method, - code_dir_name=_model_meta.ModelMetadata.MODEL_CODE_DIR, + code_dir_name=model_meta.MODEL_CODE_DIR, ) f.write(udf_code) f.flush() def _get_model_final_packages( - meta: _model_meta.ModelMetadata, + meta: model_meta.ModelMetadata, session: snowpark_session.Session, relax_version: Optional[bool] = False, ) -> List[str]: @@ -154,11 +156,8 @@ def _get_model_final_packages( """ final_packages = None if ( - any( - channel.lower() not in [env_utils.DEFAULT_CHANNEL_NAME, "snowflake"] - for channel in meta._conda_dependencies.keys() - ) - or meta.pip_requirements + any(channel.lower() not in [env_utils.DEFAULT_CHANNEL_NAME] for channel in meta.env._conda_dependencies.keys()) + or meta.env.pip_requirements ): raise snowml_exceptions.SnowflakeMLException( error_code=error_codes.DEPENDENCY_VERSION_ERROR, @@ -167,33 +166,28 @@ def _get_model_final_packages( ), ) - deps = meta._conda_dependencies[env_utils.DEFAULT_CHANNEL_NAME] + if relax_version: + relaxed_env = copy.deepcopy(meta.env) + relaxed_env.relax_version() + required_packages = relaxed_env._conda_dependencies[env_utils.DEFAULT_CHANNEL_NAME] + else: + required_packages = meta.env._conda_dependencies[env_utils.DEFAULT_CHANNEL_NAME] final_packages = env_utils.validate_requirements_in_snowflake_conda_channel( - session=session, - reqs=deps, - python_version=meta.python_version, + session, required_packages, python_version=meta.env.python_version ) - if final_packages is None and relax_version: - final_packages = env_utils.validate_requirements_in_snowflake_conda_channel( - session=session, - reqs=list(map(env_utils.relax_requirement_version, deps)), - python_version=meta.python_version, - ) if final_packages is None: relax_version_info_str = "" if relax_version else "Try to set relax_version as True in the options. " - required_deps = list(map(env_utils.relax_requirement_version, deps)) if relax_version else deps - if final_packages is None: - raise snowml_exceptions.SnowflakeMLException( - error_code=error_codes.DEPENDENCY_VERSION_ERROR, - original_exception=RuntimeError( - "The model's dependency cannot fit into Snowflake Warehouse. " - + relax_version_info_str - + "Required packages are:\n" - + " ".join(map(lambda x: f'"{x}"', required_deps)) - + "\n Required Python version is: " - + meta.python_version - ), - ) + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.DEPENDENCY_VERSION_ERROR, + original_exception=RuntimeError( + "The model's dependencyies are not available in Snowflake Anaconda Channel. " + + relax_version_info_str + + "Required packages are:\n" + + " ".join(map(lambda x: f'"{x}"', required_packages)) + + "\n Required Python version is: " + + meta.env.python_version + ), + ) return final_packages diff --git a/snowflake/ml/model/_deploy_client/warehouse/deploy_test.py b/snowflake/ml/model/_deploy_client/warehouse/deploy_test.py index a6a307f5..d0a75fd1 100644 --- a/snowflake/ml/model/_deploy_client/warehouse/deploy_test.py +++ b/snowflake/ml/model/_deploy_client/warehouse/deploy_test.py @@ -1,3 +1,4 @@ +import tempfile import textwrap from importlib import metadata as importlib_metadata from typing import Dict, List, cast @@ -6,8 +7,9 @@ from packaging import requirements from snowflake.ml._internal import env as snowml_env, env_utils -from snowflake.ml.model import _model_meta, model_signature +from snowflake.ml.model import model_signature from snowflake.ml.model._deploy_client.warehouse import deploy +from snowflake.ml.model._packager.model_meta import model_blob_meta, model_meta from snowflake.ml.test_utils import exception_utils, mock_data_frame, mock_session from snowflake.snowpark import row, session @@ -20,11 +22,15 @@ ) } +_DUMMY_BLOB = model_blob_meta.ModelBlobMeta( + name="model1", model_type="custom", path="mock_path", handler_version="version_0" +) + _BASIC_DEPENDENCIES_FINAL_PACKAGES = list( sorted( map( lambda x: env_utils.get_local_installed_version_of_pip_package(requirements.Requirement(x)), - _model_meta._BASIC_DEPENDENCIES + [env_utils._SNOWML_PKG_NAME], + model_meta._PACKAGING_CORE_DEPENDENCIES + [env_utils.SNOWPARK_ML_PKG_NAME], ), key=lambda x: x.name, ) @@ -53,9 +59,9 @@ def setUp(self) -> None: **{ basic_dep.name: [importlib_metadata.version(basic_dep.name)] for basic_dep in _BASIC_DEPENDENCIES_FINAL_PACKAGES - if basic_dep.name != env_utils._SNOWML_PKG_NAME + if basic_dep.name != env_utils.SNOWPARK_ML_PKG_NAME }, - env_utils._SNOWML_PKG_NAME: [snowml_env.VERSION], + env_utils.SNOWPARK_ML_PKG_NAME: [snowml_env.VERSION], } ) @@ -84,73 +90,106 @@ def add_packages(self, packages_dicts: Dict[str, List[str]]) -> None: self.m_session.add_mock_sql(query=query, result=mock_data_frame.MockDataFrame(sql_result)) def test_get_model_final_packages(self) -> None: - env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} - meta = _model_meta.ModelMetadata(name="model1", model_type="custom", signatures=_DUMMY_SIG) - c_session = cast(session.Session, self.m_session) + with tempfile.TemporaryDirectory() as tmpdir: + env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} + with model_meta.create_model_metadata( + model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + c_session = cast(session.Session, self.m_session) - final_packages = deploy._get_model_final_packages(meta, c_session) - self.assertListEqual(final_packages, list(map(str, _BASIC_DEPENDENCIES_FINAL_PACKAGES))) + final_packages = deploy._get_model_final_packages(meta, c_session) + self.assertListEqual(final_packages, list(map(str, _BASIC_DEPENDENCIES_FINAL_PACKAGES))) def test_get_model_final_packages_no_relax(self) -> None: - env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} - meta = _model_meta.ModelMetadata( - name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["pandas==1.0.*"] - ) - c_session = cast(session.Session, self.m_session) - with exception_utils.assert_snowml_exceptions(self, expected_original_error_type=RuntimeError): - deploy._get_model_final_packages(meta, c_session) + with tempfile.TemporaryDirectory() as tmpdir: + env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + conda_dependencies=["pandas==1.0.*"], + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + c_session = cast(session.Session, self.m_session) + with exception_utils.assert_snowml_exceptions(self, expected_original_error_type=RuntimeError): + deploy._get_model_final_packages(meta, c_session) def test_get_model_final_packages_relax(self) -> None: - env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} - meta = _model_meta.ModelMetadata( - name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["pandas==1.0.*"] - ) - c_session = cast(session.Session, self.m_session) - - final_packages = deploy._get_model_final_packages(meta, c_session, relax_version=True) - self.assertListEqual(final_packages, sorted(list(map(lambda x: x.name, _BASIC_DEPENDENCIES_FINAL_PACKAGES)))) + with tempfile.TemporaryDirectory() as tmpdir: + env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + c_session = cast(session.Session, self.m_session) + + final_packages = deploy._get_model_final_packages(meta, c_session, relax_version=True) + self.assertListEqual( + final_packages, + list(map(str, map(env_utils.relax_requirement_version, _BASIC_DEPENDENCIES_FINAL_PACKAGES))), + ) def test_get_model_final_packages_with_pip(self) -> None: - env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} - meta = _model_meta.ModelMetadata( - name="model1", model_type="custom", signatures=_DUMMY_SIG, pip_requirements=["python-package"] - ) - c_session = cast(session.Session, self.m_session) - with exception_utils.assert_snowml_exceptions(self, expected_original_error_type=RuntimeError): - deploy._get_model_final_packages(meta, c_session) + with tempfile.TemporaryDirectory() as tmpdir: + env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + pip_requirements=["python-package"], + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + c_session = cast(session.Session, self.m_session) + with exception_utils.assert_snowml_exceptions(self, expected_original_error_type=RuntimeError): + deploy._get_model_final_packages(meta, c_session) def test_get_model_final_packages_with_other_channel(self) -> None: - env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} - meta = _model_meta.ModelMetadata( - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - conda_dependencies=["conda-forge::python_package"], - ) - c_session = cast(session.Session, self.m_session) - with exception_utils.assert_snowml_exceptions(self, expected_original_error_type=RuntimeError): - deploy._get_model_final_packages(meta, c_session) + with tempfile.TemporaryDirectory() as tmpdir: + env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + conda_dependencies=["conda-forge::python_package"], + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + c_session = cast(session.Session, self.m_session) + with exception_utils.assert_snowml_exceptions(self, expected_original_error_type=RuntimeError): + deploy._get_model_final_packages(meta, c_session) def test_get_model_final_packages_with_non_exist_package(self) -> None: - env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} - d = { - **{ - basic_dep.name: [importlib_metadata.version(basic_dep.name)] - for basic_dep in _BASIC_DEPENDENCIES_FINAL_PACKAGES - if basic_dep.name != env_utils._SNOWML_PKG_NAME - }, - env_utils._SNOWML_PKG_NAME: [snowml_env.VERSION], - } - d["python-package"] = [] - self.m_session = mock_session.MockSession(conn=None, test_case=self) - self.add_packages(d) - meta = _model_meta.ModelMetadata( - name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["python-package"] - ) - c_session = cast(session.Session, self.m_session) - - with exception_utils.assert_snowml_exceptions(self, expected_original_error_type=RuntimeError): - deploy._get_model_final_packages(meta, c_session) + with tempfile.TemporaryDirectory() as tmpdir: + env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} + d = { + **{ + basic_dep.name: [importlib_metadata.version(basic_dep.name)] + for basic_dep in _BASIC_DEPENDENCIES_FINAL_PACKAGES + if basic_dep.name != env_utils.SNOWPARK_ML_PKG_NAME + }, + env_utils.SNOWPARK_ML_PKG_NAME: [snowml_env.VERSION], + } + d["python-package"] = [] + self.m_session = mock_session.MockSession(conn=None, test_case=self) + self.add_packages(d) + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + conda_dependencies=["python-package"], + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + c_session = cast(session.Session, self.m_session) + + with exception_utils.assert_snowml_exceptions(self, expected_original_error_type=RuntimeError): + deploy._get_model_final_packages(meta, c_session) if __name__ == "__main__": diff --git a/snowflake/ml/model/_deploy_client/warehouse/infer_template.py b/snowflake/ml/model/_deploy_client/warehouse/infer_template.py index ed4134a4..0e8c9f9c 100644 --- a/snowflake/ml/model/_deploy_client/warehouse/infer_template.py +++ b/snowflake/ml/model/_deploy_client/warehouse/infer_template.py @@ -1,62 +1,94 @@ _KEEP_ORDER_COL_NAME = "_ID" _UDF_CODE_TEMPLATE = """ -import pandas as pd -import numpy as np -import sys -from _snowflake import vectorized -import os import fcntl +import functools +import inspect +import os +import sys import threading import zipfile +from types import TracebackType +from typing import Optional, Type + import anyio -import inspect +import pandas as pd +from _snowflake import vectorized + class FileLock: - def __enter__(self): - self._lock = threading.Lock() - self._lock.acquire() - self._fd = open('/tmp/lockfile.LOCK', 'w+') - fcntl.lockf(self._fd, fcntl.LOCK_EX) + def __enter__(self) -> None: + self._lock = threading.Lock() + self._lock.acquire() + self._fd = open("/tmp/lockfile.LOCK", "w+") + fcntl.lockf(self._fd, fcntl.LOCK_EX) + + def __exit__( + self, exc_type: Optional[Type[BaseException]], exc: Optional[BaseException], traceback: Optional[TracebackType] + ) -> None: + self._fd.close() + self._lock.release() + - def __exit__(self, type, value, traceback): - self._fd.close() - self._lock.release() +# User-defined parameters +MODEL_FILE_NAME = "{model_stage_file_name}" +TARGET_METHOD = "{target_method}" +MAX_BATCH_SIZE = None + +# Retrieve the model IMPORT_DIRECTORY_NAME = "snowflake_import_directory" import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME] -model_dir_name = os.path.splitext('{model_stage_file_name}')[0] -zip_model_path = os.path.join(import_dir, '{model_stage_file_name}') -extracted = '/tmp/models' +model_dir_name = os.path.splitext(MODEL_FILE_NAME)[0] +zip_model_path = os.path.join(import_dir, MODEL_FILE_NAME) +extracted = "/tmp/models" extracted_model_dir_path = os.path.join(extracted, model_dir_name) with FileLock(): if not os.path.isdir(extracted_model_dir_path): - with zipfile.ZipFile(zip_model_path, 'r') as myzip: + with zipfile.ZipFile(zip_model_path, "r") as myzip: myzip.extractall(extracted_model_dir_path) sys.path.insert(0, os.path.join(extracted_model_dir_path, "{code_dir_name}")) -from snowflake.ml.model import _model -# Backward for <= 1.0.5 -if hasattr(_model, "_load_model_for_deploy"): - model, meta = _model._load_model_for_deploy(extracted_model_dir_path) + +# Load the model +try: + from snowflake.ml.model._packager import model_packager + pk = model_packager.ModelPackager(extracted_model_dir_path) + pk.load(as_custom_model=True) + assert pk.model, "model is not loaded" + assert pk.meta, "model metadata is not loaded" + + model = pk.model + meta = pk.meta +except ImportError: + # Support Legacy model + from snowflake.ml.model import _model + # Backward for <= 1.0.5 + if hasattr(_model, "_load_model_for_deploy"): + model, meta = _model._load_model_for_deploy(extracted_model_dir_path) + else: + model, meta = _model._load(local_dir_path=extracted_model_dir_path, as_custom_model=True) + +# Determine the actual runner +func = getattr(model, TARGET_METHOD) +if inspect.iscoroutinefunction(func): + runner = functools.partial(anyio.run, func) else: - model, meta = _model._load(local_dir_path=extracted_model_dir_path, as_custom_model=True) + runner = functools.partial(func) -features = meta.signatures["{target_method}"].inputs +# Determine preprocess parameters +features = meta.signatures[TARGET_METHOD].inputs input_cols = [feature.name for feature in features] dtype_map = {{feature.name: feature.as_dtype() for feature in features}} -# TODO(halu): Wire `max_batch_size`. -# TODO(halu): Avoid per batch async detection branching. -@vectorized(input=pd.DataFrame, max_batch_size=10) -def infer(df): + +# Actual handler +@vectorized(input=pd.DataFrame, max_batch_size=MAX_BATCH_SIZE) +def infer(df: pd.DataFrame) -> dict: input_df = pd.json_normalize(df[0]).astype(dtype=dtype_map) - if inspect.iscoroutinefunction(model.{target_method}): - predictions_df = anyio.run(model.{target_method}, input_df[input_cols]) - else: - predictions_df = model.{target_method}(input_df[input_cols]) + predictions_df = runner(input_df[input_cols]) if "{_KEEP_ORDER_COL_NAME}" in input_df.columns: predictions_df["{_KEEP_ORDER_COL_NAME}"] = input_df["{_KEEP_ORDER_COL_NAME}"] diff --git a/snowflake/ml/model/_deployer.py b/snowflake/ml/model/_deployer.py deleted file mode 100644 index d6debfd2..00000000 --- a/snowflake/ml/model/_deployer.py +++ /dev/null @@ -1,305 +0,0 @@ -from typing import Any, Dict, Optional, Union, cast, overload - -import pandas as pd - -from snowflake.ml._internal.exceptions import ( - error_codes, - exceptions as snowml_exceptions, -) -from snowflake.ml._internal.utils import identifier -from snowflake.ml.model import ( - _model, - deploy_platforms, - model_signature, - type_hints as model_types, -) -from snowflake.ml.model._deploy_client.snowservice import deploy as snowservice_deploy -from snowflake.ml.model._deploy_client.utils import constants as snowservice_constants -from snowflake.ml.model._deploy_client.warehouse import ( - deploy as warehouse_deploy, - infer_template, -) -from snowflake.ml.model._signatures import snowpark_handler -from snowflake.snowpark import DataFrame as SnowparkDataFrame, Session, functions as F - - -@overload -def deploy( - session: Session, - *, - name: str, - platform: deploy_platforms.TargetPlatform, - target_method: Optional[str], - model_stage_file_path: str, - options: Optional[model_types.DeployOptions], -) -> Optional[model_types.Deployment]: - """Create a deployment from a model in a zip file in a stage and deploy it to remote platform. - - Args: - session: Snowpark Connection Session. - name: Name of the deployment for the model. - platform: Target platform to deploy the model. - target_method: The name of the target method to be deployed. Can be omitted if there is only 1 target method in - the model. - model_stage_file_path: Model file in the stage to be deployed. Must be a file with .zip extension. - options: Additional options when deploying the model. - Each target platform will have their own specifications of options. - """ - ... - - -@overload -def deploy( - session: Session, - *, - model_id: str, - name: str, - platform: deploy_platforms.TargetPlatform, - target_method: Optional[str], - model_stage_file_path: str, - deployment_stage_path: str, - options: Optional[model_types.DeployOptions], -) -> Optional[model_types.Deployment]: - """Create a deployment from a model in a local directory and deploy it to remote platform. - - Args: - session: Snowpark Connection Session. - model_id: Internal model ID string. - name: Name of the deployment for the model. - platform: Target platform to deploy the model. - target_method: The name of the target method to be deployed. Can be omitted if there is only 1 target method in - the model. - model_stage_file_path: Model file in the stage to be deployed. Must be a file with .zip extension. - deployment_stage_path: Path to stage containing snowpark container service deployment artifacts. - options: Additional options when deploying the model. - Each target platform will have their own specifications of options. - """ - ... - - -def deploy( - session: Session, - *, - name: str, - platform: deploy_platforms.TargetPlatform, - model_stage_file_path: str, - target_method: Optional[str] = None, - deployment_stage_path: Optional[str] = None, - model_id: Optional[str] = None, - options: Optional[model_types.DeployOptions], -) -> Optional[model_types.Deployment]: - """Create a deployment from a model and deploy it to remote platform. - - Args: - session: Snowpark Connection Session. - model_id: Internal model ID string. - name: Name of the deployment for the model. - platform: Target platform to deploy the model. - target_method: The name of the target method to be deployed. Can be omitted if there is only 1 target method in - the model. - model_stage_file_path: Model file in the stage to be deployed. Exclusive with `model_dir_path`. - Must be a file with .zip extension. - deployment_stage_path: Path to stage containing deployment artifacts. - options: Additional options when deploying the model. - Each target platform will have their own specifications of options. - - Raises: - SnowflakeMLException: Raised when target platform is unsupported. - SnowflakeMLException: Raised when target method does not exist in model. - - Returns: - The deployment information. - """ - - info = None - - if not options: - options = {} - - meta = _model.load_model(session=session, model_stage_file_path=model_stage_file_path, meta_only=True) - - if target_method is None: - if len(meta.signatures.keys()) == 1: - target_method = list(meta.signatures.keys())[0] - else: - raise snowml_exceptions.SnowflakeMLException( - error_code=error_codes.INVALID_ARGUMENT, - original_exception=ValueError( - "Only when the model has 1 target methods can target_method be omitted when deploying." - ), - ) - - details: model_types.DeployDetails = {} - if platform == deploy_platforms.TargetPlatform.WAREHOUSE: - warehouse_deploy._deploy_to_warehouse( - session=session, - model_stage_file_path=model_stage_file_path, - model_meta=meta, - udf_name=name, - target_method=target_method, - **options, - ) - - elif platform == deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES: - options = cast(model_types.SnowparkContainerServiceDeployOptions, options) - assert model_id, "Require 'model_id' for Snowpark container service deployment" - assert model_stage_file_path, "Require 'model_stage_file_path' for Snowpark container service deployment" - assert deployment_stage_path, "Require 'deployment_stage_path' for Snowpark container service deployment" - if snowservice_constants.COMPUTE_POOL not in options: - raise snowml_exceptions.SnowflakeMLException( - error_code=error_codes.INVALID_ARGUMENT, - original_exception=ValueError( - "Missing 'compute_pool' in options field for Snowpark container service deployment" - ), - ) - - details = snowservice_deploy._deploy( - session=session, - model_id=model_id, - model_meta=meta, - service_func_name=name, - model_zip_stage_path=model_stage_file_path, - deployment_stage_path=deployment_stage_path, - target_method=target_method, - **options, - ) - - else: - raise snowml_exceptions.SnowflakeMLException( - error_code=error_codes.INVALID_TYPE, - original_exception=ValueError(f"Unsupported target Platform: {platform}"), - ) - signature = meta.signatures.get(target_method, None) - if not signature: - raise snowml_exceptions.SnowflakeMLException( - error_code=error_codes.INVALID_ARGUMENT, - original_exception=ValueError(f"Target method {target_method} does not exist in model."), - ) - info = model_types.Deployment( - name=name, platform=platform, target_method=target_method, signature=signature, options=options, details=details - ) - return info - - -@overload -def predict( - session: Session, - *, - deployment: model_types.Deployment, - X: model_types.SupportedLocalDataType, - statement_params: Optional[Dict[str, Any]] = None, -) -> pd.DataFrame: - """Execute batch inference of a model remotely on local data. Can be any supported data type. Return a local - Pandas Dataframe. - - Args: - session: Snowpark Connection Session. - deployment: The deployment info to use for predict. - X: The input data. - statement_params: Statement Parameters for telemetry. - """ - ... - - -@overload -def predict( - session: Session, - *, - deployment: model_types.Deployment, - X: SnowparkDataFrame, - statement_params: Optional[Dict[str, Any]] = None, -) -> SnowparkDataFrame: - """Execute batch inference of a model remotely on a Snowpark DataFrame. Return a Snowpark DataFrame. - - Args: - session: Snowpark Connection Session. - deployment: The deployment info to use for predict. - X: The input Snowpark dataframe. - statement_params: Statement Parameters for telemetry. - """ - ... - - -def predict( - session: Session, - *, - deployment: model_types.Deployment, - X: Union[model_types.SupportedDataType, SnowparkDataFrame], - statement_params: Optional[Dict[str, Any]] = None, -) -> Union[pd.DataFrame, SnowparkDataFrame]: - """Execute batch inference of a model remotely. - - Args: - session: Snowpark Connection Session. - deployment: The deployment info to use for predict. - X: The input dataframe. - statement_params: Statement Parameters for telemetry. - - Returns: - The output dataframe. - """ - - # Get options - INTERMEDIATE_OBJ_NAME = "tmp_result" - sig = deployment["signature"] - - # Validate and prepare input - if not isinstance(X, SnowparkDataFrame): - keep_order = True - output_with_input_features = False - df = model_signature._convert_and_validate_local_data(X, sig.inputs) - s_df = snowpark_handler.SnowparkDataFrameHandler.convert_from_df(session, df, keep_order=keep_order) - else: - keep_order = False - output_with_input_features = True - model_signature._validate_snowpark_data(X, sig.inputs) - s_df = X - - if statement_params: - if s_df._statement_params is not None: - s_df._statement_params.update(statement_params) - else: - s_df._statement_params = statement_params # type: ignore[assignment] - - # Infer and get intermediate result - input_cols = [] - for col_name in s_df.columns: - literal_col_name = identifier.get_unescaped_names(col_name) - input_cols.extend( - [ - F.lit(literal_col_name), - F.col(col_name), - ] - ) - - # TODO[shchen]: SNOW-870032, For SnowService, external function name cannot be double quoted, else it results in - # external function no found. - udf_name = deployment["name"] - output_obj = F.call_udf(udf_name, F.object_construct(*input_cols)) - - if output_with_input_features: - df_res = s_df.with_column(INTERMEDIATE_OBJ_NAME, output_obj) - else: - df_res = s_df.select(output_obj.alias(INTERMEDIATE_OBJ_NAME)) - - if keep_order: - df_res = df_res.order_by( - F.col(INTERMEDIATE_OBJ_NAME)[infer_template._KEEP_ORDER_COL_NAME], - ascending=True, - ) - - # Prepare the output - output_cols = [] - for output_feature in sig.outputs: - output_cols.append(F.col(INTERMEDIATE_OBJ_NAME)[output_feature.name].astype(output_feature.as_snowpark_type())) - - df_res = df_res.with_columns( - [identifier.get_inferred_name(output_feature.name) for output_feature in sig.outputs], - output_cols, - ).drop(INTERMEDIATE_OBJ_NAME) - - # Get final result - if not isinstance(X, SnowparkDataFrame): - return snowpark_handler.SnowparkDataFrameHandler.convert_to_df(df_res, features=sig.outputs) - else: - return df_res diff --git a/snowflake/ml/model/_env.py b/snowflake/ml/model/_env.py deleted file mode 100644 index 90d6f6f6..00000000 --- a/snowflake/ml/model/_env.py +++ /dev/null @@ -1,151 +0,0 @@ -import os -import warnings -from typing import Any, DefaultDict, Dict, List, Optional, Tuple - -import yaml -from packaging import requirements, version - -from snowflake.ml._internal import env as snowml_env, env_utils -from snowflake.ml._internal.exceptions import ( - error_codes, - exceptions as snowml_exceptions, -) - -_CONDA_ENV_FILE_NAME = "conda.yaml" -_SNOWFLAKE_CONDA_CHANNEL_URL = "https://repo.anaconda.com/pkgs/snowflake" -_NODEFAULTS = "nodefaults" -_REQUIREMENTS_FILE_NAME = "requirements.txt" - - -def save_conda_env_file( - dir_path: str, - deps: DefaultDict[str, List[requirements.Requirement]], - python_version: Optional[str] = snowml_env.PYTHON_VERSION, -) -> str: - """Generate conda.yaml file given a dict of dependencies after validation. - - Args: - dir_path: Path to the directory where conda.yaml file should be written. - deps: Dict of conda dependencies after validated. - python_version: A string 'major.minor.patchlevel' showing python version relate to model. Default to current. - - Returns: - The path to conda env file. - """ - path = os.path.join(dir_path, _CONDA_ENV_FILE_NAME) - env: Dict[str, Any] = dict() - env["name"] = "snow-env" - # Get all channels in the dependencies, ordered by the number of the packages which belongs to - channels = list(dict(sorted(deps.items(), key=lambda item: len(item[1]), reverse=True)).keys()) - if env_utils.DEFAULT_CHANNEL_NAME in channels: - channels.remove(env_utils.DEFAULT_CHANNEL_NAME) - env["channels"] = [_SNOWFLAKE_CONDA_CHANNEL_URL] + channels + [_NODEFAULTS] - env["dependencies"] = [f"python=={python_version}"] - for chan, reqs in deps.items(): - env["dependencies"].extend([f"{chan}::{str(req)}" if chan else str(req) for req in reqs]) - - with open(path, "w", encoding="utf-8") as f: - yaml.safe_dump(env, stream=f, default_flow_style=False) - - return path - - -def save_requirements_file(dir_path: str, pip_deps: List[requirements.Requirement]) -> str: - """Generate Python requirements.txt file in the given directory path. - - Args: - dir_path: Path to the directory where requirements.txt file should be written. - pip_deps: List of dependencies string after validated. - - Returns: - The path to pip requirements file. - """ - requirements = "\n".join(map(str, pip_deps)) - path = os.path.join(dir_path, _REQUIREMENTS_FILE_NAME) - with open(path, "w", encoding="utf-8") as out: - out.write(requirements) - - return path - - -def load_conda_env_file(path: str) -> Tuple[DefaultDict[str, List[requirements.Requirement]], Optional[str]]: - """Read conda.yaml file to get n a dict of dependencies after validation. - - Args: - path: Path to conda.yaml. - - Returns: - A tuple of Dict of conda dependencies after validated and a string 'major.minor.patchlevel' of python version. - """ - with open(path, encoding="utf-8") as f: - env = yaml.safe_load(stream=f) - - assert isinstance(env, dict) - - deps = [] - - python_version = None - - channels = env["channels"] - channels.remove(_SNOWFLAKE_CONDA_CHANNEL_URL) - channels.remove(_NODEFAULTS) - - for dep in env["dependencies"]: - if isinstance(dep, str): - ver = env_utils.parse_python_version_string(dep) - # ver is None: not python, ver is "": python w/o specifier, ver is str: python w/ specifier - if ver is not None: - if ver: - python_version = ver - else: - deps.append(dep) - - conda_dep_dict = env_utils.validate_conda_dependency_string_list(deps) - - if len(channels) > 0: - for channel in channels: - if channel not in conda_dep_dict: - conda_dep_dict[channel] = [] - - return conda_dep_dict, python_version - - -def load_requirements_file(path: str) -> List[requirements.Requirement]: - """Load Python requirements.txt file from the given directory path. - - Args: - path: Path to the requirements.txt file. - - Returns: - List of dependencies string after validated. - """ - with open(path, encoding="utf-8") as f: - reqs = f.readlines() - - return env_utils.validate_pip_requirement_string_list(reqs) - - -def validate_py_runtime_version(provided_py_version_str: str) -> None: - if provided_py_version_str != snowml_env.PYTHON_VERSION: - provided_py_version = version.parse(provided_py_version_str) - current_py_version = version.parse(snowml_env.PYTHON_VERSION) - if ( - provided_py_version.major != current_py_version.major - or provided_py_version.minor != current_py_version.minor - ): - raise snowml_exceptions.SnowflakeMLException( - error_code=error_codes.LOCAL_ENVIRONMENT_ERROR, - original_exception=RuntimeError( - f"Unable to load model which is saved with Python {provided_py_version_str} " - f"while current Python version is {snowml_env.PYTHON_VERSION}. " - "To load model metadata only, set meta_only to True." - ), - ) - warnings.warn( - ( - f"Model is saved with Python {provided_py_version_str} " - f"while current Python version is {snowml_env.PYTHON_VERSION}. " - "There might be some issues when using loaded model." - ), - category=RuntimeWarning, - ) diff --git a/snowflake/ml/model/_env_test.py b/snowflake/ml/model/_env_test.py deleted file mode 100644 index 89cbc5d5..00000000 --- a/snowflake/ml/model/_env_test.py +++ /dev/null @@ -1,142 +0,0 @@ -import collections -import os -import tempfile -from typing import DefaultDict, List - -import yaml -from absl.testing import absltest -from packaging import requirements - -from snowflake.ml._internal import env as snowml_env, env_utils -from snowflake.ml.model import _env - - -class EnvTest(absltest.TestCase): - def test_conda_env_file(self) -> None: - cd: DefaultDict[str, List[requirements.Requirement]] - with tempfile.TemporaryDirectory() as tmpdir: - cd = collections.defaultdict(list) - env_file_path = _env.save_conda_env_file(tmpdir, cd) - loaded_cd, _ = _env.load_conda_env_file(env_file_path) - self.assertEqual(cd, loaded_cd) - - with tempfile.TemporaryDirectory() as tmpdir: - cd = collections.defaultdict(list) - cd[env_utils.DEFAULT_CHANNEL_NAME] = [requirements.Requirement("numpy")] - env_file_path = _env.save_conda_env_file(tmpdir, cd) - loaded_cd, _ = _env.load_conda_env_file(env_file_path) - self.assertEqual(cd, loaded_cd) - - with tempfile.TemporaryDirectory() as tmpdir: - cd = collections.defaultdict(list) - cd[env_utils.DEFAULT_CHANNEL_NAME] = [requirements.Requirement("numpy>=1.22.4")] - env_file_path = _env.save_conda_env_file(tmpdir, cd) - loaded_cd, _ = _env.load_conda_env_file(env_file_path) - self.assertEqual(cd, loaded_cd) - - with tempfile.TemporaryDirectory() as tmpdir: - cd = collections.defaultdict(list) - cd.update( - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("numpy>=1.22.4")], - "conda-forge": [requirements.Requirement("pytorch!=2.0")], - } - ) - env_file_path = _env.save_conda_env_file(tmpdir, cd) - loaded_cd, _ = _env.load_conda_env_file(env_file_path) - self.assertEqual(cd, loaded_cd) - - with tempfile.TemporaryDirectory() as tmpdir: - cd = collections.defaultdict(list) - cd.update( - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("numpy>=1.22.4")], - "apple": [], - "conda-forge": [requirements.Requirement("pytorch!=2.0")], - } - ) - env_file_path = _env.save_conda_env_file(tmpdir, cd) - with open(env_file_path, encoding="utf-8") as f: - written_yaml = yaml.safe_load(f) - self.assertDictEqual( - written_yaml, - { - "name": "snow-env", - "channels": ["https://repo.anaconda.com/pkgs/snowflake", "conda-forge", "apple", "nodefaults"], - "dependencies": [ - f"python=={snowml_env.PYTHON_VERSION}", - "numpy>=1.22.4", - "conda-forge::pytorch!=2.0", - ], - }, - ) - loaded_cd, _ = _env.load_conda_env_file(env_file_path) - self.assertEqual(cd, loaded_cd) - - with tempfile.TemporaryDirectory() as tmpdir: - with open(os.path.join(tmpdir, _env._CONDA_ENV_FILE_NAME), "w", encoding="utf-8") as f: - yaml.safe_dump( - stream=f, - data={ - "name": "snow-env", - "channels": ["https://repo.anaconda.com/pkgs/snowflake", "nodefaults"], - "dependencies": [ - f"python=={snowml_env.PYTHON_VERSION}", - "::numpy>=1.22.4", - "conda-forge::pytorch!=2.0", - {"pip": "python-package"}, - ], - }, - ) - loaded_cd, python_ver = _env.load_conda_env_file(os.path.join(tmpdir, _env._CONDA_ENV_FILE_NAME)) - self.assertEqual( - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("numpy>=1.22.4")], - "conda-forge": [requirements.Requirement("pytorch!=2.0")], - }, - loaded_cd, - ) - self.assertEqual(python_ver, snowml_env.PYTHON_VERSION) - - with tempfile.TemporaryDirectory() as tmpdir: - with open(os.path.join(tmpdir, _env._CONDA_ENV_FILE_NAME), "w", encoding="utf-8") as f: - yaml.safe_dump( - stream=f, - data={ - "name": "snow-env", - "channels": ["https://repo.anaconda.com/pkgs/snowflake", "apple", "nodefaults"], - "dependencies": [ - f"python=={snowml_env.PYTHON_VERSION}", - "::numpy>=1.22.4", - "conda-forge::pytorch!=2.0", - {"pip": "python-package"}, - ], - }, - ) - loaded_cd, python_ver = _env.load_conda_env_file(os.path.join(tmpdir, _env._CONDA_ENV_FILE_NAME)) - self.assertEqual( - { - env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("numpy>=1.22.4")], - "conda-forge": [requirements.Requirement("pytorch!=2.0")], - "apple": [], - }, - loaded_cd, - ) - self.assertEqual(python_ver, snowml_env.PYTHON_VERSION) - - def test_generate_requirements_file(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - rl: List[requirements.Requirement] = [] - pip_file_path = _env.save_requirements_file(tmpdir, rl) - loaded_rl = _env.load_requirements_file(pip_file_path) - self.assertEqual(rl, loaded_rl) - - with tempfile.TemporaryDirectory() as tmpdir: - rl = [requirements.Requirement("python-package==1.0.1")] - pip_file_path = _env.save_requirements_file(tmpdir, rl) - loaded_rl = _env.load_requirements_file(pip_file_path) - self.assertEqual(rl, loaded_rl) - - -if __name__ == "__main__": - absltest.main() diff --git a/snowflake/ml/model/_handlers/_base.py b/snowflake/ml/model/_handlers/_base.py deleted file mode 100644 index 1294462c..00000000 --- a/snowflake/ml/model/_handlers/_base.py +++ /dev/null @@ -1,87 +0,0 @@ -from abc import ABC, abstractmethod -from typing import Generic, Optional - -from typing_extensions import TypeGuard, Unpack - -from snowflake.ml.model import _model_meta, type_hints as model_types - - -class _ModelHandler(ABC, Generic[model_types._ModelType]): - """ - Provides handling for a given type of model defined by `type` class property. - - handler_type: The string type that identify the handler. Should be unique in the library. - MODEL_BLOB_FILE: Relative path of the model blob file in the model subdir. - MODEL_ARTIFACTS_DIR: Relative path of the model artifacts dir in the model subdir. - DEFAULT_TARGET_METHODS: Default target methods to be logged if not specified in this kind of model. - is_auto_signature: Set to True if the model could get model signature automatically and do not require user - inputting sample data or model signature. - """ - - handler_type = "_base" - MODEL_BLOB_FILE = "model.pkl" - MODEL_ARTIFACTS_DIR = "artifacts" - DEFAULT_TARGET_METHODS = ["predict"] - is_auto_signature = False - - @staticmethod - @abstractmethod - def can_handle(model: model_types.SupportedDataType) -> TypeGuard[model_types._ModelType]: - """Whether this handler could support the type of the `model`. - - Args: - model: The model object. - """ - ... - - @staticmethod - @abstractmethod - def cast_model(model: model_types.SupportedModelType) -> model_types._ModelType: - """Cast the model from Union type into the type that handler could handle. - - Args: - model: The model object. - """ - ... - - @staticmethod - @abstractmethod - def _save_model( - name: str, - model: model_types._ModelType, - model_meta: _model_meta.ModelMetadata, - model_blobs_dir_path: str, - sample_input: Optional[model_types.SupportedDataType] = None, - is_sub_model: Optional[bool] = False, - **kwargs: Unpack[model_types.BaseModelSaveOption], - ) -> None: - """Save the model. - - Args: - name: Name of the model. - model: The model object. - model_meta: The model metadata. - model_blobs_dir_path: Directory path to the model. - sample_input: Sample input to infer the signatures from. - is_sub_model: Flag to show if it is a sub model, a sub model does not need signature. - kwargs: Additional saving options. - """ - ... - - @staticmethod - @abstractmethod - def _load_model( - name: str, - model_meta: _model_meta.ModelMetadata, - model_blobs_dir_path: str, - **kwargs: Unpack[model_types.ModelLoadOption], - ) -> model_types._ModelType: - """Load the model into memory. - - Args: - name: Name of the model. - model_meta: The model metadata. - model_blobs_dir_path: Directory path to the whole model. - kwargs: Options when loading the model. - """ - ... diff --git a/snowflake/ml/model/_handlers/llm.py b/snowflake/ml/model/_handlers/llm.py deleted file mode 100644 index 94a3fa06..00000000 --- a/snowflake/ml/model/_handlers/llm.py +++ /dev/null @@ -1,178 +0,0 @@ -import os -from typing import Optional, cast - -import cloudpickle -import pandas as pd -from packaging import requirements -from typing_extensions import TypeGuard, Unpack - -from snowflake.ml._internal import env_utils, file_utils -from snowflake.ml.model import ( - _model_meta as model_meta_api, - custom_model, - type_hints as model_types, -) -from snowflake.ml.model._handlers import _base -from snowflake.ml.model._signatures import core -from snowflake.ml.model.models import llm - - -class _LLMHandler(_base._ModelHandler[llm.LLM]): - handler_type = "llm" - MODEL_BLOB_DIR = "model" - LLM_META = "llm_meta" - is_auto_signature = True - - @staticmethod - def can_handle( - model: model_types.SupportedModelType, - ) -> TypeGuard[llm.LLM]: - return isinstance(model, llm.LLM) - - @staticmethod - def cast_model( - model: model_types.SupportedModelType, - ) -> llm.LLM: - assert isinstance(model, llm.LLM) - return cast(llm.LLM, model) - - @staticmethod - def _save_model( - name: str, - model: llm.LLM, - model_meta: model_meta_api.ModelMetadata, - model_blobs_dir_path: str, - sample_input: Optional[model_types.SupportedDataType] = None, - is_sub_model: Optional[bool] = False, - **kwargs: Unpack[model_types.BaseModelSaveOption], - ) -> None: - assert not is_sub_model, "LLM can not be sub-model." - model_blob_path = os.path.join(model_blobs_dir_path, name) - os.makedirs(model_blob_path, exist_ok=True) - model_blob_dir_path = os.path.join(model_blob_path, _LLMHandler.MODEL_BLOB_DIR) - model_meta.cuda_version = model_meta_api._DEFAULT_CUDA_VERSION - sig = core.ModelSignature( - inputs=[ - core.FeatureSpec(name="input", dtype=core.DataType.STRING), - ], - outputs=[ - core.FeatureSpec(name="generated_text", dtype=core.DataType.STRING), - ], - ) - model_meta._signatures = {"infer": sig} - assert os.path.isdir(model.model_id_or_path), "Only model dir is supported for now." - file_utils.copytree(model.model_id_or_path, model_blob_dir_path) - with open( - os.path.join(model_blob_dir_path, _LLMHandler.LLM_META), - "wb", - ) as f: - cloudpickle.dump(model, f) - - base_meta = model_meta_api._ModelBlobMetadata( - name=name, - model_type=_LLMHandler.handler_type, - path=_LLMHandler.MODEL_BLOB_DIR, - options={ - "batch_size": str(model.max_batch_size), - }, - ) - model_meta.models[name] = base_meta - pkgs_requirements = [ - model_meta_api.Dependency(conda_name="transformers", pip_req="transformers"), - model_meta_api.Dependency(conda_name="pytorch", pip_req="torch==2.0.1"), - ] - if model.model_type == llm.SupportedLLMType.LLAMA_MODEL_TYPE: - pkgs_requirements = [ - model_meta_api.Dependency(conda_name="sentencepiece", pip_req="sentencepiece"), - model_meta_api.Dependency(conda_name="protobuf", pip_req="protobuf"), - *pkgs_requirements, - ] - model_meta._include_if_absent(pkgs_requirements) - # Recent peft versions are only available in PYPI. - env_utils.append_requirement_list( - model_meta._pip_requirements, - requirements.Requirement("peft==0.5.0"), - ) - - @staticmethod - def _load_model( - name: str, - model_meta: model_meta_api.ModelMetadata, - model_blobs_dir_path: str, - **kwargs: Unpack[model_types.ModelLoadOption], - ) -> llm.LLM: - model_blob_path = os.path.join(model_blobs_dir_path, name) - if not hasattr(model_meta, "models"): - raise ValueError("Ill model metadata found.") - model_blobs_metadata = model_meta.models - if name not in model_blobs_metadata: - raise ValueError(f"Blob of model {name} does not exist.") - model_blob_metadata = model_blobs_metadata[name] - model_blob_filename = model_blob_metadata.path - model_blob_dir_path = os.path.join(model_blob_path, model_blob_filename) - assert model_blob_dir_path, "It must be a directory." - with open(os.path.join(model_blob_dir_path, _LLMHandler.LLM_META), "rb") as f: - m = cloudpickle.load(f) - assert isinstance(m, llm.LLM) - # Switch to local path - m.model_id_or_path = model_blob_dir_path - return m - - @staticmethod - def _load_as_custom_model( - name: str, - model_meta: model_meta_api.ModelMetadata, - model_blobs_dir_path: str, - **kwargs: Unpack[model_types.ModelLoadOption], - ) -> custom_model.CustomModel: - raw_model = _LLMHandler._load_model( - name, - model_meta, - model_blobs_dir_path, - **kwargs, - ) - import peft - import transformers - - hub_kwargs = { - "revision": raw_model.revision, - "token": raw_model.token, - } - model_dir_path = raw_model.model_id_or_path - hf_model = peft.AutoPeftModelForCausalLM.from_pretrained( # type: ignore[attr-defined] - model_dir_path, - device_map="auto", - torch_dtype="auto", - **hub_kwargs, - ) - peft_config = peft.PeftConfig.from_pretrained(model_dir_path) # type: ignore[attr-defined] - base_model_path = peft_config.base_model_name_or_path - tokenizer = transformers.AutoTokenizer.from_pretrained( - base_model_path, - padding_side="right", - use_fast=False, - **hub_kwargs, - ) - hf_model.eval() - - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - # TODO(lhw): migrate away from hf pipeline - pipe = transformers.pipeline( - task="text-generation", - model=hf_model, - tokenizer=tokenizer, - batch_size=raw_model.max_batch_size, - ) - - class _LLMCustomModel(custom_model.CustomModel): - @custom_model.inference_api - def infer(self, X: pd.DataFrame) -> pd.DataFrame: - input_data = X.to_dict("list")["input"] - res = pipe(input_data, return_full_text=False) - # TODO(lhw): Assume single beam only. - return pd.DataFrame({"generated_text": [output[0]["generated_text"] for output in res]}) - - llm_custom = _LLMCustomModel(custom_model.ModelContext()) - - return llm_custom diff --git a/snowflake/ml/model/_handlers/mlflow.py b/snowflake/ml/model/_handlers/mlflow.py deleted file mode 100644 index 8021969e..00000000 --- a/snowflake/ml/model/_handlers/mlflow.py +++ /dev/null @@ -1,318 +0,0 @@ -import itertools -import os -import tempfile -import warnings -from typing import TYPE_CHECKING, Callable, Optional, Type, cast - -import pandas as pd -import yaml -from typing_extensions import TypeGuard, Unpack - -from snowflake.ml._internal import env_utils, file_utils, type_utils -from snowflake.ml.model import ( - _model_meta as model_meta_api, - custom_model, - model_signature, - type_hints as model_types, -) -from snowflake.ml.model._handlers import _base -from snowflake.ml.model._signatures import utils as model_signature_utils - -if TYPE_CHECKING: - import mlflow - - -def _parse_mlflow_env(model_uri: str, model_meta: model_meta_api.ModelMetadata) -> model_meta_api.ModelMetadata: - """Parse MLFlow env file and modify model meta based on MLFlow env. - - Args: - model_uri: Model uri where the env file could be downloaded - model_meta: model meta to be modified - - Raises: - ValueError: Raised when cannot download MLFlow model dependencies file. - - Returns: - Modified model metadata. - """ - import mlflow - - try: - conda_env_file_path = mlflow.pyfunc.get_model_dependencies(model_uri, format="conda") - - with open(conda_env_file_path, encoding="utf-8") as f: - env = yaml.safe_load(stream=f) - except (mlflow.MlflowException, OSError): - raise ValueError("Cannot load MLFlow model dependencies.") - - assert isinstance(env, dict) - - mlflow_conda_deps = [] - mlflow_pip_deps = [] - mlflow_python_version = None - - mlflow_conda_channels = env.get("channels", []) - - for dep in env["dependencies"]: - if isinstance(dep, str): - ver = env_utils.parse_python_version_string(dep) - # ver is None: not python, ver is "": python w/o specifier, ver is str: python w/ specifier - if ver is not None: - if ver: - mlflow_python_version = ver - else: - mlflow_conda_deps.append(dep) - elif isinstance(dep, dict) and "pip" in dep: - mlflow_pip_deps.extend(dep["pip"]) - - if mlflow_python_version: - model_meta.python_version = mlflow_python_version - - mlflow_conda_deps_dict = env_utils.validate_conda_dependency_string_list(mlflow_conda_deps) - mlflow_pip_deps_list = env_utils.validate_pip_requirement_string_list(mlflow_pip_deps) - - for mlflow_channel, mlflow_channel_dependencies in mlflow_conda_deps_dict.items(): - if mlflow_channel != env_utils.DEFAULT_CHANNEL_NAME: - warnings.warn( - ( - "Found dependencies from MLflow specified from non-Snowflake channel." - + " This may prevent model deploying to Snowflake Warehouse." - ), - category=UserWarning, - ) - for mlflow_channel_dependency in mlflow_channel_dependencies: - try: - env_utils.append_conda_dependency( - model_meta._conda_dependencies, (mlflow_channel, mlflow_channel_dependency) - ) - except env_utils.DuplicateDependencyError: - pass - except env_utils.DuplicateDependencyInMultipleChannelsError: - warnings.warn( - ( - f"Dependency {mlflow_channel_dependency.name} appeared in multiple channels." - + " This may be unintentional." - ), - category=UserWarning, - ) - - if mlflow_conda_channels: - warnings.warn( - ( - "Found conda channels specified from MLflow." - + " This may prevent model deploying to Snowflake Warehouse." - ), - category=UserWarning, - ) - for channel_name in mlflow_conda_channels: - model_meta._conda_dependencies[channel_name] = [] - - if mlflow_pip_deps_list: - warnings.warn( - ( - "Found dependencies from MLflow specified as pip requirements." - + " This may prevent model deploying to Snowflake Warehouse." - ), - category=UserWarning, - ) - for mlflow_pip_dependency in mlflow_pip_deps_list: - if any( - mlflow_channel_dependency.name == mlflow_pip_dependency.name - for mlflow_channel_dependency in itertools.chain(*mlflow_conda_deps_dict.values()) - ): - continue - env_utils.append_requirement_list(model_meta._pip_requirements, mlflow_pip_dependency) - - return model_meta - - -class _MLFlowHandler(_base._ModelHandler["mlflow.pyfunc.PyFuncModel"]): - """Handler for MLFlow based model. - - Currently mlflow.pyfunc.PyFuncModel based classes are supported. - """ - - handler_type = "mlflow" - MODEL_BLOB_FILE = "model" - _DEFAULT_TARGET_METHOD = "predict" - DEFAULT_TARGET_METHODS = [_DEFAULT_TARGET_METHOD] - is_auto_signature = True - - @staticmethod - def can_handle( - model: model_types.SupportedModelType, - ) -> TypeGuard["mlflow.pyfunc.PyFuncModel"]: - return type_utils.LazyType("mlflow.pyfunc.PyFuncModel").isinstance(model) - - @staticmethod - def cast_model( - model: model_types.SupportedModelType, - ) -> "mlflow.pyfunc.PyFuncModel": - import mlflow - - assert isinstance(model, mlflow.pyfunc.PyFuncModel) - - return cast(mlflow.pyfunc.PyFuncModel, model) - - @staticmethod - def _save_model( - name: str, - model: "mlflow.pyfunc.PyFuncModel", - model_meta: model_meta_api.ModelMetadata, - model_blobs_dir_path: str, - sample_input: Optional[model_types.SupportedDataType] = None, - is_sub_model: Optional[bool] = False, - **kwargs: Unpack[model_types.MLFlowSaveOptions], - ) -> None: - import mlflow - - assert isinstance(model, mlflow.pyfunc.PyFuncModel) - - model_info = model.metadata.get_model_info() - model_uri = kwargs.get("model_uri", model_info.model_uri) - - pyfunc_flavor_info = model_info.flavors.get(mlflow.pyfunc.FLAVOR_NAME, None) - if pyfunc_flavor_info is None: - raise ValueError("Cannot save MLFlow model that does not have PyFunc flavor.") - - # Port MLFlow signature - if not is_sub_model: - if model_meta._signatures is not None: - model_meta_api._validate_target_methods(model, list(model_meta.signatures.keys())) - else: - model_meta_api._validate_target_methods(model, _MLFlowHandler.DEFAULT_TARGET_METHODS) - model_meta._signatures = { - _MLFlowHandler._DEFAULT_TARGET_METHOD: model_signature.ModelSignature.from_mlflow_sig( - model_info.signature - ) - } - - # Port MLFlow metadata - mlflow_model_metadata = model_info.metadata - if mlflow_model_metadata and not kwargs.get("ignore_mlflow_metadata", False): - if not model_meta.metadata: - model_meta.metadata = {} - model_meta.metadata.update(mlflow_model_metadata) - - # Port MLFlow dependencies - if kwargs.get("ignore_mlflow_dependencies", False): - model_meta._include_if_absent([model_meta_api.Dependency(conda_name="mlflow", pip_req="mlflow")]) - else: - model_meta = _parse_mlflow_env(model_uri, model_meta) - - model_blob_path = os.path.join(model_blobs_dir_path, name) - - os.makedirs(model_blob_path, exist_ok=True) - with tempfile.TemporaryDirectory() as tmpdir: - try: - local_path = mlflow.artifacts.download_artifacts(model_uri, dst_path=tmpdir) - except (mlflow.MlflowException, OSError): - raise ValueError("Cannot load MLFlow model artifacts.") - - file_utils.copy_file_or_tree(local_path, os.path.join(model_blob_path, _MLFlowHandler.MODEL_BLOB_FILE)) - - base_meta = model_meta_api._ModelBlobMetadata( - name=name, - model_type=_MLFlowHandler.handler_type, - path=_MLFlowHandler.MODEL_BLOB_FILE, - options={"artifact_path": model_info.artifact_path}, - ) - model_meta.models[name] = base_meta - - @staticmethod - def _load_model( - name: str, - model_meta: model_meta_api.ModelMetadata, - model_blobs_dir_path: str, - **kwargs: Unpack[model_types.ModelLoadOption], - ) -> "mlflow.pyfunc.PyFuncModel": - import mlflow - - model_blob_path = os.path.join(model_blobs_dir_path, name) - if not hasattr(model_meta, "models"): - raise ValueError("Ill model metadata found.") - model_blobs_metadata = model_meta.models - if name not in model_blobs_metadata: - raise ValueError(f"Blob of model {name} does not exist.") - model_blob_metadata = model_blobs_metadata[name] - - model_blob_options = model_blob_metadata.options - - model_artifact_path = model_blob_options.get("artifact_path", None) - if model_artifact_path is None: - raise ValueError("Cannot find a place to load the MLFlow model.") - - model_blob_filename = model_blob_metadata.path - - # This is to make sure the loaded model can be saved again. - with mlflow.start_run() as run: - mlflow.log_artifacts( - os.path.join(model_blob_path, model_blob_filename, model_artifact_path), - artifact_path=model_artifact_path, - ) - m = mlflow.pyfunc.load_model(f"runs:/{run.info.run_id}/{model_artifact_path}") - m.metadata.run_id = run.info.run_id - return m - - @staticmethod - def _load_as_custom_model( - name: str, - model_meta: model_meta_api.ModelMetadata, - model_blobs_dir_path: str, - **kwargs: Unpack[model_types.ModelLoadOption], - ) -> custom_model.CustomModel: - """Create a custom model class wrap for unified interface when being deployed. The predict method will be - re-targeted based on target_method metadata. - - Args: - name: Name of the model. - model_meta: The model metadata. - model_blobs_dir_path: Directory path to the whole model. - kwargs: Options when loading the model. - - Returns: - The model object as a custom model. - """ - import mlflow - - from snowflake.ml.model import custom_model - - # We need to redirect the mlruns folder to a writable location in the sandbox. - tmpdir = tempfile.TemporaryDirectory(dir="/tmp") - mlflow.set_tracking_uri(f"file://{tmpdir}") - - def _create_custom_model( - raw_model: "mlflow.pyfunc.PyFuncModel", - model_meta: model_meta_api.ModelMetadata, - ) -> Type[custom_model.CustomModel]: - def fn_factory( - raw_model: "mlflow.pyfunc.PyFuncModel", - signature: model_signature.ModelSignature, - target_method: str, - ) -> Callable[[custom_model.CustomModel, pd.DataFrame], pd.DataFrame]: - @custom_model.inference_api - def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: - res = raw_model.predict(X) - return model_signature_utils.rename_pandas_df( - model_signature._convert_local_data_to_df(res), features=signature.outputs - ) - - return fn - - type_method_dict = {} - for target_method_name, sig in model_meta.signatures.items(): - type_method_dict[target_method_name] = fn_factory(raw_model, sig, target_method_name) - - _MLFlowModel = type( - "_MLFlowModel", - (custom_model.CustomModel,), - type_method_dict, - ) - - return _MLFlowModel - - raw_model = _MLFlowHandler._load_model(name, model_meta, model_blobs_dir_path, **kwargs) - _MLFlowModel = _create_custom_model(raw_model, model_meta) - mlflow_model = _MLFlowModel(custom_model.ModelContext()) - - return mlflow_model diff --git a/snowflake/ml/model/_model.py b/snowflake/ml/model/_model.py deleted file mode 100644 index a3d2e5ef..00000000 --- a/snowflake/ml/model/_model.py +++ /dev/null @@ -1,496 +0,0 @@ -import os -import posixpath -import tempfile -from types import ModuleType -from typing import Dict, List, Literal, Optional, Tuple, Union, overload - -from absl import logging -from packaging import requirements - -from snowflake.ml._internal import env as snowml_env, env_utils, file_utils -from snowflake.ml._internal.exceptions import ( - error_codes, - exceptions as snowml_exceptions, -) -from snowflake.ml.model import ( - _env, - _model_handler, - _model_meta, - custom_model, - model_signature, - type_hints as model_types, -) -from snowflake.snowpark import FileOperation, Session -from snowflake.snowpark._internal import utils as snowpark_utils - -MODEL_BLOBS_DIR = "models" - - -@overload -def save_model( - *, - name: str, - model: model_types.SupportedNoSignatureRequirementsModelType, - session: Session, - model_stage_file_path: str, - metadata: Optional[Dict[str, str]] = None, - conda_dependencies: Optional[List[str]] = None, - pip_requirements: Optional[List[str]] = None, - python_version: Optional[str] = None, - ext_modules: Optional[List[ModuleType]] = None, - code_paths: Optional[List[str]] = None, - options: Optional[model_types.ModelSaveOption] = None, -) -> _model_meta.ModelMetadata: - """Save a model that does not require a signature to a zip file whose path is the provided stage file path. - - Args: - name: Name of the model. - model: Model object. - session: Snowpark connection session. - model_stage_file_path: Path to the file in Snowflake stage where the function should put the saved model. - Must be a file with .zip extension. - metadata: Model metadata. - conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify - a dependency. It is a recommended way to specify your dependencies using conda. When channel is not - specified, defaults channel will be used. When deploying to Snowflake Warehouse, defaults channel would be - replaced with the Snowflake Anaconda channel. - pip_requirements: List of PIP package specs. Model will not be able to deploy to the warehouse if there is pip - requirements. - python_version: A string of python version where model is run. Used for user override. If specified as None, - current version would be captured. Defaults to None. - code_paths: Directory of code to import. - ext_modules: External modules that user might want to get pickled with model object. Defaults to None. - options: Model specific kwargs. - """ - ... - - -@overload -def save_model( - *, - name: str, - model: model_types.SupportedRequireSignatureModelType, - session: Session, - model_stage_file_path: str, - signatures: Dict[str, model_signature.ModelSignature], - metadata: Optional[Dict[str, str]] = None, - conda_dependencies: Optional[List[str]] = None, - pip_requirements: Optional[List[str]] = None, - python_version: Optional[str] = None, - ext_modules: Optional[List[ModuleType]] = None, - code_paths: Optional[List[str]] = None, - options: Optional[model_types.ModelSaveOption] = None, -) -> _model_meta.ModelMetadata: - """Save a model that requires a external signature with user provided signatures - to a zip file whose path is the provided stage file path. - - Args: - name: Name of the model. - model: Model object. - session: Snowpark connection session. - model_stage_file_path: Path to the file in Snowflake stage where the function should put the saved model. - Must be a file with .zip extension. - signatures: Model data signatures for inputs and output for every target methods. - metadata: Model metadata. - conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify - a dependency. It is a recommended way to specify your dependencies using conda. When channel is not - specified, defaults channel will be used. When deploying to Snowflake Warehouse, defaults channel would be - replaced with the Snowflake Anaconda channel. - pip_requirements: List of PIP package specs. Model will not be able to deploy to the warehouse if there is pip - requirements. - python_version: A string of python version where model is run. Used for user override. If specified as None, - current version would be captured. Defaults to None. - code_paths: Directory of code to import. - ext_modules: External modules that user might want to get pickled with model object. Defaults to None. - options: Model specific kwargs. - """ - ... - - -@overload -def save_model( - *, - name: str, - model: model_types.SupportedRequireSignatureModelType, - session: Session, - model_stage_file_path: str, - sample_input: model_types.SupportedDataType, - metadata: Optional[Dict[str, str]] = None, - conda_dependencies: Optional[List[str]] = None, - pip_requirements: Optional[List[str]] = None, - python_version: Optional[str] = None, - ext_modules: Optional[List[ModuleType]] = None, - code_paths: Optional[List[str]] = None, - options: Optional[model_types.ModelSaveOption] = None, -) -> _model_meta.ModelMetadata: - """Save a model that requires a external signature to a zip file whose path is the - provided stage file path with signature inferred from a sample_input_data. - - Args: - name: Name of the model. - model: Model object. - session: Snowpark connection session. - model_stage_file_path: Path to the file in Snowflake stage where the function should put the saved model. - Must be a file with .zip extension. - sample_input: Sample input data to infer the model signatures from. - metadata: Model metadata. - conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify - a dependency. It is a recommended way to specify your dependencies using conda. When channel is not - specified, defaults channel will be used. When deploying to Snowflake Warehouse, defaults channel would be - replaced with the Snowflake Anaconda channel. - pip_requirements: List of PIP package specs. Model will not be able to deploy to the warehouse if there is pip - requirements. - python_version: A string of python version where model is run. Used for user override. If specified as None, - current version would be captured. Defaults to None. - code_paths: Directory of code to import. - ext_modules: External modules that user might want to get pickled with model object. Defaults to None. - options: Model specific kwargs. - """ - ... - - -def save_model( - *, - name: str, - model: model_types.SupportedModelType, - session: Session, - model_stage_file_path: str, - signatures: Optional[Dict[str, model_signature.ModelSignature]] = None, - sample_input: Optional[model_types.SupportedDataType] = None, - metadata: Optional[Dict[str, str]] = None, - conda_dependencies: Optional[List[str]] = None, - pip_requirements: Optional[List[str]] = None, - python_version: Optional[str] = None, - ext_modules: Optional[List[ModuleType]] = None, - code_paths: Optional[List[str]] = None, - options: Optional[model_types.ModelSaveOption] = None, -) -> _model_meta.ModelMetadata: - """Save the model. - - Args: - name: Name of the model. - model: Model object. - session: Snowpark connection session. - model_stage_file_path: Path to the file in Snowflake stage where the function should put the saved model. - Must be a file with .zip extension. - signatures: Model data signatures for inputs and output for every target methods. If it is None, sample_input - would be used to infer the signatures if it is a local (non-SnowML modeling model). - If not None, sample_input should not be specified. Defaults to None. - sample_input: Sample input data to infer the model signatures from. If it is None, signatures must be specified - if it is a local (non-SnowML modeling model). If not None, signatures should not be specified. - Defaults to None. - metadata: Model metadata. - conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify - a dependency. It is a recommended way to specify your dependencies using conda. When channel is not - specified, defaults channel will be used. When deploying to Snowflake Warehouse, defaults channel would be - replaced with the Snowflake Anaconda channel. - pip_requirements: List of PIP package specs. Model will not be able to deploy to the warehouse if there is pip - requirements. - python_version: A string of python version where model is run. Used for user override. If specified as None, - current version would be captured. Defaults to None. - code_paths: Directory of code to import. - ext_modules: External modules that user might want to get pickled with model object. Defaults to None. - options: Model specific kwargs. - - Returns: - Model metadata. - - Raises: - SnowflakeMLException: Raised when the signatures and sample_input specified at the same time, or not presented - when specifying local model. - SnowflakeMLException: Raised when provided model directory is not a directory. - SnowflakeMLException: Raised when provided model stage path is not a zip file. - """ - - if (signatures is None) and (sample_input is None) and not _model_handler.is_auto_signature_model(model): - raise snowml_exceptions.SnowflakeMLException( - error_code=error_codes.INVALID_ARGUMENT, - original_exception=ValueError( - "Signatures and sample_input both cannot be None at the same time for this kind of model." - ), - ) - - if (signatures is not None) and (sample_input is not None): - raise snowml_exceptions.SnowflakeMLException( - error_code=error_codes.INVALID_ARGUMENT, - original_exception=ValueError("Signatures and sample_input both cannot be specified at the same time."), - ) - - if not options: - options = model_types.BaseModelSaveOption() - - assert session and model_stage_file_path - if posixpath.splitext(model_stage_file_path)[1] != ".zip": - raise snowml_exceptions.SnowflakeMLException( - error_code=error_codes.INVALID_ARGUMENT, - original_exception=ValueError( - f"Provided model path in the stage {model_stage_file_path} must be a path to a zip file." - ), - ) - - if not snowpark_utils.is_in_stored_procedure(): # type: ignore[no-untyped-call] - snowml_server_availability = env_utils.validate_requirements_in_snowflake_conda_channel( - session=session, - reqs=[requirements.Requirement(f"snowflake-ml-python=={snowml_env.VERSION}")], - python_version=snowml_env.PYTHON_VERSION, - ) - - if snowml_server_availability is None: - if options.get("embed_local_ml_library", False) is False: - logging.info( - f"Local snowflake-ml-python library has version {snowml_env.VERSION}," - " which is not available in the Snowflake server, embedding local ML library automatically." - ) - options["embed_local_ml_library"] = True - - with tempfile.TemporaryDirectory() as temp_local_model_dir_path: - meta = _save( - name=name, - model=model, - local_dir_path=temp_local_model_dir_path, - signatures=signatures, - sample_input=sample_input, - metadata=metadata, - conda_dependencies=conda_dependencies, - pip_requirements=pip_requirements, - python_version=python_version, - ext_modules=ext_modules, - code_paths=code_paths, - options=options, - ) - if signatures is None: - logging.info(f"Model signatures are auto inferred as:\n\n{meta.signatures}") - with file_utils.zip_file_or_directory_to_stream( - temp_local_model_dir_path, leading_path=temp_local_model_dir_path - ) as zf: - assert session and model_stage_file_path - file_operation = FileOperation(session=session) - file_operation.put_stream( - zf, - model_stage_file_path, - auto_compress=False, - overwrite=options.get("allow_overwritten_stage_file", False), - ) - return meta - - -def _save( - *, - name: str, - model: model_types.SupportedModelType, - local_dir_path: str, - signatures: Optional[Dict[str, model_signature.ModelSignature]] = None, - sample_input: Optional[model_types.SupportedDataType] = None, - metadata: Optional[Dict[str, str]] = None, - conda_dependencies: Optional[List[str]] = None, - pip_requirements: Optional[List[str]] = None, - python_version: Optional[str] = None, - ext_modules: Optional[List[ModuleType]] = None, - code_paths: Optional[List[str]] = None, - options: Optional[model_types.ModelSaveOption] = None, -) -> _model_meta.ModelMetadata: - if not options: - options = model_types.BaseModelSaveOption() - - local_dir_path = os.path.normpath(local_dir_path) - - handler = _model_handler._find_handler(model) - if handler is None: - raise snowml_exceptions.SnowflakeMLException( - error_code=error_codes.INVALID_TYPE, original_exception=TypeError(f"{type(model)} is not supported.") - ) - with _model_meta._create_model_metadata( - model_dir_path=local_dir_path, - name=name, - model_type=handler.handler_type, - metadata=metadata, - code_paths=code_paths, - signatures=signatures, - ext_modules=ext_modules, - conda_dependencies=conda_dependencies, - pip_requirements=pip_requirements, - python_version=python_version, - **options, - ) as meta: - model_blobs_path = os.path.join(local_dir_path, MODEL_BLOBS_DIR) - os.makedirs(model_blobs_path, exist_ok=True) - model = handler.cast_model(model) - handler._save_model( - name=name, - model=model, - model_meta=meta, - model_blobs_dir_path=model_blobs_path, - sample_input=sample_input, - is_sub_model=False, - **options, - ) - - return meta - - -@overload -def load_model( - *, session: Session, model_stage_file_path: str -) -> Tuple[model_types.SupportedModelType, _model_meta.ModelMetadata]: - """Load the model into memory from a zip file in the stage. - - Args: - session: Snowflake connection session. - model_stage_file_path: The path to zipped model file in the stage. Must be a file with .zip extension. - """ - ... - - -@overload -def load_model( - *, session: Session, model_stage_file_path: str, meta_only: Literal[False] -) -> Tuple[model_types.SupportedModelType, _model_meta.ModelMetadata]: - """Load the model into memory from a zip file in the stage. - - Args: - session: Snowflake connection session. - model_stage_file_path: The path to zipped model file in the stage. Must be a file with .zip extension. - meta_only: Flag to indicate that if only load metadata. - """ - ... - - -@overload -def load_model(*, session: Session, model_stage_file_path: str, meta_only: Literal[True]) -> _model_meta.ModelMetadata: - """Load the model into memory from a zip file in the stage with metadata only. - - Args: - session: Snowflake connection session. - model_stage_file_path: The path to zipped model file in the stage. Must be a file with .zip extension. - meta_only: Flag to indicate that if only load metadata. - """ - ... - - -def load_model( - *, - session: Session, - model_stage_file_path: str, - meta_only: bool = False, -) -> Union[_model_meta.ModelMetadata, Tuple[model_types.SupportedModelType, _model_meta.ModelMetadata]]: - """Load the model into memory from directory or a zip file in the stage. - - Args: - session: Snowflake connection session. Must be specified when specifying model_stage_file_path. - Exclusive with model_dir_path. - model_stage_file_path: The path to zipped model file in the stage. Must be specified when specifying session. - Exclusive with model_dir_path. Must be a file with .zip extension. - meta_only: Flag to indicate that if only load metadata. - - Raises: - SnowflakeMLException: Raised if model provided in the stage is not a zip file. - - Returns: - A tuple containing the model object and the model metadata. - """ - - assert session and model_stage_file_path - if posixpath.splitext(model_stage_file_path)[1] != ".zip": - raise snowml_exceptions.SnowflakeMLException( - error_code=error_codes.INVALID_ARGUMENT, - original_exception=ValueError( - f"Provided model path in the stage {model_stage_file_path} must be a path to a zip file." - ), - ) - - file_operation = FileOperation(session=session) - zf = file_operation.get_stream(model_stage_file_path) - with file_utils.unzip_stream_in_temp_dir(stream=zf) as temp_local_model_dir_path: - # This is to make mypy happy. - if meta_only: - return _load(local_dir_path=temp_local_model_dir_path, meta_only=True) - return _load(local_dir_path=temp_local_model_dir_path) - - -@overload -def _load( - *, - local_dir_path: str, - meta_only: Literal[False] = False, - as_custom_model: Literal[False] = False, - options: Optional[model_types.ModelLoadOption] = None, -) -> Tuple[model_types.SupportedModelType, _model_meta.ModelMetadata]: - ... - - -@overload -def _load( - *, - local_dir_path: str, - meta_only: Literal[False] = False, - as_custom_model: Literal[True], - options: Optional[model_types.ModelLoadOption] = None, -) -> Tuple[custom_model.CustomModel, _model_meta.ModelMetadata]: - ... - - -@overload -def _load( - *, - local_dir_path: str, - meta_only: Literal[True], - as_custom_model: bool = False, - options: Optional[model_types.ModelLoadOption] = None, -) -> _model_meta.ModelMetadata: - ... - - -def _load( - *, - local_dir_path: str, - meta_only: bool = False, - as_custom_model: bool = False, - options: Optional[model_types.ModelLoadOption] = None, -) -> Union[_model_meta.ModelMetadata, Tuple[model_types.SupportedModelType, _model_meta.ModelMetadata]]: - """Load the model into memory from directory. Used internal only. - - Args: - local_dir_path: Directory containing the model. - meta_only: Flag to indicate that if only load metadata. - as_custom_model: When set to True, It will try to use _load_as_custom_model method in the handler if provided, - otherwise, it will use _load_model. - options: Model loading options. - - Raises: - SnowflakeMLException: Raised if model is not native format. - - Returns: - ModelMeta data when meta_only is True. - A tuple containing the model object as a custom model and the model metadata when as_custom_model is True. - A tuple containing the model object and the model metadata when as_custom_model is False. - """ - local_dir_path = os.path.normpath(local_dir_path) - meta = _model_meta._load_model_metadata(local_dir_path) - if meta_only: - return meta - - _model_meta._load_code_path(local_dir_path) - - _env.validate_py_runtime_version(meta.python_version) - - handler = _model_handler._load_handler(meta.model_type) - if handler is None: - raise snowml_exceptions.SnowflakeMLException( - error_code=error_codes.INVALID_TYPE, original_exception=TypeError(f"{meta.model_type} is not supported.") - ) - model_blobs_path = os.path.join(local_dir_path, MODEL_BLOBS_DIR) - if as_custom_model: - load_func = getattr(handler, "_load_as_custom_model", None) - if not callable(load_func): - load_func = handler._load_model - else: - load_func = handler._load_model - - if options is None: - options = {} - - m = load_func(meta.name, meta, model_blobs_path, **options) - if as_custom_model: - assert isinstance(m, custom_model.CustomModel) - - return m, meta diff --git a/snowflake/ml/model/_model_meta.py b/snowflake/ml/model/_model_meta.py deleted file mode 100644 index 9be02555..00000000 --- a/snowflake/ml/model/_model_meta.py +++ /dev/null @@ -1,505 +0,0 @@ -import dataclasses -import importlib -import os -import sys -import tempfile -import warnings -import zipfile -from collections import namedtuple -from contextlib import contextmanager -from datetime import datetime -from types import ModuleType -from typing import Any, Callable, Dict, Generator, List, Optional, Sequence, cast - -import cloudpickle -import yaml -from packaging import version - -from snowflake.ml._internal import env as snowml_env, env_utils, file_utils -from snowflake.ml.model import ( - _core_requirements, - _env, - model_signature, - type_hints as model_types, -) -from snowflake.ml.model._signatures import snowpark_handler -from snowflake.snowpark import DataFrame as SnowparkDataFrame - -MODEL_METADATA_VERSION = 1 -_BASIC_DEPENDENCIES = _core_requirements.REQUIREMENTS -_SNOWFLAKE_PKG_NAME = "snowflake" -_SNOWFLAKE_ML_PKG_NAME = f"{_SNOWFLAKE_PKG_NAME}.ml" -# The default CUDA version is chosen based on the driver availability in SPCS. -# If changing this version, we need also change the version of default PyTorch in HuggingFace pipeline handler to -# make sure they are compatible. -_DEFAULT_CUDA_VERSION = "11.7" - -# conda_name: The name of dependency in conda -# pip_req: Full version requirement where name is pypi package name. -Dependency = namedtuple("Dependency", ["conda_name", "pip_req"]) - - -@dataclasses.dataclass -class _ModelBlobMetadata: - """Dataclass to store metadata of an individual model blob (sub-model) in the packed model. - - Attributes: - name: The name to refer the sub-model. - model_type: The type of the model and handler to use. - path: Path to the picked model file. It is a relative path from the model blob directory. - artifacts: Optional, used in custom model to show the mapping between artifact name and relative path - from the model blob directory. - options: Optional, used for some model specific metadata storage - """ - - name: str - model_type: str - path: str - artifacts: Dict[str, str] = dataclasses.field(default_factory=dict) - options: Dict[str, str] = dataclasses.field(default_factory=dict) - - -@contextmanager -def _create_model_metadata( - *, - model_dir_path: str, - name: str, - model_type: str, - signatures: Optional[Dict[str, model_signature.ModelSignature]] = None, - metadata: Optional[Dict[str, str]] = None, - code_paths: Optional[List[str]] = None, - ext_modules: Optional[List[ModuleType]] = None, - conda_dependencies: Optional[List[str]] = None, - pip_requirements: Optional[List[str]] = None, - python_version: Optional[str] = None, - **kwargs: Any, -) -> Generator["ModelMetadata", None, None]: - """Create a generator for model metadata object. Use generator to ensure correct register and unregister for - cloudpickle. - - Args: - model_dir_path: Path to the directory containing the model to be packed. - name: Name of the model. - model_type: Type of the model. - signatures: Signatures of the model. If None, it will be inferred after the model meta is created. - Defaults to None. - metadata: User provided key-value metadata of the model. Defaults to None. - code_paths: List of paths to additional codes that needs to be packed with. Defaults to None. - ext_modules: List of names of modules that need to be pickled with the model. Defaults to None. - conda_dependencies: List of conda requirements for running the model. Defaults to None. - pip_requirements: List of pip Python packages requirements for running the model. Defaults to None. - python_version: A string of python version where model is run. Used for user override. If specified as None, - current version would be captured. Defaults to None. - **kwargs: Dict of attributes and values of the metadata. Used when loading from file. - - Raises: - ValueError: Raised when the code path contains reserved file or directory. - - Yields: - A model metadata object. - """ - model_dir_path = os.path.normpath(model_dir_path) - embed_local_ml_library = kwargs.pop("embed_local_ml_library", False) - # Use the last one which is loaded first, that is mean, it is loaded from site-packages. - # We could make sure that user does not overwrite our library with their code follow the same naming. - snowml_path, snowml_start_path = file_utils.get_package_path(_SNOWFLAKE_ML_PKG_NAME, strategy="last") - if os.path.isdir(snowml_start_path): - path_to_copy = snowml_path - # If the package is zip-imported, then the path will be `../path_to_zip.zip/snowflake/ml` - # It is not a valid path in fact and we need to get the path to the zip file to verify it. - elif os.path.isfile(snowml_start_path): - extract_root = tempfile.mkdtemp() - with zipfile.ZipFile(os.path.abspath(snowml_start_path), mode="r", compression=zipfile.ZIP_DEFLATED) as zf: - zf.extractall(path=extract_root) - path_to_copy = os.path.join(extract_root, *(_SNOWFLAKE_ML_PKG_NAME.split("."))) - else: - raise ValueError("`snowflake.ml` is imported via a way that embedding local ML library is not supported.") - - if embed_local_ml_library: - kwargs["local_ml_library_version"] = f"{snowml_env.VERSION}+{file_utils.hash_directory(path_to_copy)}" - - model_meta = ModelMetadata( - name=name, - metadata=metadata, - model_type=model_type, - conda_dependencies=conda_dependencies, - pip_requirements=pip_requirements, - python_version=python_version, - signatures=signatures, - **kwargs, - ) - - code_dir_path = os.path.join(model_dir_path, ModelMetadata.MODEL_CODE_DIR) - if embed_local_ml_library or code_paths: - os.makedirs(code_dir_path, exist_ok=True) - - if embed_local_ml_library: - snowml_path_in_code = os.path.join(code_dir_path, _SNOWFLAKE_PKG_NAME) - os.makedirs(snowml_path_in_code, exist_ok=True) - file_utils.copy_file_or_tree(path_to_copy, snowml_path_in_code) - - if code_paths: - for code_path in code_paths: - # This part is to prevent users from providing code following our naming and overwrite our code. - if ( - os.path.isfile(code_path) and os.path.splitext(os.path.basename(code_path))[0] == _SNOWFLAKE_PKG_NAME - ) or (os.path.isdir(code_path) and os.path.basename(code_path) == _SNOWFLAKE_PKG_NAME): - raise ValueError("`snowflake` is a reserved name and you cannot contain that into code path.") - file_utils.copy_file_or_tree(code_path, code_dir_path) - - try: - imported_modules = [] - if ext_modules: - registered_modules = cloudpickle.list_registry_pickle_by_value() - for mod in ext_modules: - if mod.__name__ not in registered_modules: - cloudpickle.register_pickle_by_value(mod) - imported_modules.append(mod) - yield model_meta - model_meta.save_model_metadata(model_dir_path) - finally: - for mod in imported_modules: - cloudpickle.unregister_pickle_by_value(mod) - - -def _load_model_metadata(model_dir_path: str) -> "ModelMetadata": - """Load models for a directory. Model is initially loaded normally. If additional codes are included when packed, - the code path is added to system path to be imported with highest priority. - - Args: - model_dir_path: Path to the directory containing the model to be loaded. - - Returns: - A model metadata object. - """ - model_dir_path = os.path.normpath(model_dir_path) - meta = ModelMetadata.load_model_metadata(model_dir_path) - return meta - - -def _load_code_path(model_dir_path: str) -> None: - """Load custom code in the code path into memory. - - Args: - model_dir_path: Path to the directory containing the model to be loaded. - - """ - model_dir_path = os.path.normpath(model_dir_path) - code_path = os.path.join(model_dir_path, ModelMetadata.MODEL_CODE_DIR) - if os.path.exists(code_path): - if code_path in sys.path: - sys.path.remove(code_path) - sys.path.insert(0, code_path) - module_names = file_utils.get_all_modules(code_path) - # If the module_name starts with snowflake, then do not replace it. - # When deploying, we would add them beforehand. - # When in the local, they should not be added. We already prevent user from overwriting us. - module_names = [ - module_name - for module_name in module_names - if not (module_name.startswith(f"{_SNOWFLAKE_PKG_NAME}.") or module_name == _SNOWFLAKE_PKG_NAME) - ] - for module_name in module_names: - actual_module = sys.modules.pop(module_name, None) - if actual_module is not None: - sys.modules[module_name] = importlib.import_module(module_name) - - assert code_path in sys.path - sys.path.remove(code_path) - - -class ModelMetadata: - """Model metadata for Snowflake native model packaged model. - - Attributes: - name: Name of the model. - model_type: Type of the model. - creation_timestamp: Unix timestamp when the model metadata is created. - python_version: String 'major.minor.patchlevel' showing the python version where the model runs. - cuda_version: CUDA version to be used, if None then the model cannot be deployed to instance with GPUs. - """ - - MANIFEST_FILE = "MANIFEST" - ENV_DIR = "env" - MODEL_CODE_DIR = "code" - MODEL_METADATA_FILE = "model.yaml" - - def __init__( - self, - *, - name: str, - model_type: str, - signatures: Optional[Dict[str, model_signature.ModelSignature]] = None, - metadata: Optional[Dict[str, str]] = None, - conda_dependencies: Optional[List[str]] = None, - pip_requirements: Optional[List[str]] = None, - python_version: Optional[str] = None, - **kwargs: Any, - ) -> None: - """Initialize the model metadata. Anything indicates in kwargs has higher priority. - - Args: - name: Name of the model. - model_type: Type of the model. - signatures: A dict mapping from target function name to input and output signatures. - metadata: User provided key-value metadata of the model. Defaults to None. - conda_dependencies: List of conda requirements for running the model. Defaults to None. - pip_requirements: List of pip Python packages requirements for running the model. Defaults to None. - python_version: A string of python version where model is run. Used for user override. If specified as None, - current version would be captured. Defaults to None. - **kwargs: Dict of attributes and values of the metadata. Used when loading from file. - - Raises: - ValueError: Raised when the user provided version string is invalid. - """ - self.name = name - self._signatures = signatures - self.metadata = metadata - self.creation_timestamp = str(datetime.utcnow()) - self.model_type = model_type - self._models: Dict[str, _ModelBlobMetadata] = dict() - if python_version: - try: - self.python_version = str(version.parse(python_version)) - # We might have more check here later. - except version.InvalidVersion: - raise ValueError(f"{python_version} is not a valid Python version.") - else: - self.python_version = snowml_env.PYTHON_VERSION - - self._conda_dependencies = env_utils.validate_conda_dependency_string_list( - conda_dependencies if conda_dependencies else [] - ) - self._pip_requirements = env_utils.validate_pip_requirement_string_list( - pip_requirements if pip_requirements else [] - ) - if "local_ml_library_version" in kwargs: - self._include_if_absent([Dependency(conda_name=dep, pip_req=dep) for dep in _BASIC_DEPENDENCIES]) - else: - self._include_if_absent( - [Dependency(conda_name=dep, pip_req=dep) for dep in _BASIC_DEPENDENCIES + [env_utils._SNOWML_PKG_NAME]] - ) - self._cuda_version: Optional[str] = None - - self.__dict__.update(kwargs) - - @property - def pip_requirements(self) -> List[str]: - """List of pip Python packages requirements for running the model.""" - return list(sorted(map(str, self._pip_requirements))) - - @property - def conda_dependencies(self) -> List[str]: - """List of conda channel and dependencies from that to run the model""" - return sorted( - f"{chan}::{str(req)}" if chan else str(req) - for chan, reqs in self._conda_dependencies.items() - for req in reqs - ) - - def _include_if_absent(self, pkgs: List[Dependency]) -> None: - conda_reqs_str, pip_reqs_str = tuple(zip(*pkgs)) - pip_reqs = env_utils.validate_pip_requirement_string_list(list(pip_reqs_str)) - conda_reqs = env_utils.validate_conda_dependency_string_list(list(conda_reqs_str)) - - for conda_req, pip_req in zip(conda_reqs[env_utils.DEFAULT_CHANNEL_NAME], pip_reqs): - req_to_add = env_utils.get_local_installed_version_of_pip_package(pip_req) - req_to_add.name = conda_req.name - for added_pip_req in self._pip_requirements: - if added_pip_req.name == pip_req.name: - warnings.warn( - ( - f"Basic dependency {conda_req} specified from PIP requirements." - + " This may prevent model deploying to Snowflake Warehouse." - ), - category=UserWarning, - ) - try: - env_utils.append_conda_dependency( - self._conda_dependencies, (env_utils.DEFAULT_CHANNEL_NAME, req_to_add) - ) - except env_utils.DuplicateDependencyError: - pass - except env_utils.DuplicateDependencyInMultipleChannelsError: - warnings.warn( - ( - f"Basic dependency {conda_req.name} specified from non-Snowflake channel." - + " This may prevent model deploying to Snowflake Warehouse." - ), - category=UserWarning, - ) - - @property - def cuda_version(self) -> Optional[str]: - return self._cuda_version - - @cuda_version.setter - def cuda_version(self, _cuda_version: str) -> None: - if not isinstance(_cuda_version, str): - raise ValueError("Cannot set CUDA version as a non-str object.") - if self._cuda_version is None: - self._cuda_version = _cuda_version - else: - if self._cuda_version != _cuda_version: - raise ValueError( - f"Different CUDA version {self._cuda_version} and {_cuda_version} found in the same model!" - ) - - @property - def signatures(self) -> Dict[str, model_signature.ModelSignature]: - """Signatures of the model. - - Raises: - RuntimeError: Raised when the metadata is not ready to save - - Returns: - Model signatures. - """ - if self._signatures is None: - raise RuntimeError("The meta data is not ready to save.") - return self._signatures - - @property - def models(self) -> Dict[str, _ModelBlobMetadata]: - """Dict showing the mapping from sub-models' name to corresponding model blob metadata.""" - return self._models - - def to_dict(self) -> Dict[str, Any]: - """Serialize to a dictionary. - - Raises: - RuntimeError: Raised when the metadata is not ready to save - - Returns: - A dict containing the information of the model metadata. - """ - if self._signatures is None: - raise RuntimeError("The meta data is not ready to save.") - res = {k: v for k, v in self.__dict__.items() if not k.startswith("_")} - res["signatures"] = {func_name: sig.to_dict() for func_name, sig in self._signatures.items()} - res["models"] = {name: dataclasses.asdict(blob_meta) for name, blob_meta in self._models.items()} - res["pip_requirements"] = self.pip_requirements - res["conda_dependencies"] = self.conda_dependencies - res["cuda_version"] = self._cuda_version - return res - - @classmethod - def from_dict(cls, model_dict: Dict[str, Any]) -> "ModelMetadata": - """Deserialize from a dictionary. - - Args: - model_dict: The dict where metadata is stored. - - Returns: - A model metadata object created from the given dict. - """ - model_dict["signatures"] = { - func_name: model_signature.ModelSignature.from_dict(sig) - for func_name, sig in model_dict.pop("signatures").items() - } - model_dict["_models"] = { - name: _ModelBlobMetadata(**blob_meta) for name, blob_meta in model_dict.pop("models").items() - } - model_dict["_cuda_version"] = model_dict.pop("cuda_version", None) - return cls(**model_dict) - - def save_model_metadata(self, path: str) -> None: - """Save the model metadata as a yaml file in the model directory. - - Args: - path: The path of the directory to write a yaml file in it. - """ - model_yaml_path = os.path.join(path, ModelMetadata.MODEL_METADATA_FILE) - with open(model_yaml_path, "w", encoding="utf-8") as out: - yaml.safe_dump({**self.to_dict(), "version": MODEL_METADATA_VERSION}, stream=out, default_flow_style=False) - - env_dir_path = os.path.join(path, ModelMetadata.ENV_DIR) - os.makedirs(env_dir_path, exist_ok=True) - - _env.save_conda_env_file(env_dir_path, self._conda_dependencies, self.python_version) - _env.save_requirements_file(env_dir_path, self._pip_requirements) - - @classmethod - def load_model_metadata(cls, path: str) -> "ModelMetadata": - """Load the model metadata from the model metadata yaml file in the model directory. - - Args: - path: The path of the directory to read the metadata yaml file in it. - - Raises: - NotImplementedError: raised when version is not found or unsupported in metadata file. - - Returns: - Loaded model metadata object. - """ - model_yaml_path = os.path.join(path, ModelMetadata.MODEL_METADATA_FILE) - with open(model_yaml_path, encoding="utf-8") as f: - loaded_meta = yaml.safe_load(f.read()) - - loaded_meta_version = loaded_meta.pop("version", None) - if not loaded_meta_version or loaded_meta_version != MODEL_METADATA_VERSION: - raise NotImplementedError("Unknown or unsupported model metadata file found.") - - meta = ModelMetadata.from_dict(loaded_meta) - env_dir_path = os.path.join(path, ModelMetadata.ENV_DIR) - meta._conda_dependencies, python_version = _env.load_conda_env_file( - os.path.join(env_dir_path, _env._CONDA_ENV_FILE_NAME) - ) - if python_version: - meta.python_version = python_version - meta._pip_requirements = _env.load_requirements_file(os.path.join(env_dir_path, _env._REQUIREMENTS_FILE_NAME)) - return meta - - -def _is_callable(model: model_types.SupportedModelType, method_name: str) -> bool: - return callable(getattr(model, method_name, None)) - - -def _validate_signature( - model: model_types.SupportedRequireSignatureModelType, - model_meta: ModelMetadata, - target_methods: Sequence[str], - sample_input: Optional[model_types.SupportedDataType], - get_prediction_fn: Callable[[str, model_types.SupportedLocalDataType], model_types.SupportedLocalDataType], -) -> ModelMetadata: - if model_meta._signatures is not None: - _validate_target_methods(model, list(model_meta.signatures.keys())) - return model_meta - - # In this case sample_input should be available, because of the check in save_model. - assert ( - sample_input is not None - ), "Model signature and sample input are None at the same time. This should not happen with local model." - model_meta._signatures = {} - trunc_sample_input = model_signature._truncate_data(sample_input) - if isinstance(sample_input, SnowparkDataFrame): - # Added because of Any from missing stubs. - trunc_sample_input = cast(SnowparkDataFrame, trunc_sample_input) - local_sample_input = snowpark_handler.SnowparkDataFrameHandler.convert_to_df(trunc_sample_input) - else: - local_sample_input = trunc_sample_input - for target_method in target_methods: - predictions_df = get_prediction_fn(target_method, local_sample_input) - sig = model_signature.infer_signature(local_sample_input, predictions_df) - model_meta._signatures[target_method] = sig - return model_meta - - -def _get_target_methods( - model: model_types.SupportedModelType, - target_methods: Optional[Sequence[str]], - default_target_methods: Sequence[str], -) -> Sequence[str]: - if target_methods is None: - target_methods = [method_name for method_name in default_target_methods if _is_callable(model, method_name)] - - _validate_target_methods(model, target_methods) - return target_methods - - -def _validate_target_methods(model: model_types.SupportedModelType, target_methods: Sequence[str]) -> None: - for method_name in target_methods: - if method_name not in target_methods: - raise ValueError(f"Target method {method_name} does not exists.") - if not _is_callable(model, method_name): - raise ValueError(f"Target method {method_name} is not callable.") diff --git a/snowflake/ml/model/_model_meta_test.py b/snowflake/ml/model/_model_meta_test.py deleted file mode 100644 index 4b8eb99c..00000000 --- a/snowflake/ml/model/_model_meta_test.py +++ /dev/null @@ -1,297 +0,0 @@ -import os -import tempfile -from importlib import metadata as importlib_metadata - -import yaml -from absl.testing import absltest -from packaging import requirements - -from snowflake.ml._internal import env_utils -from snowflake.ml.model import _model_meta, model_signature - -_DUMMY_SIG = { - "predict": model_signature.ModelSignature( - inputs=[ - model_signature.FeatureSpec(dtype=model_signature.DataType.FLOAT, name="input"), - ], - outputs=[model_signature.FeatureSpec(name="output", dtype=model_signature.DataType.FLOAT)], - ) -} - -_BASIC_DEPENDENCIES_TARGET = list( - sorted( - map( - lambda x: str(env_utils.get_local_installed_version_of_pip_package(requirements.Requirement(x))), - _model_meta._BASIC_DEPENDENCIES, - ) - ) -) - -_BASIC_DEPENDENCIES_TARGET_WITH_SNOWML = list( - sorted( - map( - lambda x: str(env_utils.get_local_installed_version_of_pip_package(requirements.Requirement(x))), - _model_meta._BASIC_DEPENDENCIES + [env_utils._SNOWML_PKG_NAME], - ) - ) -) - - -class ModelMetaTest(absltest.TestCase): - def test_model_meta_dependencies_no_packages(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG - ) as meta: - self.assertListEqual(meta.pip_requirements, []) - self.assertListEqual(meta.conda_dependencies, _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML) - self.assertFalse(hasattr(meta, "local_ml_library_version")) - - meta_dict = meta.to_dict() - - loaded_meta = _model_meta.ModelMetadata.from_dict(meta_dict) - - self.assertListEqual(loaded_meta.pip_requirements, []) - self.assertListEqual(loaded_meta.conda_dependencies, _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML) - self.assertFalse(hasattr(meta, "local_ml_library_version")) - - def test_model_meta_dependencies_no_packages_embedded_snowml(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - embed_local_ml_library=True, - ) as meta: - self.assertListEqual(meta.pip_requirements, []) - self.assertListEqual(meta.conda_dependencies, _BASIC_DEPENDENCIES_TARGET) - self.assertTrue(hasattr(meta, "local_ml_library_version")) - - meta_dict = meta.to_dict() - - loaded_meta = _model_meta.ModelMetadata.from_dict(meta_dict) - - self.assertListEqual(loaded_meta.pip_requirements, []) - self.assertListEqual(loaded_meta.conda_dependencies, _BASIC_DEPENDENCIES_TARGET) - self.assertTrue(hasattr(meta, "local_ml_library_version")) - - def test_model_meta_dependencies_dup_basic_dep(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - conda_dependencies=["pandas"], - ) as meta: - dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] - dep_target.remove(f"pandas=={importlib_metadata.version('pandas')}") - dep_target.append("pandas") - dep_target.sort() - - self.assertListEqual(meta.pip_requirements, []) - self.assertListEqual(meta.conda_dependencies, dep_target) - - meta_dict = meta.to_dict() - - loaded_meta = _model_meta.ModelMetadata.from_dict(meta_dict) - - self.assertListEqual(loaded_meta.pip_requirements, []) - self.assertListEqual(loaded_meta.conda_dependencies, dep_target) - - def test_model_meta_dependencies_dup_basic_dep_other_channel(self) -> None: - with self.assertWarns(UserWarning): - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - conda_dependencies=["conda-forge::pandas"], - ) as meta: - dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] - dep_target.remove(f"pandas=={importlib_metadata.version('pandas')}") - dep_target.append("conda-forge::pandas") - dep_target.sort() - - self.assertListEqual(meta.pip_requirements, []) - self.assertListEqual(meta.conda_dependencies, dep_target) - - meta_dict = meta.to_dict() - - with self.assertWarns(UserWarning): - loaded_meta = _model_meta.ModelMetadata.from_dict(meta_dict) - - self.assertListEqual(loaded_meta.pip_requirements, []) - self.assertListEqual(loaded_meta.conda_dependencies, dep_target) - - def test_model_meta_dependencies_dup_basic_dep_pip(self) -> None: - with self.assertWarns(UserWarning): - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - pip_requirements=["pandas"], - ) as meta: - dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] - dep_target.sort() - - self.assertListEqual(meta.pip_requirements, ["pandas"]) - self.assertListEqual(meta.conda_dependencies, dep_target) - - meta_dict = meta.to_dict() - - with self.assertWarns(UserWarning): - loaded_meta = _model_meta.ModelMetadata.from_dict(meta_dict) - - self.assertListEqual(loaded_meta.pip_requirements, ["pandas"]) - self.assertListEqual(loaded_meta.conda_dependencies, dep_target) - - def test_model_meta_dependencies_conda(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - conda_dependencies=["pytorch"], - ) as meta: - dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] - dep_target.append("pytorch") - dep_target.sort() - - self.assertListEqual(meta.pip_requirements, []) - self.assertListEqual(meta.conda_dependencies, dep_target) - - meta_dict = meta.to_dict() - - loaded_meta = _model_meta.ModelMetadata.from_dict(meta_dict) - - self.assertListEqual(loaded_meta.pip_requirements, []) - self.assertListEqual(loaded_meta.conda_dependencies, dep_target) - - def test_model_meta_dependencies_pip(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - pip_requirements=["torch"], - ) as meta: - dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] - dep_target.sort() - - self.assertListEqual(meta.pip_requirements, ["torch"]) - self.assertListEqual(meta.conda_dependencies, dep_target) - - meta_dict = meta.to_dict() - - loaded_meta = _model_meta.ModelMetadata.from_dict(meta_dict) - - self.assertListEqual(loaded_meta.pip_requirements, ["torch"]) - self.assertListEqual(loaded_meta.conda_dependencies, dep_target) - - def test_model_meta_dependencies_both(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - conda_dependencies=["pytorch"], - pip_requirements=["torch"], - ) as meta: - dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] - dep_target.append("pytorch") - dep_target.sort() - - self.assertListEqual(meta.pip_requirements, ["torch"]) - self.assertListEqual(meta.conda_dependencies, dep_target) - - meta_dict = meta.to_dict() - - loaded_meta = _model_meta.ModelMetadata.from_dict(meta_dict) - - self.assertListEqual(loaded_meta.pip_requirements, ["torch"]) - self.assertListEqual(loaded_meta.conda_dependencies, dep_target) - - def test_model_meta_override_py_version(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG, python_version="2.7" - ) as meta: - self.assertEqual(meta.python_version, "2.7") - - meta_dict = meta.to_dict() - - loaded_meta = _model_meta.ModelMetadata.from_dict(meta_dict) - - self.assertEqual(loaded_meta.python_version, "2.7") - - with self.assertRaises(ValueError): - meta = _model_meta.ModelMetadata( - name="model1", model_type="custom", signatures=_DUMMY_SIG, python_version="a" - ) - - def test_model_meta_metadata(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - metadata={"foo": "bar"}, - ) as meta: - saved_meta = meta - loaded_meta = _model_meta.ModelMetadata.load_model_metadata(tmpdir) - - self.assertEqual(saved_meta.metadata, loaded_meta.metadata) - self.assertDictEqual(saved_meta.to_dict(), loaded_meta.to_dict()) - - def test_model_meta_check(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - metadata={"foo": "bar"}, - ): - pass - with open(os.path.join(tmpdir, _model_meta.ModelMetadata.MODEL_METADATA_FILE), encoding="utf-8") as f: - meta_yaml_data = yaml.safe_load(f) - - del meta_yaml_data["version"] - - with open(os.path.join(tmpdir, _model_meta.ModelMetadata.MODEL_METADATA_FILE), "w", encoding="utf-8") as f: - yaml.safe_dump(meta_yaml_data, f) - - with self.assertRaises(NotImplementedError): - _ = _model_meta.ModelMetadata.load_model_metadata(tmpdir) - - def test_model_meta_cuda(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG - ) as meta: - with self.assertRaisesRegex(ValueError, "Cannot set CUDA version as a non-str object."): - meta.cuda_version = None - - meta.cuda_version = "11.7" - - meta_dict = meta.to_dict() - - loaded_meta = _model_meta.ModelMetadata.from_dict(meta_dict) - - self.assertEqual(loaded_meta.cuda_version, "11.7") - - with self.assertRaisesRegex(ValueError, "Different CUDA version .+ and .+ found in the same model!"): - loaded_meta.cuda_version = "12.0" - - -if __name__ == "__main__": - absltest.main() diff --git a/snowflake/ml/model/_model_test.py b/snowflake/ml/model/_model_test.py deleted file mode 100644 index 3b23c49a..00000000 --- a/snowflake/ml/model/_model_test.py +++ /dev/null @@ -1,346 +0,0 @@ -import importlib -import os -import sys -import tempfile -from typing import cast -from unittest import mock - -import numpy as np -import pandas as pd -from absl.testing import absltest -from sklearn import linear_model - -from snowflake.ml._internal import env as snowml_env, env_utils, file_utils -from snowflake.ml.model import _model as model_api, custom_model, model_signature -from snowflake.ml.modeling.linear_model import ( # type:ignore[attr-defined] - LinearRegression, -) -from snowflake.ml.test_utils import exception_utils, mock_session -from snowflake.snowpark import FileOperation, Session - - -class DemoModelWithManyArtifacts(custom_model.CustomModel): - def __init__(self, context: custom_model.ModelContext) -> None: - super().__init__(context) - with open(os.path.join(context.path("bias"), "bias1"), encoding="utf-8") as f: - v1 = int(f.read()) - with open(os.path.join(context.path("bias"), "bias2"), encoding="utf-8") as f: - v2 = int(f.read()) - self.bias = v1 + v2 - - @custom_model.inference_api - def predict(self, input: pd.DataFrame) -> pd.DataFrame: - return pd.DataFrame({"output": input["c1"] + self.bias}) - - -class DemoModel(custom_model.CustomModel): - def __init__(self, context: custom_model.ModelContext) -> None: - super().__init__(context) - - @custom_model.inference_api - def predict(self, input: pd.DataFrame) -> pd.DataFrame: - return pd.DataFrame({"output": input["c1"]}) - - -PY_SRC = """\ -def get_name(): - return __name__ -def get_file(): - return __file__ -""" - - -class ModelLoadHygieneTest(absltest.TestCase): - def test_model_load_hygiene(self) -> None: - with tempfile.TemporaryDirectory() as workspace: - with tempfile.TemporaryDirectory() as src_path: - fake_mod_dirpath = os.path.join(src_path, "fake", "fake_module") - os.makedirs(fake_mod_dirpath) - - py_file_path = os.path.join(fake_mod_dirpath, "p.py") - with open(py_file_path, "w", encoding="utf-8") as f: - f.write(PY_SRC) - f.flush() - - sys.path.insert(0, src_path) - - from fake.fake_module import p - - self.assertEqual(p.__file__, py_file_path) - - lm = DemoModel(context=custom_model.ModelContext(models={}, artifacts={})) - arr = np.array([[1, 2, 3], [4, 2, 5]]) - d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) - model_api._save( - name="model1", - local_dir_path=os.path.join(workspace, "model1"), - model=lm, - sample_input=d, - metadata={"author": "halu", "version": "1"}, - code_paths=[os.path.join(src_path, "fake")], - ) - - _ = model_api._load(local_dir_path=os.path.join(workspace, "model1")) - from fake.fake_module import p - - self.assertEqual(p.__file__, os.path.join(workspace, "model1", "code", "fake", "fake_module", "p.py")) - - importlib.reload(p) - self.assertEqual(p.__file__, py_file_path) - sys.path.remove(src_path) - - def test_model_save_validation(self) -> None: - with tempfile.TemporaryDirectory() as workspace: - with tempfile.TemporaryDirectory() as src_path: - fake_mod_dirpath = os.path.join(src_path, "snowflake", "fake_module") - os.makedirs(fake_mod_dirpath) - - py_file_path = os.path.join(fake_mod_dirpath, "p.py") - with open(py_file_path, "w", encoding="utf-8") as f: - f.write(PY_SRC) - f.flush() - - lm = DemoModel(context=custom_model.ModelContext(models={}, artifacts={})) - arr = np.array([[1, 2, 3], [4, 2, 5]]) - d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) - with self.assertRaises(ValueError): - model_api._save( - name="model1", - local_dir_path=os.path.join(workspace, "model1"), - model=lm, - sample_input=d, - metadata={"author": "halu", "version": "1"}, - code_paths=[os.path.join(src_path, "snowflake")], - ) - - with tempfile.TemporaryDirectory() as src_path: - py_file_path = os.path.join(src_path, "snowflake.py") - with open(py_file_path, "w", encoding="utf-8") as f: - f.write(PY_SRC) - f.flush() - - lm = DemoModel(context=custom_model.ModelContext(models={}, artifacts={})) - arr = np.array([[1, 2, 3], [4, 2, 5]]) - d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) - with self.assertRaises(ValueError): - model_api._save( - name="model1", - local_dir_path=os.path.join(workspace, "model1"), - model=lm, - sample_input=d, - metadata={"author": "halu", "version": "1"}, - code_paths=[py_file_path], - ) - - def test_zipimport_snowml(self) -> None: - snowml_path, snowml_start_path = file_utils.get_package_path("snowflake.ml", strategy="last") - with tempfile.TemporaryDirectory() as workspace: - zipped_snowml_path = os.path.join(workspace, "snowml.zip") - with open(zipped_snowml_path, "wb") as f: - with file_utils.zip_file_or_directory_to_stream(snowml_path, snowml_start_path) as zip_stream: - f.write(zip_stream.getbuffer()) - - sys.path.append(zipped_snowml_path) - try: - lm = DemoModel(context=custom_model.ModelContext(models={}, artifacts={})) - arr = np.array([[1, 2, 3], [4, 2, 5]]) - d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) - model_api._save( - name="model1", - local_dir_path=os.path.join(workspace, "model1"), - model=lm, - sample_input=d, - metadata={"author": "halu", "version": "1"}, - options={"embed_local_ml_library": True}, - ) - self.assertTrue( - os.path.exists(os.path.join(workspace, "model1", "code", "snowflake", "ml", "model", "_model.py")) - ) - finally: - sys.path.remove(zipped_snowml_path) - - -class ModelInterfaceTest(absltest.TestCase): - def test_save_interface(self) -> None: - m_session = mock_session.MockSession(conn=None, test_case=self) - c_session = cast(Session, m_session) - - stage_path = '@"db"."schema"."stage"/model.zip' - - arr = np.array([[1, 2, 3], [4, 2, 5]]) - d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) - - with exception_utils.assert_snowml_exceptions( - self, - expected_original_error_type=ValueError, - expected_regex="Signatures and sample_input both cannot be specified at the same time.", - ): - model_api.save_model( # type:ignore[call-overload] - name="model1", - session=c_session, - model_stage_file_path=stage_path, - model=linear_model.LinearRegression(), - sample_input=d, - signatures={"predict": model_signature.ModelSignature(inputs=[], outputs=[])}, - ) - - with exception_utils.assert_snowml_exceptions( - self, - expected_original_error_type=ValueError, - expected_regex="Signatures and sample_input both cannot be None at the same time for this kind of model.", - ): - model_api.save_model( - name="model1", - session=c_session, - model_stage_file_path=stage_path, - model=linear_model.LinearRegression(), - ) - - mock_meta = mock.MagicMock() - mock_meta.signatures = mock.MagicMock() - with mock.patch.object(model_api, "_save", return_value=mock_meta) as mock_save: - with mock.patch.object(FileOperation, "put_stream", return_value=None) as mock_put_stream: - with mock.patch.object( - env_utils, "validate_requirements_in_snowflake_conda_channel", return_value=[""] - ): - model_api.save_model( - name="model1", - session=c_session, - model_stage_file_path=stage_path, - model=LinearRegression(), - ) - mock_save.assert_called_once() - - with mock.patch.object(model_api, "_save", return_value=mock_meta) as mock_save: - with mock.patch.object(FileOperation, "put_stream", return_value=None) as mock_put_stream: - with mock.patch.object( - env_utils, "validate_requirements_in_snowflake_conda_channel", return_value=[""] - ): - model_api.save_model( - name="model1", - session=c_session, - model_stage_file_path=stage_path, - model=LinearRegression(), - ) - - mock_save.assert_called_once() - - with exception_utils.assert_snowml_exceptions( - self, - expected_original_error_type=ValueError, - expected_regex="Provided model path in the stage [^\\s]* must be a path to a zip file.", - ): - model_api.save_model( - name="model1", - model=linear_model.LinearRegression(), - session=c_session, - model_stage_file_path='@"db"."schema"."stage"/model', - sample_input=d, - ) - - with mock.patch.object(model_api, "_save", return_value=mock_meta): - with mock.patch.object(FileOperation, "put_stream", return_value=None): - with mock.patch.object( - env_utils, "validate_requirements_in_snowflake_conda_channel", return_value=None - ): - with self.assertLogs(level="INFO") as cm: - model_api.save_model( - name="model1", - model=linear_model.LinearRegression(), - session=c_session, - model_stage_file_path=stage_path, - sample_input=d, - ) - self.assertListEqual( - cm.output[:1], - [ - ( - f"INFO:absl:Local snowflake-ml-python library has version {snowml_env.VERSION}," - " which is not available in the Snowflake server, embedding local ML " - "library automatically." - ) - ], - ) - - with mock.patch.object(model_api, "_save", return_value=mock_meta): - with mock.patch.object(FileOperation, "put_stream", return_value=None) as mock_put_stream: - with mock.patch.object( - env_utils, "validate_requirements_in_snowflake_conda_channel", return_value=[""] - ): - model_api.save_model( - name="model1", - model=linear_model.LinearRegression(), - session=c_session, - model_stage_file_path=stage_path, - sample_input=d, - ) - mock_put_stream.assert_called_once_with(mock.ANY, stage_path, auto_compress=False, overwrite=False) - - with mock.patch.object(model_api, "_save", return_value=mock_meta): - with mock.patch.object(FileOperation, "put_stream", return_value=None) as mock_put_stream: - with mock.patch.object( - env_utils, "validate_requirements_in_snowflake_conda_channel", return_value=[""] - ): - model_api.save_model( - name="model1", - model=linear_model.LinearRegression(), - session=c_session, - model_stage_file_path=stage_path, - sample_input=d, - options={"allow_overwritten_stage_file": True}, - ) - mock_put_stream.assert_called_once_with(mock.ANY, stage_path, auto_compress=False, overwrite=True) - - def test_load_interface(self) -> None: - m_session = mock_session.MockSession(conn=None, test_case=self) - c_session = cast(Session, m_session) - - with exception_utils.assert_snowml_exceptions( - self, - expected_original_error_type=ValueError, - expected_regex="Provided model path in the stage [^\\s]* must be a path to a zip file.", - ): - model_api.load_model(session=c_session, model_stage_file_path='@"db"."schema"."stage"/model') - - -class ModelTest(absltest.TestCase): - def test_bad_save_model(self) -> None: - tmpdir = self.create_tempdir() - os.mkdir(os.path.join(tmpdir.full_path, "bias")) - with open(os.path.join(tmpdir.full_path, "bias", "bias1"), "w", encoding="utf-8") as f: - f.write("25") - with open(os.path.join(tmpdir.full_path, "bias", "bias2"), "w", encoding="utf-8") as f: - f.write("68") - lm = DemoModelWithManyArtifacts( - custom_model.ModelContext(models={}, artifacts={"bias": os.path.join(tmpdir.full_path, "bias")}) - ) - arr = np.array([[1, 2, 3], [4, 2, 5]]) - d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) - s = {"predict": model_signature.infer_signature(d, lm.predict(d))} - - with self.assertRaises(ValueError): - model_api._save( - name="model1", - local_dir_path=os.path.join(tmpdir.full_path, "model1"), - model=lm, - signatures={**s, "another_predict": s["predict"]}, - metadata={"author": "halu", "version": "1"}, - ) - - model_api._save( - name="model1", - local_dir_path=os.path.join(tmpdir.full_path, "model1"), - model=lm, - signatures=s, - metadata={"author": "halu", "version": "1"}, - python_version="3.5.2", - ) - - _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), meta_only=True) - - with exception_utils.assert_snowml_exceptions(self, expected_original_error_type=RuntimeError): - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - - -if __name__ == "__main__": - absltest.main() diff --git a/snowflake/ml/model/_module_model/BUILD.bazel b/snowflake/ml/model/_module_model/BUILD.bazel new file mode 100644 index 00000000..552d138d --- /dev/null +++ b/snowflake/ml/model/_module_model/BUILD.bazel @@ -0,0 +1,28 @@ +load("//bazel:py_rules.bzl", "py_library", "py_test") + +package(default_visibility = ["//visibility:public"]) + +py_library( + name = "module_model", + srcs = ["module_model.py"], + deps = [ + "//snowflake/ml/_internal:env", + "//snowflake/ml/_internal:env_utils", + "//snowflake/ml/_internal:file_utils", + "//snowflake/ml/model:model_signature", + "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager:model_packager", + ], +) + +py_test( + name = "module_model_test", + srcs = ["module_model_test.py"], + deps = [ + ":module_model", + "//snowflake/ml/_internal:env_utils", + "//snowflake/ml/_internal:file_utils", + "//snowflake/ml/modeling/linear_model:linear_regression", + "//snowflake/ml/test_utils:mock_session", + ], +) diff --git a/snowflake/ml/model/_module_model/module_manifest/BUILD.bazel b/snowflake/ml/model/_module_model/module_manifest/BUILD.bazel new file mode 100644 index 00000000..f2afd157 --- /dev/null +++ b/snowflake/ml/model/_module_model/module_manifest/BUILD.bazel @@ -0,0 +1,8 @@ +load("//bazel:py_rules.bzl", "py_library") + +package(default_visibility = ["//visibility:public"]) + +py_library( + name = "module_manifest", + srcs = ["module_manifest.py"], +) diff --git a/snowflake/ml/model/_module_model/module_manifest/module_manifest.py b/snowflake/ml/model/_module_model/module_manifest/module_manifest.py new file mode 100644 index 00000000..3a795fd9 --- /dev/null +++ b/snowflake/ml/model/_module_model/module_manifest/module_manifest.py @@ -0,0 +1,2 @@ +class ModuleManifest: + ... diff --git a/snowflake/ml/model/_module_model/module_method/BUILD.bazel b/snowflake/ml/model/_module_model/module_method/BUILD.bazel new file mode 100644 index 00000000..b75316f1 --- /dev/null +++ b/snowflake/ml/model/_module_model/module_method/BUILD.bazel @@ -0,0 +1,28 @@ +load("//bazel:py_rules.bzl", "py_library", "py_test") + +package(default_visibility = ["//visibility:public"]) + +py_library( + name = "handler_generator", + srcs = ["handler_generator.py"], + data = [ + "infer_handler.py_template", + ], +) + +py_test( + name = "handler_generator_test", + srcs = ["handler_generator_test.py"], + data = [ + "fixtures/handler_fixture_1.py_fixture", + "fixtures/handler_fixture_2.py_fixture", + ], + deps = [ + ":handler_generator", + ], +) + +py_library( + name = "module_method", + srcs = ["module_method.py"], +) diff --git a/snowflake/ml/model/_module_model/module_method/fixtures/handler_fixture_1.py_fixture b/snowflake/ml/model/_module_model/module_method/fixtures/handler_fixture_1.py_fixture new file mode 100644 index 00000000..e6c64872 --- /dev/null +++ b/snowflake/ml/model/_module_model/module_method/fixtures/handler_fixture_1.py_fixture @@ -0,0 +1,78 @@ +import fcntl +import functools +import inspect +import os +import sys +import threading +import zipfile +from types import TracebackType +from typing import Optional, Type + +import anyio +import pandas as pd +from _snowflake import vectorized + +from snowflake.ml.model._packager import model_packager + + +class FileLock: + def __enter__(self) -> None: + self._lock = threading.Lock() + self._lock.acquire() + self._fd = open("/tmp/lockfile.LOCK", "w+") + fcntl.lockf(self._fd, fcntl.LOCK_EX) + + def __exit__( + self, exc_type: Optional[Type[BaseException]], exc: Optional[BaseException], traceback: Optional[TracebackType] + ) -> None: + self._fd.close() + self._lock.release() + + +# User-defined parameters +MODEL_FILE_NAME = "model.zip" +TARGET_METHOD = "predict" +MAX_BATCH_SIZE = None + + +# Retrieve the model +IMPORT_DIRECTORY_NAME = "snowflake_import_directory" +import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME] + +model_dir_name = os.path.splitext(MODEL_FILE_NAME)[0] +zip_model_path = os.path.join(import_dir, MODEL_FILE_NAME) +extracted = "/tmp/models" +extracted_model_dir_path = os.path.join(extracted, model_dir_name) + +with FileLock(): + if not os.path.isdir(extracted_model_dir_path): + with zipfile.ZipFile(zip_model_path, "r") as myzip: + myzip.extractall(extracted_model_dir_path) + +# Load the model +pk = model_packager.ModelPackager(extracted_model_dir_path) +pk.load(as_custom_model=True) +assert pk.model, "model is not loaded" +assert pk.meta, "model metadata is not loaded" + +# Determine the actual runner +model = pk.model +meta = pk.meta +func = getattr(model, TARGET_METHOD) +if inspect.iscoroutinefunction(func): + runner = functools.partial(anyio.run, func) +else: + runner = functools.partial(func) + +# Determine preprocess parameters +features = meta.signatures[TARGET_METHOD].inputs +input_cols = [feature.name for feature in features] +dtype_map = {feature.name: feature.as_dtype() for feature in features} + + +# Actual handler +@vectorized(input=pd.DataFrame, max_batch_size=MAX_BATCH_SIZE) +def infer(df: pd.DataFrame) -> dict: + input_df = pd.json_normalize(df[0]).astype(dtype=dtype_map) + predictions_df = runner(input_df[input_cols]) + return predictions_df.to_dict("records") diff --git a/snowflake/ml/model/_module_model/module_method/fixtures/handler_fixture_2.py_fixture b/snowflake/ml/model/_module_model/module_method/fixtures/handler_fixture_2.py_fixture new file mode 100644 index 00000000..3058fa50 --- /dev/null +++ b/snowflake/ml/model/_module_model/module_method/fixtures/handler_fixture_2.py_fixture @@ -0,0 +1,78 @@ +import fcntl +import functools +import inspect +import os +import sys +import threading +import zipfile +from types import TracebackType +from typing import Optional, Type + +import anyio +import pandas as pd +from _snowflake import vectorized + +from snowflake.ml.model._packager import model_packager + + +class FileLock: + def __enter__(self) -> None: + self._lock = threading.Lock() + self._lock.acquire() + self._fd = open("/tmp/lockfile.LOCK", "w+") + fcntl.lockf(self._fd, fcntl.LOCK_EX) + + def __exit__( + self, exc_type: Optional[Type[BaseException]], exc: Optional[BaseException], traceback: Optional[TracebackType] + ) -> None: + self._fd.close() + self._lock.release() + + +# User-defined parameters +MODEL_FILE_NAME = "model.zip" +TARGET_METHOD = "__call__" +MAX_BATCH_SIZE = 10 + + +# Retrieve the model +IMPORT_DIRECTORY_NAME = "snowflake_import_directory" +import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME] + +model_dir_name = os.path.splitext(MODEL_FILE_NAME)[0] +zip_model_path = os.path.join(import_dir, MODEL_FILE_NAME) +extracted = "/tmp/models" +extracted_model_dir_path = os.path.join(extracted, model_dir_name) + +with FileLock(): + if not os.path.isdir(extracted_model_dir_path): + with zipfile.ZipFile(zip_model_path, "r") as myzip: + myzip.extractall(extracted_model_dir_path) + +# Load the model +pk = model_packager.ModelPackager(extracted_model_dir_path) +pk.load(as_custom_model=True) +assert pk.model, "model is not loaded" +assert pk.meta, "model metadata is not loaded" + +# Determine the actual runner +model = pk.model +meta = pk.meta +func = getattr(model, TARGET_METHOD) +if inspect.iscoroutinefunction(func): + runner = functools.partial(anyio.run, func) +else: + runner = functools.partial(func) + +# Determine preprocess parameters +features = meta.signatures[TARGET_METHOD].inputs +input_cols = [feature.name for feature in features] +dtype_map = {feature.name: feature.as_dtype() for feature in features} + + +# Actual handler +@vectorized(input=pd.DataFrame, max_batch_size=MAX_BATCH_SIZE) +def infer(df: pd.DataFrame) -> dict: + input_df = pd.json_normalize(df[0]).astype(dtype=dtype_map) + predictions_df = runner(input_df[input_cols]) + return predictions_df.to_dict("records") diff --git a/snowflake/ml/model/_module_model/module_method/handler_generator.py b/snowflake/ml/model/_module_model/module_method/handler_generator.py new file mode 100644 index 00000000..90af240e --- /dev/null +++ b/snowflake/ml/model/_module_model/module_method/handler_generator.py @@ -0,0 +1,43 @@ +import pathlib +from typing import Optional, TypedDict + +import importlib_resources +from typing_extensions import NotRequired + + +class HandlerGenerateOptions(TypedDict): + max_batch_size: NotRequired[int] + + +class HandlerGenerator: + HANDLER_NAME = "infer" + + def __init__( + self, + model_file_stage_path: pathlib.PurePosixPath, + ) -> None: + self.model_file_stage_path = model_file_stage_path + + def generate( + self, + handler_file_path: pathlib.Path, + target_method: str, + options: Optional[HandlerGenerateOptions] = None, + ) -> None: + if options is None: + options = {} + handler_template = ( + importlib_resources.files("snowflake.ml.model._module_model.module_method") + .joinpath("infer_handler.py_template") # type: ignore[no-untyped-call] + .read_text() + ) + + udf_code = handler_template.format( + model_file_name=self.model_file_stage_path.name, + target_method=target_method, + max_batch_size=options.get("max_batch_size", None), + handler_name=HandlerGenerator.HANDLER_NAME, + ) + with open(handler_file_path, "w", encoding="utf-8") as f: + f.write(udf_code) + f.flush() diff --git a/snowflake/ml/model/_module_model/module_method/handler_generator_test.py b/snowflake/ml/model/_module_model/module_method/handler_generator_test.py new file mode 100644 index 00000000..0336680a --- /dev/null +++ b/snowflake/ml/model/_module_model/module_method/handler_generator_test.py @@ -0,0 +1,46 @@ +import pathlib +import tempfile + +import importlib_resources +from absl.testing import absltest + +from snowflake.ml.model._module_model.module_method import handler_generator + + +class HandlerGeneratorTest(absltest.TestCase): + def test_handler_generator(self) -> None: + hg = handler_generator.HandlerGenerator(pathlib.PurePosixPath("@a.b.c/abc/model.zip")) + with tempfile.TemporaryDirectory() as tmpdir: + hg.generate( + pathlib.Path(tmpdir, "handler.py"), + "predict", + ) + with open(pathlib.Path(tmpdir, "handler.py"), encoding="utf-8") as f: + self.assertEqual( + ( + importlib_resources.files("snowflake.ml.model._module_model.module_method") + .joinpath("fixtures") # type: ignore[no-untyped-call] + .joinpath("handler_fixture_1.py_fixture") + .read_text() + ), + f.read(), + ) + hg.generate( + pathlib.Path(tmpdir, "another_handler.py"), + "__call__", + options=handler_generator.HandlerGenerateOptions(max_batch_size=10), + ) + with open(pathlib.Path(tmpdir, "another_handler.py"), encoding="utf-8") as f: + self.assertEqual( + ( + importlib_resources.files("snowflake.ml.model._module_model.module_method") + .joinpath("fixtures") # type: ignore[no-untyped-call] + .joinpath("handler_fixture_2.py_fixture") + .read_text() + ), + f.read(), + ) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_module_model/module_method/infer_handler.py_template b/snowflake/ml/model/_module_model/module_method/infer_handler.py_template new file mode 100644 index 00000000..14c721d5 --- /dev/null +++ b/snowflake/ml/model/_module_model/module_method/infer_handler.py_template @@ -0,0 +1,78 @@ +import fcntl +import functools +import inspect +import os +import sys +import threading +import zipfile +from types import TracebackType +from typing import Optional, Type + +import anyio +import pandas as pd +from _snowflake import vectorized + +from snowflake.ml.model._packager import model_packager + + +class FileLock: + def __enter__(self) -> None: + self._lock = threading.Lock() + self._lock.acquire() + self._fd = open("/tmp/lockfile.LOCK", "w+") + fcntl.lockf(self._fd, fcntl.LOCK_EX) + + def __exit__( + self, exc_type: Optional[Type[BaseException]], exc: Optional[BaseException], traceback: Optional[TracebackType] + ) -> None: + self._fd.close() + self._lock.release() + + +# User-defined parameters +MODEL_FILE_NAME = "{model_file_name}" +TARGET_METHOD = "{target_method}" +MAX_BATCH_SIZE = {max_batch_size} + + +# Retrieve the model +IMPORT_DIRECTORY_NAME = "snowflake_import_directory" +import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME] + +model_dir_name = os.path.splitext(MODEL_FILE_NAME)[0] +zip_model_path = os.path.join(import_dir, MODEL_FILE_NAME) +extracted = "/tmp/models" +extracted_model_dir_path = os.path.join(extracted, model_dir_name) + +with FileLock(): + if not os.path.isdir(extracted_model_dir_path): + with zipfile.ZipFile(zip_model_path, "r") as myzip: + myzip.extractall(extracted_model_dir_path) + +# Load the model +pk = model_packager.ModelPackager(extracted_model_dir_path) +pk.load(as_custom_model=True) +assert pk.model, "model is not loaded" +assert pk.meta, "model metadata is not loaded" + +# Determine the actual runner +model = pk.model +meta = pk.meta +func = getattr(model, TARGET_METHOD) +if inspect.iscoroutinefunction(func): + runner = functools.partial(anyio.run, func) +else: + runner = functools.partial(func) + +# Determine preprocess parameters +features = meta.signatures[TARGET_METHOD].inputs +input_cols = [feature.name for feature in features] +dtype_map = {{feature.name: feature.as_dtype() for feature in features}} + + +# Actual handler +@vectorized(input=pd.DataFrame, max_batch_size=MAX_BATCH_SIZE) +def {handler_name}(df: pd.DataFrame) -> dict: + input_df = pd.json_normalize(df[0]).astype(dtype=dtype_map) + predictions_df = runner(input_df[input_cols]) + return predictions_df.to_dict("records") diff --git a/snowflake/ml/model/_module_model/module_method/module_method.py b/snowflake/ml/model/_module_model/module_method/module_method.py new file mode 100644 index 00000000..c72d7b65 --- /dev/null +++ b/snowflake/ml/model/_module_model/module_method/module_method.py @@ -0,0 +1,2 @@ +class ModuleMethod: + ... diff --git a/snowflake/ml/model/_module_model/module_model.py b/snowflake/ml/model/_module_model/module_model.py new file mode 100644 index 00000000..a998fb07 --- /dev/null +++ b/snowflake/ml/model/_module_model/module_model.py @@ -0,0 +1,137 @@ +import glob +import pathlib +import tempfile +import zipfile +from types import ModuleType +from typing import Dict, List, Optional + +from absl import logging +from packaging import requirements + +from snowflake.ml._internal import env as snowml_env, env_utils, file_utils +from snowflake.ml.model import model_signature, type_hints as model_types +from snowflake.ml.model._packager import model_packager +from snowflake.snowpark import Session +from snowflake.snowpark._internal import utils as snowpark_utils + + +class ModuleModel: + """Top-level class to construct and represent contents in a MODEL object in SQL. + + Attributes: + session: The Snowpark Session. + stage_path: A stage path representing the base directory where the content of a MODEL object will exist. + workspace_path: A local path which is the exact mapping to the `stage_path` + + (TODO) manifest: A ModuleManifest object managing the MANIFEST file generation. + (TODO) runtimes: A list of ModuleRuntime objects managing the runtimes and environment in the MODEL object. + (TODO) methods: A list of ModuleMethod objects managing the method we registered to the MODEL object. + packager: A ModelPackager object managing the (un)packaging of a Snowflake Native Model in the MODEL object. + + _packager_workspace_path: A local path created from packager where it will dump all files there and ModuleModel + will zip it. This would not required if we make directory import work. + """ + + MODEL_FILE_REL_PATH = "model.zip" + + def __init__(self, session: Session, stage_path: str) -> None: + self.session = session + self.stage_path = pathlib.PurePosixPath(stage_path) + + self._workspace = tempfile.TemporaryDirectory() + self._packager_workspace = tempfile.TemporaryDirectory() + + self.packager = model_packager.ModelPackager(local_dir_path=str(self._packager_workspace_path)) + + def __del__(self) -> None: + self._workspace.cleanup() + self._packager_workspace.cleanup() + + @property + def workspace_path(self) -> pathlib.Path: + return pathlib.Path(self._workspace.name) + + @property + def _packager_workspace_path(self) -> pathlib.Path: + return pathlib.Path(self._packager_workspace.name) + + @property + def model_stage_path(self) -> str: + return (self.stage_path / ModuleModel.MODEL_FILE_REL_PATH).as_posix() + + @property + def model_local_path(self) -> str: + return str(self.workspace_path / ModuleModel.MODEL_FILE_REL_PATH) + + def save( + self, + *, + name: str, + model: model_types.SupportedModelType, + signatures: Optional[Dict[str, model_signature.ModelSignature]] = None, + sample_input: Optional[model_types.SupportedDataType] = None, + metadata: Optional[Dict[str, str]] = None, + conda_dependencies: Optional[List[str]] = None, + pip_requirements: Optional[List[str]] = None, + python_version: Optional[str] = None, + ext_modules: Optional[List[ModuleType]] = None, + code_paths: Optional[List[str]] = None, + options: Optional[model_types.ModelSaveOption] = None, + ) -> None: + if not options: + options = model_types.BaseModelSaveOption() + + if not snowpark_utils.is_in_stored_procedure(): # type: ignore[no-untyped-call] + snowml_server_availability = env_utils.validate_requirements_in_snowflake_conda_channel( + session=self.session, + reqs=[requirements.Requirement(f"snowflake-ml-python=={snowml_env.VERSION}")], + python_version=snowml_env.PYTHON_VERSION, + ) + + if snowml_server_availability is None and options.get("embed_local_ml_library", False) is False: + logging.info( + f"Local snowflake-ml-python library has version {snowml_env.VERSION}," + " which is not available in the Snowflake server, embedding local ML library automatically." + ) + options["embed_local_ml_library"] = True + + self.packager.save( + name=name, + model=model, + signatures=signatures, + sample_input=sample_input, + metadata=metadata, + conda_dependencies=conda_dependencies, + pip_requirements=pip_requirements, + python_version=python_version, + ext_modules=ext_modules, + code_paths=code_paths, + options=options, + ) + with file_utils.zip_file_or_directory_to_stream( + str(self._packager_workspace_path), + leading_path=str(self._packager_workspace_path), + ) as zf: + with open(self.model_local_path, "wb") as f: + f.write(zf.getbuffer()) + f.flush() + + file_utils.upload_directory_to_stage(self.session, local_path=self.workspace_path, stage_path=self.stage_path) + + def load( + self, + *, + meta_only: bool = False, + options: Optional[model_types.ModelLoadOption] = None, + ) -> None: + file_utils.download_directory_from_stage( + self.session, stage_path=self.stage_path, local_path=self.workspace_path + ) + + # TODO (Server-side Model Rollout): Remove this section. + model_zip_path = pathlib.Path(glob.glob(str(self.workspace_path / "*.zip"))[0]) + ModuleModel.MODEL_FILE_REL_PATH = str(model_zip_path.relative_to(self.workspace_path)) + + with zipfile.ZipFile(self.model_local_path, mode="r", compression=zipfile.ZIP_DEFLATED) as zf: + zf.extractall(path=self._packager_workspace_path) + self.packager.load(meta_only=meta_only, options=options) diff --git a/snowflake/ml/model/_module_model/module_model_test.py b/snowflake/ml/model/_module_model/module_model_test.py new file mode 100644 index 00000000..d7c2a607 --- /dev/null +++ b/snowflake/ml/model/_module_model/module_model_test.py @@ -0,0 +1,57 @@ +from typing import cast +from unittest import mock + +import numpy as np +import pandas as pd +from absl.testing import absltest +from sklearn import linear_model + +from snowflake.ml._internal import env_utils +from snowflake.ml.model._module_model import module_model +from snowflake.ml.modeling.linear_model import ( # type:ignore[attr-defined] + LinearRegression, +) +from snowflake.ml.test_utils import mock_session +from snowflake.snowpark import FileOperation, Session + + +class ModuleInterfaceTest(absltest.TestCase): + def test_save_interface(self) -> None: + m_session = mock_session.MockSession(conn=None, test_case=self) + c_session = cast(Session, m_session) + + stage_path = '@"db"."schema"."stage"' + arr = np.array([[1, 2, 3], [4, 2, 5]]) + d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + + mock_pk = mock.MagicMock() + mock_pk.meta = mock.MagicMock() + mock_pk.meta.signatures = mock.MagicMock() + m = module_model.ModuleModel(session=c_session, stage_path=stage_path) + with mock.patch.object(m.packager, "save") as mock_save: + with mock.patch.object(FileOperation, "put", return_value=None) as mock_put_stream: + with mock.patch.object( + env_utils, "validate_requirements_in_snowflake_conda_channel", return_value=[""] + ): + m.save( + name="model1", + model=LinearRegression(), + ) + mock_save.assert_called_once() + + m = module_model.ModuleModel(session=c_session, stage_path=stage_path) + with mock.patch.object(m.packager, "save") as mock_save: + with mock.patch.object(FileOperation, "put", return_value=None) as mock_put_stream: + with mock.patch.object( + env_utils, "validate_requirements_in_snowflake_conda_channel", return_value=[""] + ): + m.save( + name="model1", + model=linear_model.LinearRegression(), + sample_input=d, + ) + mock_put_stream.assert_called_once_with(mock.ANY, stage_path, auto_compress=False, overwrite=False) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_module_model/module_runtime/BUILD.bazel b/snowflake/ml/model/_module_model/module_runtime/BUILD.bazel new file mode 100644 index 00000000..975eb45f --- /dev/null +++ b/snowflake/ml/model/_module_model/module_runtime/BUILD.bazel @@ -0,0 +1,8 @@ +load("//bazel:py_rules.bzl", "py_library") + +package(default_visibility = ["//visibility:public"]) + +py_library( + name = "module_runtime", + srcs = ["module_runtime.py"], +) diff --git a/snowflake/ml/model/_module_model/module_runtime/module_runtime.py b/snowflake/ml/model/_module_model/module_runtime/module_runtime.py new file mode 100644 index 00000000..a90a8628 --- /dev/null +++ b/snowflake/ml/model/_module_model/module_runtime/module_runtime.py @@ -0,0 +1,2 @@ +class ModuleRuntime: + ... diff --git a/snowflake/ml/model/_packager/BUILD.bazel b/snowflake/ml/model/_packager/BUILD.bazel new file mode 100644 index 00000000..90bffcde --- /dev/null +++ b/snowflake/ml/model/_packager/BUILD.bazel @@ -0,0 +1,68 @@ +load("//bazel:py_rules.bzl", "py_library", "py_test") + +package(default_visibility = ["//visibility:public"]) + +py_library( + name = "model_handler", + srcs = ["model_handler.py"], + deps = [ + "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager/model_handlers:_base", + ], +) + +py_library( + name = "model_packager", + srcs = ["model_packager.py"], + deps = [ + "//snowflake/ml/_internal:env_utils", + "//snowflake/ml/_internal/exceptions", + "//snowflake/ml/model:custom_model", + "//snowflake/ml/model:model_signature", + "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager:model_handler", + "//snowflake/ml/model/_packager/model_handlers:custom", + "//snowflake/ml/model/_packager/model_handlers:huggingface_pipeline", + "//snowflake/ml/model/_packager/model_handlers:llm", + "//snowflake/ml/model/_packager/model_handlers:mlflow", + "//snowflake/ml/model/_packager/model_handlers:pytorch", + "//snowflake/ml/model/_packager/model_handlers:sklearn", + "//snowflake/ml/model/_packager/model_handlers:snowmlmodel", + "//snowflake/ml/model/_packager/model_handlers:tensorflow", + "//snowflake/ml/model/_packager/model_handlers:torchscript", + "//snowflake/ml/model/_packager/model_handlers:xgboost", + "//snowflake/ml/model/_packager/model_meta", + ], +) + +py_test( + name = "model_packager_test", + srcs = ["model_packager_test.py"], + deps = [ + ":model_packager", + "//snowflake/ml/_internal:file_utils", + "//snowflake/ml/model:custom_model", + "//snowflake/ml/model:model_signature", + "//snowflake/ml/modeling/linear_model:linear_regression", + "//snowflake/ml/test_utils:exception_utils", + ], +) + +py_test( + name = "model_handler_test", + srcs = ["model_handler_test.py"], + deps = [ + ":model_handler", + "//snowflake/ml/_internal:env", + "//snowflake/ml/model/_packager/model_handlers:custom", + "//snowflake/ml/model/_packager/model_handlers:huggingface_pipeline", + "//snowflake/ml/model/_packager/model_handlers:mlflow", + "//snowflake/ml/model/_packager/model_handlers:pytorch", + "//snowflake/ml/model/_packager/model_handlers:sklearn", + "//snowflake/ml/model/_packager/model_handlers:snowmlmodel", + "//snowflake/ml/model/_packager/model_handlers:tensorflow", + "//snowflake/ml/model/_packager/model_handlers:torchscript", + "//snowflake/ml/model/_packager/model_handlers:xgboost", + "//snowflake/ml/test_utils:test_env_utils", + ], +) diff --git a/snowflake/ml/model/_packager/model_env/BUILD.bazel b/snowflake/ml/model/_packager/model_env/BUILD.bazel new file mode 100644 index 00000000..a01f248e --- /dev/null +++ b/snowflake/ml/model/_packager/model_env/BUILD.bazel @@ -0,0 +1,23 @@ +load("//bazel:py_rules.bzl", "py_library", "py_test") + +package(default_visibility = ["//visibility:public"]) + +py_library( + name = "model_env", + srcs = ["model_env.py"], + deps = [ + "//snowflake/ml/_internal:env", + "//snowflake/ml/_internal:env_utils", + "//snowflake/ml/model/_packager/model_meta:model_meta_schema", + ], +) + +py_test( + name = "model_env_test", + srcs = ["model_env_test.py"], + deps = [ + ":model_env", + "//snowflake/ml/_internal:env", + "//snowflake/ml/_internal:env_utils", + ], +) diff --git a/snowflake/ml/model/_packager/model_env/model_env.py b/snowflake/ml/model/_packager/model_env/model_env.py new file mode 100644 index 00000000..f9e6fe0f --- /dev/null +++ b/snowflake/ml/model/_packager/model_env/model_env.py @@ -0,0 +1,392 @@ +import collections +import itertools +import os +import pathlib +import warnings +from typing import DefaultDict, List, Optional + +from packaging import requirements, version + +from snowflake.ml._internal import env as snowml_env, env_utils +from snowflake.ml.model._packager.model_meta import model_meta_schema + +# requirement: Full version requirement where name is conda package name. +# pypi_name: The name of dependency in Pypi. +ModelDependency = collections.namedtuple("ModelDependency", ["requirement", "pip_name"]) + +_DEFAULT_ENV_DIR = "env" +_DEFAULT_CONDA_ENV_FILENAME = "conda.yml" +_DEFAULT_PIP_REQUIREMENTS_FILENAME = "requirements.txt" + +# The default CUDA version is chosen based on the driver availability in SPCS. +# If changing this version, we need also change the version of default PyTorch in HuggingFace pipeline handler to +# make sure they are compatible. +DEFAULT_CUDA_VERSION = "11.7" + + +class ModelEnv: + def __init__( + self, + conda_env_rel_path: Optional[str] = None, + pip_requirements_rel_path: Optional[str] = None, + ) -> None: + if conda_env_rel_path is None: + conda_env_rel_path = os.path.join(_DEFAULT_ENV_DIR, _DEFAULT_CONDA_ENV_FILENAME) + if pip_requirements_rel_path is None: + pip_requirements_rel_path = os.path.join(_DEFAULT_ENV_DIR, _DEFAULT_PIP_REQUIREMENTS_FILENAME) + self.conda_env_rel_path = pathlib.PurePosixPath(pathlib.Path(conda_env_rel_path).as_posix()) + self.pip_requirements_rel_path = pathlib.PurePosixPath(pathlib.Path(pip_requirements_rel_path).as_posix()) + self._conda_dependencies: DefaultDict[str, List[requirements.Requirement]] = collections.defaultdict(list) + self._pip_requirements: List[requirements.Requirement] = [] + self._python_version: version.Version = version.parse(snowml_env.PYTHON_VERSION) + self._cuda_version: Optional[version.Version] = None + self._snowpark_ml_version: version.Version = version.parse(snowml_env.VERSION) + + @property + def conda_dependencies(self) -> List[str]: + """List of conda channel and dependencies from that to run the model""" + return sorted( + f"{chan}::{str(req)}" if chan else str(req) + for chan, reqs in self._conda_dependencies.items() + for req in reqs + ) + + @conda_dependencies.setter + def conda_dependencies( + self, + conda_dependencies: Optional[List[str]] = None, + ) -> None: + self._conda_dependencies = env_utils.validate_conda_dependency_string_list( + conda_dependencies if conda_dependencies else [] + ) + + @property + def pip_requirements(self) -> List[str]: + """List of pip Python packages requirements for running the model.""" + return sorted(list(map(str, self._pip_requirements))) + + @pip_requirements.setter + def pip_requirements( + self, + pip_requirements: Optional[List[str]] = None, + ) -> None: + self._pip_requirements = env_utils.validate_pip_requirement_string_list( + pip_requirements if pip_requirements else [] + ) + + @property + def python_version(self) -> str: + return f"{self._python_version.major}.{self._python_version.minor}" + + @python_version.setter + def python_version(self, python_version: Optional[str] = None) -> None: + if python_version: + self._python_version = version.parse(python_version) + + @property + def cuda_version(self) -> Optional[str]: + if self._cuda_version: + return f"{self._cuda_version.major}.{self._cuda_version.minor}" + return None + + @cuda_version.setter + def cuda_version(self, cuda_version: Optional[str] = None) -> None: + # We need to check this as CUDA version would be set inside the handler, while python_version or snowpark + # ML version would not. + if cuda_version: + parsed_cuda_version = version.parse(cuda_version) + if self._cuda_version is None: + self._cuda_version = parsed_cuda_version + else: + if self.cuda_version != f"{parsed_cuda_version.major}.{parsed_cuda_version.minor}": + raise ValueError( + f"Different CUDA version {self.cuda_version} and {cuda_version} found in the same model!" + ) + + @property + def snowpark_ml_version(self) -> str: + return str(self._snowpark_ml_version) + + @snowpark_ml_version.setter + def snowpark_ml_version(self, snowpark_ml_version: Optional[str] = None) -> None: + if snowpark_ml_version: + self._snowpark_ml_version = version.parse(snowpark_ml_version) + + def include_if_absent(self, pkgs: List[ModelDependency], check_local_version: bool = False) -> None: + """Append requirements into model env if absent. + + Args: + pkgs: A list of ModelDependency namedtuple to be appended. + check_local_version: Flag to indicate if it is required to pin to local version. Defaults to False. + """ + conda_reqs_str, pip_names_str = tuple(zip(*pkgs)) + pip_names = env_utils.validate_pip_requirement_string_list(list(pip_names_str)) + conda_reqs = env_utils.validate_conda_dependency_string_list(list(conda_reqs_str)) + + for conda_req, pip_name in zip(conda_reqs[env_utils.DEFAULT_CHANNEL_NAME], pip_names): + if check_local_version: + req_to_check = requirements.Requirement(f"{pip_name.name}{conda_req.specifier}") + req_to_add = env_utils.get_local_installed_version_of_pip_package(req_to_check) + req_to_add.name = conda_req.name + else: + req_to_add = conda_req + added_in_pip = False + for added_pip_req in self._pip_requirements: + if added_pip_req.name == pip_name.name: + warnings.warn( + ( + f"Basic dependency {req_to_add.name} specified from PIP requirements." + + " This may prevent model deploying to Snowflake Warehouse." + ), + category=UserWarning, + stacklevel=2, + ) + added_in_pip = True + if added_in_pip: + continue + try: + env_utils.append_conda_dependency( + self._conda_dependencies, (env_utils.DEFAULT_CHANNEL_NAME, req_to_add) + ) + except env_utils.DuplicateDependencyError: + pass + except env_utils.DuplicateDependencyInMultipleChannelsError: + warnings.warn( + ( + f"Basic dependency {req_to_add.name} specified from non-Snowflake channel." + + " This may prevent model deploying to Snowflake Warehouse." + ), + category=UserWarning, + stacklevel=2, + ) + + def generate_env_for_cuda(self) -> None: + if self.cuda_version is None: + return + + cuda_spec = env_utils.find_dep_spec( + self._conda_dependencies, self._pip_requirements, conda_pkg_name="cuda", remove_spec=False + ) + if cuda_spec and not cuda_spec.specifier.contains(self.cuda_version): + raise ValueError( + "The CUDA requirement you specified in your conda dependencies or pip requirements is" + " conflicting with CUDA version required. Please do not specify CUDA dependency using conda" + " dependencies or pip requirements." + ) + + if not cuda_spec: + try: + env_utils.append_conda_dependency( + self._conda_dependencies, + ("nvidia", requirements.Requirement(f"cuda=={self.cuda_version}.*")), + ) + except (env_utils.DuplicateDependencyError, env_utils.DuplicateDependencyInMultipleChannelsError): + pass + + xgboost_spec = env_utils.find_dep_spec( + self._conda_dependencies, self._pip_requirements, conda_pkg_name="xgboost", remove_spec=True + ) + if xgboost_spec: + xgboost_spec.name = "py-xgboost-gpu" + try: + env_utils.append_conda_dependency( + self._conda_dependencies, + ("conda-forge", xgboost_spec), + ) + except (env_utils.DuplicateDependencyError, env_utils.DuplicateDependencyInMultipleChannelsError): + pass + + pytorch_spec = env_utils.find_dep_spec( + self._conda_dependencies, + self._pip_requirements, + conda_pkg_name="pytorch", + pip_pkg_name="torch", + remove_spec=True, + ) + pytorch_cuda_spec = env_utils.find_dep_spec( + self._conda_dependencies, + self._pip_requirements, + conda_pkg_name="pytorch-cuda", + remove_spec=False, + ) + if pytorch_cuda_spec and not pytorch_cuda_spec.specifier.contains(self.cuda_version): + raise ValueError( + "The Pytorch-CUDA requirement you specified in your conda dependencies or pip requirements is" + " conflicting with CUDA version required. Please do not specify Pytorch-CUDA dependency using conda" + " dependencies or pip requirements." + ) + if pytorch_spec: + pytorch_spec.name = "pytorch" + try: + env_utils.append_conda_dependency( + self._conda_dependencies, + ("pytorch", pytorch_spec), + ) + except (env_utils.DuplicateDependencyError, env_utils.DuplicateDependencyInMultipleChannelsError): + pass + if not pytorch_cuda_spec: + try: + env_utils.append_conda_dependency( + self._conda_dependencies, + p_chan_dep=("pytorch", requirements.Requirement(f"pytorch-cuda=={self.cuda_version}.*")), + ) + except (env_utils.DuplicateDependencyError, env_utils.DuplicateDependencyInMultipleChannelsError): + pass + + tf_spec = env_utils.find_dep_spec( + self._conda_dependencies, self._pip_requirements, conda_pkg_name="tensorflow", remove_spec=True + ) + if tf_spec: + tf_spec.name = "tensorflow-gpu" + try: + env_utils.append_conda_dependency( + self._conda_dependencies, + ("conda-forge", tf_spec), + ) + except (env_utils.DuplicateDependencyError, env_utils.DuplicateDependencyInMultipleChannelsError): + pass + + transformers_spec = env_utils.find_dep_spec( + self._conda_dependencies, self._pip_requirements, conda_pkg_name="transformers", remove_spec=False + ) + if transformers_spec: + try: + env_utils.append_conda_dependency( + self._conda_dependencies, + ("conda-forge", requirements.Requirement("accelerate>=0.22.0")), + ) + except (env_utils.DuplicateDependencyError, env_utils.DuplicateDependencyInMultipleChannelsError): + pass + + # Required by bitsandbytes + try: + env_utils.append_conda_dependency( + self._conda_dependencies, + ( + env_utils.DEFAULT_CHANNEL_NAME, + env_utils.get_local_installed_version_of_pip_package(requirements.Requirement("scipy")), + ), + ) + except (env_utils.DuplicateDependencyError, env_utils.DuplicateDependencyInMultipleChannelsError): + pass + + try: + env_utils.append_requirement_list( + self._pip_requirements, + requirements.Requirement("bitsandbytes>=0.41.0"), + ) + except env_utils.DuplicateDependencyError: + pass + + def relax_version(self) -> None: + """Relax the version requirements for both conda dependencies and pip requirements. + It detects any ==x.y.z in specifiers and replaced with >=x.y, <(x+1) + """ + self._conda_dependencies = collections.defaultdict( + list, + { + chan: list(map(env_utils.relax_requirement_version, deps)) + for chan, deps in self._conda_dependencies.items() + }, + ) + self._pip_requirements = list(map(env_utils.relax_requirement_version, self._pip_requirements)) + + def load_from_conda_file(self, conda_env_path: pathlib.Path) -> None: + conda_dependencies_dict, pip_requirements_list, python_version = env_utils.load_conda_env_file(conda_env_path) + + for channel, channel_dependencies in conda_dependencies_dict.items(): + if channel != env_utils.DEFAULT_CHANNEL_NAME: + warnings.warn( + ( + "Found dependencies specified in the conda file from non-Snowflake channel." + " This may prevent model deploying to Snowflake Warehouse." + ), + category=UserWarning, + ) + if len(channel_dependencies) == 0 and channel not in self._conda_dependencies: + warnings.warn( + ( + f"Found additional conda channel {channel} specified in the conda file." + " This may prevent model deploying to Snowflake Warehouse." + ), + category=UserWarning, + ) + self._conda_dependencies[channel] = [] + + for channel_dependency in channel_dependencies: + try: + env_utils.append_conda_dependency(self._conda_dependencies, (channel, channel_dependency)) + except env_utils.DuplicateDependencyError: + pass + except env_utils.DuplicateDependencyInMultipleChannelsError: + warnings.warn( + ( + f"Dependency {channel_dependency.name} appeared in multiple channels as conda dependency." + " This may be unintentional." + ), + category=UserWarning, + ) + + if pip_requirements_list: + warnings.warn( + ( + "Found dependencies specified as pip requirements." + " This may prevent model deploying to Snowflake Warehouse." + ), + category=UserWarning, + ) + for pip_dependency in pip_requirements_list: + if any( + channel_dependency.name == pip_dependency.name + for channel_dependency in itertools.chain(*self._conda_dependencies.values()) + ): + continue + env_utils.append_requirement_list(self._pip_requirements, pip_dependency) + + if python_version: + self.python_version = python_version + + def load_from_pip_file(self, pip_requirements_path: pathlib.Path) -> None: + pip_requirements_list = env_utils.load_requirements_file(pip_requirements_path) + + if pip_requirements_list: + warnings.warn( + ( + "Found dependencies specified as pip requirements." + " This may prevent model deploying to Snowflake Warehouse." + ), + category=UserWarning, + ) + for pip_dependency in pip_requirements_list: + if any( + channel_dependency.name == pip_dependency.name + for channel_dependency in itertools.chain(*self._conda_dependencies.values()) + ): + continue + env_utils.append_requirement_list(self._pip_requirements, pip_dependency) + + def load_from_dict(self, base_dir: pathlib.Path, env_dict: model_meta_schema.ModelEnvDict) -> None: + self.conda_env_rel_path = pathlib.PurePosixPath(env_dict["conda"]) + self.pip_requirements_rel_path = pathlib.PurePosixPath(env_dict["pip"]) + + self.load_from_conda_file(base_dir / self.conda_env_rel_path) + self.load_from_pip_file(base_dir / self.pip_requirements_rel_path) + + self.python_version = env_dict["python_version"] + self.cuda_version = env_dict.get("cuda_version", None) + self.snowpark_ml_version = env_dict["snowpark_ml_version"] + + def save_as_dict(self, base_dir: pathlib.Path) -> model_meta_schema.ModelEnvDict: + env_utils.save_conda_env_file( + pathlib.Path(base_dir / self.conda_env_rel_path), self._conda_dependencies, self.python_version + ) + env_utils.save_requirements_file( + pathlib.Path(base_dir / self.pip_requirements_rel_path), self._pip_requirements + ) + return { + "conda": self.conda_env_rel_path.as_posix(), + "pip": self.pip_requirements_rel_path.as_posix(), + "python_version": self.python_version, + "cuda_version": self.cuda_version, + "snowpark_ml_version": self.snowpark_ml_version, + } diff --git a/snowflake/ml/model/_packager/model_env/model_env_test.py b/snowflake/ml/model/_packager/model_env/model_env_test.py new file mode 100644 index 00000000..5594c800 --- /dev/null +++ b/snowflake/ml/model/_packager/model_env/model_env_test.py @@ -0,0 +1,781 @@ +import copy +import os +import pathlib +import tempfile + +import yaml +from absl.testing import absltest +from packaging import requirements, version + +from snowflake.ml._internal import env as snowml_env, env_utils +from snowflake.ml.model._packager.model_env import model_env + + +class ModelEnvTest(absltest.TestCase): + def test_empty_model_env(self) -> None: + env = model_env.ModelEnv() + self.assertListEqual(env.conda_dependencies, []) + self.assertListEqual(env.pip_requirements, []) + py_ver = version.parse(snowml_env.PYTHON_VERSION) + self.assertEqual(env.python_version, f"{py_ver.major}.{py_ver.minor}") + self.assertIsNone(env.cuda_version) + self.assertEqual(env.snowpark_ml_version, snowml_env.VERSION) + + def test_conda_dependencies(self) -> None: + env = model_env.ModelEnv() + env.conda_dependencies = ["package"] + self.assertListEqual(env.conda_dependencies, ["package"]) + + env.conda_dependencies = ["some_package"] + self.assertListEqual(env.conda_dependencies, ["some-package"]) + + env.conda_dependencies = ["some_package==1.0.1"] + self.assertListEqual(env.conda_dependencies, ["some-package==1.0.1"]) + + env.conda_dependencies = ["some_package<1.2,>=1.0.1"] + self.assertListEqual(env.conda_dependencies, ["some-package<1.2,>=1.0.1"]) + + env.conda_dependencies = ["channel::some_package<1.2,>=1.0.1"] + self.assertListEqual(env.conda_dependencies, ["channel::some-package<1.2,>=1.0.1"]) + + with self.assertRaisesRegex(ValueError, "Invalid package requirement _some_package<1.2,>=1.0.1 found."): + env.conda_dependencies = ["channel::_some_package<1.2,>=1.0.1"] + + env.conda_dependencies = ["::some_package<1.2,>=1.0.1"] + self.assertListEqual(env.conda_dependencies, ["some-package<1.2,>=1.0.1"]) + + env.conda_dependencies = ["another==1.3", "channel::some_package<1.2,>=1.0.1"] + self.assertListEqual(env.conda_dependencies, ["another==1.3", "channel::some-package<1.2,>=1.0.1"]) + + def test_pip_requirements(self) -> None: + env = model_env.ModelEnv() + env.pip_requirements = ["package"] + self.assertListEqual(env.pip_requirements, ["package"]) + + env.pip_requirements = ["some_package"] + self.assertListEqual(env.pip_requirements, ["some-package"]) + + env.pip_requirements = ["some_package==1.0.1"] + self.assertListEqual(env.pip_requirements, ["some-package==1.0.1"]) + + env.pip_requirements = ["some_package<1.2,>=1.0.1"] + self.assertListEqual(env.pip_requirements, ["some-package<1.2,>=1.0.1"]) + + with self.assertRaisesRegex(ValueError, "Invalid package requirement channel::some_package<1.2,>=1.0.1 found."): + env.pip_requirements = ["channel::some_package<1.2,>=1.0.1"] + + def test_python_version(self) -> None: + env = model_env.ModelEnv() + env.python_version = "3.9" + self.assertEqual(env.python_version, "3.9") + + env.python_version = "3.9.16" + self.assertEqual(env.python_version, "3.9") + + env.python_version = None # type: ignore[assignment] + self.assertEqual(env.python_version, "3.9") + + def test_cuda_version(self) -> None: + env = model_env.ModelEnv() + env.cuda_version = "11.2" + self.assertEqual(env.cuda_version, "11.2") + + env.cuda_version = "11.2.1" + self.assertEqual(env.cuda_version, "11.2") + + env.cuda_version = None + self.assertEqual(env.cuda_version, "11.2") + + with self.assertRaisesRegex(ValueError, "Different CUDA version 11.2 and 12.1 found in the same model!"): + env.cuda_version = "12.1" + + def test_snowpark_ml_version(self) -> None: + env = model_env.ModelEnv() + env.python_version = "3.9" + self.assertEqual(env.python_version, "3.9") + + env.python_version = "3.9.16" + self.assertEqual(env.python_version, "3.9") + + env.python_version = None # type: ignore[assignment] + self.assertEqual(env.python_version, "3.9") + + def test_include_if_absent(self) -> None: + env = model_env.ModelEnv() + env.conda_dependencies = ["some-package==1.0.1"] + + env.include_if_absent([model_env.ModelDependency(requirement="some-package", pip_name="some-package")]) + self.assertListEqual(env.conda_dependencies, ["some-package==1.0.1"]) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.conda_dependencies = ["some-package==1.0.1"] + + env.include_if_absent([model_env.ModelDependency(requirement="some-package==1.0.2", pip_name="some-package")]) + self.assertListEqual(env.conda_dependencies, ["some-package==1.0.1"]) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.conda_dependencies = ["some-package==1.0.1"] + + env.include_if_absent([model_env.ModelDependency(requirement="some-package>=1.0,<2", pip_name="some-package")]) + self.assertListEqual(env.conda_dependencies, ["some-package==1.0.1"]) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.conda_dependencies = ["some-package==1.0.1"] + + env.include_if_absent( + [model_env.ModelDependency(requirement="another-package>=1.0,<2", pip_name="some-package")] + ) + self.assertListEqual(env.conda_dependencies, ["another-package<2,>=1.0", "some-package==1.0.1"]) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.conda_dependencies = ["channel::some-package==1.0.1"] + + with self.assertWarnsRegex(UserWarning, "Basic dependency some-package specified from non-Snowflake channel."): + env.include_if_absent([model_env.ModelDependency(requirement="some-package", pip_name="some-package")]) + self.assertListEqual(env.conda_dependencies, ["channel::some-package==1.0.1"]) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.pip_requirements = ["some-package==1.0.1"] + + with self.assertWarnsRegex( + UserWarning, + ( + "Basic dependency some-package specified from PIP requirements. " + "This may prevent model deploying to Snowflake Warehouse." + ), + ): + env.include_if_absent([model_env.ModelDependency(requirement="some-package", pip_name="some-package")]) + self.assertListEqual(env.conda_dependencies, []) + self.assertListEqual(env.pip_requirements, ["some-package==1.0.1"]) + + def test_include_if_absent_check_local(self) -> None: + env = model_env.ModelEnv() + env.conda_dependencies = [] + + env.include_if_absent( + [model_env.ModelDependency(requirement="numpy", pip_name="numpy")], check_local_version=True + ) + self.assertListEqual( + env.conda_dependencies, + [str(env_utils.get_local_installed_version_of_pip_package(requirements.Requirement("numpy")))], + ) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.conda_dependencies = [] + + env.include_if_absent( + [model_env.ModelDependency(requirement="numpy>=1.0", pip_name="numpy")], check_local_version=True + ) + self.assertListEqual( + env.conda_dependencies, + [str(env_utils.get_local_installed_version_of_pip_package(requirements.Requirement("numpy")))], + ) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.conda_dependencies = [] + + env.include_if_absent( + [model_env.ModelDependency(requirement="numpy<1.0", pip_name="numpy")], check_local_version=True + ) + self.assertListEqual( + env.conda_dependencies, + ["numpy<1.0"], + ) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.conda_dependencies = [] + + env.include_if_absent( + [model_env.ModelDependency(requirement="invalid-package", pip_name="invalid-package")], + check_local_version=True, + ) + self.assertListEqual( + env.conda_dependencies, + ["invalid-package"], + ) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.conda_dependencies = [] + + env.include_if_absent( + [model_env.ModelDependency(requirement="pytorch", pip_name="torch")], check_local_version=True + ) + self.assertListEqual( + env.conda_dependencies, + [ + "pytorch==" + + list( + env_utils.get_local_installed_version_of_pip_package(requirements.Requirement("torch")).specifier + )[0].version, + ], + ) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.conda_dependencies = ["numpy==1.0.1"] + + env.include_if_absent( + [model_env.ModelDependency(requirement="numpy", pip_name="numpy")], check_local_version=True + ) + self.assertListEqual(env.conda_dependencies, ["numpy==1.0.1"]) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.conda_dependencies = ["numpy==1.0.1"] + + env.include_if_absent( + [model_env.ModelDependency(requirement="numpy==1.0.2", pip_name="numpy")], check_local_version=True + ) + self.assertListEqual(env.conda_dependencies, ["numpy==1.0.1"]) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.conda_dependencies = ["numpy==1.0.1"] + + env.include_if_absent( + [model_env.ModelDependency(requirement="numpy>=1.0,<2", pip_name="numpy")], check_local_version=True + ) + self.assertListEqual(env.conda_dependencies, ["numpy==1.0.1"]) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.conda_dependencies = ["numpy==1.0.1"] + + env.include_if_absent( + [model_env.ModelDependency(requirement="pytorch>=1.0", pip_name="torch")], check_local_version=True + ) + self.assertListEqual( + env.conda_dependencies, + [ + "numpy==1.0.1", + "pytorch==" + + list( + env_utils.get_local_installed_version_of_pip_package(requirements.Requirement("torch")).specifier + )[0].version, + ], + ) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.conda_dependencies = ["channel::numpy==1.0.1"] + + with self.assertWarnsRegex(UserWarning, "Basic dependency numpy specified from non-Snowflake channel."): + env.include_if_absent( + [model_env.ModelDependency(requirement="numpy", pip_name="numpy")], check_local_version=True + ) + self.assertListEqual(env.conda_dependencies, ["channel::numpy==1.0.1"]) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.pip_requirements = ["numpy==1.0.1"] + + with self.assertWarnsRegex( + UserWarning, + ( + "Basic dependency numpy specified from PIP requirements. " + "This may prevent model deploying to Snowflake Warehouse." + ), + ): + env.include_if_absent( + [model_env.ModelDependency(requirement="numpy", pip_name="numpy")], check_local_version=True + ) + self.assertListEqual(env.conda_dependencies, []) + self.assertListEqual(env.pip_requirements, ["numpy==1.0.1"]) + + def test_generate_conda_env_for_cuda(self) -> None: + env = model_env.ModelEnv() + env.conda_dependencies = ["somepackage==1.0.0", "another_channel::another_package==1.0.0"] + original_env = copy.deepcopy(env) + env.generate_env_for_cuda() + + self.assertListEqual(env.conda_dependencies, original_env.conda_dependencies) + self.assertListEqual(env.pip_requirements, original_env.pip_requirements) + + env = model_env.ModelEnv() + env.conda_dependencies = ["somepackage==1.0.0", "another_channel::another_package==1.0.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + [ + "another_channel::another-package==1.0.0", + "nvidia::cuda==11.7.*", + "somepackage==1.0.0", + ], + ) + + env = model_env.ModelEnv() + env.conda_dependencies = [ + "nvidia::cuda>=11.7", + "somepackage==1.0.0", + "another_channel::another_package==1.0.0", + ] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + [ + "another_channel::another-package==1.0.0", + "nvidia::cuda>=11.7", + "somepackage==1.0.0", + ], + ) + + env = model_env.ModelEnv() + env.conda_dependencies = [ + "nvidia::cuda==11.8.*", + "somepackage==1.0.0", + "another_channel::another_package==1.0.0", + ] + env.cuda_version = "11.7" + with self.assertRaisesRegex( + ValueError, + "The CUDA requirement you specified in your conda dependencies or pip requirements is" + " conflicting with CUDA version required. Please do not specify CUDA dependency using conda" + " dependencies or pip requirements.", + ): + env.generate_env_for_cuda() + + env = model_env.ModelEnv() + env.conda_dependencies = ["pytorch==1.0.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + ["nvidia::cuda==11.7.*", "pytorch::pytorch-cuda==11.7.*", "pytorch::pytorch==1.0.0"], + ) + + env = model_env.ModelEnv() + env.conda_dependencies = ["pytorch>=1.0.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + ["nvidia::cuda==11.7.*", "pytorch::pytorch-cuda==11.7.*", "pytorch::pytorch>=1.0.0"], + ) + + env = model_env.ModelEnv() + env.conda_dependencies = ["pytorch>=1.0.0", "pytorch::pytorch-cuda>=11.7"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + ["nvidia::cuda==11.7.*", "pytorch::pytorch-cuda>=11.7", "pytorch::pytorch>=1.0.0"], + ) + + env = model_env.ModelEnv() + env.conda_dependencies = ["pytorch>=1.0.0", "pytorch::pytorch-cuda==11.8.*"] + env.cuda_version = "11.7" + + with self.assertRaisesRegex( + ValueError, + "The Pytorch-CUDA requirement you specified in your conda dependencies or pip requirements is" + " conflicting with CUDA version required. Please do not specify Pytorch-CUDA dependency using conda" + " dependencies or pip requirements.", + ): + env.generate_env_for_cuda() + + env = model_env.ModelEnv() + env.conda_dependencies = ["pytorch::pytorch>=1.1.0", "pytorch::pytorch-cuda==11.7.*"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + ["nvidia::cuda==11.7.*", "pytorch::pytorch-cuda==11.7.*", "pytorch::pytorch>=1.1.0"], + ) + + env = model_env.ModelEnv() + env.conda_dependencies = ["conda-forge::pytorch==1.0.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + ["nvidia::cuda==11.7.*", "pytorch::pytorch-cuda==11.7.*", "pytorch::pytorch==1.0.0"], + ) + self.assertIn("conda-forge", env._conda_dependencies) + + env = model_env.ModelEnv() + env.pip_requirements = ["torch==1.0.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + ["nvidia::cuda==11.7.*", "pytorch::pytorch-cuda==11.7.*", "pytorch::pytorch==1.0.0"], + ) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.conda_dependencies = ["tensorflow==1.0.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + ["conda-forge::tensorflow-gpu==1.0.0", "nvidia::cuda==11.7.*"], + ) + + env = model_env.ModelEnv() + env.conda_dependencies = ["tensorflow>=1.0.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + ["conda-forge::tensorflow-gpu>=1.0.0", "nvidia::cuda==11.7.*"], + ) + + env = model_env.ModelEnv() + env.conda_dependencies = ["tensorflow==1.0.0", "conda-forge::tensorflow-gpu==1.1.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + ["conda-forge::tensorflow-gpu==1.1.0", "nvidia::cuda==11.7.*"], + ) + self.assertIn(env_utils.DEFAULT_CHANNEL_NAME, env._conda_dependencies) + + env = model_env.ModelEnv() + env.pip_requirements = ["tensorflow==1.0.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + ["conda-forge::tensorflow-gpu==1.0.0", "nvidia::cuda==11.7.*"], + ) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.conda_dependencies = ["xgboost==1.0.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + ["conda-forge::py-xgboost-gpu==1.0.0", "nvidia::cuda==11.7.*"], + ) + + env = model_env.ModelEnv() + env.conda_dependencies = ["xgboost>=1.0.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + ["conda-forge::py-xgboost-gpu>=1.0.0", "nvidia::cuda==11.7.*"], + ) + + env = model_env.ModelEnv() + env.conda_dependencies = ["xgboost>=1.0.0", "conda-forge::py-xgboost-gpu>=1.1.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + ["conda-forge::py-xgboost-gpu>=1.1.0", "nvidia::cuda==11.7.*"], + ) + self.assertIn(env_utils.DEFAULT_CHANNEL_NAME, env._conda_dependencies) + + env = model_env.ModelEnv() + env.conda_dependencies = ["conda-forge::xgboost>=1.0.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + ["conda-forge::py-xgboost-gpu>=1.0.0", "nvidia::cuda==11.7.*"], + ) + + env = model_env.ModelEnv() + env.pip_requirements = ["xgboost>=1.0.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + ["conda-forge::py-xgboost-gpu>=1.0.0", "nvidia::cuda==11.7.*"], + ) + self.assertListEqual(env.pip_requirements, []) + + env = model_env.ModelEnv() + env.conda_dependencies = ["transformers==1.0.0", "pytorch==1.0.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + [ + "conda-forge::accelerate>=0.22.0", + "nvidia::cuda==11.7.*", + "pytorch::pytorch-cuda==11.7.*", + "pytorch::pytorch==1.0.0", + str(env_utils.get_local_installed_version_of_pip_package(requirements.Requirement("scipy"))), + "transformers==1.0.0", + ], + ) + + self.assertListEqual(env.pip_requirements, ["bitsandbytes>=0.41.0"]) + + env = model_env.ModelEnv() + env.conda_dependencies = ["transformers==1.0.0", "scipy==1.0.0", "conda-forge::accelerate==1.0.0"] + env.pip_requirements = ["bitsandbytes==1.0.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + [ + "conda-forge::accelerate==1.0.0", + "nvidia::cuda==11.7.*", + "scipy==1.0.0", + "transformers==1.0.0", + ], + ) + + self.assertListEqual(env.pip_requirements, ["bitsandbytes==1.0.0"]) + + env = model_env.ModelEnv() + env.conda_dependencies = ["conda-forge::transformers==1.0.0", "conda-forge::accelerate==1.0.0"] + env.cuda_version = "11.7" + + env.generate_env_for_cuda() + + self.assertListEqual( + env.conda_dependencies, + [ + "conda-forge::accelerate==1.0.0", + "conda-forge::transformers==1.0.0", + "nvidia::cuda==11.7.*", + str(env_utils.get_local_installed_version_of_pip_package(requirements.Requirement("scipy"))), + ], + ) + + self.assertListEqual(env.pip_requirements, ["bitsandbytes>=0.41.0"]) + + def test_relax_version(self) -> None: + env = model_env.ModelEnv() + env.conda_dependencies = [ + "somepackage==1.0.0,!=1.1", + "random-package>=2.3", + "another_channel::another-package==1.0", + ] + env.pip_requirements = ["pip-packages==3"] + + env.relax_version() + + self.assertListEqual( + env.conda_dependencies, + [ + "another_channel::another-package<2,>=1.0", + "random-package>=2.3", + "somepackage!=1.1,<2,>=1.0", + ], + ) + + self.assertListEqual(env.pip_requirements, ["pip-packages<4,>=3.0"]) + + def test_load_from_conda_file(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + env_file_path = pathlib.Path(os.path.join(tmpdir, "conda.yml")) + with open(env_file_path, "w", encoding="utf-8") as f: + yaml.safe_dump( + stream=f, + data={ + "name": "snow-env", + "channels": ["https://repo.anaconda.com/pkgs/snowflake", "apple", "nodefaults"], + "dependencies": [ + "python=3.10", + "::numpy>=1.22.4", + "conda-forge::pytorch!=2.0", + {"pip": ["python-package", "numpy==1.22.4"]}, + ], + }, + ) + + env = model_env.ModelEnv() + with self.assertWarnsRegex( + UserWarning, + ( + "Found dependencies specified in the conda file from non-Snowflake channel." + " This may prevent model deploying to Snowflake Warehouse." + ), + ): + env.load_from_conda_file(env_file_path) + + env = model_env.ModelEnv() + with self.assertWarnsRegex( + UserWarning, + ( + "Found additional conda channel apple specified in the conda file." + " This may prevent model deploying to Snowflake Warehouse." + ), + ): + env.load_from_conda_file(env_file_path) + + env = model_env.ModelEnv() + with self.assertWarnsRegex( + UserWarning, + ( + "Found dependencies specified as pip requirements." + " This may prevent model deploying to Snowflake Warehouse." + ), + ): + env.load_from_conda_file(env_file_path) + + self.assertListEqual(env.conda_dependencies, ["conda-forge::pytorch!=2.0", "numpy>=1.22.4"]) + self.assertIn("apple", env._conda_dependencies) + self.assertListEqual(env.pip_requirements, ["python-package"]) + self.assertEqual(env.python_version, "3.10") + + env = model_env.ModelEnv() + env.conda_dependencies = ["pandas==1.5.3"] + env.pip_requirements = ["pip-only==3.0"] + env.load_from_conda_file(env_file_path) + + self.assertListEqual( + env.conda_dependencies, ["conda-forge::pytorch!=2.0", "numpy>=1.22.4", "pandas==1.5.3"] + ) + self.assertIn("apple", env._conda_dependencies) + self.assertListEqual(env.pip_requirements, ["pip-only==3.0", "python-package"]) + self.assertEqual(env.python_version, "3.10") + + env = model_env.ModelEnv() + env.conda_dependencies = ["numpy==1.22.4"] + env.load_from_conda_file(env_file_path) + + self.assertListEqual(env.conda_dependencies, ["conda-forge::pytorch!=2.0", "numpy==1.22.4"]) + self.assertIn("apple", env._conda_dependencies) + self.assertListEqual(env.pip_requirements, ["python-package"]) + self.assertEqual(env.python_version, "3.10") + + env = model_env.ModelEnv() + env.conda_dependencies = ["pytorch==2.1"] + with self.assertWarnsRegex( + UserWarning, + "Dependency pytorch appeared in multiple channels as conda dependency. This may be unintentional.", + ): + env.load_from_conda_file(env_file_path) + + self.assertListEqual(env.conda_dependencies, ["numpy>=1.22.4", "pytorch==2.1"]) + self.assertIn("apple", env._conda_dependencies) + self.assertListEqual(env.pip_requirements, ["python-package"]) + self.assertEqual(env.python_version, "3.10") + + def test_load_from_pip_file(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + pip_file_path = pathlib.Path(os.path.join(tmpdir, "requirements.txt")) + with open(pip_file_path, "w", encoding="utf-8") as f: + f.writelines(["python-package\n", "numpy==1.22.4\n"]) + + env = model_env.ModelEnv() + with self.assertWarnsRegex( + UserWarning, + ( + "Found dependencies specified as pip requirements." + " This may prevent model deploying to Snowflake Warehouse." + ), + ): + env.load_from_pip_file(pip_file_path) + + self.assertListEqual(env.pip_requirements, ["numpy==1.22.4", "python-package"]) + + env = model_env.ModelEnv() + env.conda_dependencies = ["numpy>=1.22.4"] + env.load_from_pip_file(pip_file_path) + + self.assertListEqual(env.pip_requirements, ["python-package"]) + + env = model_env.ModelEnv() + env.conda_dependencies = ["conda-forge::numpy>=1.22.4"] + env.load_from_pip_file(pip_file_path) + + self.assertListEqual(env.pip_requirements, ["python-package"]) + + def test_save_and_load(self) -> None: + def check_env_equality(this: model_env.ModelEnv, that: model_env.ModelEnv) -> bool: + return all( + getattr(this, attr) == getattr(that, attr) + for attr in [ + "conda_env_rel_path", + "pip_requirements_rel_path", + "conda_dependencies", + "pip_requirements", + "python_version", + "cuda_version", + "snowpark_ml_version", + ] + ) + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir_path = pathlib.Path(tmpdir) + env = model_env.ModelEnv() + saved_dict = env.save_as_dict(tmpdir_path) + + loaded_env = model_env.ModelEnv() + loaded_env.load_from_dict(tmpdir_path, saved_dict) + self.assertTrue(check_env_equality(env, loaded_env), "Loaded env object is different.") + + env = model_env.ModelEnv() + env.conda_dependencies = ["another==1.3", "channel::some_package<1.2,>=1.0.1"] + env.pip_requirements = ["pip-package<1.2,>=1.0.1"] + env.python_version = "3.10.2" + env.cuda_version = "11.7.1" + env.snowpark_ml_version = "1.1.0" + + saved_dict = env.save_as_dict(tmpdir_path) + + self.assertDictEqual( + saved_dict, + { + "conda": "env/conda.yml", + "pip": "env/requirements.txt", + "python_version": "3.10", + "cuda_version": "11.7", + "snowpark_ml_version": "1.1.0", + }, + ) + + loaded_env = model_env.ModelEnv() + loaded_env.load_from_dict(tmpdir_path, saved_dict) + self.assertTrue(check_env_equality(env, loaded_env), "Loaded env object is different.") + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_model_handler.py b/snowflake/ml/model/_packager/model_handler.py similarity index 63% rename from snowflake/ml/model/_model_handler.py rename to snowflake/ml/model/_packager/model_handler.py index 79a9cb5f..163f6268 100644 --- a/snowflake/ml/model/_model_handler.py +++ b/snowflake/ml/model/_packager/model_handler.py @@ -1,15 +1,14 @@ import functools import importlib -import os import pkgutil from types import ModuleType from typing import Any, Callable, Dict, Optional, Type, TypeVar, cast from snowflake.ml.model import type_hints as model_types -from snowflake.ml.model._handlers import _base +from snowflake.ml.model._packager.model_handlers import _base -_HANDLERS_BASE = "_handlers" -_MODEL_HANDLER_REGISTRY: Dict[str, Type[_base._ModelHandler[model_types.SupportedModelType]]] = dict() +_HANDLERS_BASE = "snowflake.ml.model._packager.model_handlers" +_MODEL_HANDLER_REGISTRY: Dict[str, Type[_base.BaseModelHandler[model_types.SupportedModelType]]] = dict() _IS_HANDLER_LOADED = False @@ -18,22 +17,22 @@ def _register_handlers() -> None: Scan all Python modules in _HANDLERS_BASE directory and register every found non-base ModelHandler automatically. """ - model_module = importlib.import_module("snowflake.ml.model") + model_module = importlib.import_module(_HANDLERS_BASE) model_path = model_module.__path__ - for _, name, _ in pkgutil.iter_modules( - map(lambda x: os.path.join(x, _HANDLERS_BASE), model_path), "snowflake.ml.model._handlers." - ): + for _, name, _ in pkgutil.iter_modules(model_path, f"{_HANDLERS_BASE}."): + if name.startswith("_"): + continue handler_module = importlib.import_module(name) - if type(handler_module) == ModuleType: + if isinstance(handler_module, ModuleType): for c in dir(handler_module): k_class = getattr(handler_module, c) if ( isinstance(k_class, type) - and k_class is not _base._ModelHandler - and issubclass(k_class, _base._ModelHandler) + and k_class is not _base.BaseModelHandler + and issubclass(k_class, _base.BaseModelHandler) ): - _MODEL_HANDLER_REGISTRY[k_class.handler_type] = k_class + _MODEL_HANDLER_REGISTRY[k_class.HANDLER_TYPE] = k_class F = TypeVar("F", bound=Callable[..., Any]) @@ -53,9 +52,9 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: @ensure_handlers_registration -def _find_handler( +def find_handler( model: model_types.SupportedModelType, -) -> Optional[Type[_base._ModelHandler[model_types.SupportedModelType]]]: +) -> Optional[Type[_base.BaseModelHandler[model_types.SupportedModelType]]]: for handler in _MODEL_HANDLER_REGISTRY.values(): if handler.can_handle(model): return handler @@ -63,7 +62,9 @@ def _find_handler( @ensure_handlers_registration -def _load_handler(target_model_type: str) -> Optional[Type[_base._ModelHandler[model_types.SupportedModelType]]]: +def load_handler( + target_model_type: model_types.SupportedModelHandlerType, +) -> Optional[Type[_base.BaseModelHandler[model_types.SupportedModelType]]]: for model_type, handler in _MODEL_HANDLER_REGISTRY.items(): if target_model_type == model_type: return handler @@ -74,5 +75,5 @@ def _load_handler(target_model_type: str) -> Optional[Type[_base._ModelHandler[m def is_auto_signature_model(model: model_types.SupportedModelType) -> bool: for handler in _MODEL_HANDLER_REGISTRY.values(): if handler.can_handle(model): - return handler.is_auto_signature + return handler.IS_AUTO_SIGNATURE return False diff --git a/snowflake/ml/model/_packager/model_handler_test.py b/snowflake/ml/model/_packager/model_handler_test.py new file mode 100644 index 00000000..cd3e89c0 --- /dev/null +++ b/snowflake/ml/model/_packager/model_handler_test.py @@ -0,0 +1,62 @@ +import datetime + +from absl.testing import absltest + +from snowflake.ml._internal import env as snowml_env +from snowflake.ml.model._packager import model_handler +from snowflake.ml.test_utils import test_env_utils + + +class ModelHandlerTest(absltest.TestCase): + def test_registered_handler(self) -> None: + model_handler._register_handlers() + self.assertGreater(len(model_handler._MODEL_HANDLER_REGISTRY), 0, "No model handlers are registered.") + for handler_name, handler in model_handler._MODEL_HANDLER_REGISTRY.items(): + with self.subTest(f"Testing Handler for {handler_name}"): + # Validate name + self.assertEqual(handler_name, handler.HANDLER_TYPE) + # Validate version + datetime.datetime.strptime(handler.HANDLER_VERSION, "%Y-%m-%d") + # Validate min snowpark ml version + if handler._MIN_SNOWPARK_ML_VERSION != snowml_env.VERSION: + self.assertIn( + handler._MIN_SNOWPARK_ML_VERSION, + test_env_utils.get_snowpark_ml_released_versions(), + "The min Snowpark ML version is not released or not current.", + ) + all_source_versions = set() + all_target_versions = set() + for source_version, migrator_plan in handler._HANDLER_MIGRATOR_PLANS.items(): + self.assertNotEqual( + handler.HANDLER_VERSION, + source_version, + "There shouldn't be a migrator whose source version is current handler version.", + ) + self.assertEqual( + source_version, + migrator_plan.source_version, + "There shouldn't be a migrator whose source version does not equal to the key in the plans.", + ) + self.assertLess( + datetime.datetime.strptime(migrator_plan.source_version, "%Y-%m-%d"), + datetime.datetime.strptime(migrator_plan.target_version, "%Y-%m-%d"), + "Migrator should not be able to downgrade.", + ) + if migrator_plan.target_version != handler.HANDLER_VERSION: + self.assertIn( + migrator_plan.target_version, + handler._HANDLER_MIGRATOR_PLANS.keys(), + ( + "There shouldn't be a migrator whose target version " + "is not current version and has not a migrator plan" + ), + ) + all_source_versions.add(migrator_plan.source_version) + all_target_versions.add(migrator_plan.target_version) + self.assertEqual( + len(all_source_versions), len(all_target_versions), "The migrator plan is not monotonic." + ) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_handlers/BUILD.bazel b/snowflake/ml/model/_packager/model_handlers/BUILD.bazel similarity index 55% rename from snowflake/ml/model/_handlers/BUILD.bazel rename to snowflake/ml/model/_packager/model_handlers/BUILD.bazel index 158867fb..fa431296 100644 --- a/snowflake/ml/model/_handlers/BUILD.bazel +++ b/snowflake/ml/model/_packager/model_handlers/BUILD.bazel @@ -6,8 +6,20 @@ py_library( name = "_base", srcs = ["_base.py"], deps = [ - "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager/model_handlers_migrator:base_migrator", + "//snowflake/ml/model/_packager/model_meta", + ], +) + +py_library( + name = "_utils", + srcs = ["_utils.py"], + deps = [ + "//snowflake/ml/model:model_signature", + "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager/model_meta", + "//snowflake/ml/model/_signatures:snowpark_handler", ], ) @@ -16,12 +28,15 @@ py_library( srcs = ["custom.py"], deps = [ ":_base", - "//snowflake/ml/_internal:type_utils", - "//snowflake/ml/model:_model_handler", - "//snowflake/ml/model:_model_meta", + ":_utils", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager:model_handler", + "//snowflake/ml/model/_packager/model_env", + "//snowflake/ml/model/_packager/model_handlers_migrator:base_migrator", + "//snowflake/ml/model/_packager/model_meta", + "//snowflake/ml/model/_packager/model_meta:model_blob_meta", ], ) @@ -30,11 +45,15 @@ py_library( srcs = ["sklearn.py"], deps = [ ":_base", + ":_utils", "//snowflake/ml/_internal:type_utils", - "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager/model_env", + "//snowflake/ml/model/_packager/model_handlers_migrator:base_migrator", + "//snowflake/ml/model/_packager/model_meta", + "//snowflake/ml/model/_packager/model_meta:model_blob_meta", "//snowflake/ml/model/_signatures:numpy_handler", "//snowflake/ml/model/_signatures:utils", ], @@ -45,11 +64,15 @@ py_library( srcs = ["snowmlmodel.py"], deps = [ ":_base", + ":_utils", "//snowflake/ml/_internal:type_utils", - "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager/model_env", + "//snowflake/ml/model/_packager/model_handlers_migrator:base_migrator", + "//snowflake/ml/model/_packager/model_meta", + "//snowflake/ml/model/_packager/model_meta:model_blob_meta", "//snowflake/ml/model/_signatures:numpy_handler", "//snowflake/ml/model/_signatures:utils", "//snowflake/ml/modeling/framework", @@ -61,10 +84,15 @@ py_library( srcs = ["xgboost.py"], deps = [ ":_base", + ":_utils", "//snowflake/ml/_internal:type_utils", - "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager/model_env", + "//snowflake/ml/model/_packager/model_handlers_migrator:base_migrator", + "//snowflake/ml/model/_packager/model_meta", + "//snowflake/ml/model/_packager/model_meta:model_blob_meta", + "//snowflake/ml/model/_packager/model_meta:model_meta_schema", "//snowflake/ml/model/_signatures:numpy_handler", "//snowflake/ml/model/_signatures:utils", ], @@ -75,11 +103,15 @@ py_library( srcs = ["pytorch.py"], deps = [ ":_base", + ":_utils", "//snowflake/ml/_internal:type_utils", - "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager/model_env", + "//snowflake/ml/model/_packager/model_handlers_migrator:base_migrator", + "//snowflake/ml/model/_packager/model_meta", + "//snowflake/ml/model/_packager/model_meta:model_blob_meta", "//snowflake/ml/model/_signatures:pytorch_handler", "//snowflake/ml/model/_signatures:utils", ], @@ -90,11 +122,15 @@ py_library( srcs = ["torchscript.py"], deps = [ ":_base", + ":_utils", "//snowflake/ml/_internal:type_utils", - "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager/model_env", + "//snowflake/ml/model/_packager/model_handlers_migrator:base_migrator", + "//snowflake/ml/model/_packager/model_meta", + "//snowflake/ml/model/_packager/model_meta:model_blob_meta", "//snowflake/ml/model/_signatures:pytorch_handler", "//snowflake/ml/model/_signatures:utils", ], @@ -105,11 +141,15 @@ py_library( srcs = ["tensorflow.py"], deps = [ ":_base", + ":_utils", "//snowflake/ml/_internal:type_utils", - "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager/model_env", + "//snowflake/ml/model/_packager/model_handlers_migrator:base_migrator", + "//snowflake/ml/model/_packager/model_meta", + "//snowflake/ml/model/_packager/model_meta:model_blob_meta", "//snowflake/ml/model/_signatures:numpy_handler", "//snowflake/ml/model/_signatures:tensorflow_handler", "//snowflake/ml/model/_signatures:utils", @@ -121,14 +161,17 @@ py_library( srcs = ["mlflow.py"], deps = [ ":_base", - "//snowflake/ml/_internal:env_utils", + ":_utils", "//snowflake/ml/_internal:file_utils", "//snowflake/ml/_internal:type_utils", - "//snowflake/ml/model:_env", - "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager/model_env", + "//snowflake/ml/model/_packager/model_handlers_migrator:base_migrator", + "//snowflake/ml/model/_packager/model_meta", + "//snowflake/ml/model/_packager/model_meta:model_blob_meta", + "//snowflake/ml/model/_packager/model_meta:model_meta_schema", "//snowflake/ml/model/_signatures:utils", ], ) @@ -138,11 +181,16 @@ py_library( srcs = ["huggingface_pipeline.py"], deps = [ ":_base", + ":_utils", "//snowflake/ml/_internal:type_utils", - "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager/model_env", + "//snowflake/ml/model/_packager/model_handlers_migrator:base_migrator", + "//snowflake/ml/model/_packager/model_meta", + "//snowflake/ml/model/_packager/model_meta:model_blob_meta", + "//snowflake/ml/model/_packager/model_meta:model_meta_schema", "//snowflake/ml/model/_signatures:builtins_handler", "//snowflake/ml/model/_signatures:utils", "//snowflake/ml/model/models:huggingface_pipeline", @@ -156,11 +204,11 @@ py_library( ":_base", "//snowflake/ml/_internal:env_utils", "//snowflake/ml/_internal:file_utils", - "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", - "//snowflake/ml/model/_signatures:core", + "//snowflake/ml/model/_packager/model_env", + "//snowflake/ml/model/_packager/model_meta", "//snowflake/ml/model/models:llm_model", ], ) diff --git a/snowflake/ml/model/_packager/model_handlers/_base.py b/snowflake/ml/model/_packager/model_handlers/_base.py new file mode 100644 index 00000000..617d2847 --- /dev/null +++ b/snowflake/ml/model/_packager/model_handlers/_base.py @@ -0,0 +1,161 @@ +from abc import abstractmethod +from typing import Dict, Generic, Optional, Protocol, Type, final + +from typing_extensions import TypeGuard, Unpack + +from snowflake.ml.model import custom_model, type_hints as model_types +from snowflake.ml.model._packager.model_handlers_migrator import base_migrator +from snowflake.ml.model._packager.model_meta import model_meta + + +class _BaseModelHandlerProtocol(Protocol[model_types._ModelType]): + HANDLER_TYPE: model_types.SupportedModelHandlerType + HANDLER_VERSION: str + _MIN_SNOWPARK_ML_VERSION: str + _HANDLER_MIGRATOR_PLANS: Dict[str, Type[base_migrator.BaseModelHandlerMigrator]] + + @classmethod + @abstractmethod + def can_handle(cls, model: model_types.SupportedDataType) -> TypeGuard[model_types._ModelType]: + """Whether this handler could support the type of the `model`. + + Args: + model: The model object. + + Raises: + NotImplementedError: Not Implemented + """ + raise NotImplementedError + + @classmethod + @abstractmethod + def cast_model(cls, model: model_types.SupportedModelType) -> model_types._ModelType: + """Cast the model from Union type into the type that handler could handle. + + Args: + model: The model object. + + Raises: + NotImplementedError: Not Implemented + """ + raise NotImplementedError + + @classmethod + @abstractmethod + def save_model( + cls, + name: str, + model: model_types._ModelType, + model_meta: model_meta.ModelMetadata, + model_blobs_dir_path: str, + sample_input: Optional[model_types.SupportedDataType] = None, + is_sub_model: Optional[bool] = False, + **kwargs: Unpack[model_types.BaseModelSaveOption], + ) -> None: + """Save the model. + + Args: + name: Name of the model. + model: The model object. + model_meta: The model metadata. + model_blobs_dir_path: Directory path to the model. + sample_input: Sample input to infer the signatures from. + is_sub_model: Flag to show if it is a sub model, a sub model does not need signature. + kwargs: Additional saving options. + + Raises: + NotImplementedError: Not Implemented + """ + raise NotImplementedError + + @classmethod + @abstractmethod + def load_model( + cls, + name: str, + model_meta: model_meta.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], + ) -> model_types._ModelType: + """Load the model into memory. + + Args: + name: Name of the model. + model_meta: The model metadata. + model_blobs_dir_path: Directory path to the whole model. + kwargs: Options when loading the model. + + Raises: + NotImplementedError: Not Implemented + """ + raise NotImplementedError + + @classmethod + @abstractmethod + def convert_as_custom_model( + cls, + raw_model: model_types._ModelType, + model_meta: model_meta.ModelMetadata, + **kwargs: Unpack[model_types.ModelLoadOption], + ) -> custom_model.CustomModel: + """Create a custom model class wrap for unified interface when being deployed. The predict method will be + re-targeted based on target_method metadata. + + Args: + raw_model: original model object, + model_meta: The model metadata. + kwargs: Options when converting the model. + + Raises: + NotImplementedError: Not Implemented + """ + raise NotImplementedError + + +class BaseModelHandler(Generic[model_types._ModelType], _BaseModelHandlerProtocol[model_types._ModelType]): + """ + Provides handling for a given type of model defined by `HANDLER_TYPE` class property. + + HANDLER_TYPE: The string type that identify the handler. Should be unique in the library. + HANDLER_VERSION: The version of the handler. + _MIN_SNOWPARK_ML_VERSION: The minimal version of Snowpark ML library to use the current handler. + _HANDLER_MIGRATOR_PLANS: Dict holding handler migrator plans. + + MODELE_BLOB_FILE_OR_DIR: Relative path of the model blob file in the model subdir. Default to "model.pkl". + MODEL_ARTIFACTS_DIR: Relative path of the model artifacts dir in the model subdir. Default to "artifacts" + DEFAULT_TARGET_METHODS: Default target methods to be logged if not specified in this kind of model. Default to + ["predict"] + IS_AUTO_SIGNATURE: Set to True if the model could get model signature automatically and do not require user + inputting sample data or model signature. Default to False. + """ + + MODELE_BLOB_FILE_OR_DIR = "model.pkl" + MODEL_ARTIFACTS_DIR = "artifacts" + DEFAULT_TARGET_METHODS = ["predict"] + IS_AUTO_SIGNATURE = False + + @classmethod + @final + def try_upgrade(cls, name: str, model_meta: model_meta.ModelMetadata, model_blobs_dir_path: str) -> None: + """Try upgrade the stored model to adapt latest handler + + Args: + name: Name of the model. + model_meta: The model metadata. + model_blobs_dir_path: Directory path to the whole model. + + Raises: + RuntimeError: Raised when there is no corresponding migrator available. + """ + while model_meta.models[name].handler_version != cls.HANDLER_VERSION: + if model_meta.models[name].handler_version not in cls._HANDLER_MIGRATOR_PLANS.keys(): + raise RuntimeError( + f"Can not find migrator to migrate model {name} from {model_meta.models[name].handler_version}" + f" to version {cls.HANDLER_VERSION}." + ) + migrator = cls._HANDLER_MIGRATOR_PLANS[model_meta.models[name].handler_version]() + migrator.try_upgrade( + name=name, + model_meta=model_meta, + model_blobs_dir_path=model_blobs_dir_path, + ) diff --git a/snowflake/ml/model/_packager/model_handlers/_utils.py b/snowflake/ml/model/_packager/model_handlers/_utils.py new file mode 100644 index 00000000..54a1c183 --- /dev/null +++ b/snowflake/ml/model/_packager/model_handlers/_utils.py @@ -0,0 +1,57 @@ +from typing import Callable, Iterable, Optional, Sequence, cast + +from snowflake.ml.model import model_signature, type_hints as model_types +from snowflake.ml.model._packager.model_meta import model_meta +from snowflake.ml.model._signatures import snowpark_handler +from snowflake.snowpark import DataFrame as SnowparkDataFrame + + +def _is_callable(model: model_types.SupportedModelType, method_name: str) -> bool: + return callable(getattr(model, method_name, None)) + + +def validate_signature( + model: model_types.SupportedRequireSignatureModelType, + model_meta: model_meta.ModelMetadata, + target_methods: Iterable[str], + sample_input: Optional[model_types.SupportedDataType], + get_prediction_fn: Callable[[str, model_types.SupportedLocalDataType], model_types.SupportedLocalDataType], +) -> model_meta.ModelMetadata: + if model_meta.signatures: + validate_target_methods(model, list(model_meta.signatures.keys())) + return model_meta + + # In this case sample_input should be available, because of the check in save_model. + assert ( + sample_input is not None + ), "Model signature and sample input are None at the same time. This should not happen with local model." + trunc_sample_input = model_signature._truncate_data(sample_input) + if isinstance(sample_input, SnowparkDataFrame): + # Added because of Any from missing stubs. + trunc_sample_input = cast(SnowparkDataFrame, trunc_sample_input) + local_sample_input = snowpark_handler.SnowparkDataFrameHandler.convert_to_df(trunc_sample_input) + else: + local_sample_input = trunc_sample_input + for target_method in target_methods: + predictions_df = get_prediction_fn(target_method, local_sample_input) + sig = model_signature.infer_signature(local_sample_input, predictions_df) + model_meta.signatures[target_method] = sig + return model_meta + + +def get_target_methods( + model: model_types.SupportedModelType, + target_methods: Optional[Sequence[str]], + default_target_methods: Iterable[str], +) -> Sequence[str]: + if target_methods is None: + target_methods = [method_name for method_name in default_target_methods if _is_callable(model, method_name)] + + validate_target_methods(model, target_methods) + return target_methods + + +def validate_target_methods(model: model_types.SupportedModelType, target_methods: Iterable[str]) -> None: + for method_name in target_methods: + if not _is_callable(model, method_name): + raise ValueError(f"Target method {method_name} is not callable or does not exist in the model.") diff --git a/snowflake/ml/model/_handlers/custom.py b/snowflake/ml/model/_packager/model_handlers/custom.py similarity index 66% rename from snowflake/ml/model/_handlers/custom.py rename to snowflake/ml/model/_packager/model_handlers/custom.py index 23903f20..350527fe 100644 --- a/snowflake/ml/model/_handlers/custom.py +++ b/snowflake/ml/model/_packager/model_handlers/custom.py @@ -2,44 +2,45 @@ import os import pathlib import sys -from typing import TYPE_CHECKING, Dict, Optional +from typing import Dict, Optional, Type, final import anyio import cloudpickle import pandas as pd from typing_extensions import TypeGuard, Unpack -from snowflake.ml._internal import file_utils, type_utils -from snowflake.ml.model import ( - _model_handler, - _model_meta as model_meta_api, - model_signature, - type_hints as model_types, +from snowflake.ml._internal import file_utils +from snowflake.ml.model import custom_model, model_signature, type_hints as model_types +from snowflake.ml.model._packager import model_handler +from snowflake.ml.model._packager.model_handlers import _base, _utils as handlers_utils +from snowflake.ml.model._packager.model_handlers_migrator import base_migrator +from snowflake.ml.model._packager.model_meta import ( + model_blob_meta, + model_meta as model_meta_api, ) -from snowflake.ml.model._handlers import _base -if TYPE_CHECKING: - from snowflake.ml.model import custom_model - -class _CustomModelHandler(_base._ModelHandler["custom_model.CustomModel"]): +@final +class CustomModelHandler(_base.BaseModelHandler["custom_model.CustomModel"]): """Handler for custom model.""" - handler_type = "custom" - - @staticmethod - def can_handle(model: model_types.SupportedModelType) -> TypeGuard["custom_model.CustomModel"]: - return bool(type_utils.LazyType("snowflake.ml.model.custom_model.CustomModel").isinstance(model)) + HANDLER_TYPE = "custom" + HANDLER_VERSION = "2023-12-01" + _MIN_SNOWPARK_ML_VERSION = "1.0.12" + _HANDLER_MIGRATOR_PLANS: Dict[str, Type[base_migrator.BaseModelHandlerMigrator]] = {} - @staticmethod - def cast_model(model: model_types.SupportedModelType) -> "custom_model.CustomModel": - from snowflake.ml.model import custom_model + @classmethod + def can_handle(cls, model: model_types.SupportedModelType) -> TypeGuard["custom_model.CustomModel"]: + return isinstance(model, custom_model.CustomModel) + @classmethod + def cast_model(cls, model: model_types.SupportedModelType) -> "custom_model.CustomModel": assert isinstance(model, custom_model.CustomModel) return model - @staticmethod - def _save_model( + @classmethod + def save_model( + cls, name: str, model: "custom_model.CustomModel", model_meta: model_meta_api.ModelMetadata, @@ -48,8 +49,6 @@ def _save_model( is_sub_model: Optional[bool] = False, **kwargs: Unpack[model_types.CustomModelSaveOption], ) -> None: - from snowflake.ml.model import custom_model - assert isinstance(model, custom_model.CustomModel) def get_prediction( @@ -70,7 +69,7 @@ def get_prediction( return predictions_df if not is_sub_model: - model_meta = model_meta_api._validate_signature( + model_meta = handlers_utils.validate_signature( model=model, model_meta=model_meta, target_methods=[method.__name__ for method in model._get_infer_methods()], @@ -81,7 +80,7 @@ def get_prediction( model_blob_path = os.path.join(model_blobs_dir_path, name) os.makedirs(model_blob_path, exist_ok=True) if model.context.artifacts: - artifacts_path = os.path.join(model_blob_path, _CustomModelHandler.MODEL_ARTIFACTS_DIR) + artifacts_path = os.path.join(model_blob_path, cls.MODEL_ARTIFACTS_DIR) os.makedirs(artifacts_path, exist_ok=True) for _name, uri in model.context.artifacts.items(): file_utils.copy_file_or_tree(uri, artifacts_path) @@ -89,10 +88,10 @@ def get_prediction( # Save sub-models if model.context.model_refs: for sub_name, model_ref in model.context.model_refs.items(): - handler = _model_handler._find_handler(model_ref.model) + handler = model_handler.find_handler(model_ref.model) assert handler is not None sub_model = handler.cast_model(model_ref.model) - handler._save_model( + handler.save_model( name=sub_name, model=sub_model, model_meta=model_meta, @@ -103,15 +102,16 @@ def get_prediction( # Make sure that the module where the model is defined get pickled by value as well. cloudpickle.register_pickle_by_value(sys.modules[model.__module__]) picked_obj = (model.__class__, model.context) - with open(os.path.join(model_blob_path, _CustomModelHandler.MODEL_BLOB_FILE), "wb") as f: + with open(os.path.join(model_blob_path, cls.MODELE_BLOB_FILE_OR_DIR), "wb") as f: cloudpickle.dump(picked_obj, f) - model_meta.models[name] = model_meta_api._ModelBlobMetadata( + model_meta.models[name] = model_blob_meta.ModelBlobMeta( name=name, - model_type=_CustomModelHandler.handler_type, - path=_CustomModelHandler.MODEL_BLOB_FILE, + model_type=cls.HANDLER_TYPE, + path=cls.MODELE_BLOB_FILE_OR_DIR, + handler_version=cls.HANDLER_VERSION, artifacts={ name: pathlib.Path( - os.path.join(_CustomModelHandler.MODEL_ARTIFACTS_DIR, os.path.basename(os.path.normpath(path=uri))) + os.path.join(cls.MODEL_ARTIFACTS_DIR, os.path.basename(os.path.normpath(path=uri))) ).as_posix() for name, uri in model.context.artifacts.items() }, @@ -120,24 +120,20 @@ def get_prediction( # For Custom we set only when user set it. cuda_version = kwargs.get("cuda_version", None) if cuda_version: - model_meta.cuda_version = cuda_version + model_meta.env.cuda_version = cuda_version - @staticmethod - def _load_model( + @classmethod + def load_model( + cls, name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str, **kwargs: Unpack[model_types.ModelLoadOption], ) -> "custom_model.CustomModel": - from snowflake.ml.model import custom_model - model_blob_path = os.path.join(model_blobs_dir_path, name) - if not hasattr(model_meta, "models"): - raise ValueError("Ill model metadata found.") + model_blobs_metadata = model_meta.models - if name not in model_blobs_metadata: - raise ValueError(f"Blob of model {name} does not exist.") model_blob_metadata = model_blobs_metadata[name] model_blob_filename = model_blob_metadata.path with open(os.path.join(model_blob_path, model_blob_filename), "rb") as f: @@ -155,9 +151,14 @@ def _load_model( models: Dict[str, model_types.SupportedModelType] = dict() for sub_model_name, _ref in context.model_refs.items(): model_type = model_meta.models[sub_model_name].model_type - handler = _model_handler._load_handler(model_type) + handler = model_handler.load_handler(model_type) assert handler - sub_model = handler._load_model( + handler.try_upgrade( + name=sub_model_name, + model_meta=model_meta, + model_blobs_dir_path=model_blobs_dir_path, + ) + sub_model = handler.load_model( name=sub_model_name, model_meta=model_meta, model_blobs_dir_path=model_blobs_dir_path, @@ -168,3 +169,12 @@ def _load_model( assert isinstance(model, custom_model.CustomModel) return model + + @classmethod + def convert_as_custom_model( + cls, + raw_model: custom_model.CustomModel, + model_meta: model_meta_api.ModelMetadata, + **kwargs: Unpack[model_types.ModelLoadOption], + ) -> custom_model.CustomModel: + return raw_model diff --git a/snowflake/ml/model/_handlers/huggingface_pipeline.py b/snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py similarity index 77% rename from snowflake/ml/model/_handlers/huggingface_pipeline.py rename to snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py index 8546f5c6..7dcabd30 100644 --- a/snowflake/ml/model/_handlers/huggingface_pipeline.py +++ b/snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py @@ -1,7 +1,17 @@ import json import os import warnings -from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Type, Union +from typing import ( + TYPE_CHECKING, + Callable, + Dict, + List, + Optional, + Type, + Union, + cast, + final, +) import cloudpickle import numpy as np @@ -10,24 +20,27 @@ from typing_extensions import TypeGuard, Unpack from snowflake.ml._internal import type_utils -from snowflake.ml.model import ( - _model_meta as model_meta_api, - custom_model, - model_signature, - type_hints as model_types, +from snowflake.ml.model import custom_model, model_signature, type_hints as model_types +from snowflake.ml.model._packager.model_env import model_env +from snowflake.ml.model._packager.model_handlers import _base, _utils as handlers_utils +from snowflake.ml.model._packager.model_handlers_migrator import base_migrator +from snowflake.ml.model._packager.model_meta import ( + model_blob_meta, + model_meta as model_meta_api, + model_meta_schema, ) -from snowflake.ml.model._handlers import _base from snowflake.ml.model._signatures import ( builtins_handler, utils as model_signature_utils, ) from snowflake.ml.model.models import huggingface_pipeline +from snowflake.snowpark._internal import utils as snowpark_utils if TYPE_CHECKING: import transformers -def get_requirements_from_task(task: str) -> List[model_meta_api.Dependency]: +def get_requirements_from_task(task: str, spcs_only: bool = False) -> List[model_env.ModelDependency]: # Text if task in [ "conversational", @@ -43,7 +56,11 @@ def get_requirements_from_task(task: str) -> List[model_meta_api.Dependency]: "text2text-generation", "zero-shot-classification", ] or task.startswith("translation"): - return [model_meta_api.Dependency(conda_name="tokenizers", pip_req="tokenizers")] + return ( + [model_env.ModelDependency(requirement="tokenizers>=0.13.3", pip_name="tokenizers")] + if spcs_only + else [model_env.ModelDependency(requirement="tokenizers", pip_name="tokenizers")] + ) return [] @@ -60,19 +77,25 @@ def default(self, z: object) -> object: return super().default(z) -class _HuggingFacePipelineHandler( - _base._ModelHandler[Union[huggingface_pipeline.HuggingFacePipelineModel, "transformers.Pipeline"]] +@final +class HuggingFacePipelineHandler( + _base.BaseModelHandler[Union[huggingface_pipeline.HuggingFacePipelineModel, "transformers.Pipeline"]] ): """Handler for custom model.""" - handler_type = "huggingface_pipeline" - MODEL_BLOB_FILE = "model" + HANDLER_TYPE = "huggingface_pipeline" + HANDLER_VERSION = "2023-12-01" + _MIN_SNOWPARK_ML_VERSION = "1.0.12" + _HANDLER_MIGRATOR_PLANS: Dict[str, Type[base_migrator.BaseModelHandlerMigrator]] = {} + + MODELE_BLOB_FILE_OR_DIR = "model" ADDITIONAL_CONFIG_FILE = "pipeline_config.pt" DEFAULT_TARGET_METHODS = ["__call__"] - is_auto_signature = True + IS_AUTO_SIGNATURE = True - @staticmethod + @classmethod def can_handle( + cls, model: model_types.SupportedModelType, ) -> TypeGuard[Union[huggingface_pipeline.HuggingFacePipelineModel, "transformers.Pipeline"]]: if type_utils.LazyType("transformers.Pipeline").isinstance(model): @@ -81,8 +104,9 @@ def can_handle( return True return False - @staticmethod + @classmethod def cast_model( + cls, model: model_types.SupportedModelType, ) -> Union[huggingface_pipeline.HuggingFacePipelineModel, "transformers.Pipeline"]: try: @@ -97,8 +121,9 @@ def cast_model( assert isinstance(model, transformers.Pipeline) return model - @staticmethod - def _save_model( + @classmethod + def save_model( + cls, name: str, model: Union[huggingface_pipeline.HuggingFacePipelineModel, "transformers.Pipeline"], model_meta: model_meta_api.ModelMetadata, @@ -130,15 +155,15 @@ def _save_model( inferred_pipe_sig = model_signature_utils.huggingface_pipeline_signature_auto_infer(task, params=params) if not is_sub_model: - target_methods = model_meta_api._get_target_methods( + target_methods = handlers_utils.get_target_methods( model=model, target_methods=kwargs.pop("target_methods", None), - default_target_methods=_HuggingFacePipelineHandler.DEFAULT_TARGET_METHODS, + default_target_methods=cls.DEFAULT_TARGET_METHODS, ) - if model_meta._signatures is not None: + if model_meta.signatures: if type_utils.LazyType("transformers.Pipeline").isinstance(model): - model_meta_api._validate_target_methods(model, list(model_meta.signatures.keys())) + handlers_utils.validate_target_methods(model, list(model_meta.signatures.keys())) else: warnings.warn( "It is impossible to validate your model signatures when using a" @@ -147,7 +172,7 @@ def _save_model( UserWarning, ) else: - model_meta_api._validate_target_methods(model, target_methods) + handlers_utils.validate_target_methods(model, target_methods) if sample_input is not None: warnings.warn( "Inferring model signature from sample input for hugggingface pipeline is not supported. " @@ -158,14 +183,14 @@ def _save_model( if inferred_pipe_sig is None: raise NotImplementedError(f"Cannot auto infer the signature of pipeline for task {task}") - model_meta._signatures = {"__call__": inferred_pipe_sig} + model_meta.signatures = {"__call__": inferred_pipe_sig} model_blob_path = os.path.join(model_blobs_dir_path, name) os.makedirs(model_blob_path, exist_ok=True) if type_utils.LazyType("transformers.Pipeline").isinstance(model): model.save_pretrained( # type:ignore[attr-defined] - os.path.join(model_blob_path, _HuggingFacePipelineHandler.MODEL_BLOB_FILE) + os.path.join(model_blob_path, cls.MODELE_BLOB_FILE_OR_DIR) ) pipeline_params = { "_batch_size": model._batch_size, # type:ignore[attr-defined] @@ -177,88 +202,96 @@ def _save_model( with open( os.path.join( model_blob_path, - _HuggingFacePipelineHandler.MODEL_BLOB_FILE, - _HuggingFacePipelineHandler.ADDITIONAL_CONFIG_FILE, + cls.MODELE_BLOB_FILE_OR_DIR, + cls.ADDITIONAL_CONFIG_FILE, ), "wb", ) as f: cloudpickle.dump(pipeline_params, f) else: with open( - os.path.join(model_blob_path, _HuggingFacePipelineHandler.MODEL_BLOB_FILE), + os.path.join(model_blob_path, cls.MODELE_BLOB_FILE_OR_DIR), "wb", ) as f: cloudpickle.dump(model, f) - model_meta.cuda_version = kwargs.get("cuda_version", model_meta_api._DEFAULT_CUDA_VERSION) - base_meta = model_meta_api._ModelBlobMetadata( + base_meta = model_blob_meta.ModelBlobMeta( name=name, - model_type=_HuggingFacePipelineHandler.handler_type, - path=_HuggingFacePipelineHandler.MODEL_BLOB_FILE, - options={ - "task": task, - "batch_size": batch_size if batch_size is not None else "1", - }, + model_type=cls.HANDLER_TYPE, + handler_version=cls.HANDLER_VERSION, + path=cls.MODELE_BLOB_FILE_OR_DIR, + options=model_meta_schema.HuggingFacePipelineModelBlobOptions( + { + "task": task, + "batch_size": batch_size if batch_size is not None else 1, + } + ), ) model_meta.models[name] = base_meta + model_meta.min_snowpark_ml_version = cls._MIN_SNOWPARK_ML_VERSION pkgs_requirements = [ - model_meta_api.Dependency(conda_name="transformers", pip_req="transformers"), - ] + get_requirements_from_task(task) + model_env.ModelDependency(requirement="transformers>=4.32.1", pip_name="transformers"), + ] + get_requirements_from_task( + task, spcs_only=(not type_utils.LazyType("transformers.Pipeline").isinstance(model)) + ) if framework is None or framework == "pt": # Since we set default cuda version to be 11.7, to make sure it works with GPU, we need to have a default # Pytorch version that works with CUDA 11.7 as well. This is required for huggingface pipelines only as # users are not required to install pytorch locally if they are using the wrapper. - pkgs_requirements.append(model_meta_api.Dependency(conda_name="pytorch", pip_req="torch==2.0.1")) + pkgs_requirements.append(model_env.ModelDependency(requirement="pytorch==2.0.1", pip_name="torch")) elif framework == "tf": - pkgs_requirements.append(model_meta_api.Dependency(conda_name="tensorflow", pip_req="tensorflow")) - model_meta._include_if_absent(pkgs_requirements) + pkgs_requirements.append(model_env.ModelDependency(requirement="tensorflow", pip_name="tensorflow")) + model_meta.env.include_if_absent( + pkgs_requirements, check_local_version=(type_utils.LazyType("transformers.Pipeline").isinstance(model)) + ) + model_meta.env.cuda_version = kwargs.get("cuda_version", model_env.DEFAULT_CUDA_VERSION) @staticmethod def _get_device_config() -> Dict[str, str]: - from accelerate import utils - device_config = {} - utils.write_basic_config(mixed_precision="fp16") device_config["device_map"] = "auto" return device_config - @staticmethod - def _load_model( + @classmethod + def load_model( + cls, name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str, **kwargs: Unpack[model_types.ModelLoadOption], ) -> Union[huggingface_pipeline.HuggingFacePipelineModel, "transformers.Pipeline"]: + if snowpark_utils.is_in_stored_procedure(): # type: ignore[no-untyped-call] + # We need to redirect the some folders to a writable location in the sandbox. + os.environ["HF_HOME"] = "/tmp" + os.environ["XDG_CACHE_HOME"] = "/tmp" + model_blob_path = os.path.join(model_blobs_dir_path, name) - if not hasattr(model_meta, "models"): - raise ValueError("Ill model metadata found.") model_blobs_metadata = model_meta.models - if name not in model_blobs_metadata: - raise ValueError(f"Blob of model {name} does not exist.") model_blob_metadata = model_blobs_metadata[name] model_blob_filename = model_blob_metadata.path - model_blob_options = model_blob_metadata.options + model_blob_options = cast(model_meta_schema.HuggingFacePipelineModelBlobOptions, model_blob_metadata.options) + if "task" not in model_blob_options: + raise ValueError("Missing field `task` in model blob metadata for type `huggingface_pipeline`") + if "batch_size" not in model_blob_options: + raise ValueError("Missing field `batch_size` in model blob metadata for type `huggingface_pipeline`") model_blob_file_or_dir_path = os.path.join(model_blob_path, model_blob_filename) if os.path.isdir(model_blob_file_or_dir_path): import transformers - if "task" not in model_blob_options: - raise ValueError("`task` must be specified in options.") - with open( os.path.join( model_blob_file_or_dir_path, - _HuggingFacePipelineHandler.ADDITIONAL_CONFIG_FILE, + cls.ADDITIONAL_CONFIG_FILE, ), "rb", ) as f: pipeline_params = cloudpickle.load(f) if kwargs.get("use_gpu", False): - device_config = _HuggingFacePipelineHandler._get_device_config() + device_config = cls._get_device_config() else: device_config = {} @@ -280,32 +313,19 @@ def _load_model( and getattr(m, "device_map", None) is None and kwargs.get("use_gpu", False) ): - m.__dict__.update(_HuggingFacePipelineHandler._get_device_config()) + m.__dict__.update(cls._get_device_config()) if getattr(m, "torch_dtype", None) is None and kwargs.get("use_gpu", False): m.__dict__.update(torch_dtype="auto") return m - @staticmethod - def _load_as_custom_model( - name: str, + @classmethod + def convert_as_custom_model( + cls, + raw_model: Union[huggingface_pipeline.HuggingFacePipelineModel, "transformers.Pipeline"], model_meta: model_meta_api.ModelMetadata, - model_blobs_dir_path: str, **kwargs: Unpack[model_types.ModelLoadOption], ) -> custom_model.CustomModel: - """Create a custom model class wrap for unified interface when being deployed. The predict method will be - re-targeted based on target_method metadata. - - Args: - name: Name of the model. - model_meta: The model metadata. - model_blobs_dir_path: Directory path to the whole model. - kwargs: Options when loading the model. - - Returns: - The model object as a custom model. - """ - import transformers from snowflake.ml.model import custom_model @@ -411,7 +431,6 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: return _HFPipelineModel - raw_model = _HuggingFacePipelineHandler._load_model(name, model_meta, model_blobs_dir_path, **kwargs) if isinstance(raw_model, huggingface_pipeline.HuggingFacePipelineModel): if version.parse(transformers.__version__) < version.parse("4.32.0"): # Backward compatibility since HF interface change. diff --git a/snowflake/ml/model/_packager/model_handlers/llm.py b/snowflake/ml/model/_packager/model_handlers/llm.py new file mode 100644 index 00000000..86840a71 --- /dev/null +++ b/snowflake/ml/model/_packager/model_handlers/llm.py @@ -0,0 +1,276 @@ +import logging +import os +from typing import Dict, Optional, Type, cast, final + +import cloudpickle +import pandas as pd +from packaging import requirements +from typing_extensions import TypeGuard, Unpack + +from snowflake.ml._internal import env_utils, file_utils +from snowflake.ml.model import custom_model, model_signature, type_hints as model_types +from snowflake.ml.model._packager.model_env import model_env +from snowflake.ml.model._packager.model_handlers import _base +from snowflake.ml.model._packager.model_handlers_migrator import base_migrator +from snowflake.ml.model._packager.model_meta import ( + model_blob_meta, + model_meta as model_meta_api, + model_meta_schema, +) +from snowflake.ml.model.models import llm + +logger = logging.getLogger(__name__) + + +@final +class LLMHandler(_base.BaseModelHandler[llm.LLM]): + HANDLER_TYPE = "llm" + HANDLER_VERSION = "2023-12-01" + _MIN_SNOWPARK_ML_VERSION = "1.0.12" + _HANDLER_MIGRATOR_PLANS: Dict[str, Type[base_migrator.BaseModelHandlerMigrator]] = {} + + MODELE_BLOB_FILE_OR_DIR = "model" + LLM_META = "llm_meta" + IS_AUTO_SIGNATURE = True + + @classmethod + def can_handle( + cls, + model: model_types.SupportedModelType, + ) -> TypeGuard[llm.LLM]: + return isinstance(model, llm.LLM) + + @classmethod + def cast_model( + cls, + model: model_types.SupportedModelType, + ) -> llm.LLM: + assert isinstance(model, llm.LLM) + return cast(llm.LLM, model) + + @classmethod + def save_model( + cls, + name: str, + model: llm.LLM, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + sample_input: Optional[model_types.SupportedDataType] = None, + is_sub_model: Optional[bool] = False, + **kwargs: Unpack[model_types.LLMSaveOptions], + ) -> None: + assert not is_sub_model, "LLM can not be sub-model." + model_blob_path = os.path.join(model_blobs_dir_path, name) + os.makedirs(model_blob_path, exist_ok=True) + model_blob_dir_path = os.path.join(model_blob_path, cls.MODELE_BLOB_FILE_OR_DIR) + + sig = model_signature.ModelSignature( + inputs=[ + model_signature.FeatureSpec(name="input", dtype=model_signature.DataType.STRING), + ], + outputs=[ + model_signature.FeatureSpec(name="generated_text", dtype=model_signature.DataType.STRING), + ], + ) + model_meta.signatures = {"infer": sig} + if os.path.isdir(model.model_id_or_path): + file_utils.copytree(model.model_id_or_path, model_blob_dir_path) + + os.makedirs(model_blob_dir_path, exist_ok=True) + with open( + os.path.join(model_blob_dir_path, cls.LLM_META), + "wb", + ) as f: + cloudpickle.dump(model, f) + + base_meta = model_blob_meta.ModelBlobMeta( + name=name, + model_type=cls.HANDLER_TYPE, + handler_version=cls.HANDLER_VERSION, + path=cls.MODELE_BLOB_FILE_OR_DIR, + options=model_meta_schema.LLMModelBlobOptions( + { + "batch_size": model.max_batch_size, + } + ), + ) + model_meta.models[name] = base_meta + model_meta.min_snowpark_ml_version = cls._MIN_SNOWPARK_ML_VERSION + + pkgs_requirements = [ + model_env.ModelDependency(requirement="transformers>=4.32.1", pip_name="transformers"), + model_env.ModelDependency(requirement="pytorch==2.0.1", pip_name="torch"), + ] + if model.model_type == llm.SupportedLLMType.LLAMA_MODEL_TYPE.value: + pkgs_requirements = [ + model_env.ModelDependency(requirement="sentencepiece", pip_name="sentencepiece"), + model_env.ModelDependency(requirement="protobuf", pip_name="protobuf"), + *pkgs_requirements, + ] + model_meta.env.include_if_absent(pkgs_requirements, check_local_version=True) + # Recent peft versions are only available in PYPI. + env_utils.append_requirement_list( + model_meta.env._pip_requirements, + requirements.Requirement("peft==0.5.0"), + ) + env_utils.append_requirement_list( + model_meta.env._pip_requirements, + requirements.Requirement("vllm==0.2.1.post1"), + ) + + model_meta.env.cuda_version = kwargs.get("cuda_version", model_env.DEFAULT_CUDA_VERSION) + + @classmethod + def load_model( + cls, + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], + ) -> llm.LLM: + model_blob_path = os.path.join(model_blobs_dir_path, name) + if not hasattr(model_meta, "models"): + raise ValueError("Ill model metadata found.") + model_blobs_metadata = model_meta.models + if name not in model_blobs_metadata: + raise ValueError(f"Blob of model {name} does not exist.") + model_blob_metadata = model_blobs_metadata[name] + model_blob_filename = model_blob_metadata.path + model_blob_dir_path = os.path.join(model_blob_path, model_blob_filename) + assert model_blob_dir_path, "It must be a directory." + with open(os.path.join(model_blob_dir_path, cls.LLM_META), "rb") as f: + m = cloudpickle.load(f) + assert isinstance(m, llm.LLM) + if m.mode == llm.LLM.Mode.LOCAL_LORA: + # Switch to local path + m.model_id_or_path = model_blob_dir_path + return m + + @classmethod + def convert_as_custom_model( + cls, + raw_model: llm.LLM, + model_meta: model_meta_api.ModelMetadata, + **kwargs: Unpack[model_types.ModelLoadOption], + ) -> custom_model.CustomModel: + import gc + import tempfile + + import torch + import transformers + import vllm + + assert torch.cuda.is_available(), "LLM inference only works on GPUs." + device_count = torch.cuda.device_count() + logger.warning(f"There's total {device_count} GPUs visible to use.") + + class _LLMCustomModel(custom_model.CustomModel): + def _memory_stats(self, msg: str) -> None: + logger.warning(msg) + logger.warning(f"Torch VRAM {torch.cuda.memory_allocated()/1024**2} MB allocated.") + logger.warning(f"Torch VRAM {torch.cuda.memory_reserved()/1024**2} MB reserved.") + + def _merge_model(self, local_dir_path: str): # type: ignore[no-untyped-def] + import peft + + hub_kwargs = { + "revision": raw_model.revision, + "token": raw_model.token, + } + model_dir_path = raw_model.model_id_or_path + peft_config = peft.PeftConfig.from_pretrained(model_dir_path) # type: ignore[attr-defined] + base_model_path = peft_config.base_model_name_or_path + tokenizer = transformers.AutoTokenizer.from_pretrained( + base_model_path, + padding_side="right", + use_fast=False, + **hub_kwargs, + ) + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + tokenizer.save_pretrained(local_dir_path) + logger.warning(f"Tokenizer state is saved to {local_dir_path}.") + hf_model = peft.AutoPeftModelForCausalLM.from_pretrained( # type: ignore[attr-defined] + model_dir_path, + device_map="auto", + torch_dtype="auto", + **hub_kwargs, + ) + hf_model.eval() + hf_model = hf_model.merge_and_unload() + hf_model.save_pretrained(local_dir_path) + logger.warning(f"Merged model state is saved to {local_dir_path}.") + return hf_model + + def _init_engine_for_remote_pretrain(self) -> None: + hub_kwargs = { + "revision": raw_model.revision, + "token": raw_model.token, + } + model_dir_path = raw_model.model_id_or_path + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_dir_path, + padding_side="right", + use_fast=False, + **hub_kwargs, + ) + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + t = tempfile.TemporaryDirectory() + local_dir_path = t.name + tokenizer.save_pretrained(local_dir_path) + hf_model = transformers.AutoModelForCausalLM.from_pretrained( + model_dir_path, + device_map="auto", + torch_dtype="auto", + **hub_kwargs, + ) + hf_model.eval() + hf_model.save_pretrained(local_dir_path) + logger.warning(f"Model state is saved to {local_dir_path}.") + del tokenizer + del hf_model + gc.collect() + torch.cuda.empty_cache() + self._memory_stats("After GC on model.") + self.llm_engine = vllm.LLM( + model=t.name, + # TODO(halu): Update if raylet issued resolved. + tensor_parallel_size=1, + ) + + def _init_engine_for_lora(self) -> None: + t = tempfile.TemporaryDirectory() + self._memory_stats("Before model load & merge.") + hf_model = self._merge_model(t.name) + self._memory_stats("After model load & merge.") + del hf_model + gc.collect() + torch.cuda.empty_cache() + self._memory_stats("After GC on model.") + + tp_size = torch.cuda.device_count() if raw_model.enable_tp else 1 + self.llm_engine = vllm.LLM(model=t.name, tensor_parallel_size=tp_size) + logger.warning(f"vLLM engine init is done. tp: {tp_size}") + + def __init__(self, context: custom_model.ModelContext) -> None: + if raw_model.mode == llm.LLM.Mode.LOCAL_LORA: + self._init_engine_for_lora() + elif raw_model.mode == llm.LLM.Mode.REMOTE_PRETRAIN: + self._init_engine_for_remote_pretrain() + + self.sampling_params = vllm.SamplingParams( + temperature=raw_model.temperature, + top_p=raw_model.top_p, + max_tokens=raw_model.max_tokens, + ) + + @custom_model.inference_api + def infer(self, X: pd.DataFrame) -> pd.DataFrame: + input_data = X.to_dict("list")["input"] + res = self.llm_engine.generate(input_data, self.sampling_params) + return pd.DataFrame({"generated_text": [o.outputs[0].text for o in res]}) + + llm_custom = _LLMCustomModel(custom_model.ModelContext()) + + return llm_custom diff --git a/snowflake/ml/model/_packager/model_handlers/mlflow.py b/snowflake/ml/model/_packager/model_handlers/mlflow.py new file mode 100644 index 00000000..0ae720c1 --- /dev/null +++ b/snowflake/ml/model/_packager/model_handlers/mlflow.py @@ -0,0 +1,233 @@ +import os +import tempfile +from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, cast, final + +import pandas as pd +from typing_extensions import TypeGuard, Unpack + +from snowflake.ml._internal import file_utils, type_utils +from snowflake.ml.model import custom_model, model_signature, type_hints as model_types +from snowflake.ml.model._packager.model_env import model_env +from snowflake.ml.model._packager.model_handlers import _base, _utils as handlers_utils +from snowflake.ml.model._packager.model_handlers_migrator import base_migrator +from snowflake.ml.model._packager.model_meta import ( + model_blob_meta, + model_meta as model_meta_api, + model_meta_schema, +) +from snowflake.ml.model._signatures import utils as model_signature_utils +from snowflake.snowpark._internal import utils as snowpark_utils + +if TYPE_CHECKING: + import mlflow + + +def _parse_mlflow_env(model_uri: str, env: model_env.ModelEnv) -> model_env.ModelEnv: + """Parse MLFlow env file and modify model env in model meta based on MLFlow env. + + Args: + model_uri: Model uri where the env file could be downloaded + env: ModelEnv object to be modified + + Raises: + ValueError: Raised when cannot download MLFlow model dependencies file. + + Returns: + Modified model env. + """ + import mlflow + + try: + conda_env_file_path = mlflow.pyfunc.get_model_dependencies(model_uri, format="conda") + except (mlflow.MlflowException, OSError): + raise ValueError("Cannot load MLFlow model dependencies.") + + if not os.path.exists(conda_env_file_path): + raise ValueError("Cannot load MLFlow model dependencies.") + + env.load_from_conda_file(conda_env_file_path) + + return env + + +@final +class MLFlowHandler(_base.BaseModelHandler["mlflow.pyfunc.PyFuncModel"]): + """Handler for MLFlow based model. + + Currently mlflow.pyfunc.PyFuncModel based classes are supported. + """ + + HANDLER_TYPE = "mlflow" + HANDLER_VERSION = "2023-12-01" + _MIN_SNOWPARK_ML_VERSION = "1.0.12" + _HANDLER_MIGRATOR_PLANS: Dict[str, Type[base_migrator.BaseModelHandlerMigrator]] = {} + + MODELE_BLOB_FILE_OR_DIR = "model" + _DEFAULT_TARGET_METHOD = "predict" + DEFAULT_TARGET_METHODS = [_DEFAULT_TARGET_METHOD] + IS_AUTO_SIGNATURE = True + + @classmethod + def can_handle( + cls, + model: model_types.SupportedModelType, + ) -> TypeGuard["mlflow.pyfunc.PyFuncModel"]: + return type_utils.LazyType("mlflow.pyfunc.PyFuncModel").isinstance(model) + + @classmethod + def cast_model( + cls, + model: model_types.SupportedModelType, + ) -> "mlflow.pyfunc.PyFuncModel": + import mlflow + + assert isinstance(model, mlflow.pyfunc.PyFuncModel) + + return cast(mlflow.pyfunc.PyFuncModel, model) + + @classmethod + def save_model( + cls, + name: str, + model: "mlflow.pyfunc.PyFuncModel", + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + sample_input: Optional[model_types.SupportedDataType] = None, + is_sub_model: Optional[bool] = False, + **kwargs: Unpack[model_types.MLFlowSaveOptions], + ) -> None: + import mlflow + + assert isinstance(model, mlflow.pyfunc.PyFuncModel) + + model_info = model.metadata.get_model_info() + model_uri = kwargs.get("model_uri", model_info.model_uri) + + pyfunc_flavor_info = model_info.flavors.get(mlflow.pyfunc.FLAVOR_NAME, None) + if pyfunc_flavor_info is None: + raise ValueError("Cannot save MLFlow model that does not have PyFunc flavor.") + + # Port MLFlow signature + if not is_sub_model: + if model_meta.signatures: + handlers_utils.validate_target_methods(model, list(model_meta.signatures.keys())) + else: + handlers_utils.validate_target_methods(model, cls.DEFAULT_TARGET_METHODS) + model_meta.signatures = { + cls._DEFAULT_TARGET_METHOD: model_signature.ModelSignature.from_mlflow_sig(model_info.signature) + } + + # Port MLFlow metadata + mlflow_model_metadata = model_info.metadata + if mlflow_model_metadata and not kwargs.get("ignore_mlflow_metadata", False): + if not model_meta.metadata: + model_meta.metadata = {} + model_meta.metadata.update(mlflow_model_metadata) + + # Port MLFlow dependencies + if kwargs.get("ignore_mlflow_dependencies", False): + model_meta.env.include_if_absent( + [model_env.ModelDependency(requirement="mlflow", pip_name="mlflow")], check_local_version=True + ) + else: + model_meta.env = _parse_mlflow_env(model_uri, model_meta.env) + + model_blob_path = os.path.join(model_blobs_dir_path, name) + + os.makedirs(model_blob_path, exist_ok=True) + with tempfile.TemporaryDirectory() as tmpdir: + try: + local_path = mlflow.artifacts.download_artifacts(model_uri, dst_path=tmpdir) + except (mlflow.MlflowException, OSError): + raise ValueError("Cannot load MLFlow model artifacts.") + + file_utils.copy_file_or_tree(local_path, os.path.join(model_blob_path, cls.MODELE_BLOB_FILE_OR_DIR)) + + base_meta = model_blob_meta.ModelBlobMeta( + name=name, + model_type=cls.HANDLER_TYPE, + handler_version=cls.HANDLER_VERSION, + path=cls.MODELE_BLOB_FILE_OR_DIR, + options=model_meta_schema.MLFlowModelBlobOptions({"artifact_path": model_info.artifact_path}), + ) + model_meta.models[name] = base_meta + model_meta.min_snowpark_ml_version = cls._MIN_SNOWPARK_ML_VERSION + + @classmethod + def load_model( + cls, + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], + ) -> "mlflow.pyfunc.PyFuncModel": + import mlflow + + if snowpark_utils.is_in_stored_procedure(): # type: ignore[no-untyped-call] + # We need to redirect the mlruns folder to a writable location in the sandbox. + tmpdir = tempfile.TemporaryDirectory(dir="/tmp") + mlflow.set_tracking_uri(f"file://{tmpdir}") + + model_blob_path = os.path.join(model_blobs_dir_path, name) + model_blobs_metadata = model_meta.models + model_blob_metadata = model_blobs_metadata[name] + model_blob_options = cast(model_meta_schema.MLFlowModelBlobOptions, model_blob_metadata.options) + if "artifact_path" not in model_blob_options: + raise ValueError("Missing field `artifact_path` in model blob metadata for type `mlflow`") + + model_artifact_path = model_blob_options["artifact_path"] + model_blob_filename = model_blob_metadata.path + + # This is to make sure the loaded model can be saved again. + with mlflow.start_run() as run: + mlflow.log_artifacts( + os.path.join(model_blob_path, model_blob_filename, model_artifact_path), + artifact_path=model_artifact_path, + ) + m = mlflow.pyfunc.load_model(f"runs:/{run.info.run_id}/{model_artifact_path}") + m.metadata.run_id = run.info.run_id + return m + + @classmethod + def convert_as_custom_model( + cls, + raw_model: "mlflow.pyfunc.PyFuncModel", + model_meta: model_meta_api.ModelMetadata, + **kwargs: Unpack[model_types.ModelLoadOption], + ) -> custom_model.CustomModel: + from snowflake.ml.model import custom_model + + def _create_custom_model( + raw_model: "mlflow.pyfunc.PyFuncModel", + model_meta: model_meta_api.ModelMetadata, + ) -> Type[custom_model.CustomModel]: + def fn_factory( + raw_model: "mlflow.pyfunc.PyFuncModel", + signature: model_signature.ModelSignature, + target_method: str, + ) -> Callable[[custom_model.CustomModel, pd.DataFrame], pd.DataFrame]: + @custom_model.inference_api + def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: + res = raw_model.predict(X) + return model_signature_utils.rename_pandas_df( + model_signature._convert_local_data_to_df(res), features=signature.outputs + ) + + return fn + + type_method_dict = {} + for target_method_name, sig in model_meta.signatures.items(): + type_method_dict[target_method_name] = fn_factory(raw_model, sig, target_method_name) + + _MLFlowModel = type( + "_MLFlowModel", + (custom_model.CustomModel,), + type_method_dict, + ) + + return _MLFlowModel + + _MLFlowModel = _create_custom_model(raw_model, model_meta) + mlflow_model = _MLFlowModel(custom_model.ModelContext()) + + return mlflow_model diff --git a/snowflake/ml/model/_handlers/pytorch.py b/snowflake/ml/model/_packager/model_handlers/pytorch.py similarity index 74% rename from snowflake/ml/model/_handlers/pytorch.py rename to snowflake/ml/model/_packager/model_handlers/pytorch.py index 2c83a8ea..e3c74d1c 100644 --- a/snowflake/ml/model/_handlers/pytorch.py +++ b/snowflake/ml/model/_packager/model_handlers/pytorch.py @@ -1,19 +1,20 @@ import os import sys -from typing import TYPE_CHECKING, Callable, Optional, Type, cast +from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, cast, final import cloudpickle import pandas as pd from typing_extensions import TypeGuard, Unpack from snowflake.ml._internal import type_utils -from snowflake.ml.model import ( - _model_meta as model_meta_api, - custom_model, - model_signature, - type_hints as model_types, +from snowflake.ml.model import custom_model, model_signature, type_hints as model_types +from snowflake.ml.model._packager.model_env import model_env +from snowflake.ml.model._packager.model_handlers import _base, _utils as handlers_utils +from snowflake.ml.model._packager.model_handlers_migrator import base_migrator +from snowflake.ml.model._packager.model_meta import ( + model_blob_meta, + model_meta as model_meta_api, ) -from snowflake.ml.model._handlers import _base from snowflake.ml.model._signatures import ( pytorch_handler, utils as model_signature_utils, @@ -23,26 +24,33 @@ import torch -class _PyTorchHandler(_base._ModelHandler["torch.nn.Module"]): +@final +class PyTorchHandler(_base.BaseModelHandler["torch.nn.Module"]): """Handler for PyTorch based model. Currently torch.nn.Module based classes are supported. """ - handler_type = "pytorch" - MODEL_BLOB_FILE = "model.pt" + HANDLER_TYPE = "pytorch" + HANDLER_VERSION = "2023-12-01" + _MIN_SNOWPARK_ML_VERSION = "1.0.12" + _HANDLER_MIGRATOR_PLANS: Dict[str, Type[base_migrator.BaseModelHandlerMigrator]] = {} + + MODELE_BLOB_FILE_OR_DIR = "model.pt" DEFAULT_TARGET_METHODS = ["forward"] - @staticmethod + @classmethod def can_handle( + cls, model: model_types.SupportedModelType, ) -> TypeGuard["torch.nn.Module"]: return type_utils.LazyType("torch.nn.Module").isinstance(model) and not type_utils.LazyType( "torch.jit.ScriptModule" ).isinstance(model) - @staticmethod + @classmethod def cast_model( + cls, model: model_types.SupportedModelType, ) -> "torch.nn.Module": import torch @@ -51,8 +59,9 @@ def cast_model( return cast(torch.nn.Module, model) - @staticmethod - def _save_model( + @classmethod + def save_model( + cls, name: str, model: "torch.nn.Module", model_meta: model_meta_api.ModelMetadata, @@ -66,10 +75,10 @@ def _save_model( assert isinstance(model, torch.nn.Module) if not is_sub_model: - target_methods = model_meta_api._get_target_methods( + target_methods = handlers_utils.get_target_methods( model=model, target_methods=kwargs.pop("target_methods", None), - default_target_methods=_PyTorchHandler.DEFAULT_TARGET_METHODS, + default_target_methods=cls.DEFAULT_TARGET_METHODS, ) def get_prediction( @@ -90,7 +99,7 @@ def get_prediction( predictions_df = [predictions_df] return predictions_df - model_meta = model_meta_api._validate_signature( + model_meta = handlers_utils.validate_signature( model=model, model_meta=model_meta, target_methods=target_methods, @@ -103,18 +112,25 @@ def get_prediction( cloudpickle.register_pickle_by_value(sys.modules[model.__module__]) model_blob_path = os.path.join(model_blobs_dir_path, name) os.makedirs(model_blob_path, exist_ok=True) - with open(os.path.join(model_blob_path, _PyTorchHandler.MODEL_BLOB_FILE), "wb") as f: + with open(os.path.join(model_blob_path, cls.MODELE_BLOB_FILE_OR_DIR), "wb") as f: torch.save(model, f, pickle_module=cloudpickle) - base_meta = model_meta_api._ModelBlobMetadata( - name=name, model_type=_PyTorchHandler.handler_type, path=_PyTorchHandler.MODEL_BLOB_FILE + base_meta = model_blob_meta.ModelBlobMeta( + name=name, + model_type=cls.HANDLER_TYPE, + handler_version=cls.HANDLER_VERSION, + path=cls.MODELE_BLOB_FILE_OR_DIR, ) model_meta.models[name] = base_meta - model_meta._include_if_absent([model_meta_api.Dependency(conda_name="pytorch", pip_req="torch")]) + model_meta.min_snowpark_ml_version = cls._MIN_SNOWPARK_ML_VERSION - model_meta.cuda_version = kwargs.get("cuda_version", model_meta_api._DEFAULT_CUDA_VERSION) + model_meta.env.include_if_absent( + [model_env.ModelDependency(requirement="pytorch", pip_name="torch")], check_local_version=True + ) + model_meta.env.cuda_version = kwargs.get("cuda_version", model_env.DEFAULT_CUDA_VERSION) - @staticmethod - def _load_model( + @classmethod + def load_model( + cls, name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str, @@ -123,11 +139,7 @@ def _load_model( import torch model_blob_path = os.path.join(model_blobs_dir_path, name) - if not hasattr(model_meta, "models"): - raise ValueError("Ill model metadata found.") model_blobs_metadata = model_meta.models - if name not in model_blobs_metadata: - raise ValueError(f"Blob of model {name} does not exist.") model_blob_metadata = model_blobs_metadata[name] model_blob_filename = model_blob_metadata.path with open(os.path.join(model_blob_path, model_blob_filename), "rb") as f: @@ -136,25 +148,13 @@ def _load_model( return m - @staticmethod - def _load_as_custom_model( - name: str, + @classmethod + def convert_as_custom_model( + cls, + raw_model: "torch.nn.Module", model_meta: model_meta_api.ModelMetadata, - model_blobs_dir_path: str, **kwargs: Unpack[model_types.ModelLoadOption], ) -> custom_model.CustomModel: - """Create a custom model class wrap for unified interface when being deployed. The predict method will be - re-targeted based on target_method metadata. - - Args: - name: Name of the model. - model_meta: The model metadata. - model_blobs_dir_path: Directory path to the whole model. - kwargs: Options when loading the model. - - Returns: - The model object as a custom model. - """ import torch from snowflake.ml.model import custom_model @@ -203,7 +203,6 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: return _PyTorchModel - raw_model = _PyTorchHandler._load_model(name, model_meta, model_blobs_dir_path, **kwargs) _PyTorchModel = _create_custom_model(raw_model, model_meta) pytorch_model = _PyTorchModel(custom_model.ModelContext()) diff --git a/snowflake/ml/model/_handlers/sklearn.py b/snowflake/ml/model/_packager/model_handlers/sklearn.py similarity index 74% rename from snowflake/ml/model/_handlers/sklearn.py rename to snowflake/ml/model/_packager/model_handlers/sklearn.py index 2b2c6dc2..fef8a181 100644 --- a/snowflake/ml/model/_handlers/sklearn.py +++ b/snowflake/ml/model/_packager/model_handlers/sklearn.py @@ -1,5 +1,5 @@ import os -from typing import TYPE_CHECKING, Callable, Optional, Type, Union, cast +from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, Union, cast, final import cloudpickle import numpy as np @@ -7,13 +7,14 @@ from typing_extensions import TypeGuard, Unpack from snowflake.ml._internal import type_utils -from snowflake.ml.model import ( - _model_meta as model_meta_api, - custom_model, - model_signature, - type_hints as model_types, +from snowflake.ml.model import custom_model, model_signature, type_hints as model_types +from snowflake.ml.model._packager.model_env import model_env +from snowflake.ml.model._packager.model_handlers import _base, _utils as handlers_utils +from snowflake.ml.model._packager.model_handlers_migrator import base_migrator +from snowflake.ml.model._packager.model_meta import ( + model_blob_meta, + model_meta as model_meta_api, ) -from snowflake.ml.model._handlers import _base from snowflake.ml.model._signatures import numpy_handler, utils as model_signature_utils if TYPE_CHECKING: @@ -21,17 +22,23 @@ import sklearn.pipeline -class _SKLModelHandler(_base._ModelHandler[Union["sklearn.base.BaseEstimator", "sklearn.pipeline.Pipeline"]]): +@final +class SKLModelHandler(_base.BaseModelHandler[Union["sklearn.base.BaseEstimator", "sklearn.pipeline.Pipeline"]]): """Handler for scikit-learn based model. Currently sklearn.base.BaseEstimator and sklearn.pipeline.Pipeline based classes are supported. """ - handler_type = "sklearn" + HANDLER_TYPE = "sklearn" + HANDLER_VERSION = "2023-12-01" + _MIN_SNOWPARK_ML_VERSION = "1.0.12" + _HANDLER_MIGRATOR_PLANS: Dict[str, Type[base_migrator.BaseModelHandlerMigrator]] = {} + DEFAULT_TARGET_METHODS = ["predict", "transform", "predict_proba", "predict_log_proba", "decision_function"] - @staticmethod + @classmethod def can_handle( + cls, model: model_types.SupportedModelType, ) -> TypeGuard[Union["sklearn.base.BaseEstimator", "sklearn.pipeline.Pipeline"]]: return ( @@ -42,12 +49,13 @@ def can_handle( and (not type_utils.LazyType("xgboost.XGBModel").isinstance(model)) # XGBModel is actually a BaseEstimator and any( (hasattr(model, method) and callable(getattr(model, method, None))) - for method in _SKLModelHandler.DEFAULT_TARGET_METHODS + for method in cls.DEFAULT_TARGET_METHODS ) ) - @staticmethod + @classmethod def cast_model( + cls, model: model_types.SupportedModelType, ) -> Union["sklearn.base.BaseEstimator", "sklearn.pipeline.Pipeline"]: import sklearn.base @@ -57,8 +65,9 @@ def cast_model( return cast(Union["sklearn.base.BaseEstimator", "sklearn.pipeline.Pipeline"], model) - @staticmethod - def _save_model( + @classmethod + def save_model( + cls, name: str, model: Union["sklearn.base.BaseEstimator", "sklearn.pipeline.Pipeline"], model_meta: model_meta_api.ModelMetadata, @@ -73,10 +82,10 @@ def _save_model( assert isinstance(model, sklearn.base.BaseEstimator) or isinstance(model, sklearn.pipeline.Pipeline) if not is_sub_model: - target_methods = model_meta_api._get_target_methods( + target_methods = handlers_utils.get_target_methods( model=model, target_methods=kwargs.pop("target_methods", None), - default_target_methods=_SKLModelHandler.DEFAULT_TARGET_METHODS, + default_target_methods=cls.DEFAULT_TARGET_METHODS, ) def get_prediction( @@ -90,7 +99,7 @@ def get_prediction( predictions_df = target_method(sample_input) return predictions_df - model_meta = model_meta_api._validate_signature( + model_meta = handlers_utils.validate_signature( model=model, model_meta=model_meta, target_methods=target_methods, @@ -100,27 +109,31 @@ def get_prediction( model_blob_path = os.path.join(model_blobs_dir_path, name) os.makedirs(model_blob_path, exist_ok=True) - with open(os.path.join(model_blob_path, _SKLModelHandler.MODEL_BLOB_FILE), "wb") as f: + with open(os.path.join(model_blob_path, cls.MODELE_BLOB_FILE_OR_DIR), "wb") as f: cloudpickle.dump(model, f) - base_meta = model_meta_api._ModelBlobMetadata( - name=name, model_type=_SKLModelHandler.handler_type, path=_SKLModelHandler.MODEL_BLOB_FILE + base_meta = model_blob_meta.ModelBlobMeta( + name=name, + model_type=cls.HANDLER_TYPE, + handler_version=cls.HANDLER_VERSION, + path=cls.MODELE_BLOB_FILE_OR_DIR, ) model_meta.models[name] = base_meta - model_meta._include_if_absent([model_meta_api.Dependency(conda_name="scikit-learn", pip_req="scikit-learn")]) + model_meta.min_snowpark_ml_version = cls._MIN_SNOWPARK_ML_VERSION + + model_meta.env.include_if_absent( + [model_env.ModelDependency(requirement="scikit-learn", pip_name="scikit-learn")], check_local_version=True + ) - @staticmethod - def _load_model( + @classmethod + def load_model( + cls, name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str, **kwargs: Unpack[model_types.ModelLoadOption], ) -> Union["sklearn.base.BaseEstimator", "sklearn.pipeline.Pipeline"]: model_blob_path = os.path.join(model_blobs_dir_path, name) - if not hasattr(model_meta, "models"): - raise ValueError("Ill model metadata found.") model_blobs_metadata = model_meta.models - if name not in model_blobs_metadata: - raise ValueError(f"Blob of model {name} does not exist.") model_blob_metadata = model_blobs_metadata[name] model_blob_filename = model_blob_metadata.path with open(os.path.join(model_blob_path, model_blob_filename), "rb") as f: @@ -132,25 +145,13 @@ def _load_model( assert isinstance(m, sklearn.base.BaseEstimator) or isinstance(m, sklearn.pipeline.Pipeline) return m - @staticmethod - def _load_as_custom_model( - name: str, + @classmethod + def convert_as_custom_model( + cls, + raw_model: Union["sklearn.base.BaseEstimator", "sklearn.pipeline.Pipeline"], model_meta: model_meta_api.ModelMetadata, - model_blobs_dir_path: str, **kwargs: Unpack[model_types.ModelLoadOption], ) -> custom_model.CustomModel: - """Create a custom model class wrap for unified interface when being deployed. The predict method will be - re-targeted based on target_method metadata. - - Args: - name: Name of the model. - model_meta: The model metadata. - model_blobs_dir_path: Directory path to the whole model. - kwargs: Options when loading the model. - - Returns: - The model object as a custom model. - """ from snowflake.ml.model import custom_model def _create_custom_model( @@ -189,7 +190,6 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: return _SKLModel - raw_model = _SKLModelHandler._load_model(name, model_meta, model_blobs_dir_path, **kwargs) _SKLModel = _create_custom_model(raw_model, model_meta) skl_model = _SKLModel(custom_model.ModelContext()) diff --git a/snowflake/ml/model/_handlers/snowmlmodel.py b/snowflake/ml/model/_packager/model_handlers/snowmlmodel.py similarity index 70% rename from snowflake/ml/model/_handlers/snowmlmodel.py rename to snowflake/ml/model/_packager/model_handlers/snowmlmodel.py index eb0199c5..9d84233a 100644 --- a/snowflake/ml/model/_handlers/snowmlmodel.py +++ b/snowflake/ml/model/_packager/model_handlers/snowmlmodel.py @@ -1,50 +1,57 @@ import os -from typing import TYPE_CHECKING, Callable, Optional, Type, cast +from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, cast, final import cloudpickle import numpy as np import pandas as pd from typing_extensions import TypeGuard, Unpack -from snowflake.ml._internal import type_utils -from snowflake.ml.model import ( - _model_meta as model_meta_api, - custom_model, - model_signature, - type_hints as model_types, +from snowflake.ml._internal import env as snowml_env, env_utils, type_utils +from snowflake.ml.model import custom_model, model_signature, type_hints as model_types +from snowflake.ml.model._packager.model_env import model_env +from snowflake.ml.model._packager.model_handlers import _base, _utils as handlers_utils +from snowflake.ml.model._packager.model_handlers_migrator import base_migrator +from snowflake.ml.model._packager.model_meta import ( + model_blob_meta, + model_meta as model_meta_api, ) -from snowflake.ml.model._handlers import _base from snowflake.ml.model._signatures import numpy_handler, utils as model_signature_utils if TYPE_CHECKING: from snowflake.ml.modeling.framework.base import BaseEstimator -class _SnowMLModelHandler(_base._ModelHandler["BaseEstimator"]): +@final +class SnowMLModelHandler(_base.BaseModelHandler["BaseEstimator"]): """Handler for SnowML based model. Currently snowflake.ml.modeling.framework.base.BaseEstimator and snowflake.ml.modeling.pipeline.Pipeline based classes are supported. """ - handler_type = "snowml" + HANDLER_TYPE = "snowml" + HANDLER_VERSION = "2023-12-01" + _MIN_SNOWPARK_ML_VERSION = "1.0.12" + _HANDLER_MIGRATOR_PLANS: Dict[str, Type[base_migrator.BaseModelHandlerMigrator]] = {} + DEFAULT_TARGET_METHODS = ["predict", "transform", "predict_proba", "predict_log_proba", "decision_function"] - is_auto_signature = True + IS_AUTO_SIGNATURE = True - @staticmethod + @classmethod def can_handle( + cls, model: model_types.SupportedModelType, ) -> TypeGuard["BaseEstimator"]: return ( type_utils.LazyType("snowflake.ml.modeling.framework.base.BaseEstimator").isinstance(model) # Pipeline is inherited from BaseEstimator, so no need to add one more check ) and any( - (hasattr(model, method) and callable(getattr(model, method, None))) - for method in _SnowMLModelHandler.DEFAULT_TARGET_METHODS + (hasattr(model, method) and callable(getattr(model, method, None))) for method in cls.DEFAULT_TARGET_METHODS ) - @staticmethod + @classmethod def cast_model( + cls, model: model_types.SupportedModelType, ) -> "BaseEstimator": from snowflake.ml.modeling.framework.base import BaseEstimator @@ -54,8 +61,9 @@ def cast_model( return cast("BaseEstimator", model) - @staticmethod - def _save_model( + @classmethod + def save_model( + cls, name: str, model: "BaseEstimator", model_meta: model_meta_api.ModelMetadata, @@ -70,15 +78,14 @@ def _save_model( # Pipeline is inherited from BaseEstimator, so no need to add one more check if not is_sub_model: - # TODO(xjiang): get model signature from modeling. - if model_meta._signatures is None and sample_input is None: + if (not model_meta.signatures) and sample_input is None: assert hasattr(model, "model_signatures") - model_meta._signatures = getattr(model, "model_signatures", {}) + model_meta.signatures = getattr(model, "model_signatures", {}) else: - target_methods = model_meta_api._get_target_methods( + target_methods = handlers_utils.get_target_methods( model=model, target_methods=kwargs.pop("target_methods", None), - default_target_methods=_SnowMLModelHandler.DEFAULT_TARGET_METHODS, + default_target_methods=cls.DEFAULT_TARGET_METHODS, ) def get_prediction( @@ -92,7 +99,7 @@ def get_prediction( predictions_df = target_method(sample_input) return predictions_df - model_meta = model_meta_api._validate_signature( + model_meta = handlers_utils.validate_signature( model=model, model_meta=model_meta, target_methods=target_methods, @@ -102,32 +109,41 @@ def get_prediction( model_blob_path = os.path.join(model_blobs_dir_path, name) os.makedirs(model_blob_path, exist_ok=True) - with open(os.path.join(model_blob_path, _SnowMLModelHandler.MODEL_BLOB_FILE), "wb") as f: + with open(os.path.join(model_blob_path, cls.MODELE_BLOB_FILE_OR_DIR), "wb") as f: cloudpickle.dump(model, f) - base_meta = model_meta_api._ModelBlobMetadata( - name=name, model_type=_SnowMLModelHandler.handler_type, path=_SnowMLModelHandler.MODEL_BLOB_FILE + base_meta = model_blob_meta.ModelBlobMeta( + name=name, + model_type=cls.HANDLER_TYPE, + handler_version=cls.HANDLER_VERSION, + path=cls.MODELE_BLOB_FILE_OR_DIR, ) model_meta.models[name] = base_meta + model_meta.min_snowpark_ml_version = cls._MIN_SNOWPARK_ML_VERSION + _include_if_absent_pkgs = [] model_dependencies = model._get_dependencies() for dep in model_dependencies: pkg_name = dep.split("==")[0] - _include_if_absent_pkgs.append(model_meta_api.Dependency(conda_name=pkg_name, pip_req=pkg_name)) - model_meta._include_if_absent(_include_if_absent_pkgs) + _include_if_absent_pkgs.append(model_env.ModelDependency(requirement=pkg_name, pip_name=pkg_name)) + if not model_meta.env._snowpark_ml_version.local: + _include_if_absent_pkgs.append( + model_env.ModelDependency( + requirement=f"{env_utils.SNOWPARK_ML_PKG_NAME}=={snowml_env.VERSION}", + pip_name=env_utils.SNOWPARK_ML_PKG_NAME, + ) + ) + model_meta.env.include_if_absent(_include_if_absent_pkgs, check_local_version=True) - @staticmethod - def _load_model( + @classmethod + def load_model( + cls, name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str, **kwargs: Unpack[model_types.ModelLoadOption], ) -> "BaseEstimator": model_blob_path = os.path.join(model_blobs_dir_path, name) - if not hasattr(model_meta, "models"): - raise ValueError("Ill model metadata found.") model_blobs_metadata = model_meta.models - if name not in model_blobs_metadata: - raise ValueError(f"Blob of model {name} does not exist.") model_blob_metadata = model_blobs_metadata[name] model_blob_filename = model_blob_metadata.path with open(os.path.join(model_blob_path, model_blob_filename), "rb") as f: @@ -138,25 +154,13 @@ def _load_model( assert isinstance(m, BaseEstimator) return m - @staticmethod - def _load_as_custom_model( - name: str, + @classmethod + def convert_as_custom_model( + cls, + raw_model: "BaseEstimator", model_meta: model_meta_api.ModelMetadata, - model_blobs_dir_path: str, **kwargs: Unpack[model_types.ModelLoadOption], ) -> custom_model.CustomModel: - """Create a custom model class wrap for unified interface when being deployed. The predict method will be - re-targeted based on target_method metadata. - - Args: - name: Name of the model. - model_meta: The model metadata. - model_blobs_dir_path: Directory path to the whole model. - kwargs: Options when loading the model. - - Returns: - The model object as a custom model. - """ from snowflake.ml.model import custom_model def _create_custom_model( @@ -195,7 +199,6 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: return _SnowMLModel - raw_model = _SnowMLModelHandler._load_model(name, model_meta, model_blobs_dir_path, **kwargs) _SnowMLModel = _create_custom_model(raw_model, model_meta) snowml_model = _SnowMLModel(custom_model.ModelContext()) diff --git a/snowflake/ml/model/_handlers/tensorflow.py b/snowflake/ml/model/_packager/model_handlers/tensorflow.py similarity index 75% rename from snowflake/ml/model/_handlers/tensorflow.py rename to snowflake/ml/model/_packager/model_handlers/tensorflow.py index 84aed261..b130c3a7 100644 --- a/snowflake/ml/model/_handlers/tensorflow.py +++ b/snowflake/ml/model/_packager/model_handlers/tensorflow.py @@ -1,18 +1,19 @@ import os -from typing import TYPE_CHECKING, Callable, Optional, Type, cast +from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, cast, final import numpy as np import pandas as pd from typing_extensions import TypeGuard, Unpack from snowflake.ml._internal import type_utils -from snowflake.ml.model import ( - _model_meta as model_meta_api, - custom_model, - model_signature, - type_hints as model_types, +from snowflake.ml.model import custom_model, model_signature, type_hints as model_types +from snowflake.ml.model._packager.model_env import model_env +from snowflake.ml.model._packager.model_handlers import _base, _utils as handlers_utils +from snowflake.ml.model._packager.model_handlers_migrator import base_migrator +from snowflake.ml.model._packager.model_meta import ( + model_blob_meta, + model_meta as model_meta_api, ) -from snowflake.ml.model._handlers import _base from snowflake.ml.model._signatures import ( numpy_handler, tensorflow_handler, @@ -23,24 +24,31 @@ import tensorflow -class _TensorFlowHandler(_base._ModelHandler["tensorflow.Module"]): +@final +class TensorFlowHandler(_base.BaseModelHandler["tensorflow.Module"]): """Handler for TensorFlow based model. Currently tensorflow.Module based classes are supported. """ - handler_type = "tensorflow" - MODEL_BLOB_FILE = "model" + HANDLER_TYPE = "tensorflow" + HANDLER_VERSION = "2023-12-01" + _MIN_SNOWPARK_ML_VERSION = "1.0.12" + _HANDLER_MIGRATOR_PLANS: Dict[str, Type[base_migrator.BaseModelHandlerMigrator]] = {} + + MODELE_BLOB_FILE_OR_DIR = "model" DEFAULT_TARGET_METHODS = ["__call__"] - @staticmethod + @classmethod def can_handle( + cls, model: model_types.SupportedModelType, ) -> TypeGuard["tensorflow.nn.Module"]: return type_utils.LazyType("tensorflow.Module").isinstance(model) - @staticmethod + @classmethod def cast_model( + cls, model: model_types.SupportedModelType, ) -> "tensorflow.Module": import tensorflow @@ -49,8 +57,9 @@ def cast_model( return cast(tensorflow.Module, model) - @staticmethod - def _save_model( + @classmethod + def save_model( + cls, name: str, model: "tensorflow.Module", model_meta: model_meta_api.ModelMetadata, @@ -66,10 +75,10 @@ def _save_model( if isinstance(model, tensorflow.keras.Model): default_target_methods = ["predict"] else: - default_target_methods = _TensorFlowHandler.DEFAULT_TARGET_METHODS + default_target_methods = cls.DEFAULT_TARGET_METHODS if not is_sub_model: - target_methods = model_meta_api._get_target_methods( + target_methods = handlers_utils.get_target_methods( model=model, target_methods=kwargs.pop("target_methods", None), default_target_methods=default_target_methods, @@ -94,7 +103,7 @@ def get_prediction( return predictions_df - model_meta = model_meta_api._validate_signature( + model_meta = handlers_utils.validate_signature( model=model, model_meta=model_meta, target_methods=target_methods, @@ -105,20 +114,27 @@ def get_prediction( model_blob_path = os.path.join(model_blobs_dir_path, name) os.makedirs(model_blob_path, exist_ok=True) if isinstance(model, tensorflow.keras.Model): - tensorflow.keras.models.save_model(model, os.path.join(model_blob_path, _TensorFlowHandler.MODEL_BLOB_FILE)) + tensorflow.keras.models.save_model(model, os.path.join(model_blob_path, cls.MODELE_BLOB_FILE_OR_DIR)) else: - tensorflow.saved_model.save(model, os.path.join(model_blob_path, _TensorFlowHandler.MODEL_BLOB_FILE)) + tensorflow.saved_model.save(model, os.path.join(model_blob_path, cls.MODELE_BLOB_FILE_OR_DIR)) - base_meta = model_meta_api._ModelBlobMetadata( - name=name, model_type=_TensorFlowHandler.handler_type, path=_TensorFlowHandler.MODEL_BLOB_FILE + base_meta = model_blob_meta.ModelBlobMeta( + name=name, + model_type=cls.HANDLER_TYPE, + handler_version=cls.HANDLER_VERSION, + path=cls.MODELE_BLOB_FILE_OR_DIR, ) model_meta.models[name] = base_meta - model_meta._include_if_absent([model_meta_api.Dependency(conda_name="tensorflow", pip_req="tensorflow")]) + model_meta.min_snowpark_ml_version = cls._MIN_SNOWPARK_ML_VERSION - model_meta.cuda_version = kwargs.get("cuda_version", model_meta_api._DEFAULT_CUDA_VERSION) + model_meta.env.include_if_absent( + [model_env.ModelDependency(requirement="tensorflow", pip_name="tensorflow")], check_local_version=True + ) + model_meta.env.cuda_version = kwargs.get("cuda_version", model_env.DEFAULT_CUDA_VERSION) - @staticmethod - def _load_model( + @classmethod + def load_model( + cls, name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str, @@ -127,11 +143,7 @@ def _load_model( import tensorflow model_blob_path = os.path.join(model_blobs_dir_path, name) - if not hasattr(model_meta, "models"): - raise ValueError("Ill model metadata found.") model_blobs_metadata = model_meta.models - if name not in model_blobs_metadata: - raise ValueError(f"Blob of model {name} does not exist.") model_blob_metadata = model_blobs_metadata[name] model_blob_filename = model_blob_metadata.path m = tensorflow.keras.models.load_model(os.path.join(model_blob_path, model_blob_filename), compile=False) @@ -139,25 +151,13 @@ def _load_model( return m return cast(tensorflow.Module, m) - @staticmethod - def _load_as_custom_model( - name: str, + @classmethod + def convert_as_custom_model( + cls, + raw_model: "tensorflow.Module", model_meta: model_meta_api.ModelMetadata, - model_blobs_dir_path: str, **kwargs: Unpack[model_types.ModelLoadOption], ) -> custom_model.CustomModel: - """Create a custom model class wrap for unified interface when being deployed. The predict method will be - re-targeted based on target_method metadata. - - Args: - name: Name of the model. - model_meta: The model metadata. - model_blobs_dir_path: Directory path to the whole model. - kwargs: Options when loading the model. - - Returns: - The model object as a custom model. - """ import tensorflow from snowflake.ml.model import custom_model @@ -206,7 +206,6 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: return _TensorFlowModel - raw_model = _TensorFlowHandler()._load_model(name, model_meta, model_blobs_dir_path, **kwargs) _TensorFlowModel = _create_custom_model(raw_model, model_meta) tf_model = _TensorFlowModel(custom_model.ModelContext()) diff --git a/snowflake/ml/model/_handlers/torchscript.py b/snowflake/ml/model/_packager/model_handlers/torchscript.py similarity index 74% rename from snowflake/ml/model/_handlers/torchscript.py rename to snowflake/ml/model/_packager/model_handlers/torchscript.py index be5894b6..4742fe51 100644 --- a/snowflake/ml/model/_handlers/torchscript.py +++ b/snowflake/ml/model/_packager/model_handlers/torchscript.py @@ -1,17 +1,18 @@ import os -from typing import TYPE_CHECKING, Callable, Optional, Type, cast +from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, cast, final import pandas as pd from typing_extensions import TypeGuard, Unpack from snowflake.ml._internal import type_utils -from snowflake.ml.model import ( - _model_meta as model_meta_api, - custom_model, - model_signature, - type_hints as model_types, +from snowflake.ml.model import custom_model, model_signature, type_hints as model_types +from snowflake.ml.model._packager.model_env import model_env +from snowflake.ml.model._packager.model_handlers import _base, _utils as handlers_utils +from snowflake.ml.model._packager.model_handlers_migrator import base_migrator +from snowflake.ml.model._packager.model_meta import ( + model_blob_meta, + model_meta as model_meta_api, ) -from snowflake.ml.model._handlers import _base from snowflake.ml.model._signatures import ( pytorch_handler, utils as model_signature_utils, @@ -21,24 +22,31 @@ import torch -class _TorchScriptHandler(_base._ModelHandler["torch.jit.ScriptModule"]): # type:ignore[name-defined] +@final +class TorchScriptHandler(_base.BaseModelHandler["torch.jit.ScriptModule"]): # type:ignore[name-defined] """Handler for PyTorch JIT based model. Currently torch.jit.ScriptModule based classes are supported. """ - handler_type = "torchscript" - MODEL_BLOB_FILE = "model.pt" + HANDLER_TYPE = "torchscript" + HANDLER_VERSION = "2023-12-01" + _MIN_SNOWPARK_ML_VERSION = "1.0.12" + _HANDLER_MIGRATOR_PLANS: Dict[str, Type[base_migrator.BaseModelHandlerMigrator]] = {} + + MODELE_BLOB_FILE_OR_DIR = "model.pt" DEFAULT_TARGET_METHODS = ["forward"] - @staticmethod + @classmethod def can_handle( + cls, model: model_types.SupportedModelType, ) -> TypeGuard["torch.jit.ScriptModule"]: # type:ignore[name-defined] return type_utils.LazyType("torch.jit.ScriptModule").isinstance(model) - @staticmethod + @classmethod def cast_model( + cls, model: model_types.SupportedModelType, ) -> "torch.jit.ScriptModule": # type:ignore[name-defined] import torch @@ -47,8 +55,9 @@ def cast_model( return cast(torch.jit.ScriptModule, model) # type:ignore[name-defined] - @staticmethod - def _save_model( + @classmethod + def save_model( + cls, name: str, model: "torch.jit.ScriptModule", # type:ignore[name-defined] model_meta: model_meta_api.ModelMetadata, @@ -62,10 +71,10 @@ def _save_model( assert isinstance(model, torch.jit.ScriptModule) # type:ignore[attr-defined] if not is_sub_model: - target_methods = model_meta_api._get_target_methods( + target_methods = handlers_utils.get_target_methods( model=model, target_methods=kwargs.pop("target_methods", None), - default_target_methods=_TorchScriptHandler.DEFAULT_TARGET_METHODS, + default_target_methods=cls.DEFAULT_TARGET_METHODS, ) def get_prediction( @@ -87,7 +96,7 @@ def get_prediction( return predictions_df - model_meta = model_meta_api._validate_signature( + model_meta = handlers_utils.validate_signature( model=model, model_meta=model_meta, target_methods=target_methods, @@ -97,18 +106,25 @@ def get_prediction( model_blob_path = os.path.join(model_blobs_dir_path, name) os.makedirs(model_blob_path, exist_ok=True) - with open(os.path.join(model_blob_path, _TorchScriptHandler.MODEL_BLOB_FILE), "wb") as f: + with open(os.path.join(model_blob_path, cls.MODELE_BLOB_FILE_OR_DIR), "wb") as f: torch.jit.save(model, f) # type:ignore[attr-defined] - base_meta = model_meta_api._ModelBlobMetadata( - name=name, model_type=_TorchScriptHandler.handler_type, path=_TorchScriptHandler.MODEL_BLOB_FILE + base_meta = model_blob_meta.ModelBlobMeta( + name=name, + model_type=cls.HANDLER_TYPE, + handler_version=cls.HANDLER_VERSION, + path=cls.MODELE_BLOB_FILE_OR_DIR, ) model_meta.models[name] = base_meta - model_meta._include_if_absent([model_meta_api.Dependency(conda_name="pytorch", pip_req="torch")]) + model_meta.min_snowpark_ml_version = cls._MIN_SNOWPARK_ML_VERSION - model_meta.cuda_version = kwargs.get("cuda_version", model_meta_api._DEFAULT_CUDA_VERSION) + model_meta.env.include_if_absent( + [model_env.ModelDependency(requirement="pytorch", pip_name="torch")], check_local_version=True + ) + model_meta.env.cuda_version = kwargs.get("cuda_version", model_env.DEFAULT_CUDA_VERSION) - @staticmethod - def _load_model( + @classmethod + def load_model( + cls, name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str, @@ -117,11 +133,7 @@ def _load_model( import torch model_blob_path = os.path.join(model_blobs_dir_path, name) - if not hasattr(model_meta, "models"): - raise ValueError("Ill model metadata found.") model_blobs_metadata = model_meta.models - if name not in model_blobs_metadata: - raise ValueError(f"Blob of model {name} does not exist.") model_blob_metadata = model_blobs_metadata[name] model_blob_filename = model_blob_metadata.path with open(os.path.join(model_blob_path, model_blob_filename), "rb") as f: @@ -135,25 +147,13 @@ def _load_model( return m - @staticmethod - def _load_as_custom_model( - name: str, + @classmethod + def convert_as_custom_model( + cls, + raw_model: "torch.jit.ScriptModule", # type:ignore[name-defined] model_meta: model_meta_api.ModelMetadata, - model_blobs_dir_path: str, **kwargs: Unpack[model_types.ModelLoadOption], ) -> custom_model.CustomModel: - """Create a custom model class wrap for unified interface when being deployed. The predict method will be - re-targeted based on target_method metadata. - - Args: - name: Name of the model. - model_meta: The model metadata. - model_blobs_dir_path: Directory path to the whole model. - kwargs: Options when loading the model. - - Returns: - The model object as a custom model. - """ from snowflake.ml.model import custom_model def _create_custom_model( @@ -203,7 +203,6 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: return _TorchScriptModel - raw_model = _TorchScriptHandler._load_model(name, model_meta, model_blobs_dir_path, **kwargs) _TorchScriptModel = _create_custom_model(raw_model, model_meta) torchscript_model = _TorchScriptModel(custom_model.ModelContext()) diff --git a/snowflake/ml/model/_handlers/xgboost.py b/snowflake/ml/model/_packager/model_handlers/xgboost.py similarity index 67% rename from snowflake/ml/model/_handlers/xgboost.py rename to snowflake/ml/model/_packager/model_handlers/xgboost.py index ba6f69cd..478cdac8 100644 --- a/snowflake/ml/model/_handlers/xgboost.py +++ b/snowflake/ml/model/_packager/model_handlers/xgboost.py @@ -1,47 +1,56 @@ # mypy: disable-error-code="import" import os -from typing import TYPE_CHECKING, Callable, Optional, Type, Union +from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, Union, cast, final import numpy as np import pandas as pd from typing_extensions import TypeGuard, Unpack from snowflake.ml._internal import type_utils -from snowflake.ml.model import ( - _model_meta as model_meta_api, - custom_model, - model_signature, - type_hints as model_types, +from snowflake.ml.model import custom_model, model_signature, type_hints as model_types +from snowflake.ml.model._packager.model_env import model_env +from snowflake.ml.model._packager.model_handlers import _base, _utils as handlers_utils +from snowflake.ml.model._packager.model_handlers_migrator import base_migrator +from snowflake.ml.model._packager.model_meta import ( + model_blob_meta, + model_meta as model_meta_api, + model_meta_schema, ) -from snowflake.ml.model._handlers import _base from snowflake.ml.model._signatures import numpy_handler, utils as model_signature_utils if TYPE_CHECKING: import xgboost -class _XGBModelHandler(_base._ModelHandler[Union["xgboost.Booster", "xgboost.XGBModel"]]): +@final +class XGBModelHandler(_base.BaseModelHandler[Union["xgboost.Booster", "xgboost.XGBModel"]]): """Handler for XGBoost based model. Currently xgboost.XGBModel based classes are supported. """ - handler_type = "xgboost" - MODEL_BLOB_FILE = "model.ubj" + HANDLER_TYPE = "xgboost" + HANDLER_VERSION = "2023-12-01" + _MIN_SNOWPARK_ML_VERSION = "1.0.12" + _HANDLER_MIGRATOR_PLANS: Dict[str, Type[base_migrator.BaseModelHandlerMigrator]] = {} + + MODELE_BLOB_FILE_OR_DIR = "model.ubj" DEFAULT_TARGET_METHODS = ["apply", "predict", "predict_proba"] - @staticmethod - def can_handle(model: model_types.SupportedModelType) -> TypeGuard[Union["xgboost.Booster", "xgboost.XGBModel"]]: + @classmethod + def can_handle( + cls, model: model_types.SupportedModelType + ) -> TypeGuard[Union["xgboost.Booster", "xgboost.XGBModel"]]: return ( type_utils.LazyType("xgboost.Booster").isinstance(model) or type_utils.LazyType("xgboost.XGBModel").isinstance(model) ) and any( - (hasattr(model, method) and callable(getattr(model, method, None))) - for method in _XGBModelHandler.DEFAULT_TARGET_METHODS + (hasattr(model, method) and callable(getattr(model, method, None))) for method in cls.DEFAULT_TARGET_METHODS ) - @staticmethod + @classmethod def cast_model( + cls, model: model_types.SupportedModelType, ) -> Union["xgboost.Booster", "xgboost.XGBModel"]: import xgboost @@ -50,8 +59,9 @@ def cast_model( return model - @staticmethod - def _save_model( + @classmethod + def save_model( + cls, name: str, model: Union["xgboost.Booster", "xgboost.XGBModel"], model_meta: model_meta_api.ModelMetadata, @@ -65,10 +75,10 @@ def _save_model( assert isinstance(model, xgboost.Booster) or isinstance(model, xgboost.XGBModel) if not is_sub_model: - target_methods = model_meta_api._get_target_methods( + target_methods = handlers_utils.get_target_methods( model=model, target_methods=kwargs.pop("target_methods", None), - default_target_methods=_XGBModelHandler.DEFAULT_TARGET_METHODS, + default_target_methods=cls.DEFAULT_TARGET_METHODS, ) def get_prediction( @@ -85,7 +95,7 @@ def get_prediction( predictions_df = target_method(sample_input) return predictions_df - model_meta = model_meta_api._validate_signature( + model_meta = handlers_utils.validate_signature( model=model, model_meta=model_meta, target_methods=target_methods, @@ -95,25 +105,29 @@ def get_prediction( model_blob_path = os.path.join(model_blobs_dir_path, name) os.makedirs(model_blob_path, exist_ok=True) - model.save_model(os.path.join(model_blob_path, _XGBModelHandler.MODEL_BLOB_FILE)) - base_meta = model_meta_api._ModelBlobMetadata( + model.save_model(os.path.join(model_blob_path, cls.MODELE_BLOB_FILE_OR_DIR)) + base_meta = model_blob_meta.ModelBlobMeta( name=name, - model_type=_XGBModelHandler.handler_type, - path=_XGBModelHandler.MODEL_BLOB_FILE, - options={"xgb_estimator_type": model.__class__.__name__}, + model_type=cls.HANDLER_TYPE, + handler_version=cls.HANDLER_VERSION, + path=cls.MODELE_BLOB_FILE_OR_DIR, + options=model_meta_schema.XgboostModelBlobOptions({"xgb_estimator_type": model.__class__.__name__}), ) model_meta.models[name] = base_meta - model_meta._include_if_absent( + model_meta.min_snowpark_ml_version = cls._MIN_SNOWPARK_ML_VERSION + + model_meta.env.include_if_absent( [ - model_meta_api.Dependency(conda_name="scikit-learn", pip_req="scikit-learn"), - model_meta_api.Dependency(conda_name="xgboost", pip_req="xgboost"), - ] + model_env.ModelDependency(requirement="scikit-learn", pip_name="scikit-learn"), + model_env.ModelDependency(requirement="xgboost", pip_name="xgboost"), + ], + check_local_version=True, ) + model_meta.env.cuda_version = kwargs.get("cuda_version", model_env.DEFAULT_CUDA_VERSION) - model_meta.cuda_version = kwargs.get("cuda_version", model_meta_api._DEFAULT_CUDA_VERSION) - - @staticmethod - def _load_model( + @classmethod + def load_model( + cls, name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str, @@ -122,16 +136,16 @@ def _load_model( import xgboost model_blob_path = os.path.join(model_blobs_dir_path, name) - if not hasattr(model_meta, "models"): - raise ValueError("Ill model metadata found.") model_blobs_metadata = model_meta.models - if name not in model_blobs_metadata: - raise ValueError(f"Blob of model {name} does not exist.") model_blob_metadata = model_blobs_metadata[name] model_blob_filename = model_blob_metadata.path - xgb_estimator_type = model_blob_metadata.options.get("xgb_estimator_type", None) - if not xgb_estimator_type or not hasattr(xgboost, xgb_estimator_type): - raise ValueError("Type of XGB estimator unknown or illegal.") + model_blob_options = cast(model_meta_schema.XgboostModelBlobOptions, model_blob_metadata.options) + if "xgb_estimator_type" not in model_blob_options: + raise ValueError("Missing field `xgb_estimator_type` in model blob metadata for type `xgboost`") + + xgb_estimator_type = model_blob_options["xgb_estimator_type"] + if not hasattr(xgboost, xgb_estimator_type): + raise ValueError("Type of XGB estimator is illegal.") m = getattr(xgboost, xgb_estimator_type)() m.load_model(os.path.join(model_blob_path, model_blob_filename)) @@ -145,25 +159,13 @@ def _load_model( assert isinstance(m, xgboost.Booster) or isinstance(m, xgboost.XGBModel) return m - @staticmethod - def _load_as_custom_model( - name: str, + @classmethod + def convert_as_custom_model( + cls, + raw_model: Union["xgboost.Booster", "xgboost.XGBModel"], model_meta: model_meta_api.ModelMetadata, - model_blobs_dir_path: str, **kwargs: Unpack[model_types.ModelLoadOption], ) -> custom_model.CustomModel: - """Create a custom model class wrap for unified interface when being deployed. The predict method will be - re-targeted based on target_method metadata. - - Args: - name: Name of the model. - model_meta: The model metadata. - model_blobs_dir_path: Directory path to the whole model. - kwargs: Options when loading the model. - - Returns: - The model object as a custom model. - """ import xgboost from snowflake.ml.model import custom_model @@ -207,7 +209,6 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: return _XGBModel - raw_model = _XGBModelHandler._load_model(name, model_meta, model_blobs_dir_path, **kwargs) _XGBModel = _create_custom_model(raw_model, model_meta) xgb_model = _XGBModel(custom_model.ModelContext()) diff --git a/snowflake/ml/model/_packager/model_handlers_migrator/BUILD.bazel b/snowflake/ml/model/_packager/model_handlers_migrator/BUILD.bazel new file mode 100644 index 00000000..e92ee471 --- /dev/null +++ b/snowflake/ml/model/_packager/model_handlers_migrator/BUILD.bazel @@ -0,0 +1,24 @@ +load("//bazel:py_rules.bzl", "py_library", "py_test") + +package(default_visibility = ["//visibility:public"]) + +py_library( + name = "base_migrator", + srcs = ["base_migrator.py"], + deps = [ + "//snowflake/ml/_internal:migrator_utils", + "//snowflake/ml/model/_packager/model_meta", + ], +) + +py_test( + name = "base_migrator_test", + srcs = ["base_migrator_test.py"], + deps = [ + ":base_migrator", + "//snowflake/ml/_internal:migrator_utils", + "//snowflake/ml/model:model_signature", + "//snowflake/ml/model/_packager/model_meta", + "//snowflake/ml/model/_packager/model_meta:model_blob_meta", + ], +) diff --git a/snowflake/ml/model/_packager/model_handlers_migrator/base_migrator.py b/snowflake/ml/model/_packager/model_handlers_migrator/base_migrator.py new file mode 100644 index 00000000..4d2d80ca --- /dev/null +++ b/snowflake/ml/model/_packager/model_handlers_migrator/base_migrator.py @@ -0,0 +1,36 @@ +from abc import abstractmethod +from typing import Protocol, final + +from snowflake.ml._internal import migrator_utils +from snowflake.ml.model._packager.model_meta import model_meta + + +class _BaseModelHandlerMigratorProtocol(Protocol): + source_version: str + target_version: str + + @staticmethod + @abstractmethod + def upgrade( + name: str, + model_meta: model_meta.ModelMetadata, + model_blobs_dir_path: str, + ) -> None: + raise NotImplementedError + + +class BaseModelHandlerMigrator(_BaseModelHandlerMigratorProtocol): + @final + def try_upgrade(self, name: str, model_meta: model_meta.ModelMetadata, model_blobs_dir_path: str) -> None: + assert ( + model_meta.models[name].handler_version == self.__class__.source_version + ), "Incorrect source handler version found." + try: + self.upgrade(name=name, model_meta=model_meta, model_blobs_dir_path=model_blobs_dir_path) + model_meta.models[name].handler_version = self.__class__.target_version + except migrator_utils.UnableToUpgradeError as e: + raise RuntimeError( + f"Can not upgrade your model {name} from version {self.__class__.source_version} to" + f" {self.__class__.target_version}." + f"The latest version support the original version of Snowpark ML library is {e.last_supported_version}." + ) diff --git a/snowflake/ml/model/_packager/model_handlers_migrator/base_migrator_test.py b/snowflake/ml/model/_packager/model_handlers_migrator/base_migrator_test.py new file mode 100644 index 00000000..075acc32 --- /dev/null +++ b/snowflake/ml/model/_packager/model_handlers_migrator/base_migrator_test.py @@ -0,0 +1,68 @@ +import tempfile + +from absl.testing import absltest + +from snowflake.ml._internal import migrator_utils +from snowflake.ml.model import model_signature +from snowflake.ml.model._packager.model_handlers_migrator import base_migrator +from snowflake.ml.model._packager.model_meta import ( + model_blob_meta, + model_meta as model_meta_api, +) + +_DUMMY_SIG = { + "predict": model_signature.ModelSignature( + inputs=[ + model_signature.FeatureSpec(dtype=model_signature.DataType.FLOAT, name="input"), + ], + outputs=[model_signature.FeatureSpec(name="output", dtype=model_signature.DataType.FLOAT)], + ) +} + + +class HandlerMigrator_1(base_migrator.BaseModelHandlerMigrator): + source_version = "version_0" + target_version = "version_1" + + @staticmethod + def upgrade(name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str) -> None: + model_meta.models[name].path = "changed_path" + + +class HandlerMigrator_2(base_migrator.BaseModelHandlerMigrator): + source_version = "version_1" + target_version = "version_2" + + @staticmethod + def upgrade(name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str) -> None: + raise migrator_utils.UnableToUpgradeError(last_supported_version="1.0.9") + + +class BaseMigratorTest(absltest.TestCase): + def test_model_meta_dependencies_no_packages(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta_api.create_model_metadata( + model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG + ) as meta: + meta.models["model1"] = model_blob_meta.ModelBlobMeta( + name="model1", model_type="custom", path="mock_path", handler_version="version_0" + ) + + migrator_1 = HandlerMigrator_1() + migrator_1.try_upgrade(name="model1", model_meta=meta, model_blobs_dir_path=tmpdir) + + self.assertEqual(meta.models["model1"].path, "changed_path") + + migrator_2 = HandlerMigrator_2() + with self.assertRaisesRegex( + RuntimeError, + ( + "Can not upgrade your model model1 from version version_1 to version_2." + "The latest version support the original version of Snowpark ML library is 1.0.9." + ), + ): + migrator_2.try_upgrade(name="model1", model_meta=meta, model_blobs_dir_path=tmpdir) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_handlers_test/BUILD.bazel b/snowflake/ml/model/_packager/model_handlers_test/BUILD.bazel similarity index 79% rename from snowflake/ml/model/_handlers_test/BUILD.bazel rename to snowflake/ml/model/_packager/model_handlers_test/BUILD.bazel index c1f75f2b..1814b081 100644 --- a/snowflake/ml/model/_handlers_test/BUILD.bazel +++ b/snowflake/ml/model/_packager/model_handlers_test/BUILD.bazel @@ -6,9 +6,9 @@ py_test( name = "custom_test", srcs = ["custom_test.py"], deps = [ - "//snowflake/ml/model:_model", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:model_signature", + "//snowflake/ml/model/_packager:model_packager", ], ) @@ -16,8 +16,8 @@ py_test( name = "mlflow_test", srcs = ["mlflow_test.py"], deps = [ - "//snowflake/ml/model:_model", "//snowflake/ml/model:model_signature", + "//snowflake/ml/model/_packager:model_packager", ], ) @@ -25,8 +25,8 @@ py_test( name = "pytorch_test", srcs = ["pytorch_test.py"], deps = [ - "//snowflake/ml/model:_model", "//snowflake/ml/model:model_signature", + "//snowflake/ml/model/_packager:model_packager", "//snowflake/ml/model/_signatures:pytorch_handler", "//snowflake/ml/model/_signatures:utils", ], @@ -36,9 +36,9 @@ py_test( name = "sklearn_test", srcs = ["sklearn_test.py"], deps = [ - "//snowflake/ml/model:_model", "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager:model_packager", ], ) @@ -46,8 +46,8 @@ py_test( name = "snowmlmodel_test", srcs = ["snowmlmodel_test.py"], deps = [ - "//snowflake/ml/model:_model", "//snowflake/ml/model:model_signature", + "//snowflake/ml/model/_packager:model_packager", "//snowflake/ml/modeling/linear_model:linear_regression", ], ) @@ -56,8 +56,8 @@ py_test( name = "tensorflow_test", srcs = ["tensorflow_test.py"], deps = [ - "//snowflake/ml/model:_model", "//snowflake/ml/model:model_signature", + "//snowflake/ml/model/_packager:model_packager", "//snowflake/ml/model/_signatures:tensorflow_handler", "//snowflake/ml/model/_signatures:utils", ], @@ -67,8 +67,8 @@ py_test( name = "torchscript_test", srcs = ["torchscript_test.py"], deps = [ - "//snowflake/ml/model:_model", "//snowflake/ml/model:model_signature", + "//snowflake/ml/model/_packager:model_packager", "//snowflake/ml/model/_signatures:pytorch_handler", "//snowflake/ml/model/_signatures:utils", ], @@ -78,8 +78,8 @@ py_test( name = "xgboost_test", srcs = ["xgboost_test.py"], deps = [ - "//snowflake/ml/model:_model", "//snowflake/ml/model:model_signature", + "//snowflake/ml/model/_packager:model_packager", ], ) @@ -88,7 +88,7 @@ py_test( srcs = ["huggingface_pipeline_test.py"], compatible_with_snowpark = False, deps = [ - "//snowflake/ml/model:_model", + "//snowflake/ml/model/_packager:model_packager", "//snowflake/ml/model/_signatures:utils", "//snowflake/ml/model/models:huggingface_pipeline", ], diff --git a/snowflake/ml/model/_handlers_test/custom_test.py b/snowflake/ml/model/_packager/model_handlers_test/custom_test.py similarity index 69% rename from snowflake/ml/model/_handlers_test/custom_test.py rename to snowflake/ml/model/_packager/model_handlers_test/custom_test.py index c1e747ce..c5716eaf 100644 --- a/snowflake/ml/model/_handlers_test/custom_test.py +++ b/snowflake/ml/model/_packager/model_handlers_test/custom_test.py @@ -7,7 +7,8 @@ import pandas as pd from absl.testing import absltest -from snowflake.ml.model import _model as model_api, custom_model, model_signature +from snowflake.ml.model import custom_model, model_signature +from snowflake.ml.model._packager import model_packager class DemoModel(custom_model.CustomModel): @@ -94,9 +95,8 @@ def test_custom_model_with_multiple_artifacts(self) -> None: arr = np.array([[1, 2, 3], [4, 2, 5]]) d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) s = {"predict": model_signature.infer_signature(d, lm.predict(d))} - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=lm, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -104,30 +104,38 @@ def test_custom_model_with_multiple_artifacts(self) -> None: with warnings.catch_warnings(): warnings.simplefilter("error") - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - assert isinstance(m, DemoModelWithManyArtifacts) - res = m.predict(d) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, DemoModelWithManyArtifacts) + res = pk.model.predict(d) np.testing.assert_allclose(res["output"], pd.Series(np.array([94, 97]))) - m_UDF, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - assert isinstance(m_UDF, DemoModelWithManyArtifacts) - res = m_UDF.predict(d) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + assert isinstance(pk.model, DemoModelWithManyArtifacts) + res = pk.model.predict(d) np.testing.assert_allclose(res["output"], pd.Series(np.array([94, 97]))) - self.assertEqual(meta.metadata["author"] if meta.metadata else None, "halu") + self.assertEqual(pk.meta.metadata["author"] if pk.meta.metadata else None, "halu") - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")).save( name="model1_no_sig", - local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=lm, sample_input=d, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) - assert isinstance(m, DemoModelWithManyArtifacts) - res = m.predict(d) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, DemoModelWithManyArtifacts) + res = pk.model.predict(d) np.testing.assert_allclose(res["output"], pd.Series(np.array([94, 97]))) - self.assertEqual(s, meta.signatures) + self.assertEqual(s, pk.meta.signatures) def test_model_composition(self) -> None: arr = np.array([[1, 2, 3], [4, 2, 5]]) @@ -145,20 +153,25 @@ def test_model_composition(self) -> None: p2 = acm.predict(d) s = {"predict": model_signature.infer_signature(d, p2)} with tempfile.TemporaryDirectory() as tmpdir: - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=acm, signatures=s, metadata={"author": "halu", "version": "1"}, ) - lm, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - assert isinstance(lm, ComposeModel) - p3 = lm.predict(d) - - m_UDF, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - assert isinstance(m_UDF, ComposeModel) - p4 = m_UDF.predict(d) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, ComposeModel) + p3 = pk.model.predict(d) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + assert isinstance(pk.model, ComposeModel) + p4 = pk.model.predict(d) np.testing.assert_allclose(p1, p2) np.testing.assert_allclose(p2, p3) np.testing.assert_allclose(p2, p4) @@ -179,20 +192,25 @@ async def _test(self: "CustomHandlerTest") -> None: p2 = await acm.predict(d) s = {"predict": model_signature.infer_signature(d, p2)} with tempfile.TemporaryDirectory() as tmpdir: - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=acm, signatures=s, metadata={"author": "halu", "version": "1"}, ) - lm, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - assert isinstance(lm, AsyncComposeModel) - p3 = await lm.predict(d) # type: ignore[misc] - - m_UDF, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - assert isinstance(m_UDF, AsyncComposeModel) - p4 = await m_UDF.predict(d) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, AsyncComposeModel) + p3 = await pk.model.predict(d) # type: ignore[misc] + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + assert isinstance(pk.model, AsyncComposeModel) + p4 = await pk.model.predict(d) # type: ignore[misc] np.testing.assert_allclose(p1, p2) np.testing.assert_allclose(p2, p3) np.testing.assert_allclose(p2, p4) @@ -209,17 +227,19 @@ def test_custom_model_with_artifacts(self) -> None: arr = np.array([[1, 2, 3], [4, 2, 5]]) d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) s = {"predict": model_signature.infer_signature(d, lm.predict(d))} - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=lm, signatures=s, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - assert isinstance(m, DemoModelWithArtifacts) - res = m.predict(d) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, DemoModelWithArtifacts) + res = pk.model.predict(d) np.testing.assert_allclose(res["output"], pd.Series(np.array([11, 14]))) # test re-init when loading the model @@ -228,12 +248,15 @@ def test_custom_model_with_artifacts(self) -> None: ) as f: f.write("20") - m_UDF, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - assert isinstance(m_UDF, DemoModelWithArtifacts) - res = m_UDF.predict(d) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + assert isinstance(pk.model, DemoModelWithArtifacts) + res = pk.model.predict(d) np.testing.assert_allclose(res["output"], pd.Series(np.array([21, 24]))) - self.assertEqual(meta.metadata["author"] if meta.metadata else None, "halu") + self.assertEqual(pk.meta.metadata["author"] if pk.meta.metadata else None, "halu") if __name__ == "__main__": diff --git a/snowflake/ml/model/_handlers_test/huggingface_pipeline_test.py b/snowflake/ml/model/_packager/model_handlers_test/huggingface_pipeline_test.py similarity index 90% rename from snowflake/ml/model/_handlers_test/huggingface_pipeline_test.py rename to snowflake/ml/model/_packager/model_handlers_test/huggingface_pipeline_test.py index bfc52a7c..5093c7d6 100644 --- a/snowflake/ml/model/_handlers_test/huggingface_pipeline_test.py +++ b/snowflake/ml/model/_packager/model_handlers_test/huggingface_pipeline_test.py @@ -9,7 +9,7 @@ import torch from absl.testing import absltest -from snowflake.ml.model import _model as model_api +from snowflake.ml.model._packager import model_packager from snowflake.ml.model._signatures import utils from snowflake.ml.model.models import huggingface_pipeline @@ -76,53 +76,59 @@ def _basic_test_case( with tempfile.TemporaryDirectory() as tmpdir: with self.assertRaises(ValueError): - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=model, signatures={**s, "another_predict": s["__call__"]}, metadata={"author": "halu", "version": "1"}, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=model, signatures=s, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - assert isinstance(m, transformers.Pipeline) - self._check_loaded_pipeline_object(model, m) - - check_pipeline_fn(model, m) - - m_UDF, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - assert callable(m_UDF) - res = m_UDF(udf_test_input.copy(deep=True)) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, transformers.Pipeline) + self._check_loaded_pipeline_object(model, pk.model) + + check_pipeline_fn(model, pk.model) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + assert callable(pk.model) + res = pk.model(udf_test_input.copy(deep=True)) check_udf_res_fn(res) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")).save( name="model1_no_sig", - local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=model, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) - assert isinstance(m, transformers.Pipeline) - self._check_loaded_pipeline_object(model, m) - - check_pipeline_fn(model, m) - self.assertEqual(s, meta.signatures) - - m_UDF, meta = model_api._load( - local_dir_path=os.path.join(tmpdir, "model1_no_sig"), - as_custom_model=True, - ) - assert callable(m_UDF) - res = m_UDF(udf_test_input.copy(deep=True)) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, transformers.Pipeline) + self._check_loaded_pipeline_object(model, pk.model) + + check_pipeline_fn(model, pk.model) + self.assertEqual(s, pk.meta.signatures) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + assert callable(pk.model) + res = pk.model(udf_test_input.copy(deep=True)) check_udf_res_fn(res) wrapper_model = huggingface_pipeline.HuggingFacePipelineModel( @@ -131,50 +137,50 @@ def _basic_test_case( with tempfile.TemporaryDirectory() as tmpdir: with self.assertWarns(UserWarning): - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=wrapper_model, signatures=s, metadata={"author": "halu", "version": "1"}, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")).save( name="model1_no_sig", - local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=wrapper_model, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) - assert isinstance(m, huggingface_pipeline.HuggingFacePipelineModel) - self._check_loaded_pipeline_wrapper_object(wrapper_model, m) - self.assertEqual(s, meta.signatures) - - m_UDF, meta = model_api._load( - local_dir_path=os.path.join(tmpdir, "model1_no_sig"), - as_custom_model=True, - ) - assert callable(m_UDF) - res = m_UDF(udf_test_input.copy(deep=True)) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, huggingface_pipeline.HuggingFacePipelineModel) + self._check_loaded_pipeline_wrapper_object(wrapper_model, pk.model) + self.assertEqual(s, pk.meta.signatures) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + assert callable(pk.model) + res = pk.model(udf_test_input.copy(deep=True)) check_udf_res_fn(res) if check_gpu: - m, meta = model_api._load( - local_dir_path=os.path.join(tmpdir, "model1_no_sig"), - options={"use_gpu": True}, - ) - assert isinstance(m, huggingface_pipeline.HuggingFacePipelineModel) - self._check_loaded_pipeline_wrapper_object(wrapper_model, m, use_gpu=True) - self.assertEqual(s, meta.signatures) - - m_UDF, meta = model_api._load( - local_dir_path=os.path.join(tmpdir, "model1_no_sig"), - as_custom_model=True, - options={"use_gpu": True}, - ) - assert callable(m_UDF) - res = m_UDF(udf_test_input.copy(deep=True)) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load(options={"use_gpu": True}) + assert pk.model + assert pk.meta + assert isinstance(pk.model, huggingface_pipeline.HuggingFacePipelineModel) + self._check_loaded_pipeline_wrapper_object(wrapper_model, pk.model, use_gpu=True) + self.assertEqual(s, pk.meta.signatures) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load(as_custom_model=True, options={"use_gpu": True}) + assert pk.model + assert pk.meta + assert callable(pk.model) + res = pk.model(udf_test_input.copy(deep=True)) check_udf_res_fn(res) def test_conversational_pipeline(self) -> None: diff --git a/snowflake/ml/model/_handlers_test/mlflow_test.py b/snowflake/ml/model/_packager/model_handlers_test/mlflow_test.py similarity index 67% rename from snowflake/ml/model/_handlers_test/mlflow_test.py rename to snowflake/ml/model/_packager/model_handlers_test/mlflow_test.py index 9aeb2033..feb0eb46 100644 --- a/snowflake/ml/model/_handlers_test/mlflow_test.py +++ b/snowflake/ml/model/_packager/model_handlers_test/mlflow_test.py @@ -9,7 +9,8 @@ from absl.testing import absltest from sklearn import datasets, ensemble, model_selection -from snowflake.ml.model import _model as model_api, model_signature +from snowflake.ml.model import model_signature +from snowflake.ml.model._packager import model_packager class MLFlowHandlerTest(absltest.TestCase): @@ -53,16 +54,18 @@ def test_mlflow_model(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: mlflow_pyfunc_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") - saved_meta = model_api._save( + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=mlflow_pyfunc_model, ) + assert pk.model + assert pk.meta - self.assertEqual(saved_meta.python_version, "3.8.13") - self.assertDictEqual(saved_meta.metadata, {"author": "halu", "version": "1"}) + self.assertEqual(pk.meta.env.python_version, "3.8") + self.assertDictEqual(pk.meta.metadata, {"author": "halu", "version": "1"}) self.assertDictEqual( - saved_meta.signatures, + pk.meta.signatures, { "predict": model_signature.ModelSignature( inputs=[ @@ -77,35 +80,34 @@ def test_mlflow_model(self) -> None: }, ) self.assertListEqual( - sorted(saved_meta.pip_requirements), + sorted(pk.meta.env.pip_requirements), sorted( [ "mlflow<3,>=2.3", - "cloudpickle==2.0.0", - "numpy==1.23.4", "psutil==5.9.0", "scikit-learn==1.2.2", "scipy==1.9.3", - "typing-extensions==4.5.0", ] ), ) - self.assertIn("pip<=23.0.1", saved_meta.conda_dependencies) + self.assertIn("pip<=23.0.1", pk.meta.env.conda_dependencies) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - assert isinstance(m, mlflow.pyfunc.PyFuncModel) - self.assertNotEqual(m.metadata.run_id, run_id) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, mlflow.pyfunc.PyFuncModel) + self.assertNotEqual(pk.model.metadata.run_id, run_id) - _ = model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_again")).save( name="model1_again", - local_dir_path=os.path.join(tmpdir, "model1_again"), - model=m, + model=mlflow_pyfunc_model, ) - self.assertEqual(meta.python_version, "3.8.13") - self.assertDictEqual(meta.metadata, {"author": "halu", "version": "1"}) + self.assertEqual(pk.meta.env.python_version, "3.8") + self.assertDictEqual(pk.meta.metadata, {"author": "halu", "version": "1"}) self.assertDictEqual( - meta.signatures, + pk.meta.signatures, { "predict": model_signature.ModelSignature( inputs=[ @@ -120,25 +122,25 @@ def test_mlflow_model(self) -> None: }, ) self.assertListEqual( - sorted(meta.pip_requirements), + sorted(pk.meta.env.pip_requirements), sorted( [ "mlflow<3,>=2.3", - "cloudpickle==2.0.0", - "numpy==1.23.4", "psutil==5.9.0", "scikit-learn==1.2.2", "scipy==1.9.3", - "typing-extensions==4.5.0", ] ), ) - self.assertIn("pip<=23.0.1", meta.conda_dependencies) + self.assertIn("pip<=23.0.1", pk.meta.env.conda_dependencies) - np.testing.assert_allclose(predictions, m.predict(X_test)) + np.testing.assert_allclose(predictions, pk.model.predict(X_test)) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) X_df = pd.DataFrame(X_test) np.testing.assert_allclose(np.expand_dims(predictions, axis=1), predict_method(X_df).to_numpy()) @@ -163,20 +165,25 @@ def test_mlflow_model_df_inputs(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: mlflow_pyfunc_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") - _ = model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=mlflow_pyfunc_model, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - assert isinstance(m, mlflow.pyfunc.PyFuncModel) - self.assertNotEqual(m.metadata.run_id, run_id) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, mlflow.pyfunc.PyFuncModel) + self.assertNotEqual(pk.model.metadata.run_id, run_id) - np.testing.assert_allclose(predictions, m.predict(X_test)) + np.testing.assert_allclose(predictions, pk.model.predict(X_test)) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) np.testing.assert_allclose(np.expand_dims(predictions, axis=1), predict_method(X_test).to_numpy()) @@ -204,52 +211,59 @@ def test_mlflow_model_bad_case(self) -> None: mlflow_pyfunc_model = mlflow.pyfunc.load_model(local_path) mlflow_pyfunc_model.metadata.run_id = uuid.uuid4().hex.lower() with self.assertRaisesRegex(ValueError, "Cannot load MLFlow model artifacts."): - _ = model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=mlflow_pyfunc_model, options={"ignore_mlflow_dependencies": True}, ) - saved_meta = model_api._save( + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=mlflow_pyfunc_model, options={"model_uri": local_path, "ignore_mlflow_dependencies": True}, ) + assert pk.model + assert pk.meta - self.assertEmpty(saved_meta.pip_requirements) + self.assertEmpty(pk.meta.env.pip_requirements) with self.assertRaisesRegex(ValueError, "Cannot load MLFlow model dependencies."): - _ = model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=mlflow_pyfunc_model, ) - saved_meta = model_api._save( + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model2")) + pk.save( name="model2", - local_dir_path=os.path.join(tmpdir, "model2"), model=mlflow_pyfunc_model, options={"model_uri": local_path, "ignore_mlflow_metadata": True}, ) + assert pk.model + assert pk.meta - self.assertIsNone(saved_meta.metadata) + self.assertIsNone(pk.meta.metadata) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model2")) - assert isinstance(m, mlflow.pyfunc.PyFuncModel) - self.assertNotEqual(m.metadata.run_id, run_id) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model2")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, mlflow.pyfunc.PyFuncModel) + self.assertNotEqual(pk.model.metadata.run_id, run_id) - np.testing.assert_allclose(predictions, m.predict(X_test)) + np.testing.assert_allclose(predictions, pk.model.predict(X_test)) - _ = model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model2_again")).save( name="model2_again", - local_dir_path=os.path.join(tmpdir, "model2_again"), - model=m, + model=pk.model, ) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model2"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model2")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) np.testing.assert_allclose(np.expand_dims(predictions, axis=1), predict_method(X_test).to_numpy()) @@ -280,19 +294,24 @@ def test_mlflow_model_pytorch(self) -> None: predictions = pytorch_pyfunc.predict(input_x) with tempfile.TemporaryDirectory() as tmpdir: - _ = model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=pytorch_pyfunc, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - assert isinstance(m, mlflow.pyfunc.PyFuncModel) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, mlflow.pyfunc.PyFuncModel) - np.testing.assert_allclose(predictions, m.predict(input_x)) + np.testing.assert_allclose(predictions, pk.model.predict(input_x)) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) np.testing.assert_allclose( np.expand_dims(predictions, axis=1), predict_method(pd.DataFrame(input_x)).to_numpy() diff --git a/snowflake/ml/model/_handlers_test/pytorch_test.py b/snowflake/ml/model/_packager/model_handlers_test/pytorch_test.py similarity index 63% rename from snowflake/ml/model/_handlers_test/pytorch_test.py rename to snowflake/ml/model/_packager/model_handlers_test/pytorch_test.py index 1216286c..4fea18e5 100644 --- a/snowflake/ml/model/_handlers_test/pytorch_test.py +++ b/snowflake/ml/model/_packager/model_handlers_test/pytorch_test.py @@ -7,7 +7,8 @@ import torch from absl.testing import absltest -from snowflake.ml.model import _model as model_api, model_signature +from snowflake.ml.model import model_signature +from snowflake.ml.model._packager import model_packager from snowflake.ml.model._signatures import ( pytorch_handler, utils as model_signature_utils, @@ -54,17 +55,15 @@ def test_pytorch(self) -> None: model, data_x, data_y = _prepare_torch_model() s = {"forward": model_signature.infer_signature([data_x], [data_y])} with self.assertRaises(ValueError): - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=model, signatures={**s, "another_forward": s["forward"]}, metadata={"author": "halu", "version": "1"}, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=model, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -81,15 +80,22 @@ def test_pytorch(self) -> None: with warnings.catch_warnings(): warnings.simplefilter("error") - m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - assert isinstance(m, torch.nn.Module) - torch.testing.assert_close(m.forward(data_x), y_pred) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, torch.nn.Module) + torch.testing.assert_close(pk.model.forward(data_x), y_pred) with self.assertRaisesRegex(RuntimeError, "Attempting to deserialize object on a CUDA device"): - _, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), options={"use_gpu": True}) - - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - predict_method = getattr(m_udf, "forward", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(options={"use_gpu": True}) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "forward", None) assert callable(predict_method) torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( @@ -98,21 +104,26 @@ def test_pytorch(self) -> None: y_pred, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_1")).save( name="model1_no_sig_1", - local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), model=model, sample_input=[data_x], metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) - assert isinstance(m, torch.nn.Module) - torch.testing.assert_close(m.forward(data_x), y_pred) - self.assertEqual(s["forward"], meta.signatures["forward"]) - - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), as_custom_model=True) - predict_method = getattr(m_udf, "forward", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, torch.nn.Module) + torch.testing.assert_close(pk.model.forward(data_x), y_pred) + self.assertEqual(s["forward"], pk.meta.signatures["forward"]) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "forward", None) assert callable(predict_method) torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(predict_method(x_df), s["forward"].outputs)[ @@ -134,20 +145,25 @@ def test_torch_df_sample_input(self) -> None: pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([data_x], ensure_serializable=False), s["forward"].inputs, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_1")).save( name="model1_no_sig_1", - local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), model=model, sample_input=x_df, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) - assert isinstance(m, torch.nn.Module) - torch.testing.assert_close(m.forward(data_x), y_pred) - - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), as_custom_model=True) - predict_method = getattr(m_udf, "forward", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, torch.nn.Module) + torch.testing.assert_close(pk.model.forward(data_x), y_pred) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "forward", None) assert callable(predict_method) torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(predict_method(x_df))[0], y_pred @@ -156,20 +172,25 @@ def test_torch_df_sample_input(self) -> None: model_script.eval() y_pred = model_script.forward(data_x).detach() - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_2")).save( name="model1_no_sig_2", - local_dir_path=os.path.join(tmpdir, "model1_no_sig_2"), model=model_script, sample_input=x_df, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_2")) - assert isinstance(m, torch.jit.ScriptModule) # type:ignore[attr-defined] - torch.testing.assert_close(m.forward(data_x), y_pred) - - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_2"), as_custom_model=True) - predict_method = getattr(m_udf, "forward", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_2")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, torch.jit.ScriptModule) # type:ignore[attr-defined] + torch.testing.assert_close(pk.model.forward(data_x), y_pred) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_2")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "forward", None) assert callable(predict_method) torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(predict_method(x_df))[0], y_pred diff --git a/snowflake/ml/model/_handlers_test/sklearn_test.py b/snowflake/ml/model/_packager/model_handlers_test/sklearn_test.py similarity index 58% rename from snowflake/ml/model/_handlers_test/sklearn_test.py rename to snowflake/ml/model/_packager/model_handlers_test/sklearn_test.py index 8bbc218b..2ed41d7c 100644 --- a/snowflake/ml/model/_handlers_test/sklearn_test.py +++ b/snowflake/ml/model/_packager/model_handlers_test/sklearn_test.py @@ -7,11 +7,8 @@ from absl.testing import absltest from sklearn import datasets, ensemble, linear_model, multioutput -from snowflake.ml.model import ( - _model as model_api, - model_signature, - type_hints as model_types, -) +from snowflake.ml.model import model_signature, type_hints as model_types +from snowflake.ml.model._packager import model_packager class SKLearnHandlerTest(absltest.TestCase): @@ -24,9 +21,8 @@ def test_skl_multiple_output_proba(self) -> None: model.fit(iris_X_df[:-10], dual_target[:-10]) with tempfile.TemporaryDirectory() as tmpdir: s = {"predict_proba": model_signature.infer_signature(iris_X_df, model.predict_proba(iris_X_df))} - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=model, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -35,14 +31,19 @@ def test_skl_multiple_output_proba(self) -> None: orig_res = model.predict_proba(iris_X_df[-10:]) - m: multioutput.MultiOutputClassifier - m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - - loaded_res = m.predict_proba(iris_X_df[-10:]) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, multioutput.MultiOutputClassifier) + loaded_res = pk.model.predict_proba(iris_X_df[-10:]) np.testing.assert_allclose(np.hstack(orig_res), np.hstack(loaded_res)) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - predict_method = getattr(m_udf, "predict_proba", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict_proba", None) assert callable(predict_method) udf_res = predict_method(iris_X_df[-10:]) np.testing.assert_allclose( @@ -50,33 +51,38 @@ def test_skl_multiple_output_proba(self) -> None: ) with self.assertRaises(ValueError): - model_api._save( + model_packager.ModelPackager(local_dir_path=os.path.join(tmpdir, "model1_no_sig_bad")).save( name="model1_no_sig_bad", - local_dir_path=os.path.join(tmpdir, "model1_no_sig_bad"), model=model, sample_input=iris_X_df, metadata={"author": "halu", "version": "1"}, options=model_types.SKLModelSaveOptions({"target_methods": ["random"]}), ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")).save( name="model1_no_sig", - local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=model, sample_input=iris_X_df, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, multioutput.MultiOutputClassifier) np.testing.assert_allclose( - np.hstack(model.predict_proba(iris_X_df[-10:])), np.hstack(m.predict_proba(iris_X_df[-10:])) + np.hstack(model.predict_proba(iris_X_df[-10:])), np.hstack(pk.model.predict_proba(iris_X_df[-10:])) ) - np.testing.assert_allclose(model.predict(iris_X_df[-10:]), m.predict(iris_X_df[-10:])) - self.assertEqual(s["predict_proba"], meta.signatures["predict_proba"]) + np.testing.assert_allclose(model.predict(iris_X_df[-10:]), pk.model.predict(iris_X_df[-10:])) + self.assertEqual(s["predict_proba"], pk.meta.signatures["predict_proba"]) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig"), as_custom_model=True) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta - predict_method = getattr(m_udf, "predict_proba", None) + predict_method = getattr(pk.model, "predict_proba", None) assert callable(predict_method) udf_res = predict_method(iris_X_df[-10:]) np.testing.assert_allclose( @@ -84,7 +90,7 @@ def test_skl_multiple_output_proba(self) -> None: np.hstack([np.array(udf_res[col].to_list()) for col in udf_res]), ) - predict_method = getattr(m_udf, "predict", None) + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) np.testing.assert_allclose(model.predict(iris_X_df[-10:]), predict_method(iris_X_df[-10:]).to_numpy()) @@ -96,17 +102,15 @@ def test_skl(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: s = {"predict": model_signature.infer_signature(iris_X_df, regr.predict(iris_X_df))} with self.assertRaises(ValueError): - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=regr, signatures={**s, "another_predict": s["predict"]}, metadata={"author": "halu", "version": "1"}, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=regr, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -115,28 +119,41 @@ def test_skl(self) -> None: with warnings.catch_warnings(): warnings.simplefilter("error") - m: linear_model.LinearRegression - m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - np.testing.assert_allclose(np.array([-0.08254936]), m.predict(iris_X_df[:1])) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, linear_model.LinearRegression) + np.testing.assert_allclose(np.array([-0.08254936]), pk.model.predict(iris_X_df[:1])) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) np.testing.assert_allclose(np.array([[-0.08254936]]), predict_method(iris_X_df[:1])) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")).save( name="model1_no_sig", - local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=regr, sample_input=iris_X_df, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) - np.testing.assert_allclose(np.array([-0.08254936]), m.predict(iris_X_df[:1])) - self.assertEqual(s["predict"], meta.signatures["predict"]) - - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, linear_model.LinearRegression) + np.testing.assert_allclose(np.array([-0.08254936]), pk.model.predict(iris_X_df[:1])) + self.assertEqual(s["predict"], pk.meta.signatures["predict"]) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) np.testing.assert_allclose(np.array([[-0.08254936]]), predict_method(iris_X_df[:1])) diff --git a/snowflake/ml/model/_handlers_test/snowmlmodel_test.py b/snowflake/ml/model/_packager/model_handlers_test/snowmlmodel_test.py similarity index 54% rename from snowflake/ml/model/_handlers_test/snowmlmodel_test.py rename to snowflake/ml/model/_packager/model_handlers_test/snowmlmodel_test.py index effb6e1b..7e3b1c32 100644 --- a/snowflake/ml/model/_handlers_test/snowmlmodel_test.py +++ b/snowflake/ml/model/_packager/model_handlers_test/snowmlmodel_test.py @@ -7,7 +7,8 @@ from absl.testing import absltest from sklearn import datasets -from snowflake.ml.model import _model as model_api, model_signature +from snowflake.ml.model import model_signature +from snowflake.ml.model._packager import model_packager from snowflake.ml.modeling.linear_model import ( # type:ignore[attr-defined] LinearRegression, ) @@ -31,17 +32,15 @@ def test_snowml_all_input(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: s = {"predict": model_signature.infer_signature(df[INPUT_COLUMNS], regr.predict(df)[[OUTPUT_COLUMNS]])} with self.assertRaises(ValueError): - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=regr, signatures={**s, "another_predict": s["predict"]}, metadata={"author": "halu", "version": "1"}, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=regr, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -50,31 +49,44 @@ def test_snowml_all_input(self) -> None: with warnings.catch_warnings(): warnings.simplefilter("error") - m: LinearRegression - m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - np.testing.assert_allclose(predictions, m.predict(df[:1])[[OUTPUT_COLUMNS]]) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, LinearRegression) + np.testing.assert_allclose(predictions, pk.model.predict(df[:1])[[OUTPUT_COLUMNS]]) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) np.testing.assert_allclose(predictions, predict_method(df[:1])[[OUTPUT_COLUMNS]]) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")).save( name="model1_no_sig", - local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=regr, sample_input=df[INPUT_COLUMNS], metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) - np.testing.assert_allclose(np.array([[-0.08254936]]), m.predict(df[:1])[[OUTPUT_COLUMNS]]) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, LinearRegression) + np.testing.assert_allclose(predictions, desired=pk.model.predict(df[:1])[[OUTPUT_COLUMNS]]) s = regr.model_signatures - self.assertEqual(s["predict"], meta.signatures["predict"]) + self.assertEqual(s["predict"], pk.meta.signatures["predict"]) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) - np.testing.assert_allclose(np.array([[-0.08254936]]), predict_method(df[:1])[[OUTPUT_COLUMNS]]) + np.testing.assert_allclose(predictions, predict_method(df[:1])[[OUTPUT_COLUMNS]]) def test_snowml_signature_partial_input(self) -> None: iris = datasets.load_iris() @@ -93,17 +105,15 @@ def test_snowml_signature_partial_input(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: s = {"predict": model_signature.infer_signature(df[INPUT_COLUMNS], regr.predict(df)[[OUTPUT_COLUMNS]])} with self.assertRaises(ValueError): - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=regr, signatures={**s, "another_predict": s["predict"]}, metadata={"author": "halu", "version": "1"}, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=regr, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -112,32 +122,45 @@ def test_snowml_signature_partial_input(self) -> None: with warnings.catch_warnings(): warnings.simplefilter("error") - m: LinearRegression - m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - np.testing.assert_allclose(predictions, m.predict(df[:1])[[OUTPUT_COLUMNS]]) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, LinearRegression) + np.testing.assert_allclose(predictions, pk.model.predict(df[:1])[[OUTPUT_COLUMNS]]) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) np.testing.assert_allclose(predictions, predict_method(df[:1])[[OUTPUT_COLUMNS]]) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")).save( name="model1_no_sig", - local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=regr, sample_input=df, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) - np.testing.assert_allclose(np.array([[0.17150434]]), m.predict(df[:1])[[OUTPUT_COLUMNS]]) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, LinearRegression) + np.testing.assert_allclose(predictions, pk.model.predict(df[:1])[[OUTPUT_COLUMNS]]) s = regr.model_signatures # Compare the Model Signature without indexing - self.assertItemsEqual(s["predict"].to_dict(), meta.signatures["predict"].to_dict()) + self.assertItemsEqual(s["predict"].to_dict(), pk.meta.signatures["predict"].to_dict()) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) - np.testing.assert_allclose(np.array([[0.17150434]]), predict_method(df[:1])[[OUTPUT_COLUMNS]]) + np.testing.assert_allclose(predictions, predict_method(df[:1])[[OUTPUT_COLUMNS]]) def test_snowml_signature_drop_input_cols(self) -> None: iris = datasets.load_iris() @@ -158,17 +181,15 @@ def test_snowml_signature_drop_input_cols(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: s = {"predict": model_signature.infer_signature(df[INPUT_COLUMNS], regr.predict(df)[[OUTPUT_COLUMNS]])} with self.assertRaises(ValueError): - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=regr, signatures={**s, "another_predict": s["predict"]}, metadata={"author": "halu", "version": "1"}, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=regr, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -177,32 +198,45 @@ def test_snowml_signature_drop_input_cols(self) -> None: with warnings.catch_warnings(): warnings.simplefilter("error") - m: LinearRegression - m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - np.testing.assert_allclose(predictions, m.predict(df[:1])[[OUTPUT_COLUMNS]]) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, LinearRegression) + np.testing.assert_allclose(predictions, pk.model.predict(df[:1])[[OUTPUT_COLUMNS]]) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) np.testing.assert_allclose(predictions, predict_method(df[:1])[[OUTPUT_COLUMNS]]) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")).save( name="model1_no_sig", - local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=regr, sample_input=df, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) - np.testing.assert_allclose(np.array([[-0.08254936]]), m.predict(df[:1])[[OUTPUT_COLUMNS]]) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, LinearRegression) + np.testing.assert_allclose(predictions, pk.model.predict(df[:1])[[OUTPUT_COLUMNS]]) s = regr.model_signatures # Compare the Model Signature without indexing - self.assertItemsEqual(s["predict"].to_dict(), meta.signatures["predict"].to_dict()) + self.assertItemsEqual(s["predict"].to_dict(), pk.meta.signatures["predict"].to_dict()) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) - np.testing.assert_allclose(np.array([[-0.08254936]]), predict_method(df[:1])[[OUTPUT_COLUMNS]]) + np.testing.assert_allclose(predictions, predict_method(df[:1])[[OUTPUT_COLUMNS]]) if __name__ == "__main__": diff --git a/snowflake/ml/model/_handlers_test/tensorflow_test.py b/snowflake/ml/model/_packager/model_handlers_test/tensorflow_test.py similarity index 62% rename from snowflake/ml/model/_handlers_test/tensorflow_test.py rename to snowflake/ml/model/_packager/model_handlers_test/tensorflow_test.py index a2a4ca91..41a78cb9 100644 --- a/snowflake/ml/model/_handlers_test/tensorflow_test.py +++ b/snowflake/ml/model/_packager/model_handlers_test/tensorflow_test.py @@ -7,7 +7,8 @@ import tensorflow as tf from absl.testing import absltest -from snowflake.ml.model import _model as model_api, model_signature +from snowflake.ml.model import model_signature +from snowflake.ml.model._packager import model_packager from snowflake.ml.model._signatures import ( tensorflow_handler, utils as model_signature_utils, @@ -64,17 +65,15 @@ def test_tensorflow(self) -> None: y_pred = simple_module(x) s = {"__call__": model_signature.infer_signature([x], [y_pred])} with self.assertRaises(ValueError): - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=simple_module, signatures={**s, "another_forward": s["__call__"]}, metadata={"author": "halu", "version": "1"}, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=simple_module, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -88,48 +87,67 @@ def test_tensorflow(self) -> None: with warnings.catch_warnings(): warnings.simplefilter("error") - m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - assert callable(m) - tf.assert_equal(m.__call__(x), y_pred) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - assert callable(m_udf) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert callable(pk.model) + tf.assert_equal(pk.model(x), y_pred) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + assert callable(pk.model) tf.assert_equal( - tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(m_udf(x_df), s["__call__"].outputs)[ - 0 - ], + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + pk.model(x_df), s["__call__"].outputs + )[0], y_pred, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_1")).save( name="model1_no_sig_1", - local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), model=simple_module, sample_input=[x], metadata={"author": "halu", "version": "1"}, ) - m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) - assert callable(m) - tf.assert_equal(m(x), y_pred) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), as_custom_model=True) - assert callable(m_udf) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_1")) + pk.load() + assert pk.model + assert pk.meta + assert callable(pk.model) + tf.assert_equal(pk.model(x), y_pred) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + assert callable(pk.model) tf.assert_equal( - tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(m_udf(x_df), s["__call__"].outputs)[0], + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(pk.model(x_df), s["__call__"].outputs)[ + 0 + ], y_pred, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_2")).save( name="model1_no_sig_2", - local_dir_path=os.path.join(tmpdir, "model1_no_sig_2"), model=simple_module, sample_input=x_df, metadata={"author": "halu", "version": "1"}, ) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_2"), as_custom_model=True) - assert callable(m_udf) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_2")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + assert callable(pk.model) tf.assert_equal( - tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(m_udf(x_df), s["__call__"].outputs)[0], + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(pk.model(x_df), s["__call__"].outputs)[ + 0 + ], y_pred, ) @@ -138,17 +156,15 @@ def test_tensorflow_keras(self) -> None: model, data_x, data_y = _prepare_keras_model() s = {"predict": model_signature.infer_signature([data_x], [data_y])} with self.assertRaises(ValueError): - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=model, signatures={**s, "another_forward": s["predict"]}, metadata={"author": "halu", "version": "1"}, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=model, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -164,11 +180,18 @@ def test_tensorflow_keras(self) -> None: with warnings.catch_warnings(): warnings.simplefilter("error") - m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - assert isinstance(m, tf.keras.Model) - tf.debugging.assert_near(m.predict(data_x), y_pred) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, tf.keras.Model) + tf.debugging.assert_near(pk.model.predict(data_x), y_pred) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) tf.debugging.assert_near( tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( @@ -177,21 +200,26 @@ def test_tensorflow_keras(self) -> None: y_pred, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_1")).save( name="model1_no_sig_1", - local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), model=model, sample_input=[data_x], metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) - assert isinstance(m, tf.keras.Model) - tf.debugging.assert_near(m.predict(data_x), y_pred) - self.assertEqual(s["predict"], meta.signatures["predict"]) - - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, tf.keras.Model) + tf.debugging.assert_near(pk.model.predict(data_x), y_pred) + self.assertEqual(s["predict"], pk.meta.signatures["predict"]) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) tf.debugging.assert_near( tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( diff --git a/snowflake/ml/model/_handlers_test/torchscript_test.py b/snowflake/ml/model/_packager/model_handlers_test/torchscript_test.py similarity index 68% rename from snowflake/ml/model/_handlers_test/torchscript_test.py rename to snowflake/ml/model/_packager/model_handlers_test/torchscript_test.py index b471a1cb..fac3e679 100644 --- a/snowflake/ml/model/_handlers_test/torchscript_test.py +++ b/snowflake/ml/model/_packager/model_handlers_test/torchscript_test.py @@ -7,7 +7,8 @@ import torch from absl.testing import absltest -from snowflake.ml.model import _model as model_api, model_signature +from snowflake.ml.model import model_signature +from snowflake.ml.model._packager import model_packager from snowflake.ml.model._signatures import ( pytorch_handler, utils as model_signature_utils, @@ -56,17 +57,15 @@ def test_torchscript(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: s = {"forward": model_signature.infer_signature([data_x], [data_y])} with self.assertRaises(ValueError): - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=model_script, signatures={**s, "another_forward": s["forward"]}, metadata={"author": "halu", "version": "1"}, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=model_script, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -83,15 +82,21 @@ def test_torchscript(self) -> None: with warnings.catch_warnings(): warnings.simplefilter("error") - m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - assert isinstance(m, torch.jit.ScriptModule) # type:ignore[attr-defined] - torch.testing.assert_close(m.forward(data_x), y_pred) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, torch.jit.ScriptModule) # type:ignore[attr-defined] + torch.testing.assert_close(pk.model.forward(data_x), y_pred) with self.assertRaisesRegex(RuntimeError, "Attempting to deserialize object on a CUDA device"): - _, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), options={"use_gpu": True}) + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).load(options={"use_gpu": True}) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - predict_method = getattr(m_udf, "forward", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "forward", None) assert callable(predict_method) torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( @@ -100,21 +105,26 @@ def test_torchscript(self) -> None: y_pred, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_1")).save( name="model1_no_sig_1", - local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), model=model_script, sample_input=[data_x], metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) - assert isinstance(m, torch.jit.ScriptModule) # type:ignore[attr-defined] - torch.testing.assert_close(m.forward(data_x), y_pred) - self.assertEqual(s["forward"], meta.signatures["forward"]) - - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), as_custom_model=True) - predict_method = getattr(m_udf, "forward", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, torch.jit.ScriptModule) # type:ignore[attr-defined] + torch.testing.assert_close(pk.model.forward(data_x), y_pred) + self.assertEqual(s["forward"], pk.meta.signatures["forward"]) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig_1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "forward", None) assert callable(predict_method) torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(predict_method(x_df), s["forward"].outputs)[ diff --git a/snowflake/ml/model/_handlers_test/xgboost_test.py b/snowflake/ml/model/_packager/model_handlers_test/xgboost_test.py similarity index 54% rename from snowflake/ml/model/_handlers_test/xgboost_test.py rename to snowflake/ml/model/_packager/model_handlers_test/xgboost_test.py index 673c7636..3cd4e989 100644 --- a/snowflake/ml/model/_handlers_test/xgboost_test.py +++ b/snowflake/ml/model/_packager/model_handlers_test/xgboost_test.py @@ -8,7 +8,8 @@ from absl.testing import absltest from sklearn import datasets, model_selection -from snowflake.ml.model import _model as model_api, model_signature +from snowflake.ml.model import model_signature +from snowflake.ml.model._packager import model_packager class XgboostHandlerTest(absltest.TestCase): @@ -23,17 +24,15 @@ def test_xgb_booster(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: s = {"predict": model_signature.infer_signature(cal_X_test, y_pred)} with self.assertRaises(ValueError): - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=regressor, signatures={**s, "another_predict": s["predict"]}, metadata={"author": "halu", "version": "1"}, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=regressor, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -42,29 +41,40 @@ def test_xgb_booster(self) -> None: with warnings.catch_warnings(): warnings.simplefilter("error") - m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - assert isinstance(m, xgboost.Booster) - np.testing.assert_allclose(m.predict(xgboost.DMatrix(data=cal_X_test)), y_pred) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, xgboost.Booster) + np.testing.assert_allclose(pk.model.predict(xgboost.DMatrix(data=cal_X_test)), y_pred) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) np.testing.assert_allclose(predict_method(cal_X_test), np.expand_dims(y_pred, axis=1)) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")).save( name="model1_no_sig", - local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=regressor, sample_input=cal_X_test, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) - assert isinstance(m, xgboost.Booster) - np.testing.assert_allclose(m.predict(xgboost.DMatrix(data=cal_X_test)), y_pred) - self.assertEqual(s["predict"], meta.signatures["predict"]) - - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, xgboost.Booster) + np.testing.assert_allclose(pk.model.predict(xgboost.DMatrix(data=cal_X_test)), y_pred) + self.assertEqual(s["predict"], pk.meta.signatures["predict"]) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) np.testing.assert_allclose(predict_method(cal_X_test), np.expand_dims(y_pred, axis=1)) @@ -80,17 +90,15 @@ def test_xgb(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: s = {"predict": model_signature.infer_signature(cal_X_test, y_pred)} with self.assertRaises(ValueError): - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=regressor, signatures={**s, "another_predict": s["predict"]}, metadata={"author": "halu", "version": "1"}, ) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( name="model1", - local_dir_path=os.path.join(tmpdir, "model1"), model=regressor, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -99,34 +107,45 @@ def test_xgb(self) -> None: with warnings.catch_warnings(): warnings.simplefilter("error") - m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) - assert isinstance(m, xgboost.XGBClassifier) - np.testing.assert_allclose(m.predict(cal_X_test), y_pred) - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, xgboost.XGBClassifier) + np.testing.assert_allclose(pk.model.predict(cal_X_test), y_pred) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) np.testing.assert_allclose(predict_method(cal_X_test), np.expand_dims(y_pred, axis=1)) - model_api._save( + model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")).save( name="model1_no_sig", - local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=regressor, sample_input=cal_X_test, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) - assert isinstance(m, xgboost.XGBClassifier) - np.testing.assert_allclose(m.predict(cal_X_test), y_pred) - np.testing.assert_allclose(m.predict_proba(cal_X_test), y_pred_proba) - self.assertEqual(s["predict"], meta.signatures["predict"]) - - m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig"), as_custom_model=True) - predict_method = getattr(m_udf, "predict", None) + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, xgboost.XGBClassifier) + np.testing.assert_allclose(pk.model.predict(cal_X_test), y_pred) + np.testing.assert_allclose(pk.model.predict_proba(cal_X_test), y_pred_proba) + self.assertEqual(s["predict"], pk.meta.signatures["predict"]) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1_no_sig")) + pk.load(as_custom_model=True) + assert pk.model + assert pk.meta + predict_method = getattr(pk.model, "predict", None) assert callable(predict_method) np.testing.assert_allclose(predict_method(cal_X_test), np.expand_dims(y_pred, axis=1)) - predict_method = getattr(m_udf, "predict_proba", None) + predict_method = getattr(pk.model, "predict_proba", None) assert callable(predict_method) np.testing.assert_allclose(predict_method(cal_X_test), y_pred_proba) diff --git a/snowflake/ml/model/_packager/model_meta/BUILD.bazel b/snowflake/ml/model/_packager/model_meta/BUILD.bazel new file mode 100644 index 00000000..8a55af08 --- /dev/null +++ b/snowflake/ml/model/_packager/model_meta/BUILD.bazel @@ -0,0 +1,74 @@ +load("//bazel:py_rules.bzl", "py_genrule", "py_library", "py_test") + +package(default_visibility = ["//visibility:public"]) + +GEN_CORE_REQ_CMD = "$(location //bazel/requirements:parse_and_generate_requirements) $(location //:requirements.yml) --schema $(location //bazel/requirements:requirements.schema.json) --mode version_requirements --format python --filter_by_tag deployment_core > $@" + +py_genrule( + name = "gen_core_requirements", + srcs = [ + "//:requirements.yml", + "//bazel/requirements:requirements.schema.json", + ], + outs = ["_core_requirements.py"], + cmd = GEN_CORE_REQ_CMD, + tools = ["//bazel/requirements:parse_and_generate_requirements"], +) + +py_library( + name = "_core_requirements", + srcs = [":gen_core_requirements"], +) + +py_library( + name = "model_meta_schema", + srcs = ["model_meta_schema.py"], + deps = [ + "//snowflake/ml/model:type_hints", + ], +) + +py_library( + name = "model_blob_meta", + srcs = ["model_blob_meta.py"], + deps = [ + "//snowflake/ml/model:type_hints", + ], +) + +py_library( + name = "model_meta", + srcs = ["model_meta.py"], + deps = [ + ":_core_requirements", + ":model_blob_meta", + ":model_meta_schema", + "//snowflake/ml/_internal:env", + "//snowflake/ml/_internal:env_utils", + "//snowflake/ml/_internal:file_utils", + "//snowflake/ml/model:model_signature", + "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_packager/model_env", + "//snowflake/ml/model/_packager/model_meta_migrator:migrator_plans", + ], +) + +py_test( + name = "model_meta_test", + srcs = ["model_meta_test.py"], + deps = [ + ":model_meta", + "//snowflake/ml/_internal:env_utils", + "//snowflake/ml/model:model_signature", + ], +) + +py_test( + name = "model_meta_schema_test", + srcs = ["model_meta_schema_test.py"], + deps = [ + ":model_meta_schema", + "//snowflake/ml/_internal:env", + "//snowflake/ml/test_utils:test_env_utils", + ], +) diff --git a/snowflake/ml/model/_packager/model_meta/model_blob_meta.py b/snowflake/ml/model/_packager/model_meta/model_blob_meta.py new file mode 100644 index 00000000..7a7c8993 --- /dev/null +++ b/snowflake/ml/model/_packager/model_meta/model_blob_meta.py @@ -0,0 +1,48 @@ +from typing import Dict, cast + +from typing_extensions import Unpack + +from snowflake.ml.model._packager.model_meta import model_meta_schema + + +class ModelBlobMeta: + """Metadata of an individual model blob (sub-model) in the packed model. + + Attributes: + name: The name to refer the sub-model. + model_type: The type of the model and handler to use. + path: Path to the picked model file. It is a relative path from the model blob directory. + handler_version: The version of the handler. + artifacts: Optional, used in custom model to show the mapping between artifact name and relative path + from the model blob directory. + options: Optional, used for some model specific metadata storage + """ + + def __init__(self, **kwargs: Unpack[model_meta_schema.ModelBlobMetadataDict]) -> None: + self.name = kwargs["name"] + self.model_type = kwargs["model_type"] + self.path = kwargs["path"] + self.handler_version = kwargs["handler_version"] + + self.artifacts: Dict[str, str] = {} + artifacts = kwargs.get("artifacts", None) + if artifacts: + self.artifacts = artifacts + + self.options: model_meta_schema.ModelBlobOptions = cast( + model_meta_schema.ModelBlobOptions, kwargs.get("options", {}) + ) + + def to_dict(self) -> model_meta_schema.ModelBlobMetadataDict: + return model_meta_schema.ModelBlobMetadataDict( + name=self.name, + model_type=self.model_type, + path=self.path, + handler_version=self.handler_version, + artifacts=self.artifacts, + options=self.options, + ) + + @classmethod + def from_dict(cls, blob_dict: model_meta_schema.ModelBlobMetadataDict) -> "ModelBlobMeta": + return cls(**blob_dict) diff --git a/snowflake/ml/model/_packager/model_meta/model_meta.py b/snowflake/ml/model/_packager/model_meta/model_meta.py new file mode 100644 index 00000000..e787e0e1 --- /dev/null +++ b/snowflake/ml/model/_packager/model_meta/model_meta.py @@ -0,0 +1,357 @@ +import importlib +import os +import pathlib +import sys +import tempfile +import zipfile +from contextlib import contextmanager +from datetime import datetime +from types import ModuleType +from typing import Any, Dict, Generator, List, Optional + +import cloudpickle +import yaml +from packaging import version + +from snowflake.ml._internal import env as snowml_env, env_utils, file_utils +from snowflake.ml.model import model_signature, type_hints as model_types +from snowflake.ml.model._packager.model_env import model_env +from snowflake.ml.model._packager.model_meta import ( + _core_requirements, + model_blob_meta, + model_meta_schema, +) +from snowflake.ml.model._packager.model_meta_migrator import migrator_plans + +MODEL_METADATA_FILE = "model.yaml" +MODEL_CODE_DIR = "code" + +_PACKAGING_CORE_DEPENDENCIES = _core_requirements.REQUIREMENTS +_SNOWFLAKE_PKG_NAME = "snowflake" +_SNOWFLAKE_ML_PKG_NAME = f"{_SNOWFLAKE_PKG_NAME}.ml" + + +@contextmanager +def create_model_metadata( + *, + model_dir_path: str, + name: str, + model_type: model_types.SupportedModelHandlerType, + signatures: Optional[Dict[str, model_signature.ModelSignature]] = None, + metadata: Optional[Dict[str, str]] = None, + code_paths: Optional[List[str]] = None, + ext_modules: Optional[List[ModuleType]] = None, + conda_dependencies: Optional[List[str]] = None, + pip_requirements: Optional[List[str]] = None, + python_version: Optional[str] = None, + **kwargs: Any, +) -> Generator["ModelMetadata", None, None]: + """Create a generator for model metadata object. Use generator to ensure correct register and unregister for + cloudpickle. + + Args: + model_dir_path: Path to the directory containing the model to be packed. + name: Name of the model. + model_type: Type of the model. + signatures: Signatures of the model. If None, it will be inferred after the model meta is created. + Defaults to None. + metadata: User provided key-value metadata of the model. Defaults to None. + code_paths: List of paths to additional codes that needs to be packed with. Defaults to None. + ext_modules: List of names of modules that need to be pickled with the model. Defaults to None. + conda_dependencies: List of conda requirements for running the model. Defaults to None. + pip_requirements: List of pip Python packages requirements for running the model. Defaults to None. + python_version: A string of python version where model is run. Used for user override. If specified as None, + current version would be captured. Defaults to None. + **kwargs: Dict of attributes and values of the metadata. Used when loading from file. + + Raises: + ValueError: Raised when the code path contains reserved file or directory. + + Yields: + A model metadata object. + """ + model_dir_path = os.path.normpath(model_dir_path) + embed_local_ml_library = kwargs.pop("embed_local_ml_library", False) + # Use the last one which is loaded first, that is mean, it is loaded from site-packages. + # We could make sure that user does not overwrite our library with their code follow the same naming. + snowml_path, snowml_start_path = file_utils.get_package_path(_SNOWFLAKE_ML_PKG_NAME, strategy="last") + if os.path.isdir(snowml_start_path): + path_to_copy = snowml_path + # If the package is zip-imported, then the path will be `../path_to_zip.zip/snowflake/ml` + # It is not a valid path in fact and we need to get the path to the zip file to verify it. + elif os.path.isfile(snowml_start_path): + extract_root = tempfile.mkdtemp() + with zipfile.ZipFile(os.path.abspath(snowml_start_path), mode="r", compression=zipfile.ZIP_DEFLATED) as zf: + zf.extractall(path=extract_root) + path_to_copy = os.path.join(extract_root, *(_SNOWFLAKE_ML_PKG_NAME.split("."))) + else: + raise ValueError("`snowflake.ml` is imported via a way that embedding local ML library is not supported.") + + env = _create_env_for_model_metadata( + conda_dependencies=conda_dependencies, + pip_requirements=pip_requirements, + python_version=python_version, + embed_local_ml_library=embed_local_ml_library, + ) + + if embed_local_ml_library: + env.snowpark_ml_version = f"{snowml_env.VERSION}+{file_utils.hash_directory(path_to_copy)}" + + model_meta = ModelMetadata( + name=name, + env=env, + metadata=metadata, + model_type=model_type, + signatures=signatures, + ) + + code_dir_path = os.path.join(model_dir_path, MODEL_CODE_DIR) + if embed_local_ml_library or code_paths: + os.makedirs(code_dir_path, exist_ok=True) + + if embed_local_ml_library: + snowml_path_in_code = os.path.join(code_dir_path, _SNOWFLAKE_PKG_NAME) + os.makedirs(snowml_path_in_code, exist_ok=True) + file_utils.copy_file_or_tree(path_to_copy, snowml_path_in_code) + + if code_paths: + for code_path in code_paths: + # This part is to prevent users from providing code following our naming and overwrite our code. + if ( + os.path.isfile(code_path) and os.path.splitext(os.path.basename(code_path))[0] == _SNOWFLAKE_PKG_NAME + ) or (os.path.isdir(code_path) and os.path.basename(code_path) == _SNOWFLAKE_PKG_NAME): + raise ValueError("`snowflake` is a reserved name and you cannot contain that into code path.") + file_utils.copy_file_or_tree(code_path, code_dir_path) + + try: + imported_modules = [] + if ext_modules: + registered_modules = cloudpickle.list_registry_pickle_by_value() + for mod in ext_modules: + if mod.__name__ not in registered_modules: + cloudpickle.register_pickle_by_value(mod) + imported_modules.append(mod) + yield model_meta + model_meta.save(model_dir_path) + finally: + for mod in imported_modules: + cloudpickle.unregister_pickle_by_value(mod) + + +def _create_env_for_model_metadata( + *, + conda_dependencies: Optional[List[str]] = None, + pip_requirements: Optional[List[str]] = None, + python_version: Optional[str] = None, + embed_local_ml_library: bool = False, +) -> model_env.ModelEnv: + env = model_env.ModelEnv() + + # Mypy doesn't like getter and setter have different types. See python/mypy #3004 + env.conda_dependencies = conda_dependencies # type: ignore[assignment] + env.pip_requirements = pip_requirements # type: ignore[assignment] + env.python_version = python_version # type: ignore[assignment] + env.snowpark_ml_version = snowml_env.VERSION + if embed_local_ml_library: + env.include_if_absent( + [model_env.ModelDependency(requirement=dep, pip_name=dep) for dep in _PACKAGING_CORE_DEPENDENCIES], + check_local_version=True, + ) + else: + env.include_if_absent( + [ + model_env.ModelDependency(requirement=dep, pip_name=dep) + for dep in _PACKAGING_CORE_DEPENDENCIES + [env_utils.SNOWPARK_ML_PKG_NAME] + ], + check_local_version=True, + ) + + return env + + +def load_code_path(model_dir_path: str) -> None: + """Load custom code in the code path into memory. + + Args: + model_dir_path: Path to the directory containing the model to be loaded. + + """ + code_path = os.path.join(model_dir_path, MODEL_CODE_DIR) + if os.path.exists(code_path): + if code_path in sys.path: + sys.path.remove(code_path) + sys.path.insert(0, code_path) + module_names = file_utils.get_all_modules(code_path) + # If the module_name starts with snowflake, then do not replace it. + # When deploying, we would add them beforehand. + # When in the local, they should not be added. We already prevent user from overwriting us. + module_names = [ + module_name + for module_name in module_names + if not (module_name.startswith(f"{_SNOWFLAKE_PKG_NAME}.") or module_name == _SNOWFLAKE_PKG_NAME) + ] + for module_name in module_names: + actual_module = sys.modules.pop(module_name, None) + if actual_module is not None: + sys.modules[module_name] = importlib.import_module(module_name) + + assert code_path in sys.path + sys.path.remove(code_path) + + +class ModelMetadata: + """Model metadata for Snowflake native model packaged model. + + Attributes: + name: Name of the model. + model_type: Type of the model. + env: ModelEnv object containing all environment related object + models: Dict of model blob metadata + signatures: A dict mapping from target function name to input and output signatures. + metadata: User provided key-value metadata of the model. Defaults to None. + creation_timestamp: Unix timestamp when the model metadata is created. + """ + + def __init__( + self, + *, + name: str, + env: model_env.ModelEnv, + model_type: model_types.SupportedModelHandlerType, + signatures: Optional[Dict[str, model_signature.ModelSignature]] = None, + metadata: Optional[Dict[str, str]] = None, + creation_timestamp: Optional[str] = None, + min_snowpark_ml_version: Optional[str] = None, + models: Optional[Dict[str, model_blob_meta.ModelBlobMeta]] = None, + original_metadata_version: Optional[str] = model_meta_schema.MODEL_METADATA_VERSION, + ) -> None: + self.name = name + self.signatures: Dict[str, model_signature.ModelSignature] = dict() + if signatures: + self.signatures = signatures + self.metadata = metadata + self.model_type = model_type + self.env = env + self.creation_timestamp = creation_timestamp if creation_timestamp else str(datetime.utcnow()) + self._min_snowpark_ml_version = version.parse( + min_snowpark_ml_version + if min_snowpark_ml_version + else model_meta_schema.MODEL_METADATA_MIN_SNOWPARK_ML_VERSION + ) + + self.models: Dict[str, model_blob_meta.ModelBlobMeta] = dict() + if models: + self.models = models + + self.original_metadata_version = original_metadata_version + + @property + def min_snowpark_ml_version(self) -> str: + return self._min_snowpark_ml_version.base_version + + @min_snowpark_ml_version.setter + def min_snowpark_ml_version(self, min_snowpark_ml_version: str) -> None: + parsed_min_snowpark_ml_version = version.parse(min_snowpark_ml_version) + self._min_snowpark_ml_version = max(self._min_snowpark_ml_version, parsed_min_snowpark_ml_version) + + def save(self, model_dir_path: str) -> None: + """Save the model metadata + + Raises: + RuntimeError: Raised when the metadata is not ready to save + + Args: + model_dir_path: Path to the directory containing the model to be loaded. + """ + model_yaml_path = os.path.join(model_dir_path, MODEL_METADATA_FILE) + + if (not self.signatures) or (self.name not in self.models): + raise RuntimeError("The meta data is not ready to save.") + + model_dict = model_meta_schema.ModelMetadataDict( + { + "creation_timestamp": self.creation_timestamp, + "env": self.env.save_as_dict(pathlib.Path(model_dir_path)), + "metadata": self.metadata, + "model_type": self.model_type, + "models": {model_name: blob.to_dict() for model_name, blob in self.models.items()}, + "name": self.name, + "signatures": {func_name: sig.to_dict() for func_name, sig in self.signatures.items()}, + "version": model_meta_schema.MODEL_METADATA_VERSION, + "min_snowpark_ml_version": self.min_snowpark_ml_version, + } + ) + + with open(model_yaml_path, "w", encoding="utf-8") as out: + yaml.safe_dump( + model_dict, + stream=out, + default_flow_style=False, + ) + + @staticmethod + def _validate_model_metadata(loaded_meta: Any) -> model_meta_schema.ModelMetadataDict: + if not isinstance(loaded_meta, dict): + raise ValueError(f"Read ill-formatted model metadata, should be a dict, received {type(loaded_meta)}") + + original_loaded_meta_version = loaded_meta.get("version", None) + if not original_loaded_meta_version: + raise ValueError("Unable to get the version of the metadata file.") + + loaded_meta = migrator_plans.migrate_metadata(loaded_meta) + + loaded_meta_min_snowpark_ml_version = loaded_meta.get("min_snowpark_ml_version", None) + if not loaded_meta_min_snowpark_ml_version or version.parse( + loaded_meta_min_snowpark_ml_version + ) < version.parse(snowml_env.VERSION): + raise RuntimeError( + f"The minimal version required to load the model is {loaded_meta_min_snowpark_ml_version}," + f"while current version of Snowpark ML library is {snowml_env.VERSION}." + ) + return model_meta_schema.ModelMetadataDict( + creation_timestamp=loaded_meta["creation_timestamp"], + env=loaded_meta["env"], + metadata=loaded_meta.get("metadata", None), + model_type=loaded_meta["model_type"], + models=loaded_meta["models"], + name=loaded_meta["name"], + signatures=loaded_meta["signatures"], + version=original_loaded_meta_version, + min_snowpark_ml_version=loaded_meta_min_snowpark_ml_version, + ) + + @classmethod + def load(cls, model_dir_path: str) -> "ModelMetadata": + """Load models for a directory. Model is initially loaded normally. If additional codes are included when + packed, the code path is added to system path to be imported with highest priority. + + Args: + model_dir_path: Path to the directory containing the model to be loaded. + + Returns: + A model metadata object. + """ + model_yaml_path = os.path.join(model_dir_path, MODEL_METADATA_FILE) + with open(model_yaml_path, encoding="utf-8") as f: + loaded_meta = yaml.safe_load(f.read()) + + model_dict = cls._validate_model_metadata(loaded_meta) + + signatures = { + func_name: model_signature.ModelSignature.from_dict(sig) + for func_name, sig in model_dict["signatures"].items() + } + models = {name: model_blob_meta.ModelBlobMeta(**blob_meta) for name, blob_meta in model_dict["models"].items()} + env = model_env.ModelEnv() + env.load_from_dict(pathlib.Path(model_dir_path), model_dict["env"]) + return cls( + name=model_dict["name"], + model_type=model_dict["model_type"], + env=env, + signatures=signatures, + metadata=model_dict.get("metadata", None), + creation_timestamp=model_dict["creation_timestamp"], + min_snowpark_ml_version=model_dict["min_snowpark_ml_version"], + models=models, + original_metadata_version=model_dict["version"], + ) diff --git a/snowflake/ml/model/_packager/model_meta/model_meta_schema.py b/snowflake/ml/model/_packager/model_meta/model_meta_schema.py new file mode 100644 index 00000000..e1473a67 --- /dev/null +++ b/snowflake/ml/model/_packager/model_meta/model_meta_schema.py @@ -0,0 +1,70 @@ +# This files contains schema definition of what will be written into model.yml +# Changing this file should lead to a change of the schema version. + +from typing import Any, Dict, Optional, TypedDict, Union + +from typing_extensions import NotRequired, Required + +from snowflake.ml.model import type_hints + +MODEL_METADATA_VERSION = "2023-12-01" +MODEL_METADATA_MIN_SNOWPARK_ML_VERSION = "1.0.12" + + +class ModelEnvDict(TypedDict): + conda: Required[str] + pip: Required[str] + python_version: Required[str] + cuda_version: NotRequired[Optional[str]] + snowpark_ml_version: Required[str] + + +class BaseModelBlobOptions(TypedDict): + ... + + +class HuggingFacePipelineModelBlobOptions(BaseModelBlobOptions): + task: Required[str] + batch_size: Required[int] + + +class LLMModelBlobOptions(BaseModelBlobOptions): + batch_size: Required[int] + + +class MLFlowModelBlobOptions(BaseModelBlobOptions): + artifact_path: Required[str] + + +class XgboostModelBlobOptions(BaseModelBlobOptions): + xgb_estimator_type: Required[str] + + +ModelBlobOptions = Union[ + BaseModelBlobOptions, + HuggingFacePipelineModelBlobOptions, + LLMModelBlobOptions, + MLFlowModelBlobOptions, + XgboostModelBlobOptions, +] + + +class ModelBlobMetadataDict(TypedDict): + name: Required[str] + model_type: Required[type_hints.SupportedModelHandlerType] + path: Required[str] + handler_version: Required[str] + artifacts: NotRequired[Dict[str, str]] + options: NotRequired[ModelBlobOptions] + + +class ModelMetadataDict(TypedDict): + creation_timestamp: Required[str] + env: Required[ModelEnvDict] + metadata: NotRequired[Optional[Dict[str, str]]] + model_type: Required[type_hints.SupportedModelHandlerType] + models: Required[Dict[str, ModelBlobMetadataDict]] + name: Required[str] + signatures: Required[Dict[str, Dict[str, Any]]] + version: Required[str] + min_snowpark_ml_version: Required[str] diff --git a/snowflake/ml/model/_packager/model_meta/model_meta_schema_test.py b/snowflake/ml/model/_packager/model_meta/model_meta_schema_test.py new file mode 100644 index 00000000..c4ad1b4a --- /dev/null +++ b/snowflake/ml/model/_packager/model_meta/model_meta_schema_test.py @@ -0,0 +1,21 @@ +import datetime + +from absl.testing import absltest + +from snowflake.ml._internal import env as snowml_env +from snowflake.ml.model._packager.model_meta import model_meta_schema +from snowflake.ml.test_utils import test_env_utils + + +class ModelMetaSchemaTest(absltest.TestCase): + def test_model_meta_schema_version(self) -> None: + datetime.datetime.strptime(model_meta_schema.MODEL_METADATA_VERSION, "%Y-%m-%d") + if model_meta_schema.MODEL_METADATA_MIN_SNOWPARK_ML_VERSION != snowml_env.VERSION: + self.assertIn( + model_meta_schema.MODEL_METADATA_MIN_SNOWPARK_ML_VERSION, + test_env_utils.get_snowpark_ml_released_versions(), + ) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_packager/model_meta/model_meta_test.py b/snowflake/ml/model/_packager/model_meta/model_meta_test.py new file mode 100644 index 00000000..85f0390f --- /dev/null +++ b/snowflake/ml/model/_packager/model_meta/model_meta_test.py @@ -0,0 +1,293 @@ +import os +import tempfile +from importlib import metadata as importlib_metadata + +import yaml +from absl.testing import absltest +from packaging import requirements, version + +from snowflake.ml._internal import env as snowml_env, env_utils +from snowflake.ml.model import model_signature +from snowflake.ml.model._packager.model_meta import model_blob_meta, model_meta + +_DUMMY_SIG = { + "predict": model_signature.ModelSignature( + inputs=[ + model_signature.FeatureSpec(dtype=model_signature.DataType.FLOAT, name="input"), + ], + outputs=[model_signature.FeatureSpec(name="output", dtype=model_signature.DataType.FLOAT)], + ) +} + +_DUMMY_BLOB = model_blob_meta.ModelBlobMeta( + name="model1", model_type="custom", path="mock_path", handler_version="version_0" +) + +_BASIC_DEPENDENCIES_TARGET = list( + sorted( + map( + lambda x: str(env_utils.get_local_installed_version_of_pip_package(requirements.Requirement(x))), + model_meta._PACKAGING_CORE_DEPENDENCIES, + ) + ) +) + +_BASIC_DEPENDENCIES_TARGET_WITH_SNOWML = list( + sorted( + map( + lambda x: str(env_utils.get_local_installed_version_of_pip_package(requirements.Requirement(x))), + model_meta._PACKAGING_CORE_DEPENDENCIES + [env_utils.SNOWPARK_ML_PKG_NAME], + ) + ) +) + + +class ModelMetaEnvTest(absltest.TestCase): + def test_model_meta_dependencies_no_packages(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + self.assertListEqual(meta.env.pip_requirements, []) + self.assertListEqual(meta.env.conda_dependencies, _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML) + self.assertEqual(meta.env.snowpark_ml_version, snowml_env.VERSION) + + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, []) + self.assertListEqual(loaded_meta.env.conda_dependencies, _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML) + self.assertEqual(meta.env.snowpark_ml_version, snowml_env.VERSION) + + def test_model_meta_dependencies_no_packages_embedded_snowml(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + embed_local_ml_library=True, + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + self.assertListEqual(meta.env.pip_requirements, []) + self.assertListEqual(meta.env.conda_dependencies, _BASIC_DEPENDENCIES_TARGET) + self.assertIsNotNone(meta.env._snowpark_ml_version.local) + + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, []) + self.assertListEqual(loaded_meta.env.conda_dependencies, _BASIC_DEPENDENCIES_TARGET) + self.assertIsNotNone(meta.env._snowpark_ml_version.local) + + def test_model_meta_dependencies_dup_basic_dep(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + conda_dependencies=["pandas"], + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] + dep_target.remove(f"pandas=={importlib_metadata.version('pandas')}") + dep_target.append("pandas") + dep_target.sort() + + self.assertListEqual(meta.env.pip_requirements, []) + self.assertListEqual(meta.env.conda_dependencies, dep_target) + + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, []) + self.assertListEqual(loaded_meta.env.conda_dependencies, dep_target) + + def test_model_meta_dependencies_dup_basic_dep_other_channel(self) -> None: + with self.assertWarns(UserWarning): + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + conda_dependencies=["conda-forge::pandas"], + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] + dep_target.remove(f"pandas=={importlib_metadata.version('pandas')}") + dep_target.append("conda-forge::pandas") + dep_target.sort() + + self.assertListEqual(meta.env.pip_requirements, []) + self.assertListEqual(meta.env.conda_dependencies, dep_target) + + with self.assertWarns(UserWarning): + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, []) + self.assertListEqual(loaded_meta.env.conda_dependencies, dep_target) + + def test_model_meta_dependencies_dup_basic_dep_pip(self) -> None: + with self.assertWarns(UserWarning): + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + pip_requirements=["pandas"], + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] + dep_target.remove(f"pandas=={importlib_metadata.version('pandas')}") + dep_target.sort() + + self.assertListEqual(meta.env.pip_requirements, ["pandas"]) + self.assertListEqual(meta.env.conda_dependencies, dep_target) + + with self.assertWarns(UserWarning): + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, ["pandas"]) + self.assertListEqual(loaded_meta.env.conda_dependencies, dep_target) + + def test_model_meta_dependencies_conda(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + conda_dependencies=["pytorch"], + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] + dep_target.append("pytorch") + dep_target.sort() + + self.assertListEqual(meta.env.pip_requirements, []) + self.assertListEqual(meta.env.conda_dependencies, dep_target) + + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, []) + self.assertListEqual(loaded_meta.env.conda_dependencies, dep_target) + + def test_model_meta_dependencies_pip(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + pip_requirements=["torch"], + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] + dep_target.sort() + + self.assertListEqual(meta.env.pip_requirements, ["torch"]) + self.assertListEqual(meta.env.conda_dependencies, dep_target) + + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, ["torch"]) + self.assertListEqual(loaded_meta.env.conda_dependencies, dep_target) + + def test_model_meta_dependencies_both(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + conda_dependencies=["pytorch"], + pip_requirements=["torch"], + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] + dep_target.append("pytorch") + dep_target.sort() + + self.assertListEqual(meta.env.pip_requirements, ["torch"]) + self.assertListEqual(meta.env.conda_dependencies, dep_target) + + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, ["torch"]) + self.assertListEqual(loaded_meta.env.conda_dependencies, dep_target) + + def test_model_meta_override_py_version(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG, python_version="2.7" + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + self.assertEqual(meta.env.python_version, "2.7") + + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertEqual(loaded_meta.env.python_version, "2.7") + + with tempfile.TemporaryDirectory() as tmpdir: + with self.assertRaises(version.InvalidVersion): + with model_meta.create_model_metadata( + model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG, python_version="a" + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + + def test_model_meta_metadata(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + metadata={"foo": "bar"}, + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + saved_meta = meta + + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertEqual(saved_meta.metadata, loaded_meta.metadata) + + def test_model_meta_check(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + metadata={"foo": "bar"}, + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + with open(os.path.join(tmpdir, model_meta.MODEL_METADATA_FILE), encoding="utf-8") as f: + meta_yaml_data = yaml.safe_load(f) + + del meta_yaml_data["version"] + + with open(os.path.join(tmpdir, model_meta.MODEL_METADATA_FILE), "w", encoding="utf-8") as f: + yaml.safe_dump(meta_yaml_data, f) + + with self.assertRaisesRegex(ValueError, "Unable to get the version of the metadata file."): + model_meta.ModelMetadata.load(tmpdir) + + def test_model_meta_cuda(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + meta.env.cuda_version = "11.7" + + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertEqual(loaded_meta.env.cuda_version, "11.7") + + with self.assertRaisesRegex(ValueError, "Different CUDA version .+ and .+ found in the same model!"): + loaded_meta.env.cuda_version = "12.0" + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_packager/model_meta_migrator/BUILD.bazel b/snowflake/ml/model/_packager/model_meta_migrator/BUILD.bazel new file mode 100644 index 00000000..68e43653 --- /dev/null +++ b/snowflake/ml/model/_packager/model_meta_migrator/BUILD.bazel @@ -0,0 +1,55 @@ +load("//bazel:py_rules.bzl", "py_library", "py_test") + +package(default_visibility = ["//visibility:public"]) + +py_library( + name = "base_migrator", + srcs = ["base_migrator.py"], + deps = [ + "//snowflake/ml/_internal:migrator_utils", + ], +) + +py_test( + name = "base_migrator_test", + srcs = ["base_migrator_test.py"], + deps = [ + ":base_migrator", + "//snowflake/ml/_internal:migrator_utils", + ], +) + +py_library( + name = "migrator_plans", + srcs = ["migrator_plans.py"], + deps = [ + ":base_migrator", + ":migrator_v1", + ], +) + +py_test( + name = "migrator_plans_test", + srcs = ["migrator_plans_test.py"], + deps = [ + ":migrator_plans", + "//snowflake/ml/model/_packager/model_meta:model_meta_schema", + ], +) + +py_library( + name = "migrator_v1", + srcs = ["migrator_v1.py"], + deps = [ + ":base_migrator", + "//snowflake/ml/_internal:env", + ], +) + +py_test( + name = "migrator_v1_test", + srcs = ["migrator_v1_test.py"], + deps = [ + "//snowflake/ml/model/_packager/model_meta", + ], +) diff --git a/snowflake/ml/model/_packager/model_meta_migrator/base_migrator.py b/snowflake/ml/model/_packager/model_meta_migrator/base_migrator.py new file mode 100644 index 00000000..94e35e15 --- /dev/null +++ b/snowflake/ml/model/_packager/model_meta_migrator/base_migrator.py @@ -0,0 +1,33 @@ +import copy +from abc import abstractmethod +from typing import Any, Dict, Protocol, final + +from snowflake.ml._internal import migrator_utils + + +class _BaseModelMetaMigratorProtocol(Protocol): + source_version: str + target_version: str + + @staticmethod + @abstractmethod + def upgrade(original_meta_dict: Dict[str, Any]) -> Dict[str, Any]: + raise NotImplementedError + + +class BaseModelMetaMigrator(_BaseModelMetaMigratorProtocol): + @final + def try_upgrade(self, original_meta_dict: Dict[str, Any]) -> Dict[str, Any]: + loaded_meta_version = original_meta_dict.get("version", None) + if not loaded_meta_version or str(loaded_meta_version) != self.source_version: + raise NotImplementedError( + f"Unknown or unsupported model metadata file with version {loaded_meta_version} found." + ) + try: + return self.upgrade(copy.deepcopy(original_meta_dict)) + except migrator_utils.UnableToUpgradeError as e: + raise RuntimeError( + f"Can not upgrade your model metadata from version {self.__class__.source_version} to" + f" {self.__class__.target_version}." + f"The latest version support the original version of Snowpark ML library is {e.last_supported_version}." + ) diff --git a/snowflake/ml/model/_packager/model_meta_migrator/base_migrator_test.py b/snowflake/ml/model/_packager/model_meta_migrator/base_migrator_test.py new file mode 100644 index 00000000..f0bf407e --- /dev/null +++ b/snowflake/ml/model/_packager/model_meta_migrator/base_migrator_test.py @@ -0,0 +1,54 @@ +from typing import Any, Dict + +from absl.testing import absltest + +from snowflake.ml._internal import migrator_utils +from snowflake.ml.model._packager.model_meta_migrator import base_migrator + + +class MetaMigrator_1(base_migrator.BaseModelMetaMigrator): + source_version = "version_0" + target_version = "version_1" + + @staticmethod + def upgrade(original_meta_dict: Dict[str, Any]) -> Dict[str, Any]: + return original_meta_dict + + +class MetaMigrator_2(base_migrator.BaseModelMetaMigrator): + source_version = "version_1" + target_version = "version_2" + + @staticmethod + def upgrade(original_meta_dict: Dict[str, Any]) -> Dict[str, Any]: + raise migrator_utils.UnableToUpgradeError(last_supported_version="1.0.9") + + +class BaseMigratorTest(absltest.TestCase): + def test_model_meta_dependencies_no_packages(self) -> None: + bad_meta: Dict[str, Any] = {} + migrator_1 = MetaMigrator_1() + with self.assertRaisesRegex( + NotImplementedError, + "Unknown or unsupported model metadata file with version .* found.", + ): + migrator_1.try_upgrade(bad_meta) + + good_meta = {"version": "version_0"} + + self.assertDictEqual(good_meta, migrator_1.try_upgrade(good_meta)) + self.assertIsNot(good_meta, migrator_1.try_upgrade(good_meta)) + + migrator_2 = MetaMigrator_2() + with self.assertRaisesRegex( + RuntimeError, + ( + "Can not upgrade your model metadata from version version_1 to version_2." + "The latest version support the original version of Snowpark ML library is 1.0.9." + ), + ): + migrator_2.try_upgrade({"version": "version_1"}) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py b/snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py new file mode 100644 index 00000000..50ca1cf8 --- /dev/null +++ b/snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py @@ -0,0 +1,21 @@ +from typing import Any, Dict, Type + +from snowflake.ml.model._packager.model_meta import model_meta_schema +from snowflake.ml.model._packager.model_meta_migrator import base_migrator, migrator_v1 + +MODEL_META_MIGRATOR_PLANS: Dict[str, Type[base_migrator.BaseModelMetaMigrator]] = {"1": migrator_v1.MetaMigrator_v1} + + +def migrate_metadata(loaded_meta: Dict[str, Any]) -> Dict[str, Any]: + loaded_meta_version = str(loaded_meta.get("version", None)) + while loaded_meta_version != model_meta_schema.MODEL_METADATA_VERSION: + if loaded_meta_version not in MODEL_META_MIGRATOR_PLANS.keys(): + raise RuntimeError( + f"Can not find migrator to migrate model metadata from {loaded_meta_version}" + f" to version {model_meta_schema.MODEL_METADATA_VERSION}." + ) + migrator = MODEL_META_MIGRATOR_PLANS[loaded_meta_version]() + loaded_meta = migrator.try_upgrade(original_meta_dict=loaded_meta) + loaded_meta_version = str(loaded_meta["version"]) + + return loaded_meta diff --git a/snowflake/ml/model/_packager/model_meta_migrator/migrator_plans_test.py b/snowflake/ml/model/_packager/model_meta_migrator/migrator_plans_test.py new file mode 100644 index 00000000..331172ef --- /dev/null +++ b/snowflake/ml/model/_packager/model_meta_migrator/migrator_plans_test.py @@ -0,0 +1,48 @@ +import datetime + +from absl.testing import absltest + +from snowflake.ml.model._packager.model_meta import model_meta_schema +from snowflake.ml.model._packager.model_meta_migrator import migrator_plans + + +class ModelMetaMigratorTest(absltest.TestCase): + def test_registered_handler(self) -> None: + all_source_versions = set() + all_target_versions = set() + for source_version, migrator_plan in migrator_plans.MODEL_META_MIGRATOR_PLANS.items(): + self.assertNotEqual( + model_meta_schema.MODEL_METADATA_VERSION, + source_version, + "There shouldn't be a migrator whose source version is current handler version.", + ) + self.assertEqual( + source_version, + migrator_plan.source_version, + "There shouldn't be a migrator whose source version does not equal to the key in the plans.", + ) + if source_version == "1": + # Legacy check + self.assertEqual(migrator_plan.target_version, "2023-12-01") + else: + self.assertLess( + datetime.datetime.strptime(migrator_plan.source_version, "%Y-%m-%d"), + datetime.datetime.strptime(migrator_plan.target_version, "%Y-%m-%d"), + "Migrator should not be able to downgrade.", + ) + if migrator_plan.target_version != model_meta_schema.MODEL_METADATA_VERSION: + self.assertIn( + migrator_plan.target_version, + migrator_plans.MODEL_META_MIGRATOR_PLANS.keys(), + ( + "There shouldn't be a migrator whose target version " + "is not current version and has not a migrator plan" + ), + ) + all_source_versions.add(migrator_plan.source_version) + all_target_versions.add(migrator_plan.target_version) + self.assertEqual(len(all_source_versions), len(all_target_versions), "The migrator plan is not monotonic.") + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_packager/model_meta_migrator/migrator_v1.py b/snowflake/ml/model/_packager/model_meta_migrator/migrator_v1.py new file mode 100644 index 00000000..2fcb7c62 --- /dev/null +++ b/snowflake/ml/model/_packager/model_meta_migrator/migrator_v1.py @@ -0,0 +1,48 @@ +from typing import Any, Dict + +from packaging import requirements, version + +from snowflake.ml._internal import env as snowml_env +from snowflake.ml.model._packager.model_meta_migrator import base_migrator + + +class MetaMigrator_v1(base_migrator.BaseModelMetaMigrator): + source_version = "1" + target_version = "2023-12-01" + + @staticmethod + def upgrade(original_meta_dict: Dict[str, Any]) -> Dict[str, Any]: + loaded_python_version = version.parse(original_meta_dict["python_version"]) + if original_meta_dict.get("local_ml_library_version", None): + loaded_lib_version = str(version.parse(original_meta_dict["local_ml_library_version"])) + else: + lib_spec_str = next( + filter( + lambda x: requirements.Requirement(x).name == "snowflake-ml-python", + original_meta_dict["conda_dependencies"], + ), + None, + ) + if lib_spec_str is None: + loaded_lib_version = snowml_env.VERSION + loaded_lib_version = list(requirements.Requirement(str(lib_spec_str)).specifier)[0].version + + return dict( + creation_timestamp=original_meta_dict["creation_timestamp"], + env=dict( + conda="env/conda.yaml", + pip="env/requirements.txt", + python_version=f"{loaded_python_version.major}.{loaded_python_version.minor}", + cuda_version=original_meta_dict.get("cuda_version", None), + snowpark_ml_version=loaded_lib_version, + ), + metadata=original_meta_dict.get("metadata", None), + model_type=original_meta_dict["model_type"], + models={ + name: {**value, "handler_version": "2023-12-01"} for name, value in original_meta_dict["models"].items() + }, + name=original_meta_dict["name"], + signatures=original_meta_dict["signatures"], + version=MetaMigrator_v1.target_version, + min_snowpark_ml_version="1.0.12", + ) diff --git a/snowflake/ml/model/_packager/model_meta_migrator/migrator_v1_test.py b/snowflake/ml/model/_packager/model_meta_migrator/migrator_v1_test.py new file mode 100644 index 00000000..735863d8 --- /dev/null +++ b/snowflake/ml/model/_packager/model_meta_migrator/migrator_v1_test.py @@ -0,0 +1,132 @@ +import os +import tempfile + +from absl.testing import absltest + +from snowflake.ml.model._packager.model_meta import model_meta + +YAML_1 = """ +conda_dependencies: +- absl-py==1.3.0 +- anyio==3.5.0 +- cloudpickle==2.0.0 +- numpy==1.24.3 +- packaging==23.0 +- pandas==1.5.3 +- pytorch==2.0.1 +- pyyaml==6.0 +- snowflake-snowpark-python==1.5.1 +- tokenizers==0.13.2 +- transformers==4.29.2 +- typing-extensions==4.5.0 +creation_timestamp: '2023-09-21 18:12:39.409911' +cuda_version: '11.7' +local_ml_library_version: 1.0.9+df2e394bae177167b9d9a8becc792ed899f3432d +metadata: null +model_type: huggingface_pipeline +models: + llama-2-7b-chat: + artifacts: {} + model_type: huggingface_pipeline + name: llama-2-7b-chat + options: + batch_size: '1' + task: text-generation + path: model +name: llama-2-7b-chat +pip_requirements: [] +python_version: 3.8.13 +signatures: + __call__: + inputs: + - name: inputs + type: STRING + outputs: + - name: outputs + type: STRING +version: 1 +""" + +CONDA_FILE = """ +channels: +- https://repo.anaconda.com/pkgs/snowflake +- nodefaults +dependencies: +- python==3.8.13 +- absl-py==1.3.0 +- anyio==3.5.0 +- cloudpickle==2.0.0 +- numpy==1.24.3 +- packaging==23.0 +- pandas==1.5.3 +- pyyaml==6.0 +- snowflake-snowpark-python==1.5.1 +- typing-extensions==4.5.0 +- transformers==4.29.2 +- tokenizers==0.13.2 +- pytorch==2.0.1 +name: snow-env +""" + +YAML_2 = """ +conda_dependencies: +- absl-py==1.3.0 +- anyio==3.5.0 +- cloudpickle==2.0.0 +- numpy==1.24.3 +- packaging==23.0 +- pandas==1.5.3 +- pytorch==2.0.1 +- pyyaml==6.0 +- snowflake-ml-python==1.0.9 +- snowflake-snowpark-python==1.5.1 +- tokenizers==0.13.2 +- transformers==4.29.2 +- typing-extensions==4.5.0 +creation_timestamp: '2023-09-21 18:12:39.409911' +cuda_version: '11.7' +metadata: null +model_type: huggingface_pipeline +models: + llama-2-7b-chat: + artifacts: {} + model_type: huggingface_pipeline + name: llama-2-7b-chat + options: + batch_size: '1' + task: text-generation + path: model +name: llama-2-7b-chat +pip_requirements: [] +python_version: 3.8.13 +signatures: + __call__: + inputs: + - name: inputs + type: STRING + outputs: + - name: outputs + type: STRING +version: 1 +""" + + +class MigratorV1Test(absltest.TestCase): + def test_yaml_load(self) -> None: + for yaml_str in [YAML_1, YAML_2]: + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, model_meta.MODEL_METADATA_FILE), "w", encoding="utf-8") as f: + f.write(yaml_str) + + os.makedirs(os.path.join(tmpdir, "env"), exist_ok=True) + with open(os.path.join(tmpdir, "env", "conda.yaml"), "w", encoding="utf-8") as f: + f.write(CONDA_FILE) + + with open(os.path.join(tmpdir, "env", "requirements.txt"), "w", encoding="utf-8") as f: + f.write("") + + model_meta.ModelMetadata.load(tmpdir) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_packager/model_packager.py b/snowflake/ml/model/_packager/model_packager.py new file mode 100644 index 00000000..71127327 --- /dev/null +++ b/snowflake/ml/model/_packager/model_packager.py @@ -0,0 +1,149 @@ +import os +from types import ModuleType +from typing import Dict, List, Optional + +from absl import logging + +from snowflake.ml._internal import env_utils +from snowflake.ml._internal.exceptions import ( + error_codes, + exceptions as snowml_exceptions, +) +from snowflake.ml.model import custom_model, model_signature, type_hints as model_types +from snowflake.ml.model._packager import model_handler +from snowflake.ml.model._packager.model_meta import model_meta + + +class ModelPackager: + """Top-level class to save/load and manage a Snowflake Native formatted model. + It maintains the actual model blob files, environment required by model itself and signatures to do the + inference with the model. + + Attributes: + local_dir_path: A path to a local directory will files to dump and load. + model: The model object to be saved / loaded from file. + meta: The model metadata (ModelMetadata object) to be saved / loaded from file. + model and meta will be set once save / load method is called. + + """ + + MODEL_BLOBS_DIR = "models" + + def __init__(self, local_dir_path: str) -> None: + self.local_dir_path = os.path.normpath(local_dir_path) + self.model: Optional[model_types.SupportedModelType] = None + self.meta: Optional[model_meta.ModelMetadata] = None + + def save( + self, + *, + name: str, + model: model_types.SupportedModelType, + signatures: Optional[Dict[str, model_signature.ModelSignature]] = None, + sample_input: Optional[model_types.SupportedDataType] = None, + metadata: Optional[Dict[str, str]] = None, + conda_dependencies: Optional[List[str]] = None, + pip_requirements: Optional[List[str]] = None, + python_version: Optional[str] = None, + ext_modules: Optional[List[ModuleType]] = None, + code_paths: Optional[List[str]] = None, + options: Optional[model_types.ModelSaveOption] = None, + ) -> None: + if (signatures is None) and (sample_input is None) and not model_handler.is_auto_signature_model(model): + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=ValueError( + "Signatures and sample_input both cannot be None at the same time for this kind of model." + ), + ) + + if (signatures is not None) and (sample_input is not None): + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=ValueError("Signatures and sample_input both cannot be specified at the same time."), + ) + + if not options: + options = model_types.BaseModelSaveOption() + + handler = model_handler.find_handler(model) + if handler is None: + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_TYPE, + original_exception=TypeError(f"{type(model)} is not supported."), + ) + with model_meta.create_model_metadata( + model_dir_path=self.local_dir_path, + name=name, + model_type=handler.HANDLER_TYPE, + metadata=metadata, + code_paths=code_paths, + signatures=signatures, + ext_modules=ext_modules, + conda_dependencies=conda_dependencies, + pip_requirements=pip_requirements, + python_version=python_version, + **options, + ) as meta: + model_blobs_path = os.path.join(self.local_dir_path, ModelPackager.MODEL_BLOBS_DIR) + os.makedirs(model_blobs_path, exist_ok=True) + model = handler.cast_model(model) + handler.save_model( + name=name, + model=model, + model_meta=meta, + model_blobs_dir_path=model_blobs_path, + sample_input=sample_input, + is_sub_model=False, + **options, + ) + if signatures is None: + logging.info(f"Model signatures are auto inferred as:\n\n{meta.signatures}") + + self.model = model + self.meta = meta + + def load( + self, + *, + meta_only: bool = False, + as_custom_model: bool = False, + options: Optional[model_types.ModelLoadOption] = None, + ) -> None: + """Load the model into memory from directory. Used internal only. + + Args: + meta_only: Flag to indicate that if only load metadata. + as_custom_model: When set to True, It will try to convert the model as custom model after load. + options: Model loading options. + + Raises: + SnowflakeMLException: Raised if model is not native format. + """ + + self.meta = model_meta.ModelMetadata.load(self.local_dir_path) + if meta_only: + return + + model_meta.load_code_path(self.local_dir_path) + + env_utils.validate_py_runtime_version(self.meta.env.python_version) + + handler = model_handler.load_handler(self.meta.model_type) + if handler is None: + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_TYPE, + original_exception=TypeError(f"{self.meta.model_type} is not supported."), + ) + model_blobs_path = os.path.join(self.local_dir_path, ModelPackager.MODEL_BLOBS_DIR) + if options is None: + options = {} + + handler.try_upgrade(self.meta.name, self.meta, model_blobs_path) + m = handler.load_model(self.meta.name, self.meta, model_blobs_path, **options) + + if as_custom_model: + m = handler.convert_as_custom_model(m, self.meta, **options) + assert isinstance(m, custom_model.CustomModel) + + self.model = m diff --git a/snowflake/ml/model/_packager/model_packager_test.py b/snowflake/ml/model/_packager/model_packager_test.py new file mode 100644 index 00000000..60dbfbae --- /dev/null +++ b/snowflake/ml/model/_packager/model_packager_test.py @@ -0,0 +1,260 @@ +import importlib +import os +import sys +import tempfile + +import numpy as np +import pandas as pd +from absl.testing import absltest +from sklearn import datasets, linear_model + +from snowflake.ml._internal import file_utils +from snowflake.ml.model import custom_model, model_signature +from snowflake.ml.model._packager import model_packager +from snowflake.ml.modeling.linear_model import ( # type:ignore[attr-defined] + LinearRegression, +) +from snowflake.ml.test_utils import exception_utils + + +class DemoModelWithManyArtifacts(custom_model.CustomModel): + def __init__(self, context: custom_model.ModelContext) -> None: + super().__init__(context) + with open(os.path.join(context.path("bias"), "bias1"), encoding="utf-8") as f: + v1 = int(f.read()) + with open(os.path.join(context.path("bias"), "bias2"), encoding="utf-8") as f: + v2 = int(f.read()) + self.bias = v1 + v2 + + @custom_model.inference_api + def predict(self, input: pd.DataFrame) -> pd.DataFrame: + return pd.DataFrame({"output": input["c1"] + self.bias}) + + +class DemoModel(custom_model.CustomModel): + def __init__(self, context: custom_model.ModelContext) -> None: + super().__init__(context) + + @custom_model.inference_api + def predict(self, input: pd.DataFrame) -> pd.DataFrame: + return pd.DataFrame({"output": input["c1"]}) + + +PY_SRC = """\ +def get_name(): + return __name__ +def get_file(): + return __file__ +""" + + +class ModelLoadHygieneTest(absltest.TestCase): + def test_model_load_hygiene(self) -> None: + with tempfile.TemporaryDirectory() as workspace: + with tempfile.TemporaryDirectory() as src_path: + fake_mod_dirpath = os.path.join(src_path, "fake", "fake_module") + os.makedirs(fake_mod_dirpath) + + py_file_path = os.path.join(fake_mod_dirpath, "p.py") + with open(py_file_path, "w", encoding="utf-8") as f: + f.write(PY_SRC) + f.flush() + + sys.path.insert(0, src_path) + + from fake.fake_module import p + + self.assertEqual(p.__file__, py_file_path) + + lm = DemoModel(context=custom_model.ModelContext(models={}, artifacts={})) + arr = np.array([[1, 2, 3], [4, 2, 5]]) + d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + + model_packager.ModelPackager(os.path.join(workspace, "model1")).save( + name="model1", + model=lm, + sample_input=d, + metadata={"author": "halu", "version": "1"}, + code_paths=[os.path.join(src_path, "fake")], + ) + + model_packager.ModelPackager(os.path.join(workspace, "model1")).load() + from fake.fake_module import p + + self.assertEqual(p.__file__, os.path.join(workspace, "model1", "code", "fake", "fake_module", "p.py")) + + importlib.reload(p) + self.assertEqual(p.__file__, py_file_path) + sys.path.remove(src_path) + + def test_model_save_validation(self) -> None: + with tempfile.TemporaryDirectory() as workspace: + with tempfile.TemporaryDirectory() as src_path: + fake_mod_dirpath = os.path.join(src_path, "snowflake", "fake_module") + os.makedirs(fake_mod_dirpath) + + py_file_path = os.path.join(fake_mod_dirpath, "p.py") + with open(py_file_path, "w", encoding="utf-8") as f: + f.write(PY_SRC) + f.flush() + + lm = DemoModel(context=custom_model.ModelContext(models={}, artifacts={})) + arr = np.array([[1, 2, 3], [4, 2, 5]]) + d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + with self.assertRaises(ValueError): + model_packager.ModelPackager(os.path.join(workspace, "model1")).save( + name="model1", + model=lm, + sample_input=d, + metadata={"author": "halu", "version": "1"}, + code_paths=[os.path.join(src_path, "snowflake")], + ) + + with tempfile.TemporaryDirectory() as src_path: + py_file_path = os.path.join(src_path, "snowflake.py") + with open(py_file_path, "w", encoding="utf-8") as f: + f.write(PY_SRC) + f.flush() + + lm = DemoModel(context=custom_model.ModelContext(models={}, artifacts={})) + arr = np.array([[1, 2, 3], [4, 2, 5]]) + d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + with self.assertRaises(ValueError): + model_packager.ModelPackager(os.path.join(workspace, "model1")).save( + name="model1", + model=lm, + sample_input=d, + metadata={"author": "halu", "version": "1"}, + code_paths=[py_file_path], + ) + + def test_zipimport_snowml(self) -> None: + snowml_path, snowml_start_path = file_utils.get_package_path("snowflake.ml", strategy="last") + with tempfile.TemporaryDirectory() as workspace: + zipped_snowml_path = os.path.join(workspace, "snowml.zip") + with open(zipped_snowml_path, "wb") as f: + with file_utils.zip_file_or_directory_to_stream(snowml_path, snowml_start_path) as zip_stream: + f.write(zip_stream.getbuffer()) + + sys.path.append(zipped_snowml_path) + try: + lm = DemoModel(context=custom_model.ModelContext(models={}, artifacts={})) + arr = np.array([[1, 2, 3], [4, 2, 5]]) + d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + model_packager.ModelPackager(os.path.join(workspace, "model1")).save( + name="model1", + model=lm, + sample_input=d, + metadata={"author": "halu", "version": "1"}, + options={"embed_local_ml_library": True}, + ) + self.assertTrue( + os.path.exists( + os.path.join( + workspace, "model1", "code", "snowflake", "ml", "model", "_packager", "model_packager.py" + ) + ) + ) + finally: + sys.path.remove(zipped_snowml_path) + + +class ModelPackagerTest(absltest.TestCase): + def test_save_validation_1(self) -> None: + with tempfile.TemporaryDirectory() as workspace: + arr = np.array([[1, 2, 3], [4, 2, 5]]) + d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + pk = model_packager.ModelPackager(os.path.join(workspace, "model1")) + + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex="Signatures and sample_input both cannot be specified at the same time.", + ): + pk.save( + name="model1", + model=linear_model.LinearRegression(), + sample_input=d, + signatures={"predict": model_signature.ModelSignature(inputs=[], outputs=[])}, + ) + + with exception_utils.assert_snowml_exceptions( + self, + expected_original_error_type=ValueError, + expected_regex=( + "Signatures and sample_input both cannot be None at the same time for this kind of model." + ), + ): + pk.save( + name="model1", + model=linear_model.LinearRegression(), + ) + + def test_save_validation_2(self) -> None: + iris = datasets.load_iris() + + df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=iris["feature_names"] + ["target"]) + df.columns = [s.replace(" (CM)", "").replace(" ", "") for s in df.columns.str.upper()] + + INPUT_COLUMNS = ["SEPALLENGTH", "SEPALWIDTH", "PETALLENGTH", "PETALWIDTH"] + LABEL_COLUMNS = "TARGET" + OUTPUT_COLUMNS = "PREDICTED_TARGET" + regr = LinearRegression(input_cols=INPUT_COLUMNS, output_cols=OUTPUT_COLUMNS, label_cols=LABEL_COLUMNS) + regr.fit(df) + + predictions = regr.predict(df[:1])[[OUTPUT_COLUMNS]] + + with tempfile.TemporaryDirectory() as tmpdir: + model_packager.ModelPackager(os.path.join(tmpdir, "model1")).save( + name="model1", + model=regr, + metadata={"author": "halu", "version": "1"}, + ) + + pk = model_packager.ModelPackager(os.path.join(tmpdir, "model1")) + pk.load() + assert pk.model + assert pk.meta + assert isinstance(pk.model, LinearRegression) + np.testing.assert_allclose(predictions, desired=pk.model.predict(df[:1])[[OUTPUT_COLUMNS]]) + + def test_bad_save_model(self) -> None: + with tempfile.TemporaryDirectory() as workspace: + os.mkdir(os.path.join(workspace, "bias")) + with open(os.path.join(workspace, "bias", "bias1"), "w", encoding="utf-8") as f: + f.write("25") + with open(os.path.join(workspace, "bias", "bias2"), "w", encoding="utf-8") as f: + f.write("68") + lm = DemoModelWithManyArtifacts( + custom_model.ModelContext(models={}, artifacts={"bias": os.path.join(workspace, "bias")}) + ) + arr = np.array([[1, 2, 3], [4, 2, 5]]) + d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + s = {"predict": model_signature.infer_signature(d, lm.predict(d))} + + with self.assertRaises(ValueError): + model_packager.ModelPackager(os.path.join(workspace, "model1")).save( + name="model1", + model=lm, + signatures={**s, "another_predict": s["predict"]}, + metadata={"author": "halu", "version": "1"}, + ) + + model_packager.ModelPackager(os.path.join(workspace, "model1")).save( + name="model1", + model=lm, + signatures=s, + metadata={"author": "halu", "version": "1"}, + python_version="3.5.2", + ) + + pk = model_packager.ModelPackager(os.path.join(workspace, "model1")) + pk.load(meta_only=True) + + with exception_utils.assert_snowml_exceptions(self, expected_original_error_type=RuntimeError): + pk = model_packager.ModelPackager(os.path.join(workspace, "model1")) + pk.load() + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/models/llm.py b/snowflake/ml/model/models/llm.py index 52488852..c6bf036e 100644 --- a/snowflake/ml/model/models/llm.py +++ b/snowflake/ml/model/models/llm.py @@ -29,9 +29,20 @@ class LLMOptions: revision: Optional[str] = field(default=None) token: Optional[str] = field(default=None) max_batch_size: int = field(default=1) + # TODO(halu): Debug raylet die issue. + # TP on vLLM is not supported yet. + enable_tp: bool = field(default=False) + # TODO(halu): Below could be per query call param instead. + temperature: float = field(default=0.01) + top_p: float = field(default=1.0) + max_tokens: int = field(default=100) class LLM: + class Mode(Enum): + LOCAL_LORA = "local_lora" + REMOTE_PRETRAIN = "remote_pretrain" + def __init__( self, model_id_or_path: str, @@ -41,35 +52,55 @@ def __init__( """ Args: - model_id_or_path: Local dir to PEFT weights. + model_id_or_path: model_id or local dir to PEFT lora weights. options: Options for LLM. Defaults to be None. Raises: ValueError: When unsupported. """ - if not (os.path.isdir(model_id_or_path) and os.path.isfile(os.path.join(model_id_or_path, _PEFT_CONFIG_NAME))): - raise ValueError("Peft config is not found.") - import peft - import transformers - if not options: options = LLMOptions() - hub_kwargs = { "revision": options.revision, "token": options.token, } - peft_config = peft.PeftConfig.from_pretrained(model_id_or_path, **hub_kwargs) # type: ignore[attr-defined] - if peft_config.peft_type != peft.PeftType.LORA: # type: ignore[attr-defined] - raise ValueError("Only LORA is supported.") - if peft_config.task_type != peft.TaskType.CAUSAL_LM: # type: ignore[attr-defined] - raise ValueError("Only CAUSAL_LM is supported.") - base_model = peft_config.base_model_name_or_path - base_config = transformers.AutoConfig.from_pretrained(base_model, **hub_kwargs) - assert base_config.model_type in SupportedLLMType.valid_values(), f"{base_config.model_type} is not supported." + import transformers + + if os.path.isdir(model_id_or_path): + if not os.path.isfile(os.path.join(model_id_or_path, _PEFT_CONFIG_NAME)): + raise ValueError("Peft config is not found.") + + import peft + + peft_config = peft.PeftConfig.from_pretrained(model_id_or_path, **hub_kwargs) # type: ignore[attr-defined] + if peft_config.peft_type != peft.PeftType.LORA: # type: ignore[attr-defined] + raise ValueError("Only LORA is supported.") + if peft_config.task_type != peft.TaskType.CAUSAL_LM: # type: ignore[attr-defined] + raise ValueError("Only CAUSAL_LM is supported.") + base_model = peft_config.base_model_name_or_path + base_config = transformers.AutoConfig.from_pretrained(base_model, **hub_kwargs) + assert ( + base_config.model_type in SupportedLLMType.valid_values() + ), f"{base_config.model_type} is not supported." + self.mode = LLM.Mode.LOCAL_LORA + self.model_type = base_config.model_type + else: + # We support pre-train model as well + model_config = transformers.AutoConfig.from_pretrained( + model_id_or_path, + **hub_kwargs, + ) + assert ( + model_config.model_type in SupportedLLMType.valid_values() + ), f"{model_config.model_type} is not supported." + self.mode = LLM.Mode.REMOTE_PRETRAIN + self.model_type = model_config.model_type self.model_id_or_path = model_id_or_path self.token = options.token self.revision = options.revision self.max_batch_size = options.max_batch_size - self.model_type = base_config.model_type + self.temperature = options.temperature + self.top_p = options.top_p + self.max_tokens = options.max_tokens + self.enable_tp = options.enable_tp diff --git a/snowflake/ml/model/type_hints.py b/snowflake/ml/model/type_hints.py index 9c58e344..eef1a9e7 100644 --- a/snowflake/ml/model/type_hints.py +++ b/snowflake/ml/model/type_hints.py @@ -1,5 +1,5 @@ # mypy: disable-error-code="import" -from typing import TYPE_CHECKING, Sequence, TypedDict, TypeVar, Union +from typing import TYPE_CHECKING, Literal, Sequence, TypedDict, TypeVar, Union import numpy.typing as npt from typing_extensions import NotRequired, Required @@ -79,7 +79,7 @@ SupportedNoSignatureRequirementsModelType, ] """This is defined as the type that Snowflake native model packaging could accept. -Here is all acceptable types of Snowflake native model packaging and its handler file in _handlers/ folder. +Here is all acceptable types of Snowflake native model packaging and its handler file in _model_handlers/ folder. | Type | Handler File | Handler | |---------------------------------|--------------|---------------------| @@ -97,6 +97,18 @@ | huggingface_pipeline.HuggingFacePipelineModel | huggingface_pipeline.py | _HuggingFacePipelineHandler | """ +SupportedModelHandlerType = Literal[ + "custom", + "huggingface_pipeline", + "mlflow", + "pytorch", + "sklearn", + "snowml", + "tensorflow", + "torchscript", + "xgboost", + "llm", +] _ModelType = TypeVar("_ModelType", bound=SupportedModelType) @@ -113,7 +125,8 @@ class WarehouseDeployOptions(DeployOptions): permanent_udf_stage_location: A Snowflake stage option where the UDF should be persisted. If specified, the model will be deployed as a permanent UDF, otherwise temporary. - relax_version: Whether or not relax the version constraints of the dependencies if unresolvable. Defaults to False. + relax_version: Whether or not relax the version constraints of the dependencies if unresolvable. It detects any + ==x.y.z in specifiers and replaced with >=x.y, <(x+1). Defaults to False. replace_udf: Flag to indicate when deploying model as permanent UDF, whether overwriting existed UDF is allowed. Default to False. """ @@ -157,18 +170,16 @@ class SnowparkContainerServiceDeployOptions(DeployOptions): enable_remote_image_build: NotRequired[bool] force_image_build: NotRequired[bool] model_in_image: NotRequired[bool] + debug_mode: NotRequired[bool] class BaseModelSaveOption(TypedDict): """Options for saving the model. embed_local_ml_library: Embedding local SnowML into the code directory of the folder. - allow_overwritten_stage_file: Flag to indicate when saving the model as a stage file, whether overwriting existed - file is allowed. Default to False. """ embed_local_ml_library: NotRequired[bool] - allow_overwritten_stage_file: NotRequired[bool] class CustomModelSaveOption(BaseModelSaveOption): @@ -214,6 +225,10 @@ class HuggingFaceSaveOptions(BaseModelSaveOption): cuda_version: NotRequired[str] +class LLMSaveOptions(BaseModelSaveOption): + cuda_version: NotRequired[str] + + ModelSaveOption = Union[ BaseModelSaveOption, CustomModelSaveOption, @@ -225,6 +240,7 @@ class HuggingFaceSaveOptions(BaseModelSaveOption): TensorflowSaveOptions, MLFlowSaveOptions, HuggingFaceSaveOptions, + LLMSaveOptions, ] diff --git a/snowflake/ml/modeling/_internal/snowpark_handlers.py b/snowflake/ml/modeling/_internal/snowpark_handlers.py index b78aded3..2ab5f328 100644 --- a/snowflake/ml/modeling/_internal/snowpark_handlers.py +++ b/snowflake/ml/modeling/_internal/snowpark_handlers.py @@ -1,7 +1,6 @@ import importlib import inspect import io -import json import os import posixpath import sys @@ -38,10 +37,9 @@ TempObjectType, random_name_for_temp_object, ) -from snowflake.snowpark.functions import col, pandas_udf, sproc +from snowflake.snowpark.functions import col, pandas_udf, sproc, udtf from snowflake.snowpark.stored_procedure import StoredProcedure from snowflake.snowpark.types import ( - FloatType, IntegerType, PandasSeries, StringType, @@ -50,6 +48,8 @@ VariantType, ) +cp.register_pickle_by_value(inspect.getmodule(get_temp_file_path)) + _PROJECT = "ModelDevelopment" @@ -75,7 +75,6 @@ def fit_wrapper_function( ) -> str: import inspect import os - import tempfile import cloudpickle as cp import pandas as pd @@ -91,9 +90,7 @@ def fit_wrapper_function( df: pd.DataFrame = sp_df.to_pandas(statement_params=statement_params) df.columns = sp_df.columns - local_transform_file = tempfile.NamedTemporaryFile(delete=True) - local_transform_file_name = local_transform_file.name - local_transform_file.close() + local_transform_file_name = get_temp_file_path() session.file.get(stage_transform_file_name, local_transform_file_name, statement_params=statement_params) @@ -114,9 +111,7 @@ def fit_wrapper_function( estimator.fit(**args) - local_result_file = tempfile.NamedTemporaryFile(delete=True) - local_result_file_name = local_result_file.name - local_result_file.close() + local_result_file_name = get_temp_file_path() with open(local_result_file_name, mode="w+b") as local_result_file_obj: cp.dump(estimator, local_result_file_obj) @@ -635,7 +630,6 @@ def score_wrapper_sproc( ) -> float: import inspect import os - import tempfile import cloudpickle as cp @@ -648,10 +642,7 @@ def score_wrapper_sproc( df: pd.DataFrame = sp_df.to_pandas(statement_params=statement_params) df.columns = sp_df.columns - local_score_file = tempfile.NamedTemporaryFile(delete=True) - local_score_file_name = local_score_file.name - local_score_file.close() - + local_score_file_name = get_temp_file_path() session.file.get(stage_score_file_name, local_score_file_name, statement_params=statement_params) local_score_file_name_path = os.path.join(local_score_file_name, os.listdir(local_score_file_name)[0]) @@ -733,47 +724,61 @@ def fit_search_snowpark( # Store GridSearchCV's refit variable. If user set it as False, we don't need to refit it again refit_bool = estimator.refit - # Create a temp file and dump the score to that file. + # Create a temp file and dump the estimator to that file. estimator_file_name = get_temp_file_path() with open(estimator_file_name, mode="w+b") as local_estimator_file_obj: # Set GridSearchCV refit as False and fit it again after retrieving the best param estimator.refit = False cp.dump(estimator, local_estimator_file_obj) stage_estimator_file_name = posixpath.join(temp_stage_name, os.path.basename(estimator_file_name)) - statement_params = telemetry.get_function_usage_statement_params( + sproc_statement_params = telemetry.get_function_usage_statement_params( project=_PROJECT, subproject=self._subproject, function_name=telemetry.get_statement_params_full_func_name( inspect.currentframe(), self.__class__.__name__ ), api_calls=[sproc], - custom_tags=dict([("autogen", False)]) if self._autogenerated else None, + custom_tags=dict([("autogen", True)]) if self._autogenerated else None, + ) + udtf_statement_params = telemetry.get_function_usage_statement_params( + project=_PROJECT, + subproject=self._subproject, + function_name=telemetry.get_statement_params_full_func_name( + inspect.currentframe(), self.__class__.__name__ + ), + api_calls=[udtf], + custom_tags=dict([("autogen", True)]) if self._autogenerated else None, ) - # Put locally serialized score on stage. + # Put locally serialized estimator on stage. put_result = session.file.put( estimator_file_name, temp_stage_name, auto_compress=False, overwrite=True, - statement_params=statement_params, ) estimator_location = put_result[0].target imports.append(f"@{temp_stage_name}/{estimator_location}") search_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE) random_udtf_name = random_name_for_temp_object(TempObjectType.FUNCTION) - random_table_name = random_name_for_temp_object(TempObjectType.TABLE) + + required_deps = dependencies + [ + "snowflake-snowpark-python<2", + "fastparquet<2023.11", + "pyarrow<14", + "cachetools<5", + ] @sproc( # type: ignore[misc] is_permanent=False, name=search_sproc_name, - packages=dependencies + ["snowflake-snowpark-python", "pyarrow", "fastparquet"], # type: ignore[arg-type] + packages=required_deps, # type: ignore[arg-type] replace=True, session=session, anonymous=True, imports=imports, # type: ignore[arg-type] - statement_params=statement_params, + statement_params=sproc_statement_params, ) def _distributed_search( session: Session, @@ -781,17 +786,17 @@ def _distributed_search( stage_estimator_file_name: str, input_cols: List[str], label_cols: List[str], - statement_params: Dict[str, str], ) -> str: import copy import os - import tempfile import time from typing import Iterator, List import cloudpickle as cp import pandas as pd import pyarrow.parquet as pq + from sklearn.metrics import check_scoring + from sklearn.metrics._scorer import _check_multimetric_scoring for import_name in udf_imports: importlib.import_module(import_name) @@ -810,11 +815,8 @@ def _distributed_search( X = df[input_cols] y = df[label_cols].squeeze() - local_estimator_file = tempfile.NamedTemporaryFile(delete=True) - local_estimator_file_name = local_estimator_file.name - local_estimator_file.close() - - session.file.get(stage_estimator_file_name, local_estimator_file_name, statement_params=statement_params) + local_estimator_file_name = get_temp_file_path() + session.file.get(stage_estimator_file_name, local_estimator_file_name) local_estimator_file_path = os.path.join( local_estimator_file_name, os.listdir(local_estimator_file_name)[0] @@ -824,15 +826,19 @@ def _distributed_search( cv_orig = check_cv(estimator.cv, y, classifier=is_classifier(estimator.estimator)) indices = [test for _, test in cv_orig.split(X, y)] - indices_df = pd.DataFrame({"TEST": indices}) - indices_df = session.create_dataframe(indices_df) + local_indices_file_name = get_temp_file_path() + with open(local_indices_file_name, mode="w+b") as local_indices_file_obj: + cp.dump(indices, local_indices_file_obj) - remote_file_path = f"{temp_stage_name}/indices.parquet" - indices_df.write.copy_into_location( - remote_file_path, file_format_type="parquet", header=True, overwrite=True + # Put locally serialized indices on stage. + put_result = session.file.put( + local_indices_file_name, + temp_stage_name, + auto_compress=False, + overwrite=True, ) - imports.extend([f"@{row.name}" for row in session.sql(f"LIST @{temp_stage_name}/indices").collect()]) - + indices_location = put_result[0].target + imports.append(f"@{temp_stage_name}/{indices_location}") indices_len = len(indices) assert estimator is not None @@ -870,16 +876,11 @@ def _load_data_into_udf() -> Tuple[ estimator = cp.load(local_estimator_file_obj) # load indices - indices_files = [ - filename - for filename in os.listdir(sys._xoptions["snowflake_import_directory"]) - if filename.startswith("indices") - ] - indices_partial_df = [ - pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas() - for file_name in indices_files - ] - indices = pd.concat(indices_partial_df, ignore_index=True) + local_indices_file_path = os.path.join( + sys._xoptions["snowflake_import_directory"], f"{indices_location}" + ) + with open(local_indices_file_path, mode="rb") as local_indices_file_obj: + indices = cp.load(local_indices_file_obj) argspec = inspect.getfullargspec(estimator.fit) args = {"X": df[input_cols]} @@ -902,13 +903,13 @@ def __init__(self) -> None: def process( self, params: List[dict], idx: int # type:ignore[type-arg] - ) -> Iterator[Tuple[float, str, str]]: + ) -> Iterator[Tuple[str]]: if hasattr(estimator, "param_grid"): self.estimator.param_grid = params else: self.estimator.param_distributions = params full_indices = np.array([i for i in range(self.data_length)]) - test_indice = json.loads(self.indices["TEST"][idx]) + test_indice = self.indices[idx] train_indice = np.setdiff1d(full_indices, test_indice) self.estimator.cv = [(train_indice, test_indice)] self.estimator.fit(**self.args) @@ -917,27 +918,21 @@ def process( cp.dump(self.estimator.cv_results_, f) f.seek(0) binary_cv_results = f.getvalue().hex() - yield (self.estimator.best_score_, json.dumps(self.estimator.best_params_), binary_cv_results) + yield (binary_cv_results,) def end_partition(self) -> None: ... session.udtf.register( SearchCV, - output_schema=StructType( - [ - StructField("BEST_SCORE", FloatType()), - StructField("BEST_PARAMS", StringType()), - StructField("CV_RESULTS", StringType()), - ] - ), + output_schema=StructType([StructField("CV_RESULTS", StringType())]), input_types=[VariantType(), IntegerType()], name=random_udtf_name, - packages=dependencies + ["pyarrow", "fastparquet"], # type: ignore[arg-type] + packages=required_deps, # type: ignore[arg-type] replace=True, is_permanent=False, imports=imports, # type: ignore[arg-type] - statement_params=statement_params, + statement_params=udtf_statement_params, ) HP_TUNING = F.table_function(random_udtf_name) @@ -963,12 +958,10 @@ def end_partition(self) -> None: (HP_TUNING(df["PARAMS"], df["TRAIN_IND"]).over(partition_by=df["PARAM_INDEX"])), ) - results.write.saveAsTable(random_table_name, mode="overwrite", table_type="temporary") - table_result = session.table(random_table_name).sort(col("PARAM_INDEX")) - # cv_result maintains the original order + multimetric = False cv_results_ = dict() - for i, val in enumerate(table_result.select("CV_RESULTS").collect()): + for i, val in enumerate(results.select("CV_RESULTS").sort(col("PARAM_INDEX")).collect()): # retrieved string had one more double quote in the front and end of the string. # use [1:-1] to remove the extra double quotes hex_str = bytes.fromhex(val[0]) @@ -977,8 +970,12 @@ def end_partition(self) -> None: for k, v in each_cv_result.items(): cur_cv = i % idx_length key = k - if k == "split0_test_score": - key = f"split{cur_cv}_test_score" + if "split0_test" in k: + # For multi-metric evaluation, the scores for all the scorers are available in the + # cv_results_ dict at the keys ending with that scorer’s name ('_') + # instead of '_score'. + multimetric = True if k.split("_")[-1] != "score" else False + key = k.replace("split0_test", f"split{cur_cv}_test") elif k.startswith("param"): if cur_cv != 0: key = False @@ -1014,10 +1011,36 @@ def end_partition(self) -> None: cv_results_["rank_test_score"] = rankdata(-cv_results_["mean_test_score"], method="min") # best param is the highest ranking (which is 1) and we choose the first time ranking 1 appeared best_param_index = np.where(cv_results_["rank_test_score"] == 1)[0][0] - - estimator.best_params_ = cv_results_["params"][best_param_index] - estimator.best_score_ = cv_results_["mean_test_score"][best_param_index] estimator.cv_results_ = cv_results_ + estimator.multimetric_ = multimetric + + # Reconstruct the sklearn estimator. + refit_metric = "score" + if callable(estimator.scoring): + scorers = estimator.scoring + elif estimator.scoring is None or isinstance(estimator.scoring, str): + scorers = check_scoring(estimator.estimator, estimator.scoring) + else: + scorers = _check_multimetric_scoring(estimator.estimator, estimator.scoring) + estimator._check_refit_for_multimetric(scorers) + refit_metric = estimator.refit + + estimator.scorer_ = scorers + + # check refit_metric now for a callabe scorer that is multimetric + if callable(estimator.scoring) and estimator.multimetric_: + refit_metric = estimator.refit + + # For multi-metric evaluation, store the best_index_, best_params_ and + # best_score_ iff refit is one of the scorer names + # In single metric evaluation, refit_metric is "score" + if estimator.refit or not estimator.multimetric_: + estimator.best_index_ = estimator._select_best_index(estimator.refit, refit_metric, cv_results_) + if not callable(estimator.refit): + # With a non-custom callable, we can select the best score + # based on the best index + estimator.best_score_ = cv_results_[f"mean_test_{refit_metric}"][estimator.best_index_] + estimator.best_params_ = cv_results_["params"][best_param_index] if refit_bool: estimator.best_estimator_ = copy.deepcopy( @@ -1040,9 +1063,10 @@ def end_partition(self) -> None: refit_end_time = time.time() estimator.refit_time_ = refit_end_time - refit_start_time - local_result_file = tempfile.NamedTemporaryFile(delete=True) - local_result_file_name = local_result_file.name - local_result_file.close() + if hasattr(estimator.best_estimator_, "feature_names_in_"): + estimator.feature_names_in_ = estimator.best_estimator_.feature_names_in_ + + local_result_file_name = get_temp_file_path() with open(local_result_file_name, mode="w+b") as local_result_file_obj: cp.dump(estimator, local_result_file_obj) @@ -1052,7 +1076,6 @@ def end_partition(self) -> None: temp_stage_name, auto_compress=False, overwrite=True, - statement_params=statement_params, ) # Note: you can add something like + "|" + str(df) to the return string @@ -1065,14 +1088,12 @@ def end_partition(self) -> None: stage_estimator_file_name, input_cols, label_cols, - statement_params, ) local_estimator_path = get_temp_file_path() session.file.get( posixpath.join(temp_stage_name, sproc_export_file_name), local_estimator_path, - statement_params=statement_params, ) with open(os.path.join(local_estimator_path, sproc_export_file_name), mode="r+b") as result_file_obj: diff --git a/snowflake/ml/modeling/metrics/classification.py b/snowflake/ml/modeling/metrics/classification.py index 4bca6c2c..7efe3425 100644 --- a/snowflake/ml/modeling/metrics/classification.py +++ b/snowflake/ml/modeling/metrics/classification.py @@ -1,5 +1,6 @@ import inspect import json +import math import warnings from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union @@ -518,6 +519,24 @@ def log_loss( y_true = y_true_col_names if isinstance(y_true_col_names, list) else [y_true_col_names] y_pred = y_pred_col_names if isinstance(y_pred_col_names, list) else [y_pred_col_names] + # If it is binary classification, use SQL because it is faster. + if len(y_pred) == 1 and eps == "auto": + metrics_utils.check_label_columns(y_true_col_names, y_pred_col_names) + eps = float(np.finfo(float).eps) + y_true_col = y_true[0] + y_pred_col = y_pred[0] + y_pred_eps_min = F.iff(df[y_pred_col] < (1 - eps), df[y_pred_col], 1 - eps) + y_pred_eps = F.iff(y_pred_eps_min > eps, y_pred_eps_min, eps) + neg_loss_column = F.iff(df[y_true_col] == 1, F.log(math.e, y_pred_eps), F.log(math.e, 1 - y_pred_eps)) + loss_column = F.negate(neg_loss_column) + return metrics_utils.weighted_sum( + df=df, + sample_score_column=loss_column, + sample_weight_column=df[sample_weight_col_name] if sample_weight_col_name else None, + normalize=normalize, + statement_params=statement_params, + ) + # Since we are processing samples individually, we need to explicitly specify the output labels # in the case that there is one output label. if len(y_true) == 1 and not labels: diff --git a/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py b/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py index 00a1283d..bf7c9e47 100644 --- a/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py +++ b/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py @@ -5,9 +5,6 @@ from typing import Any, Dict, Iterable, List, Optional, Set, Union from uuid import uuid4 -import cachetools -import cloudpickle as cp -import fsspec import numpy as np import pandas as pd import sklearn.model_selection @@ -34,7 +31,7 @@ validate_sklearn_args, ) from snowflake.ml.modeling._internal.snowpark_handlers import ( - SklearnWrapperProvider, + SklearnModelSelectionWrapperProvider, SnowparkHandlers as HandlersImpl, ) from snowflake.ml.modeling.framework.base import BaseTransformer @@ -231,13 +228,7 @@ def __init__( # type: ignore[no-untyped-def] sample_weight_col: Optional[str] = None, ) -> None: super().__init__() - deps: Set[str] = { - f"numpy=={np.__version__}", - f"scikit-learn=={sklearn.__version__}", - f"cloudpickle=={cp.__version__}", - f"cachetools=={cachetools.__version__}", # type: ignore[attr-defined] - f"fsspec=={fsspec.__version__}", - } + deps: Set[str] = set(SklearnModelSelectionWrapperProvider().dependencies) deps = deps | gather_dependencies(estimator) self._deps = list(deps) estimator = transform_snowml_obj_to_sklearn_obj(estimator) @@ -264,7 +255,9 @@ def __init__( # type: ignore[no-untyped-def] self.set_drop_input_cols(drop_input_cols) self.set_sample_weight_col(sample_weight_col) self._handlers: CVHandlers = HandlersImpl( - class_name=self.__class__.__name__, subproject=_SUBPROJECT, wrapper_provider=SklearnWrapperProvider() + class_name=self.__class__.__name__, + subproject=_SUBPROJECT, + wrapper_provider=SklearnModelSelectionWrapperProvider(), ) def _get_rand_id(self) -> str: @@ -301,7 +294,6 @@ def _get_active_columns(self) -> List[str]: @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "GridSearchCV": """Run fit with all sets of parameters @@ -536,12 +528,10 @@ def _sklearn_inference( @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) @telemetry.add_stmt_params_to_df( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) def predict(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[DataFrame, pd.DataFrame]: """Call predict on the estimator with the best found parameters @@ -584,12 +574,10 @@ def predict(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[DataFrame, p @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) @telemetry.add_stmt_params_to_df( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) def transform(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[DataFrame, pd.DataFrame]: """Call transform on the estimator with the best found parameters @@ -662,12 +650,10 @@ def _get_output_column_names(self, output_cols_prefix: str) -> List[str]: @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) @telemetry.add_stmt_params_to_df( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) def predict_proba( self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "predict_proba_" @@ -705,12 +691,10 @@ def predict_proba( @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) @telemetry.add_stmt_params_to_df( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) def predict_log_proba( self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "predict_log_proba_" @@ -749,12 +733,10 @@ def predict_log_proba( @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) @telemetry.add_stmt_params_to_df( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) def decision_function( self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "decision_function_" @@ -810,7 +792,7 @@ def score(self, dataset: Union[DataFrame, pd.DataFrame]) -> float: return output_score def _score_snowpark(self, dataset: DataFrame) -> float: - # Specify input columns so column pruing will be enforced + # Specify input columns so column pruning will be enforced selected_cols = self._get_active_columns() if len(selected_cols) > 0: dataset = dataset.select(selected_cols) diff --git a/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py b/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py index 1849c881..f92fe2c9 100644 --- a/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py +++ b/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py @@ -1,9 +1,6 @@ from typing import Any, Dict, Iterable, List, Optional, Set, Union from uuid import uuid4 -import cachetools -import cloudpickle as cp -import fsspec import numpy as np import pandas as pd import sklearn @@ -31,7 +28,7 @@ validate_sklearn_args, ) from snowflake.ml.modeling._internal.snowpark_handlers import ( - SklearnWrapperProvider, + SklearnModelSelectionWrapperProvider, SnowparkHandlers as HandlersImpl, ) from snowflake.ml.modeling.framework.base import BaseTransformer @@ -241,13 +238,7 @@ def __init__( # type: ignore[no-untyped-def] sample_weight_col: Optional[str] = None, ) -> None: super().__init__() - deps: Set[str] = { - f"numpy=={np.__version__}", - f"scikit-learn=={sklearn.__version__}", - f"cloudpickle=={cp.__version__}", - f"cachetools=={cachetools.__version__}", # type: ignore[attr-defined] - f"fsspec=={fsspec.__version__}", - } + deps: Set[str] = set(SklearnModelSelectionWrapperProvider().dependencies) deps = deps | gather_dependencies(estimator) self._deps = list(deps) estimator = transform_snowml_obj_to_sklearn_obj(estimator) @@ -276,7 +267,9 @@ def __init__( # type: ignore[no-untyped-def] self.set_drop_input_cols(drop_input_cols) self.set_sample_weight_col(sample_weight_col) self._handlers: CVHandlers = HandlersImpl( - class_name=self.__class__.__name__, subproject=_SUBPROJECT, wrapper_provider=SklearnWrapperProvider() + class_name=self.__class__.__name__, + subproject=_SUBPROJECT, + wrapper_provider=SklearnModelSelectionWrapperProvider(), ) def _get_rand_id(self) -> str: @@ -313,7 +306,6 @@ def _get_active_columns(self) -> List[str]: @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "RandomizedSearchCV": """Run fit with all sets of parameters @@ -552,12 +544,10 @@ def _sklearn_inference( @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) @telemetry.add_stmt_params_to_df( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) def predict(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[DataFrame, pd.DataFrame]: """Call predict on the estimator with the best found parameters @@ -599,12 +589,10 @@ def predict(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[DataFrame, p @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) @telemetry.add_stmt_params_to_df( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) def transform(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[DataFrame, pd.DataFrame]: """Call transform on the estimator with the best found parameters @@ -677,12 +665,10 @@ def _get_output_column_names(self, output_cols_prefix: str) -> List[str]: @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) @telemetry.add_stmt_params_to_df( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) def predict_proba( self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "predict_proba_" @@ -720,12 +706,10 @@ def predict_proba( @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) @telemetry.add_stmt_params_to_df( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) def predict_log_proba( self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "predict_log_proba_" @@ -764,12 +748,10 @@ def predict_log_proba( @telemetry.send_api_usage_telemetry( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) @telemetry.add_stmt_params_to_df( project=_PROJECT, subproject=_SUBPROJECT, - custom_tags=dict([("autogen", True)]), ) def decision_function( self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "decision_function_" @@ -825,7 +807,7 @@ def score(self, dataset: Union[DataFrame, pd.DataFrame]) -> float: return output_score def _score_snowpark(self, dataset: DataFrame) -> float: - # Specify input columns so column pruing will be enforced + # Specify input columns so column pruning will be enforced selected_cols = self._get_active_columns() if len(selected_cols) > 0: dataset = dataset.select(selected_cols) diff --git a/snowflake/ml/modeling/preprocessing/ordinal_encoder.py b/snowflake/ml/modeling/preprocessing/ordinal_encoder.py index 12615e3a..760715a8 100644 --- a/snowflake/ml/modeling/preprocessing/ordinal_encoder.py +++ b/snowflake/ml/modeling/preprocessing/ordinal_encoder.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import numbers import uuid -from typing import Dict, Iterable, List, Optional, Union +from typing import Any, Dict, Iterable, List, Optional, Union import numpy as np import pandas as pd @@ -13,10 +13,12 @@ from snowflake.ml._internal.utils import identifier from snowflake.ml.modeling.framework import _utils, base from snowflake.snowpark import functions as F, types as T +from snowflake.snowpark._internal import utils as snowpark_utils _COLUMN_NAME = "_COLUMN_NAME" _CATEGORY = "_CATEGORY" _INDEX = "_INDEX" +_COLUMN_BATCH_SIZE = 20 # constants used to validate the compatibility of the kwargs passed to the sklearn # transformer with the sklearn version @@ -123,7 +125,7 @@ def __init__( self._categories_list: List[type_utils.LiteralNDArrayType] = [] self._missing_indices: Dict[int, int] = {} self._infrequent_enabled = False - self._vocab_table_name = "snowml_preprocessing_ordinal_encoder_temp_table_" + uuid.uuid4().hex + self._vocab_table_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.TABLE) self.set_input_cols(input_cols) self.set_output_cols(output_cols) @@ -472,30 +474,46 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame suffix = "_" + uuid.uuid4().hex.upper() transformed_dataset = dataset - for idx, input_col in enumerate(self.input_cols): - output_col = self.output_cols[idx] - input_col_state_df = state_df.filter(F.col(_COLUMN_NAME) == input_col)[ - [_CATEGORY, _INDEX] - ].with_column_renamed(_INDEX, output_col) - - # index values through a join operation over dataset and its states - # In case of inplace transform, origin column name adds suffix (lsuffix=suffix) - transformed_dataset = ( - transformed_dataset.join( - input_col_state_df, - on=transformed_dataset[input_col].cast(T.StringType()).equal_null(input_col_state_df[_CATEGORY]), - how="left", - lsuffix=suffix, + for batch_start in range(0, len(self.input_cols), _COLUMN_BATCH_SIZE): + batch_end = min(batch_start + _COLUMN_BATCH_SIZE, len(self.input_cols)) + batch_input_cols = self.input_cols[batch_start:batch_end] + batch_output_cols = self.output_cols[batch_start:batch_end] + + for input_col, output_col in zip(batch_input_cols, batch_output_cols): + input_col_state_df = state_df.filter(F.col(_COLUMN_NAME) == input_col)[ + [_CATEGORY, _INDEX] + ].with_column_renamed(_INDEX, output_col) + + # index values through a join operation over dataset and its states + # In case of inplace transform, origin column name adds suffix (lsuffix=suffix) + transformed_dataset = ( + transformed_dataset.join( + input_col_state_df, + on=transformed_dataset[input_col] + .cast(T.StringType()) + .equal_null(input_col_state_df[_CATEGORY]), + how="left", + lsuffix=suffix, + ) + .drop(_CATEGORY) + .drop(identifier.concat_names([input_col, suffix])) ) - .drop(_CATEGORY) - .drop(identifier.concat_names([input_col, suffix])) - ) - # in case of duplicate column, filter them - output_cols = transformed_dataset.columns - if output_col not in output_cols: - output_cols.append(output_col) - transformed_dataset = transformed_dataset[output_cols] + # in case of duplicate column, filter them + output_cols = transformed_dataset.columns + if output_col not in output_cols: + output_cols.append(output_col) + transformed_dataset = transformed_dataset[output_cols] + + batch_table_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.TABLE) + transformed_dataset.write.save_as_table( # type: ignore[call-overload] + batch_table_name, + mode="overwrite", + table_type="temporary", + statement_params=telemetry.get_statement_params(base.PROJECT, base.SUBPROJECT, self.__class__.__name__), + ) + assert transformed_dataset._session is not None + transformed_dataset = transformed_dataset._session.table(batch_table_name) if _CATEGORY + suffix in transformed_dataset.columns: transformed_dataset = transformed_dataset.with_column_renamed(F.col(_CATEGORY + suffix), _CATEGORY) @@ -589,52 +607,106 @@ def _handle_unknown_in_transform(self, transformed_dataset: snowpark.DataFrame) Returns: Transformed dataset with unknown values handled. + """ + if self.handle_unknown == "error": + # batch columns to avoid query compilation OOM + self._check_unknown( + transformed_dataset, + batch=len(self.input_cols) > _COLUMN_BATCH_SIZE, + statement_params=telemetry.get_statement_params(base.PROJECT, base.SUBPROJECT, self.__class__.__name__), + ) + + if self.handle_unknown == "use_encoded_value": + # left outer join has already filled unknown values with null + if not (self.unknown_value is None or sklearn_utils.is_scalar_nan(self.unknown_value)): + transformed_dataset = transformed_dataset.na.fill(self.unknown_value, self.output_cols) + + return transformed_dataset + + def _check_unknown( + self, + dataset: snowpark.DataFrame, + statement_params: Dict[str, Any], + batch: bool = False, + ) -> None: + """ + Check if there are unknown values in the output of the given dataset. + + Args: + dataset: Dataset to check. + statement_params: Statement parameters for telemetry tracking. + batch: Whether to batch the dataset. Raises: - SnowflakeMLException: If `self.handle_unknown="error"` and unknown values exist in the - transformed dataset. + SnowflakeMLException: If unknown values exist in the output of the given dataset. """ - if self.handle_unknown == "error": + + def create_unknown_df( + dataset: snowpark.DataFrame, + input_cols: List[str], + output_cols: List[str], + ) -> snowpark.DataFrame: # dataframe with unknown values # columns: COLUMN_NAME, UNKNOWN_VALUE unknown_df: Optional[snowpark.DataFrame] = None - for idx, input_col in enumerate(self.input_cols): - output_col = self.output_cols[idx] + for input_col, output_col in zip(input_cols, output_cols): unknown_columns = [ F.lit(input_col), F.col(input_col), ] temp_df = ( - transformed_dataset[list({input_col, output_col})] + dataset[list({input_col, output_col})] .distinct() .filter(F.col(output_col).is_null()) .select(unknown_columns) .to_df(["COLUMN_NAME", "UNKNOWN_VALUE"]) ) unknown_df = unknown_df.union_by_name(temp_df) if unknown_df is not None else temp_df - - if unknown_df is None: - raise exceptions.SnowflakeMLException( - error_code=error_codes.INTERNAL_PYTHON_ERROR, - original_exception=ValueError( - "Internal error caused by handle_unknown='error': empty input columns." - ), + assert unknown_df is not None, "Internal error by handle_unknown='error': Empty input columns." + return unknown_df + + unknown_pandas_list = [] + if batch: + batch_writes = [] + for batch_start in range(0, len(self.input_cols), _COLUMN_BATCH_SIZE): + batch_end = min(batch_start + _COLUMN_BATCH_SIZE, len(self.input_cols)) + batch_input_cols = self.input_cols[batch_start:batch_end] + batch_output_cols = self.output_cols[batch_start:batch_end] + batch_dataset = dataset[list(set(batch_input_cols + batch_output_cols))] + batch_table_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.TABLE) + job = batch_dataset.write.save_as_table( + batch_table_name, + mode="overwrite", + table_type="temporary", + block=False, + statement_params=statement_params, ) - - unknown_pandas = unknown_df.to_pandas( - statement_params=telemetry.get_statement_params(base.PROJECT, base.SUBPROJECT, self.__class__.__name__) - ) - if not unknown_pandas.empty: - raise exceptions.SnowflakeMLException( - error_code=error_codes.INVALID_DATA, - original_exception=ValueError( - f"Found unknown categories during transform:\n{unknown_pandas.to_string()}" - ), + batch_writes.append((job, batch_table_name, batch_input_cols, batch_output_cols)) + + to_pandas_async_jobs = [] + for job, batch_table_name, batch_input_cols, batch_output_cols in batch_writes: + job.result(result_type="no_result") + assert dataset._session is not None + unknown_df = create_unknown_df( + dataset._session.table(batch_table_name), batch_input_cols, batch_output_cols ) + job = unknown_df.to_pandas(block=False, statement_params=statement_params) + to_pandas_async_jobs.append(job) - if self.handle_unknown == "use_encoded_value": - # left outer join has already filled unknown values with null - if not (self.unknown_value is None or sklearn_utils.is_scalar_nan(self.unknown_value)): - transformed_dataset = transformed_dataset.na.fill(self.unknown_value, self.output_cols) + for job in to_pandas_async_jobs: + unknown_pandas = job.result(result_type="pandas") + if not unknown_pandas.empty: + unknown_pandas_list.append(unknown_pandas) + else: + unknown_df = create_unknown_df(dataset, self.input_cols, self.output_cols) + unknown_pandas = unknown_df.to_pandas(statement_params=statement_params) + if not unknown_pandas.empty: + unknown_pandas_list.append(unknown_pandas) - return transformed_dataset + if unknown_pandas_list: + concat_unknown_pandas = pd.concat(unknown_pandas_list, ignore_index=True) + msg = f"Found unknown categories during transform:\n{concat_unknown_pandas.to_string()}" + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError(msg), + ) diff --git a/snowflake/ml/registry/BUILD.bazel b/snowflake/ml/registry/BUILD.bazel index 479176b3..7d57ca50 100644 --- a/snowflake/ml/registry/BUILD.bazel +++ b/snowflake/ml/registry/BUILD.bazel @@ -18,8 +18,7 @@ py_library( "//snowflake/ml/_internal/utils:table_manager", "//snowflake/ml/_internal/utils:uri", "//snowflake/ml/dataset", - "//snowflake/ml/model:_deployer", - "//snowflake/ml/model:_model", + "//snowflake/ml/model:_api", "//snowflake/ml/model:deploy_platforms", "//snowflake/ml/modeling/framework", ], diff --git a/snowflake/ml/registry/model_registry.py b/snowflake/ml/registry/model_registry.py index c5c52257..71bd92d5 100644 --- a/snowflake/ml/registry/model_registry.py +++ b/snowflake/ml/registry/model_registry.py @@ -1,8 +1,7 @@ import inspect import json -import os -import posixpath import sys +import textwrap import types from typing import ( TYPE_CHECKING, @@ -30,8 +29,7 @@ ) from snowflake.ml.dataset import dataset from snowflake.ml.model import ( - _deployer, - _model as model_api, + _api as model_api, deploy_platforms, model_signature, type_hints as model_types, @@ -773,7 +771,7 @@ def _get_model_path( raise connector.DataError(f"No files in model artifact for id {id} located at {model_uri}.") if len(model_file_list) > 1: raise NotImplementedError("Restoring models consisting of multiple files is currently not supported.") - return f"{self._fully_qualified_schema_name()}.{model_file_list[0].name}" + return f"{_STAGE_PREFIX}{model_stage_path}" def _log_model_path( self, @@ -876,6 +874,21 @@ def _register_model_with_id( else: raise connector.DatabaseError("Failed to insert the model properties to the registry table.") + def _get_deployment(self, *, model_name: str, model_version: str, deployment_name: str) -> snowpark.Row: + statement_params = self._get_statement_params(inspect.currentframe()) + deployment_lst = ( + self._session.sql(f"SELECT * FROM {self._fully_qualified_permanent_deployment_view_name()}") + .filter(snowpark.Column("DEPLOYMENT_NAME") == deployment_name) + .filter(snowpark.Column("MODEL_NAME") == model_name) + .filter(snowpark.Column("MODEL_VERSION") == model_version) + ).collect(statement_params=statement_params) + if len(deployment_lst) == 0: + raise KeyError( + f"Unable to find deployment named {deployment_name} in the model {model_name}/{model_version}." + ) + assert len(deployment_lst) == 1, "_get_deployment should return exactly 1 deployment" + return cast(snowpark.Row, deployment_lst[0]) + # Registry operations @telemetry.send_api_usage_telemetry( @@ -1384,13 +1397,13 @@ def log_model( model_name=model_name, model_version=model_version, ) - model_stage_file_path = posixpath.join(f"{_STAGE_PREFIX}{fully_qualified_model_stage_name}", f"{model_id}.zip") + stage_path = f"{_STAGE_PREFIX}{fully_qualified_model_stage_name}" model = cast(model_types.SupportedModelType, model) try: - model_metadata = model_api.save_model( # type: ignore[call-overload, misc] + module_model = model_api.save_model( # type: ignore[call-overload, misc] name=model_name, session=self._session, - model_stage_file_path=model_stage_file_path, + stage_path=stage_path, model=model, signatures=signatures, metadata=tags, @@ -1411,8 +1424,8 @@ def log_model( model_name=model_name, model_version=model_version, model_id=model_id, - type=model_metadata.model_type, - uri=uri.get_uri_from_snowflake_stage_path(model_stage_file_path), + type=module_model.packager.meta.model_type, + uri=uri.get_uri_from_snowflake_stage_path(stage_path), description=description, tags=tags, artifacts=artifacts, @@ -1438,9 +1451,9 @@ def load_model(self, model_name: str, model_version: str) -> Any: remote_model_path = self._get_model_path(model_name=model_name, model_version=model_version) restored_model = None - restored_model, _ = model_api.load_model(session=self._session, model_stage_file_path=remote_model_path) + restored_model = model_api.load_model(session=self._session, stage_path=remote_model_path) - return restored_model + return restored_model.packager.model # Repository Operations @@ -1480,6 +1493,7 @@ def deploy( Raises: RuntimeError: Raised when parameters are not properly enabled when deploying to Warehouse with temporary UDF + RuntimeError: Raised when deploying to SPCS with db/schema that starts with underscore. """ statement_params = self._get_statement_params(inspect.currentframe()) self._svm.validate_schema_version(statement_params) @@ -1490,6 +1504,24 @@ def deploy( deployment_stage_path = "" if platform == deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES: + if self._name.startswith("_") or self._schema.startswith("_"): + error_message = """\ + Model deployment to Snowpark Container Service does not support a database/schema name that starts with + an underscore. Please ensure you pass in a valid db/schema name when initializing the registry with: + + model_registry.create_model_registry( + session=session, + database_name=db, + schema_name=schema + ) + + registry = model_registry.ModelRegistry( + session=session, + database_name=db, + schema_name=schema + ) + """ + raise RuntimeError(textwrap.dedent(error_message)) permanent = True options = cast(model_types.SnowparkContainerServiceDeployOptions, options) deployment_stage_path = f"{self._prepare_deployment_stage()}/{deployment_name}/" @@ -1506,7 +1538,7 @@ def deploy( ) options["permanent_udf_stage_location"] = deployment_stage_path - remote_model_path = "@" + self._get_model_path(model_name=model_name, model_version=model_version) + remote_model_path = self._get_model_path(model_name=model_name, model_version=model_version) model_id = self._get_model_id(model_name, model_version) # https://snowflakecomputing.atlassian.net/browse/SNOW-858376 @@ -1527,15 +1559,15 @@ def deploy( "Temporary deployment to the warehouse is currently not supported. Please use " "permanent deployment by setting the 'permanent' parameter to True" ) - remote_model_path = f"{unencrypted_stage}/{os.path.basename(remote_model_path)}" + remote_model_path = unencrypted_stage # Step 1: Deploy to get the UDF - deployment_info = _deployer.deploy( + deployment_info = model_api.deploy( session=self._session, name=self._fully_qualified_deployment_name(deployment_name), platform=platform, target_method=target_method, - model_stage_file_path=remote_model_path, + stage_path=remote_model_path, deployment_stage_path=deployment_stage_path, model_id=model_id, options=options, @@ -1703,20 +1735,10 @@ def delete_deployment(self, model_name: str, model_version: str, *, deployment_n model_version: Model Version string. deployment_name: Name of the deployment that is getting deleted. - Raises: - KeyError: Raised if the target deployment is not found. """ - deployment = ( - self._session.sql(f"SELECT * FROM {self._fully_qualified_permanent_deployment_view_name()}") - .filter(snowpark.Column("DEPLOYMENT_NAME") == deployment_name) - .filter(snowpark.Column("MODEL_NAME") == model_name) - .filter(snowpark.Column("MODEL_VERSION") == model_version) - ).collect() - if len(deployment) == 0: - raise KeyError( - f"Unable to find deployment named {deployment_name} in the model {model_name}/{model_version}." - ) - deployment = deployment[0] + deployment = self._get_deployment( + model_name=model_name, model_version=model_version, deployment_name=deployment_name + ) # TODO(SNOW-759526): The following sequence should be a transaction. # Step 1: Drop the UDF @@ -1745,7 +1767,9 @@ def delete_deployment(self, model_name: str, model_version: str, *, deployment_n # Optional Step 5: Delete Snowpark container service. if deployment["TARGET_PLATFORM"] == deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES.value: - service_name = f"service_{deployment['MODEL_ID']}" + service_name = identifier.get_schema_level_object_identifier( + self._name, self._schema, f"service_{deployment['MODEL_ID']}" + ) query_result_checker.SqlResultValidator( self._session, f"DROP SERVICE IF EXISTS {service_name}", @@ -1966,7 +1990,7 @@ def predict(self, deployment_name: str, data: Any) -> "pd.DataFrame": self._registry._svm.validate_schema_version(statement_params) if di: - return _deployer.predict( + return model_api.predict( session=self._registry._session, deployment=di, X=data, statement_params=statement_params ) @@ -1996,7 +2020,7 @@ def predict(self, deployment_name: str, data: Any) -> "pd.DataFrame": signature=signature, options=options, ) - return _deployer.predict( + return model_api.predict( session=self._registry._session, deployment=di, X=data, statement_params=statement_params ) except KeyError: diff --git a/snowflake/ml/registry/model_registry_test.py b/snowflake/ml/registry/model_registry_test.py index 2487b95e..5a63380b 100644 --- a/snowflake/ml/registry/model_registry_test.py +++ b/snowflake/ml/registry/model_registry_test.py @@ -1,7 +1,6 @@ import datetime import itertools import json -import posixpath from typing import Any, Dict, List, Union, cast from absl.testing import absltest @@ -9,7 +8,7 @@ from snowflake import connector, snowpark from snowflake.ml._internal import telemetry from snowflake.ml._internal.utils import formatting, identifier, uri -from snowflake.ml.model import _model +from snowflake.ml.model import _api from snowflake.ml.registry import _initial_schema, _schema, model_registry from snowflake.ml.test_utils import mock_data_frame, mock_session @@ -1116,8 +1115,7 @@ def test_log_model(self) -> None: + "." + f"SNOWML_MODEL_{expected_stage_postfix}" ) - model_path = posixpath.join(f"@{expected_stage_path}", f"{self.model_id}.zip") - + model_path = f"@{expected_stage_path}" with absltest.mock.patch.object( model_registry, "_list_selected_models", @@ -1130,9 +1128,11 @@ def test_log_model(self) -> None: ) as mock_path: mock_model = absltest.mock.MagicMock() mock_type = absltest.mock.MagicMock() - mock_metadata = absltest.mock.MagicMock(model_type=mock_type) + mock_module_model = absltest.mock.MagicMock( + packager=absltest.mock.MagicMock(meta=absltest.mock.MagicMock(model_type=mock_type)) + ) with absltest.mock.patch.object( - target=_model, attribute="save_model", return_value=mock_metadata + target=_api, attribute="save_model", return_value=mock_module_model ) as mock_save: with absltest.mock.patch.object( target=model_registry, attribute="_register_model_with_id", return_value=None @@ -1151,7 +1151,7 @@ def test_log_model(self) -> None: mock_save.assert_called_once_with( name=model_name, session=self._session, - model_stage_file_path=model_path, + stage_path=model_path, model=mock_model, signatures=m_signatures, metadata=None, @@ -1211,8 +1211,7 @@ def test_log_model(self) -> None: ) as mock_path: mock_model = absltest.mock.MagicMock() mock_type = absltest.mock.MagicMock() - mock_metadata = absltest.mock.MagicMock(model_type=mock_type) - with absltest.mock.patch.object(target=_model, attribute="save_model") as mock_save: + with absltest.mock.patch.object(target=_api, attribute="save_model") as mock_save: mock_save.side_effect = ValueError("Mock Error") with self.assertRaises(ValueError): model_registry.log_model( diff --git a/snowflake/ml/registry/notebooks/Deployment to Snowpark Container Service Demo.ipynb b/snowflake/ml/registry/notebooks/Deployment to Snowpark Container Service Demo.ipynb index bdd99926..66669f72 100644 --- a/snowflake/ml/registry/notebooks/Deployment to Snowpark Container Service Demo.ipynb +++ b/snowflake/ml/registry/notebooks/Deployment to Snowpark Container Service Demo.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 1, "id": "18a75d71", "metadata": {}, "outputs": [], @@ -71,10 +71,18 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 2, "id": "58dd3604", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. \n" + ] + } + ], "source": [ "from snowflake.ml.utils.connection_params import SnowflakeLoginOptions\n", "from snowflake.snowpark import Session\n", @@ -84,10 +92,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "27dfbc42", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:create_model_registry() is in private preview since 0.2.0. Do not use it in production. \n", + "WARNING:absl:The database SHULIN_DB_TEST already exists. Skipping creation.\n" + ] + } + ], "source": [ "from snowflake.ml.registry import model_registry\n", "from snowflake.ml._internal.utils import identifier\n", @@ -110,7 +127,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 4, "id": "574e7a43", "metadata": {}, "outputs": [ @@ -121,16 +138,6 @@ "WARNING:snowflake.snowpark:ModelRegistry.log_model() is in private preview since 0.2.0. Do not use it in production. \n", "WARNING:snowflake.snowpark:ModelRegistry.list_models() is in private preview since 0.2.0. Do not use it in production. \n" ] - }, - { - "data": { - "text/plain": [ - "'0aa236602be711ee89915ac3f3b698e1'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ @@ -156,7 +163,20 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, + "id": "64d286fb-bc80-4ce4-85e6-bcc46b38ce7c", + "metadata": {}, + "outputs": [], + "source": [ + "# Optionally enable INFO log level to show more logging during model deployment.\n", + "import logging\n", + "logging.basicConfig()\n", + "logging.getLogger().setLevel(logging.INFO)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, "id": "72ff114f", "metadata": {}, "outputs": [ @@ -164,16 +184,47 @@ "name": "stderr", "output_type": "stream", "text": [ - "WARNING:root:Building the Docker image and deploying to Snowpark Container Service. This process may take a few minutes.\n", - "WARNING:root:Image successfully built! To prevent the need for rebuilding the Docker image in future deployments, simply specify 'prebuilt_snowflake_image': 'temptest002038-servicesnow.registry-dev.snowflakecomputing.com/inference_container_db/inference_container_schema/snowml_repo/42374efe274011eea4ff5ac3f3b698e1:latest' in the options field of the deploy() function\n" + "WARNING:snowflake.ml.model._deploy_client.snowservice.deploy:Similar environment detected. Using existing image sfengineering-mlplatformtest.registry.snowflakecomputing.com/shulin_db_test/shulin_db_schema/snowml_repo/d3e4770e34443205e0d53f9f84c602ba7fc2876b:latest to skip image build. To disable this feature, set 'force_image_build=True' in deployment options\n", + "WARNING:snowflake.ml.model._deploy_client.utils.snowservice_client:Best-effort log streaming from SPCS will be enabled when python logging level is set to INFO.Alternatively, you can also query the logs by running the query 'CALL SYSTEM$GET_SERVICE_LOGS('SHULIN_DB_TEST.SHULIN_DB_SCHEMA.service_919991487d4211ee92415ac3f3b698df', '0', 'inference-server')'\n" ] + }, + { + "data": { + "text/plain": [ + "{'name': 'SHULIN_DB_TEST.SHULIN_DB_SCHEMA.LOGISTIC_FUNC',\n", + " 'platform': ,\n", + " 'target_method': 'predict',\n", + " 'signature': ModelSignature(\n", + " inputs=[\n", + " FeatureSpec(dtype=DataType.DOUBLE, name='SEPALLENGTH'),\n", + " \t\tFeatureSpec(dtype=DataType.DOUBLE, name='SEPALWIDTH'),\n", + " \t\tFeatureSpec(dtype=DataType.DOUBLE, name='PETALLENGTH'),\n", + " \t\tFeatureSpec(dtype=DataType.DOUBLE, name='PETALWIDTH')\n", + " ],\n", + " outputs=[\n", + " FeatureSpec(dtype=DataType.DOUBLE, name='SEPALLENGTH'),\n", + " \t\tFeatureSpec(dtype=DataType.DOUBLE, name='SEPALWIDTH'),\n", + " \t\tFeatureSpec(dtype=DataType.DOUBLE, name='PETALLENGTH'),\n", + " \t\tFeatureSpec(dtype=DataType.DOUBLE, name='PETALWIDTH'),\n", + " \t\tFeatureSpec(dtype=DataType.DOUBLE, name='PREDICTED_TARGET')\n", + " ]\n", + " ),\n", + " 'options': {'compute_pool': 'REGTEST_INFERENCE_CPU_POOL'},\n", + " 'details': {'image_name': 'sfengineering-mlplatformtest.registry.snowflakecomputing.com/shulin_db_test/shulin_db_schema/snowml_repo/d3e4770e34443205e0d53f9f84c602ba7fc2876b:latest',\n", + " 'service_spec': \"spec:\\n container:\\n - env:\\n MODEL_ZIP_STAGE_PATH: SHULIN_DB_TEST.SHULIN_DB_SCHEMA.SNOWML_MODEL_919991487D4211EE92415AC3F3B698DF/model.zip\\n NUM_WORKERS: None\\n SNOWML_USE_GPU: false\\n TARGET_METHOD: predict\\n image: sfengineering-mlplatformtest.registry.snowflakecomputing.com/shulin_db_test/shulin_db_schema/snowml_repo/d3e4770e34443205e0d53f9f84c602ba7fc2876b:latest\\n name: inference-server\\n readinessProbe:\\n path: /health\\n port: 5000\\n volumeMounts:\\n - mountPath: /local/user/vol1\\n name: vol1\\n - mountPath: SHULIN_DB_TEST.SHULIN_DB_SCHEMA.SNOWML_MODEL_919991487D4211EE92415AC3F3B698DF\\n name: stage\\n endpoint:\\n - name: predict\\n port: 5000\\n volume:\\n - name: vol1\\n source: local\\n - gid: 1000\\n name: stage\\n source: '@SHULIN_DB_TEST.SHULIN_DB_SCHEMA.SNOWML_MODEL_919991487D4211EE92415AC3F3B698DF'\\n uid: 1000\\n\",\n", + " 'service_function_sql': \"\\n CREATE OR REPLACE FUNCTION SHULIN_DB_TEST.SHULIN_DB_SCHEMA.LOGISTIC_FUNC(input OBJECT)\\n RETURNS OBJECT\\n SERVICE=SHULIN_DB_TEST.SHULIN_DB_SCHEMA.service_919991487d4211ee92415ac3f3b698df\\n ENDPOINT=predict\\n \\n AS '/predict'\\n \"}}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "from snowflake.ml.model import deploy_platforms\n", "from snowflake import snowpark\n", "\n", - "compute_pool = \"MY_COMPUTE_POOL\" # Pre-created compute pool\n", + "compute_pool = \"REGTEST_INFERENCE_CPU_POOL\" # Pre-created compute pool\n", "deployment_name = \"LOGISTIC_FUNC\" # Name of the resulting UDF\n", "\n", "model_ref.deploy(\n", @@ -187,6 +238,18 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": 11, + "id": "8709ee24-f7c0-458a-bc54-a2b78d5cc2cb", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "logging.basicConfig()\n", + "logging.getLogger().setLevel(logging.WARNING)" + ] + }, { "cell_type": "markdown", "id": "1c754e72", @@ -197,7 +260,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "id": "a5c02328", "metadata": {}, "outputs": [ @@ -328,7 +391,7 @@ "9 4.9 3.1 1.5 0.1 0.0" ] }, - "execution_count": 7, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -339,10 +402,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "12991f07", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:ModelRegistry.delete_deployment() is in private preview since 1.0.1. Do not use it in production. \n" + ] + } + ], "source": [ "model_ref.delete_deployment(deployment_name=deployment_name)" ] @@ -360,9 +431,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:local_snowml]", + "display_name": "Python [conda env:micromamba-snowml_1.0.12] *", "language": "python", - "name": "conda-env-local_snowml-py" + "name": "conda-env-micromamba-snowml_1.0.12-py" }, "language_info": { "codemirror_mode": { @@ -374,7 +445,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.17" + "version": "3.10.8" } }, "nbformat": 4, diff --git a/snowflake/ml/registry/notebooks/Finetune_Registry.ipynb b/snowflake/ml/registry/notebooks/Finetune_Registry.ipynb index 6037a7d3..8bc24c03 100644 --- a/snowflake/ml/registry/notebooks/Finetune_Registry.ipynb +++ b/snowflake/ml/registry/notebooks/Finetune_Registry.ipynb @@ -15,19 +15,220 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, + "id": "255a02dd-9208-4489-9468-fb98231e859b", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Ignoring invalid distribution -ackaging (/opt/conda/envs/pytorch/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mCollecting snowflake-snowpark-python==1.8.0\n", + " Using cached snowflake_snowpark_python-1.8.0-py3-none-any.whl (326 kB)\n", + "Collecting setuptools>=40.6.0 (from snowflake-snowpark-python==1.8.0)\n", + " Using cached setuptools-68.2.2-py3-none-any.whl (807 kB)\n", + "Collecting wheel (from snowflake-snowpark-python==1.8.0)\n", + " Using cached wheel-0.41.3-py3-none-any.whl (65 kB)\n", + "Collecting snowflake-connector-python<4.0.0,>=3.2.0 (from snowflake-snowpark-python==1.8.0)\n", + " Using cached snowflake_connector_python-3.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)\n", + "Collecting pyyaml (from snowflake-snowpark-python==1.8.0)\n", + " Using cached PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (705 kB)\n", + "Collecting cloudpickle<=2.0.0,>=1.6.0 (from snowflake-snowpark-python==1.8.0)\n", + " Using cached cloudpickle-2.0.0-py3-none-any.whl (25 kB)\n", + "Collecting asn1crypto<2.0.0,>0.24.0 (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached asn1crypto-1.5.1-py2.py3-none-any.whl (105 kB)\n", + "Collecting cffi<2.0.0,>=1.9 (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached cffi-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (443 kB)\n", + "Collecting cryptography<42.0.0,>=3.1.0 (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached cryptography-41.0.5-cp37-abi3-manylinux_2_28_x86_64.whl (4.4 MB)\n", + "Collecting oscrypto<2.0.0 (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached oscrypto-1.3.0-py2.py3-none-any.whl (194 kB)\n", + "Collecting pyOpenSSL<24.0.0,>=16.2.0 (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached pyOpenSSL-23.3.0-py3-none-any.whl (58 kB)\n", + "Collecting pycryptodomex!=3.5.0,<4.0.0,>=3.2 (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached pycryptodomex-3.19.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)\n", + "Collecting pyjwt<3.0.0 (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached PyJWT-2.8.0-py3-none-any.whl (22 kB)\n", + "Collecting pytz (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached pytz-2023.3.post1-py2.py3-none-any.whl (502 kB)\n", + "Collecting requests<3.0.0 (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached requests-2.31.0-py3-none-any.whl (62 kB)\n", + "Collecting packaging (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached packaging-23.2-py3-none-any.whl (53 kB)\n", + "Collecting charset-normalizer<4,>=2 (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (142 kB)\n", + "Collecting idna<4,>=2.5 (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached idna-3.4-py3-none-any.whl (61 kB)\n", + "Collecting urllib3<1.27,>=1.21.1 (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached urllib3-1.26.18-py2.py3-none-any.whl (143 kB)\n", + "Collecting certifi>=2017.4.17 (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached certifi-2023.7.22-py3-none-any.whl (158 kB)\n", + "Collecting typing-extensions<5,>=4.3 (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached typing_extensions-4.8.0-py3-none-any.whl (31 kB)\n", + "Collecting filelock<4,>=3.5 (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached filelock-3.13.1-py3-none-any.whl (11 kB)\n", + "Collecting sortedcontainers>=2.4.0 (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)\n", + "Collecting platformdirs<4.0.0,>=2.6.0 (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached platformdirs-3.11.0-py3-none-any.whl (17 kB)\n", + "Collecting tomlkit (from snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached tomlkit-0.12.1-py3-none-any.whl (37 kB)\n", + "Collecting pycparser (from cffi<2.0.0,>=1.9->snowflake-connector-python<4.0.0,>=3.2.0->snowflake-snowpark-python==1.8.0)\n", + " Using cached pycparser-2.21-py2.py3-none-any.whl (118 kB)\n", + "\u001b[33mWARNING: Ignoring invalid distribution -ackaging (/opt/conda/envs/pytorch/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mInstalling collected packages: sortedcontainers, pytz, asn1crypto, wheel, urllib3, typing-extensions, tomlkit, setuptools, pyyaml, pyjwt, pycryptodomex, pycparser, platformdirs, packaging, oscrypto, idna, filelock, cloudpickle, charset-normalizer, certifi, requests, cffi, cryptography, pyOpenSSL, snowflake-connector-python, snowflake-snowpark-python\n", + " Attempting uninstall: sortedcontainers\n", + " Found existing installation: sortedcontainers 2.4.0\n", + " Uninstalling sortedcontainers-2.4.0:\n", + " Successfully uninstalled sortedcontainers-2.4.0\n", + " Attempting uninstall: pytz\n", + " Found existing installation: pytz 2023.3.post1\n", + " Uninstalling pytz-2023.3.post1:\n", + " Successfully uninstalled pytz-2023.3.post1\n", + " Attempting uninstall: asn1crypto\n", + " Found existing installation: asn1crypto 1.5.1\n", + " Uninstalling asn1crypto-1.5.1:\n", + " Successfully uninstalled asn1crypto-1.5.1\n", + " Attempting uninstall: wheel\n", + " Found existing installation: wheel 0.41.3\n", + " Uninstalling wheel-0.41.3:\n", + " Successfully uninstalled wheel-0.41.3\n", + " Attempting uninstall: urllib3\n", + " Found existing installation: urllib3 1.26.18\n", + " Uninstalling urllib3-1.26.18:\n", + " Successfully uninstalled urllib3-1.26.18\n", + " Attempting uninstall: typing-extensions\n", + " Found existing installation: typing_extensions 4.8.0\n", + " Uninstalling typing_extensions-4.8.0:\n", + " Successfully uninstalled typing_extensions-4.8.0\n", + " Attempting uninstall: tomlkit\n", + " Found existing installation: tomlkit 0.12.1\n", + " Uninstalling tomlkit-0.12.1:\n", + " Successfully uninstalled tomlkit-0.12.1\n", + " Attempting uninstall: setuptools\n", + " Found existing installation: setuptools 68.2.2\n", + " Uninstalling setuptools-68.2.2:\n", + " Successfully uninstalled setuptools-68.2.2\n", + " Attempting uninstall: pyyaml\n", + " Found existing installation: PyYAML 6.0.1\n", + " Uninstalling PyYAML-6.0.1:\n", + " Successfully uninstalled PyYAML-6.0.1\n", + " Attempting uninstall: pyjwt\n", + " Found existing installation: PyJWT 2.8.0\n", + " Uninstalling PyJWT-2.8.0:\n", + " Successfully uninstalled PyJWT-2.8.0\n", + " Attempting uninstall: pycryptodomex\n", + " Found existing installation: pycryptodomex 3.19.0\n", + " Uninstalling pycryptodomex-3.19.0:\n", + " Successfully uninstalled pycryptodomex-3.19.0\n", + " Attempting uninstall: pycparser\n", + " Found existing installation: pycparser 2.21\n", + " Uninstalling pycparser-2.21:\n", + " Successfully uninstalled pycparser-2.21\n", + " Attempting uninstall: platformdirs\n", + " Found existing installation: platformdirs 3.11.0\n", + " Uninstalling platformdirs-3.11.0:\n", + " Successfully uninstalled platformdirs-3.11.0\n", + " Attempting uninstall: packaging\n", + " Found existing installation: packaging 23.2\n", + " Uninstalling packaging-23.2:\n", + " Successfully uninstalled packaging-23.2\n", + " Attempting uninstall: oscrypto\n", + " Found existing installation: oscrypto 1.3.0\n", + " Uninstalling oscrypto-1.3.0:\n", + " Successfully uninstalled oscrypto-1.3.0\n", + " Attempting uninstall: idna\n", + " Found existing installation: idna 3.4\n", + " Uninstalling idna-3.4:\n", + " Successfully uninstalled idna-3.4\n", + " Attempting uninstall: filelock\n", + " Found existing installation: filelock 3.12.2\n", + " Uninstalling filelock-3.12.2:\n", + " Successfully uninstalled filelock-3.12.2\n", + " Attempting uninstall: cloudpickle\n", + " Found existing installation: cloudpickle 2.0.0\n", + " Uninstalling cloudpickle-2.0.0:\n", + " Successfully uninstalled cloudpickle-2.0.0\n", + " Attempting uninstall: charset-normalizer\n", + " Found existing installation: charset-normalizer 3.1.0\n", + " Uninstalling charset-normalizer-3.1.0:\n", + " Successfully uninstalled charset-normalizer-3.1.0\n", + " Attempting uninstall: certifi\n", + " Found existing installation: certifi 2023.5.7\n", + " Uninstalling certifi-2023.5.7:\n", + " Successfully uninstalled certifi-2023.5.7\n", + " Attempting uninstall: requests\n", + " Found existing installation: requests 2.31.0\n", + " Uninstalling requests-2.31.0:\n", + " Successfully uninstalled requests-2.31.0\n", + " Attempting uninstall: cffi\n", + " Found existing installation: cffi 1.15.1\n", + " Uninstalling cffi-1.15.1:\n", + " Successfully uninstalled cffi-1.15.1\n", + " Attempting uninstall: cryptography\n", + " Found existing installation: cryptography 39.0.2\n", + " Uninstalling cryptography-39.0.2:\n", + " Successfully uninstalled cryptography-39.0.2\n", + " Attempting uninstall: pyOpenSSL\n", + " Found existing installation: pyOpenSSL 23.2.0\n", + " Uninstalling pyOpenSSL-23.2.0:\n", + " Successfully uninstalled pyOpenSSL-23.2.0\n", + " Attempting uninstall: snowflake-connector-python\n", + " Found existing installation: snowflake-connector-python 3.3.1\n", + " Uninstalling snowflake-connector-python-3.3.1:\n", + " Successfully uninstalled snowflake-connector-python-3.3.1\n", + " Attempting uninstall: snowflake-snowpark-python\n", + " Found existing installation: snowflake-snowpark-python 1.9.0\n", + " Uninstalling snowflake-snowpark-python-1.9.0:\n", + " Successfully uninstalled snowflake-snowpark-python-1.9.0\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "triton 2.0.0 requires cmake, which is not installed.\n", + "triton 2.0.0 requires lit, which is not installed.\n", + "awscli 1.27.151 requires botocore==1.29.151, but you have botocore 1.31.17 which is incompatible.\n", + "awscli 1.27.151 requires PyYAML<5.5,>=3.10, but you have pyyaml 6.0.1 which is incompatible.\n", + "sagemaker 2.164.0 requires cloudpickle==2.2.1, but you have cloudpickle 2.0.0 which is incompatible.\n", + "sagemaker 2.164.0 requires PyYAML==6.0, but you have pyyaml 6.0.1 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed asn1crypto-1.5.1 certifi-2023.7.22 cffi-1.16.0 charset-normalizer-3.3.2 cloudpickle-2.0.0 cryptography-41.0.5 filelock-3.13.1 idna-3.4 oscrypto-1.3.0 packaging-23.2 platformdirs-3.11.0 pyOpenSSL-23.3.0 pycparser-2.21 pycryptodomex-3.19.0 pyjwt-2.8.0 pytz-2023.3.post1 pyyaml-6.0.1 requests-2.31.0 setuptools-68.2.2 snowflake-connector-python-3.3.1 snowflake-snowpark-python-1.8.0 sortedcontainers-2.4.0 tomlkit-0.12.1 typing-extensions-4.8.0 urllib3-1.26.18 wheel-0.41.3\n" + ] + } + ], + "source": [ + "!pip install --upgrade --force-reinstall snowflake-snowpark-python==1.8.0" + ] + }, + { + "cell_type": "code", + "execution_count": 45, "id": "1ed66db9", "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Ignoring invalid distribution -ackaging (/opt/conda/envs/pytorch/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mProcessing /home/ubuntu/snowml/bazel-bin/snowflake/ml/snowflake_ml_python-1.0.12-py3-none-any.whl\n", + "Installing collected packages: snowflake-ml-python\n", + " Attempting uninstall: snowflake-ml-python\n", + " Found existing installation: snowflake-ml-python 1.0.12\n", + " Uninstalling snowflake-ml-python-1.0.12:\n", + " Successfully uninstalled snowflake-ml-python-1.0.12\n", + "Successfully installed snowflake-ml-python-1.0.12\n" + ] + } + ], "source": [ - "!pip install /Users/halu/snowml/bazel-bin/snowflake/ml/snowflake_ml_python-1.0.10-py3-none-any.whl" + "!pip install --force-reinstall --no-deps /home/ubuntu/snowml/bazel-bin/snowflake/ml/snowflake_ml_python-1.0.12-py3-none-any.whl" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "id": "292e9f48", "metadata": {}, "outputs": [ @@ -42,14 +243,6 @@ }, "metadata": {}, "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] } ], "source": [ @@ -62,7 +255,26 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, + "id": "4c6b1310-9941-4ba3-b126-6e58c01fb613", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Ignoring invalid distribution -ackaging (/opt/conda/envs/pytorch/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0msnowflake-snowpark-python 1.8.0\n" + ] + } + ], + "source": [ + "! pip list | grep snowpark" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "id": "7585077b", "metadata": {}, "outputs": [], @@ -81,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 4, "id": "f876232e", "metadata": {}, "outputs": [ @@ -99,7 +311,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 5, "id": "c6aee8c9", "metadata": { "scrolled": true @@ -111,7 +323,7 @@ "('\"HALU_FT\"', '\"PUBLIC\"')" ] }, - "execution_count": 12, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -122,7 +334,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 6, "id": "72c16c14", "metadata": {}, "outputs": [], @@ -133,7 +345,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 7, "id": "c420807b", "metadata": {}, "outputs": [ @@ -142,6 +354,7 @@ "output_type": "stream", "text": [ "WARNING:snowflake.snowpark:create_model_registry() is in private preview since 0.2.0. Do not use it in production. \n", + "WARNING:absl:The database HALU_MR already exists. Skipping creation.\n", "WARNING:absl:The schema HALU_MR.PUBLIC already exists. Skipping creation.\n" ] } @@ -159,7 +372,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 47, "id": "0adc9637", "metadata": {}, "outputs": [], @@ -169,37 +382,31 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 48, "id": "18323af6", "metadata": {}, "outputs": [], "source": [ - "options = llm.LLMOptions(token=\"....\")\n", + "options = llm.LLMOptions(\n", + " token=\"...\",\n", + " max_batch_size=20,\n", + ")\n", "model = llm.LLM(\n", - " model_id_or_path=\"/Users/halu/Downloads/halu_peft_ft\",\n", + " model_id_or_path=\"/home/ubuntu/projects/test_ft_weights\",\n", " options=options\n", ")" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 50, "id": "dac3fc56", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:snowflake.snowpark:ModelRegistry.log_model() is in private preview since 0.2.0. Do not use it in production. \n", - "WARNING:snowflake.snowpark:ModelRegistry.list_models() is in private preview since 0.2.0. Do not use it in production. \n" - ] - } - ], + "outputs": [], "source": [ "svc_model = registry.log_model(\n", - " model_name='halu_ft_model_1',\n", - " model_version='v1',\n", + " model_name='build_demo_1101',\n", + " model_version='v5',\n", " model=model,\n", " options={\"embed_local_ml_library\": True},\n", ")" @@ -207,21 +414,26 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 51, "id": "b17b1fbb", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "WARNING:snowflake.ml.model._deploy_client.snowservice.deploy:Similar environment detected. Using existing image sfengineering-servicesnow.registry.snowflakecomputing.com/halu_ft_db/public/haul_repo/4b7980a43a1ff656d23b9401e4471bcd4f021d39:latest to skip image build. To disable this feature, set 'force_image_build=True' in deployment options\n" + "WARNING:snowflake.ml.model._deploy_client.snowservice.deploy:Debug model is enabled, deployment artifacts will be available in /tmp/tmpqkmmoahf\n", + "WARNING:snowflake.ml.model._deploy_client.snowservice.deploy:Building the Docker image and deploying to Snowpark Container Service. This process may take a few minutes.\n", + "WARNING:snowflake.ml.model._deploy_client.snowservice.deploy:Image successfully built! For future model deployments, the image will be reused if possible, saving model deployment time. To enforce using the same image, include 'prebuilt_snowflake_image': 'sfengineering-servicesnow.registry.snowflakecomputing.com/halu_ft_db/public/haul_repo/50e52d564ecc126d1f53452aa4dd734efa4e3a0a:latest' in the deploy() function's options.\n", + "WARNING:urllib3.connectionpool:Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))': /login\n" ] }, { "data": { "text/plain": [ - "{'name': 'HALU_MR.PUBLIC.halu_ft_deploy_1',\n", + "{'name': 'HALU_MR.PUBLIC.build_demo_1101_4',\n", " 'platform': ,\n", " 'target_method': 'infer',\n", " 'signature': ModelSignature(\n", @@ -236,13 +448,14 @@ " 'num_gpus': 1,\n", " 'image_repo': 'sfengineering-servicesnow.registry.snowflakecomputing.com/halu_ft_db/public/haul_repo',\n", " 'enable_remote_image_build': True,\n", - " 'model_in_image': True},\n", - " 'details': {'image_name': 'sfengineering-servicesnow.registry.snowflakecomputing.com/halu_ft_db/public/haul_repo/4b7980a43a1ff656d23b9401e4471bcd4f021d39:latest',\n", - " 'service_spec': 'spec:\\n container:\\n - env:\\n NUM_WORKERS: 1\\n SNOWML_USE_GPU: true\\n TARGET_METHOD: infer\\n image: sfengineering-servicesnow.registry.snowflakecomputing.com/halu_ft_db/public/haul_repo/4b7980a43a1ff656d23b9401e4471bcd4f021d39:latest\\n name: inference-server\\n readinessProbe:\\n path: /health\\n port: 5000\\n resources:\\n limits:\\n nvidia.com/gpu: 1\\n requests:\\n nvidia.com/gpu: 1\\n volumeMounts:\\n - mountPath: /local/user/vol1\\n name: vol1\\n endpoint:\\n - name: predict\\n port: 5000\\n volume:\\n - name: vol1\\n source: local\\n',\n", - " 'service_function_sql': \"\\n CREATE OR REPLACE FUNCTION HALU_MR.PUBLIC.halu_ft_deploy_1(input OBJECT)\\n RETURNS OBJECT\\n SERVICE=HALU_MR.PUBLIC.service_d289e6506e3111eeb21b769aea86b514\\n ENDPOINT=predict\\n MAX_BATCH_ROWS = 1\\n AS '/predict'\\n \"}}" + " 'model_in_image': True,\n", + " 'debug_mode': True},\n", + " 'details': {'image_name': 'sfengineering-servicesnow.registry.snowflakecomputing.com/halu_ft_db/public/haul_repo/50e52d564ecc126d1f53452aa4dd734efa4e3a0a:latest',\n", + " 'service_spec': 'spec:\\n container:\\n - env:\\n NUM_WORKERS: 1\\n SNOWML_USE_GPU: true\\n TARGET_METHOD: infer\\n _CONCURRENT_REQUESTS_MAX: 1\\n image: sfengineering-servicesnow.registry.snowflakecomputing.com/halu_ft_db/public/haul_repo/50e52d564ecc126d1f53452aa4dd734efa4e3a0a:latest\\n name: inference-server\\n readinessProbe:\\n path: /health\\n port: 5000\\n resources:\\n limits:\\n nvidia.com/gpu: 1\\n requests:\\n nvidia.com/gpu: 1\\n volumeMounts:\\n - mountPath: /local/user/vol1\\n name: vol1\\n endpoint:\\n - name: predict\\n port: 5000\\n volume:\\n - name: vol1\\n source: local\\n',\n", + " 'service_function_sql': \"\\n CREATE OR REPLACE FUNCTION HALU_MR.PUBLIC.build_demo_1101_4(input OBJECT)\\n RETURNS OBJECT\\n SERVICE=HALU_MR.PUBLIC.service_3b9880c078d711ee861c06f9498c0da3\\n ENDPOINT=predict\\n MAX_BATCH_ROWS = 20\\n AS '/predict'\\n \"}}" ] }, - "execution_count": 27, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -256,10 +469,12 @@ " \"image_repo\": 'sfengineering-servicesnow.registry.snowflakecomputing.com/halu_ft_db/public/haul_repo',\n", " \"enable_remote_image_build\": True,\n", " \"model_in_image\": True,\n", + " \"debug_mode\": True,\n", + " #'prebuilt_snowflake_image': 'sfengineering-servicesnow.registry.snowflakecomputing.com/halu_ft_db/public/haul_repo/93d14fc687640746235f8f880a6af8c730ce3eaf:latest'\n", "}\n", " \n", "deploy_info = svc_model.deploy(\n", - " deployment_name=\"halu_ft_deploy_1\",\n", + " deployment_name=\"build_demo_1101_4\",\n", " platform=deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES,\n", " permanent=True,\n", " options=deployment_options\n", @@ -269,32 +484,55 @@ }, { "cell_type": "code", - "execution_count": 28, - "id": "b25baf1c", + "execution_count": null, + "id": "f14753df-6fdc-422e-864d-10d93fbc05a6", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "9475d6cd-5222-4bcb-9883-9d8924354d6a", "metadata": {}, "outputs": [], "source": [ - "sample = \"\"\"\n", + "PROMPT_TEMPLATE = \"\"\"\n", + "\n", "[INST] <>\n", "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n", "<>\n", "### Instruction:\n", - "Extract JSON response with 'location' and 'toy_list' as keys.\n", - "'location': Location string of the caller.\n", + "Extract JSON response with 'location' and 'toy_list' as keys. Start response by \"{\".\n", + "'location': Location of the caller. Include city only.\n", "'toy_list\": List of toy names from the caller.\n", + "\n", "### Input:\n", - " \"frosty: Hello, good friend! You're talking to Frosty! What's your name?\n", - "caller: My name's Oliver. And I'm calling from Perth.\n", - "frosty: Nice to meet you, Oliver from Perth! So, what's on your wish list this year?\n", - "caller: I want a mickey, please.\n", - "frosty: Look forward to some Mickey adventures!\"\n", - "[/INST]\n", - "\"\"\"" + "\"\"\"\n", + "\n", + "def build_prompt(input):\n", + " return PROMPT_TEMPLATE + input + \"\\n[/INST]\"" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, + "id": "484f44fe-bf03-497d-97b3-147fcb4074c3", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "b25baf1c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 26, "id": "2a84b44b", "metadata": {}, "outputs": [], @@ -302,6 +540,147 @@ "import pandas as pd" ] }, + { + "cell_type": "code", + "execution_count": 27, + "id": "d067a009-567d-4869-9e8a-44694e169cc0", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_json('/home/ubuntu/projects/v8.jsonl', lines=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "fdbfa6ef-e179-44e1-898b-8c103cf09d4d", + "metadata": {}, + "outputs": [], + "source": [ + "dfl = df['transcript'].to_list()[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "c484ec44-672d-4269-830b-42ec037cef13", + "metadata": {}, + "outputs": [], + "source": [ + "prompts = [build_prompt(t) for t in dfl]" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "e8377d97-a70d-4709-b1c7-ea638634a557", + "metadata": {}, + "outputs": [], + "source": [ + "input_df = pd.DataFrame({'input': prompts})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3477c070-f067-471e-83b1-302dfec392b9", + "metadata": {}, + "outputs": [], + "source": [ + "res = svc_model.predict(\n", + " deployment_name='build_demo_1101_2',\n", + " data=input_df\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3df9106f-30f4-4de5-b731-68d3d71901e9", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac42369c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e0da3cd-c983-4b12-b7ca-bc038a75ff9b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fb574d1-2671-46d4-baa4-659951b1f4cc", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4afd2ce-aaec-43d1-8961-a484964e2997", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "702e9749-e8e3-432c-9fb6-d083794fbe0b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65f84287-550f-4190-94c0-59b16aa64880", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5babe0a7-1272-4ca0-8f23-d9c57da21fce", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2b93fa3-97ae-422a-949d-5f3d72037ad9", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a4f591b-48d8-4187-8ffa-1ba747800ee1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0ac832c-e8c9-4083-9429-e8050dd2b215", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 29, @@ -415,7 +794,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/snowflake/ml/requirements.bzl b/snowflake/ml/requirements.bzl index 618e7401..403a547b 100755 --- a/snowflake/ml/requirements.bzl +++ b/snowflake/ml/requirements.bzl @@ -1,6 +1,63 @@ # DO NOT EDIT! # Generate by running 'bazel run --config=pre_build //bazel/requirements:sync_requirements' -EXTRA_REQUIREMENTS = {"all": ["lightgbm==3.3.5", "mlflow>=2.1.0,<2.4", "peft>=0.5.0,<1", "sentencepiece>=0.1.95,<0.2", "shap==0.42.1", "tensorflow>=2.9,<3,!=2.12.0", "tokenizers>=0.10,<1", "torchdata>=0.4,<1", "transformers>=4.29.2,<5"], "lightgbm": ["lightgbm==3.3.5"], "llm": ["peft>=0.5.0,<1"], "mlflow": ["mlflow>=2.1.0,<2.4"], "shap": ["shap==0.42.1"], "tensorflow": ["tensorflow>=2.9,<3,!=2.12.0"], "torch": ["torchdata>=0.4,<1"], "transformers": ["sentencepiece>=0.1.95,<0.2", "tokenizers>=0.10,<1", "transformers>=4.29.2,<5"]} +EXTRA_REQUIREMENTS = { + "all": [ + "lightgbm==3.3.5", + "mlflow>=2.1.0,<2.4", + "peft>=0.5.0,<1", + "sentencepiece>=0.1.95,<0.2", + "shap==0.42.1", + "tensorflow>=2.9,<3,!=2.12.0", + "tokenizers>=0.10,<1", + "torchdata>=0.4,<1", + "transformers>=4.32.1,<5", + "vllm>=0.2.1.post1,<1" + ], + "lightgbm": [ + "lightgbm==3.3.5" + ], + "llm": [ + "peft>=0.5.0,<1", + "vllm>=0.2.1.post1,<1" + ], + "mlflow": [ + "mlflow>=2.1.0,<2.4" + ], + "shap": [ + "shap==0.42.1" + ], + "tensorflow": [ + "tensorflow>=2.9,<3,!=2.12.0" + ], + "torch": [ + "torchdata>=0.4,<1" + ], + "transformers": [ + "sentencepiece>=0.1.95,<0.2", + "tokenizers>=0.10,<1", + "transformers>=4.32.1,<5" + ] +} -REQUIREMENTS = ["absl-py>=0.15,<2", "anyio>=3.5.0,<4", "cachetools>=3.1.1,<5", "cloudpickle>=2.0.0", "fsspec[http]>=2022.11,<2024", "numpy>=1.23,<2", "packaging>=20.9,<24", "pandas>=1.0.0,<2", "pytimeparse>=1.1.8,<2", "pyyaml>=6.0,<7", "s3fs>=2022.11,<2024", "scikit-learn>=1.2.1,<1.4", "scipy>=1.9,<2", "snowflake-connector-python[pandas]>=3.0.4,<4", "snowflake-snowpark-python>=1.5.1,<2", "sqlparse>=0.4,<1", "typing-extensions>=4.1.0,<5", "xgboost>=1.7.3,<2"] +REQUIREMENTS = [ + "absl-py>=0.15,<2", + "anyio>=3.5.0,<4", + "cachetools>=3.1.1,<5", + "cloudpickle>=2.0.0", + "fsspec[http]>=2022.11,<2024", + "importlib_resources>=5.1.4, <6", + "numpy>=1.23,<2", + "packaging>=20.9,<24", + "pandas>=1.0.0,<2", + "pytimeparse>=1.1.8,<2", + "pyyaml>=6.0,<7", + "s3fs>=2022.11,<2024", + "scikit-learn>=1.2.1,<1.4", + "scipy>=1.9,<2", + "snowflake-connector-python[pandas]>=3.0.4,<4", + "snowflake-snowpark-python>=1.5.1,<2", + "sqlparse>=0.4,<1", + "typing-extensions>=4.1.0,<5", + "xgboost>=1.7.3,<2" +] diff --git a/snowflake/ml/test_utils/BUILD.bazel b/snowflake/ml/test_utils/BUILD.bazel index 4a1be2e2..25bf068e 100644 --- a/snowflake/ml/test_utils/BUILD.bazel +++ b/snowflake/ml/test_utils/BUILD.bazel @@ -11,6 +11,12 @@ py_library( ], ) +py_library( + name = "test_env_utils", + testonly = True, + srcs = ["test_env_utils.py"], +) + py_library( name = "mock_snowml_base", testonly = True, diff --git a/snowflake/ml/test_utils/test_env_utils.py b/snowflake/ml/test_utils/test_env_utils.py new file mode 100644 index 00000000..a2b531f7 --- /dev/null +++ b/snowflake/ml/test_utils/test_env_utils.py @@ -0,0 +1,11 @@ +import functools +from typing import List + +import requests + + +@functools.lru_cache +def get_snowpark_ml_released_versions() -> List[str]: + releases_url = "https://api.github.com/repos/snowflakedb/snowflake-ml-python/releases" + releases_resp = requests.get(releases_url).json() + return [rel["tag_name"] for rel in releases_resp] diff --git a/snowflake/ml/utils/connection_params.py b/snowflake/ml/utils/connection_params.py index 44e1f323..02ee97ea 100644 --- a/snowflake/ml/utils/connection_params.py +++ b/snowflake/ml/utils/connection_params.py @@ -150,7 +150,7 @@ def SnowflakeLoginOptions(connection_name: str = "", login_file: Optional[str] = >> session = Session.builder.configs(SnowflakeLoginOptions()).create() Usage Note: - Ideally one should have a snoqsql config file. Read more here: + Ideally one should have a snowsql config file. Read more here: https://docs.snowflake.com/en/user-guide/snowsql-start.html#configuring-default-connection-settings Args: diff --git a/snowflake/ml/version.bzl b/snowflake/ml/version.bzl index 9dfd36a1..abf8795b 100644 --- a/snowflake/ml/version.bzl +++ b/snowflake/ml/version.bzl @@ -1,2 +1,2 @@ # This is parsed by regex in conda reciper meta file. Make sure not to break it. -VERSION = "1.0.11" +VERSION = "1.0.12" diff --git a/tests/integ/snowflake/ml/_internal/BUILD.bazel b/tests/integ/snowflake/ml/_internal/BUILD.bazel index dd7be706..d907170a 100644 --- a/tests/integ/snowflake/ml/_internal/BUILD.bazel +++ b/tests/integ/snowflake/ml/_internal/BUILD.bazel @@ -24,7 +24,7 @@ py_test( name = "grid_search_integ_test", timeout = "long", srcs = ["grid_search_integ_test.py"], - shard_count = 2, + shard_count = 3, deps = [ "//snowflake/ml/modeling/ensemble:random_forest_classifier", "//snowflake/ml/modeling/model_selection/_internal:_grid_search_cv", @@ -38,6 +38,7 @@ py_test( name = "randomized_search_integ_test", timeout = "long", srcs = ["randomized_search_integ_test.py"], + shard_count = 2, deps = [ "//snowflake/ml/modeling/ensemble:random_forest_classifier", "//snowflake/ml/modeling/model_selection/_internal:_randomized_search_cv", diff --git a/tests/integ/snowflake/ml/_internal/grid_search_integ_test.py b/tests/integ/snowflake/ml/_internal/grid_search_integ_test.py index a2bd5ad0..a3f6f974 100644 --- a/tests/integ/snowflake/ml/_internal/grid_search_integ_test.py +++ b/tests/integ/snowflake/ml/_internal/grid_search_integ_test.py @@ -2,7 +2,7 @@ import inflection import numpy as np -from absl.testing.absltest import TestCase, main +from absl.testing import absltest, parameterized from sklearn.datasets import load_diabetes, load_iris from sklearn.model_selection import GridSearchCV as SkGridSearchCV from sklearn.svm import SVR as SkSVR @@ -15,7 +15,7 @@ from snowflake.snowpark import Session -class GridSearchCVTest(TestCase): +class GridSearchCVTest(parameterized.TestCase): def setUp(self): """Creates Snowpark and Snowflake environments for testing.""" self._session = Session.builder.configs(SnowflakeLoginOptions()).create() @@ -30,10 +30,10 @@ def _compare_cv_results(self, cv_result_1, cv_result_2) -> None: for k, v in cv_result_1.items(): if isinstance(v, np.ndarray): if k.startswith("param_"): # compare the masked array - np.ma.allequal(v, cv_result_2[k]) + self.assertTrue(np.ma.allequal(v, cv_result_2[k])) elif k == "params": # compare the parameter combination - self.assertEqual(v.tolist(), cv_result_2[k]) - elif k.endswith("test_score"): # compare the test score + self.assertItemsEqual(v.tolist(), cv_result_2[k]) + elif ("test_") in k: # compare the test score np.testing.assert_allclose(v, cv_result_2[k], rtol=1.0e-1, atol=1.0e-2) # Do not compare the fit time @@ -71,8 +71,10 @@ def test_fit_and_compare_results(self, mock_if_single_node) -> None: actual_arr_pd = reg.predict(input_df.to_pandas()).sort_values(by="INDEX")[output_cols].to_numpy() np.testing.assert_allclose(actual_arr_pd.flatten(), sklearn_numpy_arr.flatten(), rtol=1.0e-1, atol=1.0e-2) + @parameterized.parameters({"is_single_node": True}, {"is_single_node": False}) @mock.patch("snowflake.ml.modeling.model_selection._internal._grid_search_cv.if_single_node") - def test_fit_xgboost(self, mock_if_single_node) -> None: + def test_fit_xgboost_multimetric_and_compare_results(self, mock_if_single_node, is_single_node) -> None: + mock_if_single_node.return_value = is_single_node mock_if_single_node.return_value = True # falls back to HPO implementation input_df_pandas = load_iris(as_frame=True).frame input_df_pandas.columns = [inflection.parameterize(c, "_").upper() for c in input_df_pandas.columns] @@ -83,27 +85,58 @@ def test_fit_xgboost(self, mock_if_single_node) -> None: sk_estimator = SkXGBClassifier(seed=42, n_jobs=1) parameters = { - "max_depth": range(2, 6, 1), + "max_depth": [2, 6], "learning_rate": [0.1, 0.01], } - sklearn_reg = SkGridSearchCV(estimator=sk_estimator, param_grid=parameters, verbose=True) + scoring = ["accuracy", "f1_macro"] + + sklearn_reg = SkGridSearchCV( + estimator=sk_estimator, param_grid=parameters, scoring=scoring, refit="f1_macro", verbose=True + ) sklearn_reg.fit(X=input_df_pandas[input_cols], y=input_df_pandas[label_col].squeeze()) estimator = XGBClassifier(seed=42, n_jobs=1) - reg = GridSearchCV(estimator=estimator, param_grid=parameters, verbose=True) + reg = GridSearchCV(estimator=estimator, param_grid=parameters, scoring=scoring, refit="f1_macro", verbose=True) reg.set_input_cols(input_cols) output_cols = ["OUTPUT_" + c for c in label_col] reg.set_output_cols(output_cols) reg.set_label_cols(label_col) reg.fit(input_df) + # the result of SnowML grid search cv should behave the same as sklearn's + sk_obj = reg.to_sklearn() + np.testing.assert_allclose(sk_obj.best_score_, sklearn_reg.best_score_) + self._compare_cv_results(sk_obj.cv_results_, sklearn_reg.cv_results_) + self.assertEqual(sk_obj.best_params_, sklearn_reg.best_params_) + self.assertEqual(sk_obj.multimetric_, sklearn_reg.multimetric_) + self.assertEqual(sklearn_reg.multimetric_, True) + self.assertEqual(sk_obj.best_index_, sklearn_reg.best_index_) + + # n_features_in_ is available because `refit` is set to `True`. + self.assertEqual(sk_obj.n_features_in_, sklearn_reg.n_features_in_) + + # classes are available because this is a classifier + for idx, class_ in enumerate(sk_obj.classes_): + self.assertEqual(class_, sklearn_reg.classes_[idx]) + actual_arr = reg.predict(input_df).to_pandas().sort_values(by="INDEX")[output_cols].to_numpy() sklearn_numpy_arr = sklearn_reg.predict(input_df_pandas[input_cols]) - - np.testing.assert_allclose(reg._sklearn_object.best_score_, sklearn_reg.best_score_) - self._compare_cv_results(reg._sklearn_object.cv_results_, sklearn_reg.cv_results_) np.testing.assert_allclose(actual_arr.flatten(), sklearn_numpy_arr.flatten(), rtol=1.0e-1, atol=1.0e-2) + # Test predict_proba + actual_inference_result = ( + reg.predict_proba(input_df, output_cols_prefix="OUTPUT_").to_pandas().sort_values(by="INDEX") + ) + actual_output_cols = [c for c in actual_inference_result.columns if c.find("OUTPUT_") >= 0] + actual_inference_result = actual_inference_result[actual_output_cols].to_numpy() + sklearn_predict_prob_array = sklearn_reg.predict_proba(input_df_pandas[input_cols]) + np.testing.assert_allclose(actual_inference_result.flatten(), sklearn_predict_prob_array.flatten()) + + # Test score + actual_score = reg.score(input_df) + sklearn_score = sklearn_reg.score(input_df_pandas[input_cols], input_df_pandas[label_col]) + np.testing.assert_allclose(actual_score, sklearn_score, rtol=1.0e-1, atol=1.0e-2) + if __name__ == "__main__": - main() + absltest.main() diff --git a/tests/integ/snowflake/ml/_internal/randomized_search_integ_test.py b/tests/integ/snowflake/ml/_internal/randomized_search_integ_test.py index eac60515..ac4fd9b0 100644 --- a/tests/integ/snowflake/ml/_internal/randomized_search_integ_test.py +++ b/tests/integ/snowflake/ml/_internal/randomized_search_integ_test.py @@ -2,7 +2,7 @@ import inflection import numpy as np -from absl.testing.absltest import TestCase, main +from absl.testing import absltest, parameterized from scipy.stats import randint from sklearn.datasets import load_iris from sklearn.ensemble import RandomForestClassifier as SkRandomForestClassifier @@ -14,7 +14,7 @@ from snowflake.snowpark import Session -class RandomizedSearchCVTest(TestCase): +class RandomizedSearchCVTest(parameterized.TestCase): def setUp(self): """Creates Snowpark and Snowflake environments for testing.""" self._session = Session.builder.configs(SnowflakeLoginOptions()).create() @@ -29,16 +29,17 @@ def _compare_cv_results(self, cv_result_1, cv_result_2) -> None: for k, v in cv_result_1.items(): if isinstance(v, np.ndarray): if k.startswith("param_"): # compare the masked array - np.ma.allequal(v, cv_result_2[k]) + self.assertTrue(np.ma.allequal(v, cv_result_2[k])) elif k == "params": # compare the parameter combination self.assertItemsEqual(v.tolist(), cv_result_2[k]) - elif k.endswith("test_score"): # compare the test score + elif ("test_") in k: # compare the test score np.testing.assert_allclose(v, cv_result_2[k], rtol=1.0e-1, atol=1.0e-2) # Do not compare the fit time + @parameterized.parameters({"is_single_node": True}, {"is_single_node": False}) @mock.patch("snowflake.ml.modeling.model_selection._internal._randomized_search_cv.if_single_node") - def test_fit_and_compare_results(self, mock_if_single_node) -> None: - mock_if_single_node.return_value = True # falls back to HPO implementation + def test_fit_and_compare_results(self, mock_if_single_node, is_single_node) -> None: + mock_if_single_node.return_value = is_single_node input_df_pandas = load_iris(as_frame=True).frame input_df_pandas.columns = [inflection.parameterize(c, "_").upper() for c in input_df_pandas.columns] input_cols = [c for c in input_df_pandas.columns if not c.startswith("TARGET")] @@ -46,7 +47,7 @@ def test_fit_and_compare_results(self, mock_if_single_node) -> None: input_df_pandas["INDEX"] = input_df_pandas.reset_index().index input_df = self._session.create_dataframe(input_df_pandas) param_distribution = { - "n_estimators": randint(50, 200), + "n_estimators": [50, 200], "max_depth": randint(3, 8), } @@ -68,22 +69,47 @@ def test_fit_and_compare_results(self, mock_if_single_node) -> None: reg.fit(input_df) sklearn_reg.fit(X=input_df_pandas[input_cols], y=input_df_pandas[label_col].squeeze()) - - actual_arr = reg.predict(input_df).to_pandas().sort_values(by="INDEX")[output_cols].to_numpy() - sklearn_numpy_arr = sklearn_reg.predict(input_df_pandas[input_cols]) sk_obj = reg.to_sklearn() # the result of SnowML grid search cv should behave the same as sklearn's np.testing.assert_allclose(sk_obj.best_score_, sklearn_reg.best_score_) - assert sk_obj.best_params_ == sklearn_reg.best_params_ + self.assertEqual(sk_obj.best_params_, sklearn_reg.best_params_) + self.assertEqual(sk_obj.multimetric_, sklearn_reg.multimetric_) + self.assertEqual(sklearn_reg.multimetric_, False) + self.assertEqual(sk_obj.best_index_, sklearn_reg.best_index_) self._compare_cv_results(sk_obj.cv_results_, sklearn_reg.cv_results_) + actual_arr = reg.predict(input_df).to_pandas().sort_values(by="INDEX")[output_cols].to_numpy() + sklearn_numpy_arr = sklearn_reg.predict(input_df_pandas[input_cols]) np.testing.assert_allclose(actual_arr.flatten(), sklearn_numpy_arr.flatten(), rtol=1.0e-1, atol=1.0e-2) # Test on fitting on snowpark Dataframe, and predict on pandas dataframe actual_arr_pd = reg.predict(input_df.to_pandas()).sort_values(by="INDEX")[output_cols].to_numpy() np.testing.assert_allclose(actual_arr_pd.flatten(), sklearn_numpy_arr.flatten(), rtol=1.0e-1, atol=1.0e-2) + # Test predict_proba + actual_inference_result = ( + reg.predict_proba(input_df, output_cols_prefix="OUTPUT_").to_pandas().sort_values(by="INDEX") + ) + actual_output_cols = [c for c in actual_inference_result.columns if c.find("OUTPUT_") >= 0] + actual_inference_result = actual_inference_result[actual_output_cols].to_numpy() + + sklearn_predict_prob_array = sklearn_reg.predict_proba(input_df_pandas[input_cols]) + np.testing.assert_allclose(actual_inference_result.flatten(), sklearn_predict_prob_array.flatten()) + + # Test predict_log_proba + actual_log_proba_result = ( + reg.predict_log_proba(input_df, output_cols_prefix="OUTPUT_").to_pandas().sort_values(by="INDEX") + ) + actual_log_proba_result = actual_log_proba_result[actual_output_cols].to_numpy() + sklearn_log_prob_array = sklearn_reg.predict_log_proba(input_df_pandas[input_cols]) + np.testing.assert_allclose(actual_log_proba_result.flatten(), sklearn_log_prob_array.flatten()) + + # Test score + actual_score = reg.score(input_df) + sklearn_score = sklearn_reg.score(input_df_pandas[input_cols], input_df_pandas[label_col]) + np.testing.assert_allclose(actual_score, sklearn_score, rtol=1.0e-1, atol=1.0e-2) + if __name__ == "__main__": - main() + absltest.main() diff --git a/tests/integ/snowflake/ml/image_builds/image_registry_client_integ_test.py b/tests/integ/snowflake/ml/image_builds/image_registry_client_integ_test.py index 0ff24582..bb1e9a0d 100644 --- a/tests/integ/snowflake/ml/image_builds/image_registry_client_integ_test.py +++ b/tests/integ/snowflake/ml/image_builds/image_registry_client_integ_test.py @@ -9,18 +9,16 @@ class ImageRegistryClientIntegTest(spcs_integ_test_base.SpcsIntegTestBase): - @classmethod - def setUpClass(cls) -> None: - super().setUpClass() - cls._TEST_REPO = "TEST_REPO" - client = snowservice_client.SnowServiceClient(cls._session) + def setUp(self) -> None: + super().setUp() + self._TEST_REPO = "TEST_REPO" + client = snowservice_client.SnowServiceClient(self._session) client.create_image_repo( - identifier.get_schema_level_object_identifier(cls._TEST_DB, cls._TEST_SCHEMA, cls._TEST_REPO) + identifier.get_schema_level_object_identifier(self._test_db, self._test_schema, self._TEST_REPO) ) - @classmethod - def tearDownClass(cls) -> None: - super().tearDownClass() + def tearDown(self) -> None: + super().tearDown() def _get_repo_url(self) -> str: """Retrieve repo url. @@ -29,7 +27,7 @@ def _get_repo_url(self) -> str: """ sql = ( f"SHOW IMAGE REPOSITORIES LIKE '{self._TEST_REPO}' " - f"IN SCHEMA {'.'.join([self._TEST_DB, self._TEST_SCHEMA])}" + f"IN SCHEMA {'.'.join([self._test_db, self._test_schema])}" ) result = ( query_result_checker.SqlResultValidator( diff --git a/tests/integ/snowflake/ml/model/BUILD.bazel b/tests/integ/snowflake/ml/model/BUILD.bazel index f7e8b307..fffbeb68 100644 --- a/tests/integ/snowflake/ml/model/BUILD.bazel +++ b/tests/integ/snowflake/ml/model/BUILD.bazel @@ -5,8 +5,7 @@ py_library( testonly = True, srcs = ["warehouse_model_integ_test_utils.py"], deps = [ - "//snowflake/ml/model:_deployer", - "//snowflake/ml/model:_model", + "//snowflake/ml/model:_api", "//snowflake/ml/model:deploy_platforms", "//snowflake/ml/model:type_hints", "//snowflake/ml/model/_signatures:snowpark_handler", @@ -99,8 +98,7 @@ py_test( deps = [ ":warehouse_model_integ_test_utils", "//snowflake/ml/_internal/exceptions", - "//snowflake/ml/model:_deployer", - "//snowflake/ml/model:_model", + "//snowflake/ml/model:_api", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:type_hints", "//snowflake/ml/utils:connection_params", @@ -128,7 +126,7 @@ py_test( timeout = "long", srcs = ["deployment_to_snowservice_integ_test.py"], deps = [ - "//snowflake/ml/model:_model", + "//snowflake/ml/model:_api", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:type_hints", "//snowflake/ml/model/_deploy_client/snowservice:deploy", @@ -153,20 +151,32 @@ py_test( ], ) -#TODO(halu): Needs support of pip package for build & test -#py_test( -# name = "spcs_llm_model_integ_test", -# timeout = "eternal", # 3600s, GPU image takes very long to build.. -# srcs = ["spcs_llm_model_integ_test.py"], -# compatible_with_snowpark = False, -# deps = [ -# ":warehouse_model_integ_test_utils", -# "//snowflake/ml/_internal:env_utils", -# "//snowflake/ml/model:type_hints", -# "//snowflake/ml/model/models:llm_model", -# "//snowflake/ml/utils:connection_params", -# "//tests/integ/snowflake/ml/test_utils:db_manager", -# "//tests/integ/snowflake/ml/test_utils:spcs_integ_test_base", -# "//tests/integ/snowflake/ml/test_utils:test_env_utils", -# ], -#) +py_test( + name = "spcs_llm_model_integ_test", + timeout = "eternal", # 3600s, GPU image takes very long to build.. + srcs = ["spcs_llm_model_integ_test.py"], + compatible_with_snowpark = False, + deps = [ + ":warehouse_model_integ_test_utils", + "//snowflake/ml/_internal:env_utils", + "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/models:llm_model", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/test_utils:db_manager", + "//tests/integ/snowflake/ml/test_utils:spcs_integ_test_base", + "//tests/integ/snowflake/ml/test_utils:test_env_utils", + ], +) + +py_test( + name = "warehouse_model_compat_v1_test", + timeout = "long", + srcs = ["warehouse_model_compat_v1_test.py"], + shard_count = 8, + deps = [ + "//snowflake/ml/model:_api", + "//snowflake/ml/model:deploy_platforms", + "//tests/integ/snowflake/ml/test_utils:common_test_base", + "//tests/integ/snowflake/ml/test_utils:db_manager", + ], +) diff --git a/tests/integ/snowflake/ml/model/deployment_to_snowservice_integ_test.py b/tests/integ/snowflake/ml/model/deployment_to_snowservice_integ_test.py index 0be59983..37fe8151 100644 --- a/tests/integ/snowflake/ml/model/deployment_to_snowservice_integ_test.py +++ b/tests/integ/snowflake/ml/model/deployment_to_snowservice_integ_test.py @@ -12,9 +12,10 @@ # from sklearn import neighbors # from snowflake.ml.model import ( -# _model as model_api, +# _api as model_api, # _model_meta, # custom_model, +# deploy_platforms, # type_hints as model_types, # ) # from snowflake.ml.model._deploy_client.snowservice import deploy as snowservice_api @@ -80,11 +81,11 @@ # def _save_model_to_stage( # self, model: custom_model.CustomModel, sample_input: pd.DataFrame # ) -> Tuple[str, _model_meta.ModelMetadata]: -# stage_path = f"@{self.TEST_STAGE}/{self.uid}/model.zip" +# stage_path = f"@{self.TEST_STAGE}/{self.uid}" # meta = model_api.save_model( # type: ignore[call-overload] # name="model", # session=self._session, -# model_stage_file_path=stage_path, +# stage_path=stage_path, # model=model, # sample_input=sample_input, # options={"embed_local_ml_library": True}, @@ -92,7 +93,7 @@ # return stage_path, meta # def test_deployment_workflow(self) -> None: -# model_stage_file_path, meta = self._save_model_to_stage(model=_get_sklearn_model(), sample_input=_IRIS_X) +# stage_path, meta = self._save_model_to_stage(model=_get_sklearn_model(), sample_input=_IRIS_X) # service_func_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( # self._RUN_ID, f"func_{self.uid}" # ) @@ -103,16 +104,17 @@ # subdomain=constants.DEV_IMAGE_REGISTRY_SUBDOMAIN, repo=self.TEST_IMAGE_REPO # ), # } -# snowservice_api._deploy( -# self._session, -# model_id=uuid.uuid4().hex, -# model_meta=meta, -# service_func_name=service_func_name, -# model_zip_stage_path=model_stage_file_path, -# deployment_stage_path=model_stage_file_path, # use the same stage for testing -# target_method="predict", -# **deployment_options, -# ) +# model_api.deploy( +# name=service_func_name, +# session=self._session, +# stage_path=stage_path, +# platform=deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES, +# target_method="predict", +# model_id=uuid.uuid4().hex, +# options={ +# **deployment_options, +# }, # type: ignore[call-overload] +# ) if __name__ == "__main__": diff --git a/tests/integ/snowflake/ml/model/model_badcase_integ_test.py b/tests/integ/snowflake/ml/model/model_badcase_integ_test.py index c1c9133b..2c57960b 100644 --- a/tests/integ/snowflake/ml/model/model_badcase_integ_test.py +++ b/tests/integ/snowflake/ml/model/model_badcase_integ_test.py @@ -7,8 +7,7 @@ from snowflake.ml._internal.exceptions import exceptions as snowml_exceptions from snowflake.ml.model import ( - _deployer, - _model as model_api, + _api as model_api, custom_model, deploy_platforms, type_hints as model_types, @@ -68,7 +67,7 @@ def test_bad_model_deploy(self) -> None: model_api.save_model( name="custom_bad_model", session=self._session, - model_stage_file_path=posixpath.join(tmp_stage, "custom_bad_model.zip"), + stage_path=posixpath.join(tmp_stage, "custom_bad_model"), model=lm, sample_input=pd_df, metadata={"author": "halu", "version": "1"}, @@ -77,10 +76,10 @@ def test_bad_model_deploy(self) -> None: ) function_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "custom_bad_model") with self.assertRaises(snowml_exceptions.SnowflakeMLException) as e: - _ = _deployer.deploy( + _ = model_api.deploy( session=self._session, name=function_name, - model_stage_file_path=posixpath.join(tmp_stage, "custom_bad_model.zip"), + stage_path=posixpath.join(tmp_stage, "custom_bad_model"), platform=deploy_platforms.TargetPlatform.WAREHOUSE, target_method="predict", options=model_types.WarehouseDeployOptions({"relax_version": False}), @@ -93,10 +92,10 @@ def test_custom_demo_model(self) -> None: arr = np.random.randint(100, size=(10000, 3)) pd_df = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) - model_metadata = model_api.save_model( + module_model = model_api.save_model( name="custom_demo_model", session=self._session, - model_stage_file_path=posixpath.join(tmp_stage, "custom_demo_model.zip"), + stage_path=posixpath.join(tmp_stage, "custom_demo_model"), model=lm, conda_dependencies=[ test_env_utils.get_latest_package_version_spec_in_server(self._session, "snowflake-snowpark-python") @@ -105,14 +104,14 @@ def test_custom_demo_model(self) -> None: metadata={"author": "halu", "version": "1"}, ) - self.assertTrue(hasattr(model_metadata, "local_ml_library_version")) + self.assertIsNotNone(module_model.packager.meta.env._snowpark_ml_version.local) function_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "custom_demo_model") with self.assertRaises(snowml_exceptions.SnowflakeMLException) as e: - deploy_info = _deployer.deploy( + deploy_info = model_api.deploy( session=self._session, name=function_name, - model_stage_file_path=posixpath.join(tmp_stage, "custom_demo_model.zip"), + stage_path=posixpath.join(tmp_stage, "custom_demo_model"), platform=deploy_platforms.TargetPlatform.WAREHOUSE, target_method="predict", options=model_types.WarehouseDeployOptions( @@ -124,10 +123,10 @@ def test_custom_demo_model(self) -> None: ) self.assertIsInstance(e.exception.original_exception, ValueError) - deploy_info = _deployer.deploy( + deploy_info = model_api.deploy( session=self._session, name=function_name, - model_stage_file_path=posixpath.join(tmp_stage, "custom_demo_model.zip"), + stage_path=posixpath.join(tmp_stage, "custom_demo_model"), platform=deploy_platforms.TargetPlatform.WAREHOUSE, target_method="predict", options=model_types.WarehouseDeployOptions( @@ -137,7 +136,7 @@ def test_custom_demo_model(self) -> None: ), ) assert deploy_info is not None - res = _deployer.predict(session=self._session, deployment=deploy_info, X=pd_df) + res = model_api.predict(session=self._session, deployment=deploy_info, X=pd_df) pd.testing.assert_frame_equal( res, @@ -145,10 +144,10 @@ def test_custom_demo_model(self) -> None: ) with self.assertRaises(snowpark_exceptions.SnowparkSQLException): - deploy_info = _deployer.deploy( + deploy_info = model_api.deploy( session=self._session, name=function_name, - model_stage_file_path=posixpath.join(tmp_stage, "custom_demo_model.zip"), + stage_path=posixpath.join(tmp_stage, "custom_demo_model"), platform=deploy_platforms.TargetPlatform.WAREHOUSE, target_method="predict", options=model_types.WarehouseDeployOptions( @@ -160,10 +159,10 @@ def test_custom_demo_model(self) -> None: self._db_manager.drop_function(function_name=function_name, args=["OBJECT"]) - deploy_info = _deployer.deploy( + deploy_info = model_api.deploy( session=self._session, name=function_name, - model_stage_file_path=posixpath.join(tmp_stage, "custom_demo_model.zip"), + stage_path=posixpath.join(tmp_stage, "custom_demo_model"), platform=deploy_platforms.TargetPlatform.WAREHOUSE, target_method="predict", options=model_types.WarehouseDeployOptions( diff --git a/tests/integ/snowflake/ml/model/spcs_llm_model_integ_test.py b/tests/integ/snowflake/ml/model/spcs_llm_model_integ_test.py index e86cca8f..c740b4b9 100644 --- a/tests/integ/snowflake/ml/model/spcs_llm_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/spcs_llm_model_integ_test.py @@ -1,109 +1,90 @@ -# import os -# import tempfile -# import uuid +import os +import tempfile -# import pandas as pd -# from absl.testing import absltest +import pandas as pd +import pytest +from absl.testing import absltest -# from snowflake.ml.model import ( -# _deployer, -# _model as model_api, -# deploy_platforms, -# type_hints as model_types, -# ) -# from snowflake.ml.model.models import llm -# from tests.integ.snowflake.ml.test_utils import ( -# db_manager, -# spcs_integ_test_base, -# test_env_utils, -# ) +from snowflake.ml.model import ( + _api as model_api, + deploy_platforms, + type_hints as model_types, +) +from snowflake.ml.model.models import llm +from tests.integ.snowflake.ml.test_utils import ( + db_manager, + spcs_integ_test_base, + test_env_utils, +) -# class TestSPCSLLMModelInteg(spcs_integ_test_base.SpcsIntegTestBase): -# @classmethod -# def setUpClass(cls) -> None: -# super().setUpClass() -# cls.cache_dir = tempfile.TemporaryDirectory() -# cls._original_hf_home = os.getenv("HF_HOME", None) -# os.environ["HF_HOME"] = cls.cache_dir.name +@pytest.mark.conda_incompatible +class TestSPCSLLMModelInteg(spcs_integ_test_base.SpcsIntegTestBase): + def setUp(self) -> None: + super().setUp() + self.cache_dir = tempfile.TemporaryDirectory() + self._original_hf_home = os.getenv("HF_HOME", None) + os.environ["HF_HOME"] = self.cache_dir.name -# @classmethod -# def tearDownClass(cls) -> None: -# super().tearDownClass() -# if cls._original_hf_home: -# os.environ["HF_HOME"] = cls._original_hf_home -# else: -# del os.environ["HF_HOME"] -# cls.cache_dir.cleanup() + def tearDown(self) -> None: + super().tearDown() + if self._original_hf_home: + os.environ["HF_HOME"] = self._original_hf_home + else: + del os.environ["HF_HOME"] + self.cache_dir.cleanup() -# def setUp(self) -> None: -# # Set up a unique id for each artifact, in addition to the class-level prefix. This is particularly useful -# # when differentiating artifacts generated between different test cases, such as service function names. -# self.uid = uuid.uuid4().hex[:4] + def test_text_generation_pipeline( + self, + ) -> None: + model = llm.LLM( + model_id_or_path="facebook/opt-350m", + ) -# def test_text_generation_pipeline( -# self, -# ) -> None: -# import peft + x_df = pd.DataFrame( + [["Hello world"]], + ) -# ft_model = peft.AutoPeftModelForCausalLM.from_pretrained( -# "peft-internal-testing/tiny-OPTForCausalLM-lora", -# device_map="auto", -# ) -# tmpdir = self.create_tempdir().full_path -# ft_model.save_pretrained(tmpdir) -# model = llm.LLM( -# model_id_or_path=tmpdir, -# ) + stage_path = f"@{self._test_stage}/{self._run_id}" + deployment_stage_path = f"@{self._test_stage}/{self._run_id}" + model_api.save_model( # type: ignore[call-overload] + name="model", + session=self._session, + stage_path=stage_path, + model=model, + options={"embed_local_ml_library": True}, + conda_dependencies=[ + test_env_utils.get_latest_package_version_spec_in_server(self._session, "snowflake-snowpark-python"), + ], + ) + svc_func_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( + self._run_id, + f"func_{self._run_id}", + ) -# x_df = pd.DataFrame( -# [["Hello world"]], -# ) -# cls = TestSPCSLLMModelInteg -# stage_path = f"@{cls._TEST_STAGE}/{self.uid}/model.zip" -# deployment_stage_path = f"@{cls._TEST_STAGE}/{self.uid}" -# model_api.save_model( # type: ignore[call-overload] -# name="model", -# session=self._session, -# model_stage_file_path=stage_path, -# model=model, -# options={"embed_local_ml_library": True}, -# conda_dependencies=[ -# test_env_utils.get_latest_package_version_spec_in_server(self._session, "snowflake-snowpark-python"), -# ], -# ) -# svc_func_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( -# self._RUN_ID, -# f"func_{self.uid}", -# ) -# deployment_options: model_types.SnowparkContainerServiceDeployOptions = { -# "compute_pool": cls._TEST_GPU_COMPUTE_POOL, -# "num_gpus": 1, -# # TODO(halu): Create an separate testing registry. -# # Creating new registry for each single test is costly since no cache hit would ever occurs. -# "image_repo": "sfengineering-mlplatformtest.registry.snowflakecomputing.com/" -# "regtest_db/regtest_schema/halu_test", -# "enable_remote_image_build": True, -# "model_in_image": True, -# } + deployment_options: model_types.SnowparkContainerServiceDeployOptions = { + "compute_pool": self._TEST_GPU_COMPUTE_POOL, + "num_gpus": 1, + "model_in_image": True, + } -# deploy_info = _deployer.deploy( -# name=svc_func_name, -# session=cls._session, -# model_stage_file_path=stage_path, -# deployment_stage_path=deployment_stage_path, -# model_id=svc_func_name, -# platform=deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES, -# options={ -# **deployment_options, # type: ignore[arg-type] -# }, # type: ignore[call-overload] -# ) -# assert deploy_info is not None -# res = _deployer.predict(session=cls._session, deployment=deploy_info, X=x_df) -# self.assertIn("generated_text", res) -# self.assertEqual(len(res["generated_text"]), 1) -# self.assertNotEmpty(res["generated_text"][0]) + deploy_info = model_api.deploy( + name=svc_func_name, + session=self._session, + stage_path=stage_path, + deployment_stage_path=deployment_stage_path, + model_id=svc_func_name, + platform=deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES, + options={ + **deployment_options, # type: ignore[arg-type] + }, # type: ignore[call-overload] + ) + assert deploy_info is not None + res = model_api.predict(session=self._session, deployment=deploy_info, X=x_df) + self.assertIn("generated_text", res) + self.assertEqual(len(res["generated_text"]), 1) + self.assertNotEmpty(res["generated_text"][0]) -# if __name__ == "__main__": -# absltest.main() +if __name__ == "__main__": + absltest.main() diff --git a/tests/integ/snowflake/ml/model/warehouse_custom_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_custom_model_integ_test.py index a57ef8a7..7877b310 100644 --- a/tests/integ/snowflake/ml/model/warehouse_custom_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_custom_model_integ_test.py @@ -105,7 +105,6 @@ def base_test_case( test_input: model_types.SupportedDataType, deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: warehouse_model_integ_test_utils.base_test_case( self._db_manager, @@ -117,14 +116,12 @@ def base_test_case( test_input=test_input, deploy_params=deploy_params, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.8"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_async_model_composition( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: async def _test(self: "TestWarehouseCustomModelInteg") -> None: arr = np.random.randint(100, size=(10000, 3)) @@ -152,16 +149,14 @@ async def _test(self: "TestWarehouseCustomModelInteg") -> None: ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) asyncio.get_event_loop().run_until_complete(_test(self)) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.8"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_custom_demo_model_sp( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: lm = DemoModel(custom_model.ModelContext()) arr = [[1, 2, 3], [4, 2, 5]] @@ -179,14 +174,12 @@ def test_custom_demo_model_sp( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.8"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_custom_demo_model_sp_quote( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: lm = DemoModelSPQuote(custom_model.ModelContext()) arr = [[1, 2, 3], [4, 2, 5]] @@ -207,14 +200,12 @@ def test_custom_demo_model_sp_quote( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.8"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_custom_demo_model_sp_mix_1( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: lm = DemoModel(custom_model.ModelContext()) arr = [[1, 2, 3], [4, 2, 5]] @@ -233,14 +224,12 @@ def test_custom_demo_model_sp_mix_1( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.8"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_custom_demo_model_sp_mix_2( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: lm = DemoModel(custom_model.ModelContext()) arr = [[1, 2, 3], [4, 2, 5]] @@ -261,14 +250,12 @@ def test_custom_demo_model_sp_mix_2( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.8"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_custom_demo_model_array( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: lm = DemoModelArray(custom_model.ModelContext()) arr = np.array([[1, 2, 3], [4, 2, 5]]) @@ -288,14 +275,12 @@ def test_custom_demo_model_array( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.8"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_custom_demo_model_str( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: lm = DemoModel(custom_model.ModelContext()) pd_df = pd.DataFrame([["Yogiri", "Civia", "Echo"], ["Artia", "Doris", "Rosalyn"]], columns=["c1", "c2", "c3"]) @@ -314,14 +299,12 @@ def test_custom_demo_model_str( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.8"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_custom_demo_model_array_sp( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: lm = DemoModelArray(custom_model.ModelContext()) arr = np.array([[1, 2, 3], [4, 2, 5]]) @@ -340,14 +323,12 @@ def test_custom_demo_model_array_sp( ) }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.8"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_custom_demo_model_str_sp( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: lm = DemoModel(custom_model.ModelContext()) pd_df = pd.DataFrame([["Yogiri", "Civia", "Echo"], ["Artia", "Doris", "Rosalyn"]], columns=["c1", "c2", "c3"]) @@ -365,14 +346,12 @@ def test_custom_demo_model_str_sp( ) }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.8"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_custom_demo_model_array_str( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: lm = DemoModelArray(custom_model.ModelContext()) pd_df = pd.DataFrame([["Yogiri", "Civia", "Echo"], ["Artia", "Doris", "Rosalyn"]], columns=["c1", "c2", "c3"]) @@ -391,14 +370,12 @@ def test_custom_demo_model_array_str( ) }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.8"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_custom_model_with_artifacts( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, "bias"), "w", encoding="utf-8") as f: @@ -423,14 +400,12 @@ def test_custom_model_with_artifacts( ) }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.8"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_custom_model_bool_sp( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, "bias"), "w", encoding="utf-8") as f: @@ -456,7 +431,6 @@ def test_custom_model_bool_sp( ) }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/model/warehouse_huggingface_pipeline_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_huggingface_pipeline_model_integ_test.py index fe348187..f5c639ec 100644 --- a/tests/integ/snowflake/ml/model/warehouse_huggingface_pipeline_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_huggingface_pipeline_model_integ_test.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +import pytest from absl.testing import absltest, parameterized from packaging import requirements @@ -17,6 +18,7 @@ from tests.integ.snowflake.ml.test_utils import db_manager +@pytest.mark.pip_incompatible class TestWarehouseHuggingFacehModelInteg(parameterized.TestCase): @classmethod def setUpClass(self) -> None: @@ -68,7 +70,6 @@ def base_test_case( Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]], ], permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, additional_dependencies: Optional[List[str]] = None, ) -> None: warehouse_model_integ_test_utils.base_test_case( @@ -81,15 +82,13 @@ def base_test_case( test_input=test_input, deploy_params=deploy_params, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, additional_dependencies=additional_dependencies, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_conversational_pipeline( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: # We have to import here due to cache location issue. # Only by doing so can we make the cache dir setting effective. @@ -128,14 +127,12 @@ def check_res(res: pd.DataFrame) -> None: ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_fill_mask_pipeline( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: import transformers @@ -174,14 +171,12 @@ def check_res(res: pd.DataFrame) -> None: ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_ner_pipeline( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: import transformers @@ -218,14 +213,12 @@ def check_res(res: pd.DataFrame) -> None: ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_question_answering_pipeline( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: import transformers @@ -267,14 +260,12 @@ def check_res(res: pd.DataFrame) -> None: ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_question_answering_pipeline_multiple_output( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: import transformers @@ -320,14 +311,12 @@ def check_res(res: pd.DataFrame) -> None: ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_summarization_pipeline( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: import transformers @@ -365,17 +354,15 @@ def check_res(res: pd.DataFrame) -> None: ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, additional_dependencies=[ str(env_utils.get_local_installed_version_of_pip_package(requirements.Requirement("sentencepiece"))) ], ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_table_question_answering_pipeline( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: import transformers @@ -433,14 +420,12 @@ def check_res(res: pd.DataFrame) -> None: ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_text_classification_pair_pipeline( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: import transformers @@ -467,14 +452,12 @@ def check_res(res: pd.DataFrame) -> None: ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_text_classification_pipeline( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: import transformers @@ -514,14 +497,12 @@ def check_res(res: pd.DataFrame) -> None: ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_text_generation_pipeline( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: import transformers @@ -554,14 +535,12 @@ def check_res(res: pd.DataFrame) -> None: ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_text2text_generation_pipeline( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: import transformers @@ -589,14 +568,12 @@ def check_res(res: pd.DataFrame) -> None: ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_translation_pipeline( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: import transformers @@ -634,14 +611,12 @@ def check_res(res: pd.DataFrame) -> None: ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_zero_shot_classification_pipeline( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: import transformers @@ -692,7 +667,6 @@ def check_res(res: pd.DataFrame) -> None: ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/model/warehouse_mlflow_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_mlflow_model_integ_test.py index dc74ae47..b72875cb 100644 --- a/tests/integ/snowflake/ml/model/warehouse_mlflow_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_mlflow_model_integ_test.py @@ -57,7 +57,6 @@ def base_test_case( test_input: model_types.SupportedDataType, deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: warehouse_model_integ_test_utils.base_test_case( self._db_manager, @@ -69,14 +68,12 @@ def base_test_case( test_input=test_input, deploy_params=deploy_params, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_mlflow_model_deploy_sklearn_df( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: db = datasets.load_diabetes(as_frame=True) X_train, X_test, y_train, y_test = model_selection.train_test_split(db.data, db.target) @@ -125,14 +122,12 @@ def test_mlflow_model_deploy_sklearn_df( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_mlflow_model_deploy_sklearn( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: db = datasets.load_diabetes() X_train, X_test, y_train, y_test = model_selection.train_test_split(db.data, db.target) @@ -183,7 +178,6 @@ def test_mlflow_model_deploy_sklearn( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/model/warehouse_model_compat_v1_test.py b/tests/integ/snowflake/ml/model/warehouse_model_compat_v1_test.py new file mode 100644 index 00000000..85e2a3d6 --- /dev/null +++ b/tests/integ/snowflake/ml/model/warehouse_model_compat_v1_test.py @@ -0,0 +1,689 @@ +import posixpath +import uuid +from typing import Callable, Tuple + +import numpy as np +import pandas as pd +import tensorflow as tf +import torch +from absl.testing import absltest +from sklearn import datasets + +from snowflake.ml.model import _api as model_api, deploy_platforms +from snowflake.snowpark import session +from tests.integ.snowflake.ml.test_utils import common_test_base, db_manager + + +class TestWarehouseCustomModelCompat(common_test_base.CommonTestBase): + def setUp(self) -> None: + """Creates Snowpark and Snowflake environments for testing.""" + super().setUp() + self.run_id = uuid.uuid4().hex + self.session_stage = self.session.get_session_stage() + self.model_stage_path = posixpath.join(self.session_stage, self.run_id) + self.model_stage_file_path = posixpath.join(self.session_stage, self.run_id, f"{self.run_id}.zip") + + def _log_model_factory( + self, + ) -> Tuple[Callable[[session.Session, str, str], None], Tuple[str, str]]: + def log_model(session: session.Session, run_id: str, model_stage_file_path: str) -> None: + import pandas as pd + + from snowflake.ml.model import ( # type: ignore[attr-defined] + _model as model_api, + custom_model, + ) + + class DemoModel(custom_model.CustomModel): + def __init__(self, context: custom_model.ModelContext) -> None: + super().__init__(context) + + @custom_model.inference_api + def predict(self, input: pd.DataFrame) -> pd.DataFrame: + return pd.DataFrame({"output": input["c1"]}) + + lm = DemoModel(custom_model.ModelContext()) + pd_df = pd.DataFrame([[1, 2, 3], [4, 2, 5]], columns=["c1", "c2", "c3"]) + + model_api.save_model( + name=run_id, + model=lm, + sample_input=pd_df, + metadata={"author": "halu", "version": "1"}, + session=session, + model_stage_file_path=model_stage_file_path, + ) + + return log_model, (self.run_id, self.model_stage_file_path) + + @common_test_base.CommonTestBase.compatibility_test( + prepare_fn_factory=_log_model_factory, version_range=">=1.0.8,<=1.0.11" # type: ignore[misc, arg-type] + ) + def test_deploy_custom_model_compat_v1(self) -> None: + deploy_info = model_api.deploy( + self.session, + name=db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "predict"), + platform=deploy_platforms.TargetPlatform.WAREHOUSE, + stage_path=self.model_stage_path, + target_method="predict", + options={}, + ) + assert deploy_info + + model_api.predict( + self.session, deployment=deploy_info, X=pd.DataFrame([[1, 2, 3], [4, 2, 5]], columns=["c1", "c2", "c3"]) + ) + + def _log_model_multiple_components_factory( + self, + ) -> Tuple[Callable[[session.Session, str, str], None], Tuple[str, str]]: + def log_model(session: session.Session, run_id: str, model_stage_file_path: str) -> None: + import os + import tempfile + + import pandas as pd + + from snowflake.ml.model import ( # type: ignore[attr-defined] + _model as model_api, + custom_model, + ) + + class DemoModel(custom_model.CustomModel): + def __init__(self, context: custom_model.ModelContext) -> None: + super().__init__(context) + + @custom_model.inference_api + def predict(self, input: pd.DataFrame) -> pd.DataFrame: + return pd.DataFrame({"output": input["c1"]}) + + class AsyncComposeModel(custom_model.CustomModel): + def __init__(self, context: custom_model.ModelContext) -> None: + super().__init__(context) + + @custom_model.inference_api + async def predict(self, input: pd.DataFrame) -> pd.DataFrame: + res1 = await self.context.model_ref("m1").predict.async_run(input) + res_sum = res1["output"] + self.context.model_ref("m2").predict(input)["output"] + return pd.DataFrame({"output": res_sum / 2}) + + class DemoModelWithArtifacts(custom_model.CustomModel): + def __init__(self, context: custom_model.ModelContext) -> None: + super().__init__(context) + with open(context.path("bias"), encoding="utf-8") as f: + v = int(f.read()) + self.bias = v + + @custom_model.inference_api + def predict(self, input: pd.DataFrame) -> pd.DataFrame: + return pd.DataFrame({"output": (input["c1"] + self.bias) > 12}) + + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, "bias"), "w", encoding="utf-8") as f: + f.write("10") + lm_1 = DemoModelWithArtifacts( + custom_model.ModelContext(models={}, artifacts={"bias": os.path.join(tmpdir, "bias")}) + ) + lm_2 = DemoModel(custom_model.ModelContext()) + model_context = custom_model.ModelContext( + models={ + "m1": lm_1, + "m2": lm_2, + } + ) + acm = AsyncComposeModel(model_context) + pd_df = pd.DataFrame([[1, 2, 3], [4, 2, 5]], columns=["c1", "c2", "c3"]) + + model_api.save_model( + name=run_id, + model=acm, + sample_input=pd_df, + metadata={"author": "halu", "version": "1"}, + session=session, + model_stage_file_path=model_stage_file_path, + ) + + return log_model, (self.run_id, self.model_stage_file_path) + + @common_test_base.CommonTestBase.compatibility_test( + prepare_fn_factory=_log_model_multiple_components_factory, # type: ignore[misc, arg-type] + version_range=">=1.0.8,<=1.0.11", + ) + def test_deploy_custom_model_multiple_components_compat_v1(self) -> None: + deploy_info = model_api.deploy( + self.session, + name=db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "predict"), + platform=deploy_platforms.TargetPlatform.WAREHOUSE, + stage_path=self.model_stage_path, + target_method="predict", + options={}, + ) + assert deploy_info + + model_api.predict( + self.session, deployment=deploy_info, X=pd.DataFrame([[1, 2, 3], [4, 2, 5]], columns=["c1", "c2", "c3"]) + ) + + def _log_sklearn_model_factory( + self, + ) -> Tuple[Callable[[session.Session, str, str], None], Tuple[str, str]]: + def log_model(session: session.Session, run_id: str, model_stage_file_path: str) -> None: + from sklearn import datasets, linear_model + + from snowflake.ml.model import ( # type: ignore[attr-defined] + _model as model_api, + ) + + iris_X, iris_y = datasets.load_iris(return_X_y=True, as_frame=True) + # LogisticRegression is for classfication task, such as iris + regr = linear_model.LogisticRegression() + regr.fit(iris_X, iris_y) + + model_api.save_model( + name=run_id, + model=regr, + sample_input=iris_X, + metadata={"author": "halu", "version": "1"}, + session=session, + model_stage_file_path=model_stage_file_path, + ) + + return log_model, (self.run_id, self.model_stage_file_path) + + @common_test_base.CommonTestBase.compatibility_test( + prepare_fn_factory=_log_sklearn_model_factory, version_range=">=1.0.6,<=1.0.11" # type: ignore[misc, arg-type] + ) + def test_deploy_sklearn_model_compat_v1(self) -> None: + deploy_info = model_api.deploy( + self.session, + name=db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "predict"), + platform=deploy_platforms.TargetPlatform.WAREHOUSE, + stage_path=self.model_stage_path, + target_method="predict", + options={}, + ) + assert deploy_info + + iris_X, _ = datasets.load_iris(return_X_y=True, as_frame=True) + model_api.predict(self.session, deployment=deploy_info, X=iris_X) + + def _log_xgboost_model_factory( + self, + ) -> Tuple[Callable[[session.Session, str, str], None], Tuple[str, str]]: + def log_model(session: session.Session, run_id: str, model_stage_file_path: str) -> None: + import xgboost + from sklearn import datasets, model_selection + + from snowflake.ml.model import ( # type: ignore[attr-defined] + _model as model_api, + ) + + cal_data = datasets.load_breast_cancer(as_frame=True) + cal_X = cal_data.data + cal_y = cal_data.target + cal_X_train, cal_X_test, cal_y_train, cal_y_test = model_selection.train_test_split(cal_X, cal_y) + regressor = xgboost.XGBRegressor(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3) + regressor.fit(cal_X_train, cal_y_train) + + model_api.save_model( + name=run_id, + model=regressor, + sample_input=cal_X_test, + metadata={"author": "halu", "version": "1"}, + session=session, + model_stage_file_path=model_stage_file_path, + ) + + return log_model, (self.run_id, self.model_stage_file_path) + + @common_test_base.CommonTestBase.compatibility_test( + prepare_fn_factory=_log_xgboost_model_factory, version_range=">=1.0.6,<=1.0.11" # type: ignore[misc, arg-type] + ) + def test_deploy_xgboost_model_compat_v1(self) -> None: + deploy_info = model_api.deploy( + self.session, + name=db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "predict"), + platform=deploy_platforms.TargetPlatform.WAREHOUSE, + stage_path=self.model_stage_path, + target_method="predict", + options={}, + ) + assert deploy_info + + cal_data = datasets.load_breast_cancer(as_frame=True) + cal_X = cal_data.data + model_api.predict(self.session, deployment=deploy_info, X=cal_X) + + def _log_xgboost_booster_model_factory( + self, + ) -> Tuple[Callable[[session.Session, str, str], None], Tuple[str, str]]: + def log_model(session: session.Session, run_id: str, model_stage_file_path: str) -> None: + import xgboost + from sklearn import datasets, model_selection + + from snowflake.ml.model import ( # type: ignore[attr-defined] + _model as model_api, + ) + + cal_data = datasets.load_breast_cancer(as_frame=True) + cal_X = cal_data.data + cal_y = cal_data.target + cal_X_train, cal_X_test, cal_y_train, cal_y_test = model_selection.train_test_split(cal_X, cal_y) + params = dict(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, objective="binary:logistic") + regressor = xgboost.train(params, xgboost.DMatrix(data=cal_X_train, label=cal_y_train)) + + model_api.save_model( + name=run_id, + model=regressor, + sample_input=cal_X_test, + metadata={"author": "halu", "version": "1"}, + session=session, + model_stage_file_path=model_stage_file_path, + ) + + return log_model, (self.run_id, self.model_stage_file_path) + + @common_test_base.CommonTestBase.compatibility_test( + prepare_fn_factory=_log_xgboost_booster_model_factory, # type: ignore[misc, arg-type] + version_range=">=1.0.6,<=1.0.11", + ) + def test_deploy_xgboost_booster_model_compat_v1(self) -> None: + deploy_info = model_api.deploy( + self.session, + name=db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "predict"), + platform=deploy_platforms.TargetPlatform.WAREHOUSE, + stage_path=self.model_stage_path, + target_method="predict", + options={}, + ) + assert deploy_info + + cal_data = datasets.load_breast_cancer(as_frame=True) + cal_X = cal_data.data + model_api.predict(self.session, deployment=deploy_info, X=cal_X) + + def _log_snowml_sklearn_model_factory( + self, + ) -> Tuple[Callable[[session.Session, str, str], None], Tuple[str, str]]: + def log_model(session: session.Session, run_id: str, model_stage_file_path: str) -> None: + from sklearn import datasets + + from snowflake.ml.model import ( # type: ignore[attr-defined] + _model as model_api, + ) + from snowflake.ml.modeling.linear_model import ( + LogisticRegression, # type: ignore[attr-defined] + ) + + iris_X = datasets.load_iris(as_frame=True).frame + iris_X.columns = [s.replace(" (CM)", "").replace(" ", "") for s in iris_X.columns.str.upper()] + + INPUT_COLUMNS = ["SEPALLENGTH", "SEPALWIDTH", "PETALLENGTH", "PETALWIDTH"] + LABEL_COLUMNS = "TARGET" + OUTPUT_COLUMNS = "PREDICTED_TARGET" + regr = LogisticRegression(input_cols=INPUT_COLUMNS, output_cols=OUTPUT_COLUMNS, label_cols=LABEL_COLUMNS) + test_features = iris_X + regr.fit(test_features) + + model_api.save_model( + name=run_id, + model=regr, + metadata={"author": "halu", "version": "1"}, + session=session, + model_stage_file_path=model_stage_file_path, + ) + + return log_model, (self.run_id, self.model_stage_file_path) + + @common_test_base.CommonTestBase.compatibility_test( + prepare_fn_factory=_log_snowml_sklearn_model_factory, # type: ignore[misc, arg-type] + version_range=">=1.0.8,<=1.0.11", + ) + def test_deploy_snowml_sklearn_model_compat_v1(self) -> None: + deploy_info = model_api.deploy( + self.session, + name=db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "predict"), + platform=deploy_platforms.TargetPlatform.WAREHOUSE, + stage_path=self.model_stage_path, + target_method="predict", + options={}, + ) + assert deploy_info + + iris_X = datasets.load_iris(as_frame=True).frame + iris_X.columns = [s.replace(" (CM)", "").replace(" ", "") for s in iris_X.columns.str.upper()] + + model_api.predict(self.session, deployment=deploy_info, X=iris_X) + + def _log_snowml_xgboost_model_factory( + self, + ) -> Tuple[Callable[[session.Session, str, str], None], Tuple[str, str]]: + def log_model(session: session.Session, run_id: str, model_stage_file_path: str) -> None: + from sklearn import datasets + + from snowflake.ml.model import ( # type: ignore[attr-defined] + _model as model_api, + ) + from snowflake.ml.modeling.xgboost import ( + XGBRegressor, # type: ignore[attr-defined] + ) + + iris_X = datasets.load_iris(as_frame=True).frame + iris_X.columns = [s.replace(" (CM)", "").replace(" ", "") for s in iris_X.columns.str.upper()] + + INPUT_COLUMNS = ["SEPALLENGTH", "SEPALWIDTH", "PETALLENGTH", "PETALWIDTH"] + LABEL_COLUMNS = "TARGET" + OUTPUT_COLUMNS = "PREDICTED_TARGET" + regr = XGBRegressor(input_cols=INPUT_COLUMNS, output_cols=OUTPUT_COLUMNS, label_cols=LABEL_COLUMNS) + test_features = iris_X + regr.fit(test_features) + + model_api.save_model( + name=run_id, + model=regr, + metadata={"author": "halu", "version": "1"}, + session=session, + model_stage_file_path=model_stage_file_path, + ) + + return log_model, (self.run_id, self.model_stage_file_path) + + @common_test_base.CommonTestBase.compatibility_test( + prepare_fn_factory=_log_snowml_xgboost_model_factory, # type: ignore[misc, arg-type] + version_range=">=1.0.8,<=1.0.11", + ) + def test_deploy_snowml_xgboost_model_compat_v1(self) -> None: + deploy_info = model_api.deploy( + self.session, + name=db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "predict"), + platform=deploy_platforms.TargetPlatform.WAREHOUSE, + stage_path=self.model_stage_path, + target_method="predict", + options={}, + ) + assert deploy_info + + iris_X = datasets.load_iris(as_frame=True).frame + iris_X.columns = [s.replace(" (CM)", "").replace(" ", "") for s in iris_X.columns.str.upper()] + + model_api.predict(self.session, deployment=deploy_info, X=iris_X) + + def _log_pytorch_model_factory( + self, + ) -> Tuple[Callable[[session.Session, str, str], None], Tuple[str, str]]: + def log_model(session: session.Session, run_id: str, model_stage_file_path: str) -> None: + import numpy as np + import torch + + from snowflake.ml.model import ( # type: ignore[attr-defined] + _model as model_api, + ) + + class TorchModel(torch.nn.Module): + def __init__(self, n_input: int, n_hidden: int, n_out: int, dtype: torch.dtype = torch.float32) -> None: + super().__init__() + self.model = torch.nn.Sequential( + torch.nn.Linear(n_input, n_hidden, dtype=dtype), + torch.nn.ReLU(), + torch.nn.Linear(n_hidden, n_out, dtype=dtype), + torch.nn.Sigmoid(), + ) + + def forward(self, tensor: torch.Tensor) -> torch.Tensor: + return self.model(tensor) # type: ignore[no-any-return] + + n_input, n_hidden, n_out, batch_size, learning_rate = 10, 15, 1, 100, 0.01 + x = np.random.rand(batch_size, n_input) + dtype = torch.float32 + data_x = torch.from_numpy(x).to(dtype=dtype) + data_y = (torch.rand(size=(batch_size, 1)) < 0.5).to(dtype=dtype) + + model = TorchModel(n_input, n_hidden, n_out, dtype=dtype) + loss_function = torch.nn.MSELoss() + optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) + for _epoch in range(100): + pred_y = model.forward(data_x) + loss = loss_function(pred_y, data_y) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + model_api.save_model( + name=run_id, + model=model, + sample_input=[data_x], + metadata={"author": "halu", "version": "1"}, + session=session, + model_stage_file_path=model_stage_file_path, + ) + + return log_model, (self.run_id, self.model_stage_file_path) + + @common_test_base.CommonTestBase.compatibility_test( + prepare_fn_factory=_log_pytorch_model_factory, # type: ignore[misc, arg-type] + version_range=">=1.0.6,<=1.0.11", + additional_packages=["pytorch"], + ) + def test_deploy_pytorch_model_compat_v1(self) -> None: + deploy_info = model_api.deploy( + self.session, + name=db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "forward"), + platform=deploy_platforms.TargetPlatform.WAREHOUSE, + stage_path=self.model_stage_path, + target_method="forward", + options={}, + ) + assert deploy_info + + n_input, batch_size = 10, 100 + x = np.random.rand(batch_size, n_input) + dtype = torch.float32 + data_x = torch.from_numpy(x).to(dtype=dtype) + + model_api.predict(self.session, deployment=deploy_info, X=[data_x]) + + def _log_torchscript_model_factory( + self, + ) -> Tuple[Callable[[session.Session, str, str], None], Tuple[str, str]]: + def log_model(session: session.Session, run_id: str, model_stage_file_path: str) -> None: + import numpy as np + import torch + + from snowflake.ml.model import ( # type: ignore[attr-defined] + _model as model_api, + ) + + class TorchModel(torch.nn.Module): + def __init__(self, n_input: int, n_hidden: int, n_out: int, dtype: torch.dtype = torch.float32) -> None: + super().__init__() + self.model = torch.nn.Sequential( + torch.nn.Linear(n_input, n_hidden, dtype=dtype), + torch.nn.ReLU(), + torch.nn.Linear(n_hidden, n_out, dtype=dtype), + torch.nn.Sigmoid(), + ) + + def forward(self, tensor: torch.Tensor) -> torch.Tensor: + return self.model(tensor) # type: ignore[no-any-return] + + n_input, n_hidden, n_out, batch_size, learning_rate = 10, 15, 1, 100, 0.01 + x = np.random.rand(batch_size, n_input) + dtype = torch.float32 + data_x = torch.from_numpy(x).to(dtype=dtype) + data_y = (torch.rand(size=(batch_size, 1)) < 0.5).to(dtype=dtype) + + model = TorchModel(n_input, n_hidden, n_out, dtype=dtype) + loss_function = torch.nn.MSELoss() + optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) + for _epoch in range(100): + pred_y = model.forward(data_x) + loss = loss_function(pred_y, data_y) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + model_script = torch.jit.script(model) # type:ignore[attr-defined] + + model_api.save_model( + name=run_id, + model=model_script, + sample_input=[data_x], + metadata={"author": "halu", "version": "1"}, + session=session, + model_stage_file_path=model_stage_file_path, + ) + + return log_model, (self.run_id, self.model_stage_file_path) + + @common_test_base.CommonTestBase.compatibility_test( + prepare_fn_factory=_log_torchscript_model_factory, # type: ignore[misc, arg-type] + version_range=">=1.0.6,<=1.0.11", + additional_packages=["pytorch"], + ) + def test_deploy_torchscript_model_compat_v1(self) -> None: + deploy_info = model_api.deploy( + self.session, + name=db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "forward"), + platform=deploy_platforms.TargetPlatform.WAREHOUSE, + stage_path=self.model_stage_path, + target_method="forward", + options={}, + ) + assert deploy_info + + n_input, batch_size = 10, 100 + x = np.random.rand(batch_size, n_input) + dtype = torch.float32 + data_x = torch.from_numpy(x).to(dtype=dtype) + + model_api.predict(self.session, deployment=deploy_info, X=[data_x]) + + def _log_tensorflow_model_factory( + self, + ) -> Tuple[Callable[[session.Session, str, str], None], Tuple[str, str]]: + def log_model(session: session.Session, run_id: str, model_stage_file_path: str) -> None: + from typing import Optional + + import tensorflow as tf + + from snowflake.ml.model import ( # type: ignore[attr-defined] + _model as model_api, + ) + + class SimpleModule(tf.Module): + def __init__(self, name: Optional[str] = None) -> None: + super().__init__(name=name) + self.a_variable = tf.Variable(5.0, name="train_me") + self.non_trainable_variable = tf.Variable(5.0, trainable=False, name="do_not_train_me") + + @tf.function(input_signature=[tf.TensorSpec(shape=(None, 1), dtype=tf.float32)]) # type: ignore[misc] + def __call__(self, tensor: tf.Tensor) -> tf.Tensor: + return self.a_variable * tensor + self.non_trainable_variable + + model = SimpleModule(name="simple") + data_x = tf.constant([[5.0], [10.0]]) + + model_api.save_model( + name=run_id, + model=model, + sample_input=[data_x], + metadata={"author": "halu", "version": "1"}, + session=session, + model_stage_file_path=model_stage_file_path, + ) + + return log_model, (self.run_id, self.model_stage_file_path) + + @common_test_base.CommonTestBase.compatibility_test( + prepare_fn_factory=_log_tensorflow_model_factory, # type: ignore[misc, arg-type] + version_range=">=1.0.6,<=1.0.11", + additional_packages=["tensorflow"], + ) + def test_deploy_tensorflow_model_compat_v1(self) -> None: + deploy_info = model_api.deploy( + self.session, + name=db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "__call__"), + platform=deploy_platforms.TargetPlatform.WAREHOUSE, + stage_path=self.model_stage_path, + target_method="__call__", + options={}, + ) + assert deploy_info + + data_x = tf.constant([[5.0], [10.0]]) + + model_api.predict(self.session, deployment=deploy_info, X=[data_x]) + + def _log_keras_model_factory( + self, + ) -> Tuple[Callable[[session.Session, str, str], None], Tuple[str, str]]: + def log_model(session: session.Session, run_id: str, model_stage_file_path: str) -> None: + import numpy as np + import tensorflow as tf + + from snowflake.ml.model import ( # type: ignore[attr-defined] + _model as model_api, + ) + + class KerasModel(tf.keras.Model): + def __init__(self, n_hidden: int, n_out: int) -> None: + super().__init__() + self.fc_1 = tf.keras.layers.Dense(n_hidden, activation="relu") + self.fc_2 = tf.keras.layers.Dense(n_out, activation="sigmoid") + + def call(self, tensor: tf.Tensor) -> tf.Tensor: + input = tensor + x = self.fc_1(input) + x = self.fc_2(x) + return x + + dtype = tf.float32 + n_input, n_hidden, n_out, batch_size, learning_rate = 10, 15, 1, 100, 0.01 + x = np.random.rand(batch_size, n_input) + data_x = tf.convert_to_tensor(x, dtype=dtype) + raw_data_y = tf.random.uniform((batch_size, 1)) + raw_data_y = tf.where(raw_data_y > 0.5, tf.ones_like(raw_data_y), tf.zeros_like(raw_data_y)) + data_y = tf.cast(raw_data_y, dtype=dtype) + + model = KerasModel(n_hidden, n_out) + model.compile( + optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), loss=tf.keras.losses.MeanSquaredError() + ) + model.fit(data_x, data_y, batch_size=batch_size, epochs=100) + + model_api.save_model( + name=run_id, + model=model, + sample_input=[data_x], + metadata={"author": "halu", "version": "1"}, + session=session, + model_stage_file_path=model_stage_file_path, + ) + + return log_model, (self.run_id, self.model_stage_file_path) + + @common_test_base.CommonTestBase.compatibility_test( + prepare_fn_factory=_log_keras_model_factory, # type: ignore[misc, arg-type] + version_range=">=1.0.6,<=1.0.11", + additional_packages=["tensorflow"], + ) + def test_deploy_keras_model_compat_v1(self) -> None: + deploy_info = model_api.deploy( + self.session, + name=db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "predict"), + platform=deploy_platforms.TargetPlatform.WAREHOUSE, + stage_path=self.model_stage_path, + target_method="predict", + options={}, + ) + assert deploy_info + + dtype = tf.float32 + n_input, batch_size = 10, 100 + x = np.random.rand(batch_size, n_input) + data_x = tf.convert_to_tensor(x, dtype=dtype) + + model_api.predict(self.session, deployment=deploy_info, X=[data_x]) + + +if __name__ == "__main__": + absltest.main() diff --git a/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py b/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py index 9e20b8c5..9fd7e5b1 100644 --- a/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py +++ b/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py @@ -1,15 +1,12 @@ import posixpath -import unittest from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union import numpy as np import numpy.typing as npt import pandas as pd -from packaging import version from snowflake.ml.model import ( - _deployer, - _model as model_api, + _api as model_api, deploy_platforms, type_hints as model_types, ) @@ -28,18 +25,14 @@ def base_test_case( test_input: model_types.SupportedDataType, deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, additional_dependencies: Optional[List[str]] = None, ) -> None: - version_args: Dict[str, Any] = {} tmp_stage = db._session.get_session_stage() conda_dependencies = [ test_env_utils.get_latest_package_version_spec_in_server(db._session, "snowflake-snowpark-python") ] if additional_dependencies: conda_dependencies.extend(additional_dependencies) - # We only test when the test is added before the current version available in the server. - snowml_req_str = test_env_utils.get_latest_package_version_spec_in_server(db._session, "snowflake-ml-python") if permanent_deploy: permanent_deploy_args = {"permanent_udf_stage_location": f"@{full_qual_stage}/"} @@ -48,17 +41,7 @@ def base_test_case( permanent_deploy_args = {} perm_model_name = "temp" - if test_released_version: - if version.parse(test_released_version) <= version.parse(snowml_req_str.split("==")[-1]): - actual_name = f"{name}_{perm_model_name}_released" - conda_dependencies.append(snowml_req_str) - else: - raise unittest.SkipTest( - f"Skip test on released version {test_released_version} which has not been available yet." - ) - else: - actual_name = f"{name}_{perm_model_name}_current" - version_args["options"] = {"embed_local_ml_library": True} + actual_name = f"{name}_{perm_model_name}" model_api.save_model( name=actual_name, @@ -67,8 +50,7 @@ def base_test_case( conda_dependencies=conda_dependencies, metadata={"author": "halu", "version": "1"}, session=db._session, - model_stage_file_path=posixpath.join(tmp_stage, f"{actual_name}_{run_id}.zip"), - **version_args, + stage_path=posixpath.join(tmp_stage, f"{actual_name}_{run_id}"), ) for target_method, (additional_deploy_options, check_func) in deploy_params.items(): @@ -80,10 +62,10 @@ def base_test_case( target_method_arg = None else: target_method_arg = target_method - deploy_info = _deployer.deploy( + deploy_info = model_api.deploy( name=function_name, session=db._session, - model_stage_file_path=posixpath.join(tmp_stage, f"{actual_name}_{run_id}.zip"), + stage_path=posixpath.join(tmp_stage, f"{actual_name}_{run_id}"), platform=deploy_platforms.TargetPlatform.WAREHOUSE, target_method=target_method_arg, options={ @@ -93,7 +75,7 @@ def base_test_case( ) assert deploy_info is not None - res = _deployer.predict(session=db._session, deployment=deploy_info, X=test_input) + res = model_api.predict(session=db._session, deployment=deploy_info, X=test_input) check_func(res) diff --git a/tests/integ/snowflake/ml/model/warehouse_pytorch_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_pytorch_model_integ_test.py index 2d4e216e..1d435b76 100644 --- a/tests/integ/snowflake/ml/model/warehouse_pytorch_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_pytorch_model_integ_test.py @@ -53,7 +53,6 @@ def base_test_case( test_input: model_types.SupportedDataType, deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: warehouse_model_integ_test_utils.base_test_case( self._db_manager, @@ -65,14 +64,12 @@ def base_test_case( test_input=test_input, deploy_params=deploy_params, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_pytorch_tensor_as_sample( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_torch_model() x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([data_x], ensure_serializable=False) @@ -92,14 +89,12 @@ def test_pytorch_tensor_as_sample( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_pytorch_df_as_sample( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_torch_model(torch.float64) x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([data_x], ensure_serializable=False) @@ -119,14 +114,12 @@ def test_pytorch_df_as_sample( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_pytorch_sp( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_torch_model(torch.float64) x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([data_x], ensure_serializable=False) @@ -149,14 +142,12 @@ def test_pytorch_sp( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_torchscript_tensor_as_sample( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_jittable_torch_model() x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([data_x], ensure_serializable=False) @@ -177,14 +168,12 @@ def test_torchscript_tensor_as_sample( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_torchscript_df_as_sample( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_jittable_torch_model(torch.float64) x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([data_x], ensure_serializable=False) @@ -205,14 +194,12 @@ def test_torchscript_df_as_sample( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_torchscript_sp( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_jittable_torch_model(torch.float64) x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([data_x], ensure_serializable=False) @@ -236,7 +223,6 @@ def test_torchscript_sp( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/model/warehouse_sklearn_xgboost_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_sklearn_xgboost_model_integ_test.py index 33702067..ab467dfa 100644 --- a/tests/integ/snowflake/ml/model/warehouse_sklearn_xgboost_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_sklearn_xgboost_model_integ_test.py @@ -54,7 +54,6 @@ def base_test_case( test_input: model_types.SupportedDataType, deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: warehouse_model_integ_test_utils.base_test_case( self._db_manager, @@ -66,14 +65,12 @@ def base_test_case( test_input=test_input, deploy_params=deploy_params, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_skl_model_deploy( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: iris_X, iris_y = datasets.load_iris(return_X_y=True) # LogisticRegression is for classfication task, such as iris @@ -91,14 +88,12 @@ def test_skl_model_deploy( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_skl_model_proba_deploy( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: iris_X, iris_y = datasets.load_iris(return_X_y=True) model = ensemble.RandomForestClassifier(random_state=42) @@ -119,14 +114,12 @@ def test_skl_model_proba_deploy( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_skl_multiple_output_model_proba_deploy( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: iris_X, iris_y = datasets.load_iris(return_X_y=True) target2 = np.random.randint(0, 6, size=iris_y.shape) @@ -152,14 +145,12 @@ def test_skl_multiple_output_model_proba_deploy( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_xgb( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: cal_data = datasets.load_breast_cancer(as_frame=True) cal_X = cal_data.data @@ -181,14 +172,12 @@ def test_xgb( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_xgb_sp( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: cal_data = datasets.load_breast_cancer(as_frame=True) cal_data_sp_df = self._session.create_dataframe(cal_data.frame) @@ -217,14 +206,12 @@ def test_xgb_sp( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_xgb_booster( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: cal_data = datasets.load_breast_cancer(as_frame=True) cal_X = cal_data.data @@ -245,14 +232,12 @@ def test_xgb_booster( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_xgb_booster_sp( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: cal_data = datasets.load_breast_cancer(as_frame=True) cal_data_sp_df = self._session.create_dataframe(cal_data.frame) @@ -286,7 +271,6 @@ def test_xgb_booster_sp( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py index ff9ffd56..f1e3d4c6 100644 --- a/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py @@ -56,7 +56,6 @@ def base_test_case( test_input: model_types.SupportedDataType, deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: warehouse_model_integ_test_utils.base_test_case( self._db_manager, @@ -68,14 +67,12 @@ def base_test_case( test_input=test_input, deploy_params=deploy_params, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_snowml_model_deploy_snowml_sklearn( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: iris_X = datasets.load_iris(as_frame=True).frame iris_X.columns = [s.replace(" (CM)", "").replace(" ", "") for s in iris_X.columns.str.upper()] @@ -101,14 +98,12 @@ def test_snowml_model_deploy_snowml_sklearn( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_snowml_model_deploy_xgboost( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: iris_X = datasets.load_iris(as_frame=True).frame iris_X.columns = [s.replace(" (CM)", "").replace(" ", "") for s in iris_X.columns.str.upper()] @@ -134,14 +129,12 @@ def test_snowml_model_deploy_xgboost( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_snowml_model_deploy_lightgbm( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: iris_X = datasets.load_iris(as_frame=True).frame iris_X.columns = [s.replace(" (CM)", "").replace(" ", "") for s in iris_X.columns.str.upper()] @@ -167,7 +160,6 @@ def test_snowml_model_deploy_lightgbm( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/model/warehouse_tensorflow_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_tensorflow_model_integ_test.py index 557c8ad7..d0305125 100644 --- a/tests/integ/snowflake/ml/model/warehouse_tensorflow_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_tensorflow_model_integ_test.py @@ -71,7 +71,6 @@ def base_test_case( test_input: model_types.SupportedDataType, deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: warehouse_model_integ_test_utils.base_test_case( self._db_manager, @@ -83,14 +82,12 @@ def base_test_case( test_input=test_input, deploy_params=deploy_params, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_tf_tensor_as_sample( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: model = SimpleModule(name="simple") data_x = tf.constant([[5.0], [10.0]]) @@ -112,14 +109,12 @@ def test_tf_tensor_as_sample( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_tf_df_as_sample( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: model = SimpleModule(name="simple") data_x = tf.constant([[5.0], [10.0]]) @@ -141,14 +136,12 @@ def test_tf_df_as_sample( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_tf_sp( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: model = SimpleModule(name="simple") data_x = tf.constant([[5.0], [10.0]]) @@ -175,14 +168,12 @@ def test_tf_sp( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_keras_tensor_as_sample( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_keras_model() x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df([data_x], ensure_serializable=False) @@ -204,14 +195,12 @@ def test_keras_tensor_as_sample( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_keras_df_as_sample( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_keras_model() x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df([data_x], ensure_serializable=False) @@ -233,14 +222,12 @@ def test_keras_df_as_sample( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) - @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] + @parameterized.product(permanent_deploy=[True, False]) # type: ignore[misc] def test_keras_sp( self, permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_keras_model() x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df([data_x], ensure_serializable=False) @@ -266,7 +253,6 @@ def test_keras_sp( ), }, permanent_deploy=permanent_deploy, - test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/modeling/framework/BUILD.bazel b/tests/integ/snowflake/ml/modeling/framework/BUILD.bazel index 9f3847d5..04902268 100644 --- a/tests/integ/snowflake/ml/modeling/framework/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/framework/BUILD.bazel @@ -17,4 +17,7 @@ py_test( py_library( name = "utils", srcs = ["utils.py"], + deps = [ + "//snowflake/ml/_internal/utils:identifier", + ], ) diff --git a/tests/integ/snowflake/ml/modeling/framework/utils.py b/tests/integ/snowflake/ml/modeling/framework/utils.py index cfe5ed98..a7e3c2d5 100644 --- a/tests/integ/snowflake/ml/modeling/framework/utils.py +++ b/tests/integ/snowflake/ml/modeling/framework/utils.py @@ -6,6 +6,7 @@ import pandas as pd from pandas._typing import ArrayLike +from snowflake.ml._internal.utils import identifier from snowflake.snowpark import DataFrame, Session _EqualityFunc = Callable[[Any, Any], bool] @@ -141,7 +142,7 @@ class DataType(Enum): def gen_fuzz_data( rows: int, types: List[DataType], low: Union[int, List[int]] = MIN_INT, high: Union[int, List[int]] = MAX_INT -) -> Tuple[List[Any], List[str], List[str]]: +) -> Tuple[List[Any], List[str]]: """ Generate random data based on input column types and row count. First column in the result data will be an ID column for indexing. @@ -159,7 +160,6 @@ def gen_fuzz_data( ValueError: if data type is not supported """ data: List[npt.NDArray[Any]] = [np.arange(1, rows + 1, 1)] - names = ["ID"] snowflake_identifiers = ["ID"] for idx, t in enumerate(types): @@ -171,11 +171,10 @@ def gen_fuzz_data( data.append(np.random.uniform(_low, _high, rows)) else: raise ValueError(f"Unsupported data type {t}") - names.append(f"col_{idx}") snowflake_identifiers.append(f'"col_{idx}"') - data = np.core.records.fromarrays(data, names=names).tolist() # type: ignore[call-overload] + data = np.core.records.fromarrays(data, names=snowflake_identifiers).tolist() # type: ignore[call-overload] - return data, names, snowflake_identifiers + return data, snowflake_identifiers def get_df( @@ -185,22 +184,27 @@ def get_df( fillna: Optional[Union[object, ArrayLike]] = None, ) -> Tuple[pd.DataFrame, DataFrame]: """Create pandas dataframe and Snowpark dataframes from input data. The schema passed should be - a pandas schema, which will be converted to a schema using snowflake identifiers when `session.create_dataframe` - is called. + a snowflake schema using snowflake identifiers. Args: session: Snowpark session object. data: List of input data to convert to dataframe. - schema: The pandas schema for dataframe to be created. + schema: The schema for dataframe to be created. fillna: Value to fill for NA values in the input data. Returns: A tuple containing a pandas dataframe and a snowpark dataframe. """ - df_pandas = pd.DataFrame(data, columns=schema) + # Change the snowflake schema into the pandas equivalent identifiers. + # This will get converted back to the SF schema in `session.create_dataframe`. + pd_schema = identifier.get_unescaped_names(schema) + df_pandas = pd.DataFrame(data, columns=pd_schema) + if fillna is not None: df_pandas.fillna(value=fillna, inplace=True) df = session.create_dataframe(df_pandas) + + # Use snowflake identifiers for the pandas dataframe. df_pandas.columns = df.columns return df_pandas, df diff --git a/tests/integ/snowflake/ml/modeling/metrics/accuracy_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/accuracy_score_test.py index 95ac3d32..f89a0d75 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/accuracy_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/accuracy_score_test.py @@ -11,13 +11,13 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -56,7 +56,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_score = snowml_metrics.accuracy_score( @@ -89,7 +89,7 @@ def test_normalized(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for normalize in params["normalize"]: actual_score = snowml_metrics.accuracy_score( diff --git a/tests/integ/snowflake/ml/modeling/metrics/confusion_matrix_test.py b/tests/integ/snowflake/ml/modeling/metrics/confusion_matrix_test.py index c48b402a..45191be9 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/confusion_matrix_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/confusion_matrix_test.py @@ -10,7 +10,7 @@ from snowflake.ml.utils import connection_params from tests.integ.snowflake.ml.modeling.framework import utils -_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=100, types=[utils.DataType.INTEGER] * 2 + [utils.DataType.FLOAT], low=-1, @@ -35,7 +35,7 @@ def tearDown(self) -> None: {"params": {"labels": [None, [2, 0, 4]]}}, ) def test_labels(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _DATA, _SF_SCHEMA) for labels in params["labels"]: actual_cm = snowml_metrics.confusion_matrix( @@ -55,7 +55,7 @@ def test_labels(self, params: Dict[str, Any]) -> None: {"params": {"sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL]}}, ) def test_sample_weight(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _DATA, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_cm = snowml_metrics.confusion_matrix( @@ -76,7 +76,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"normalize": ["true", "pred", "all", None]}}, ) def test_normalize(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _DATA, _SF_SCHEMA) for normalize in params["normalize"]: actual_cm = snowml_metrics.confusion_matrix( diff --git a/tests/integ/snowflake/ml/modeling/metrics/d2_absolute_error_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/d2_absolute_error_score_test.py index b61fef1c..fb9216c5 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/d2_absolute_error_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/d2_absolute_error_score_test.py @@ -13,13 +13,13 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -67,7 +67,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_loss = snowml_metrics.d2_absolute_error_score( @@ -120,7 +120,7 @@ def test_multilabel(self) -> None: @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) actual_loss = snowml_metrics.d2_absolute_error_score( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/d2_pinball_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/d2_pinball_score_test.py index e36839a7..3ce2f309 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/d2_pinball_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/d2_pinball_score_test.py @@ -13,13 +13,13 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -68,7 +68,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_loss = snowml_metrics.d2_pinball_score( @@ -101,7 +101,7 @@ def test_alpha(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for alpha in params["alpha"]: actual_loss = snowml_metrics.d2_pinball_score( @@ -153,7 +153,7 @@ def test_multilabel(self) -> None: @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) actual_loss = snowml_metrics.d2_pinball_score( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/explained_variance_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/explained_variance_score_test.py index 9a79db02..e33a2acd 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/explained_variance_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/explained_variance_score_test.py @@ -13,13 +13,13 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -67,7 +67,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_loss = snowml_metrics.explained_variance_score( @@ -120,7 +120,7 @@ def test_force_finite(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for force_finite in params["force_finite"]: actual_loss = snowml_metrics.explained_variance_score( @@ -152,7 +152,7 @@ def test_multilabel(self) -> None: @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) actual_loss = snowml_metrics.explained_variance_score( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/f1_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/f1_score_test.py index 9341bfbe..d2d6bc67 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/f1_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/f1_score_test.py @@ -12,13 +12,13 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -45,7 +45,7 @@ def tearDown(self) -> None: {"params": {"labels": [None, [2, 0, 4]]}}, ) def test_labels(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for labels in params["labels"]: actual_f = snowml_metrics.f1_score( @@ -67,7 +67,7 @@ def test_labels(self, params: Dict[str, Any]) -> None: {"params": {"pos_label": [0, 2, 4]}}, ) def test_pos_label(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for pos_label in params["pos_label"]: actual_f = snowml_metrics.f1_score( @@ -89,7 +89,7 @@ def test_pos_label(self, params: Dict[str, Any]) -> None: {"params": {"average": [None, "micro", "macro", "weighted"]}}, ) def test_average_multiclass(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for average in params["average"]: actual_f = snowml_metrics.f1_score( @@ -115,7 +115,7 @@ def test_average_multiclass(self, params: Dict[str, Any]) -> None: }, ) def test_average_binary(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) for idx, average in enumerate(params["average"]): y_true = params["y_true"][idx] @@ -149,7 +149,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_f = snowml_metrics.f1_score( @@ -175,7 +175,7 @@ def test_zero_division(self, params: Dict[str, Any]) -> None: data = [ [0, 0, 0, 0, 0, 0], ] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for zero_division in params["zero_division"]: if zero_division == "warn": diff --git a/tests/integ/snowflake/ml/modeling/metrics/fbeta_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/fbeta_score_test.py index f40cc576..0e59c6d0 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/fbeta_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/fbeta_score_test.py @@ -12,13 +12,13 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -57,7 +57,7 @@ def test_beta(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for beta in params["beta"]: actual_f = snowml_metrics.fbeta_score( @@ -79,7 +79,7 @@ def test_beta(self, params: Dict[str, Any]) -> None: {"params": {"labels": [None, [2, 0, 4]]}}, ) def test_labels(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for labels in params["labels"]: actual_f = snowml_metrics.fbeta_score( @@ -103,7 +103,7 @@ def test_labels(self, params: Dict[str, Any]) -> None: {"params": {"pos_label": [0, 2, 4]}}, ) def test_pos_label(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for pos_label in params["pos_label"]: actual_f = snowml_metrics.fbeta_score( @@ -127,7 +127,7 @@ def test_pos_label(self, params: Dict[str, Any]) -> None: {"params": {"average": [None, "micro", "macro", "weighted"]}}, ) def test_average_multiclass(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for average in params["average"]: actual_f = snowml_metrics.fbeta_score( @@ -155,7 +155,7 @@ def test_average_multiclass(self, params: Dict[str, Any]) -> None: }, ) def test_average_binary(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) for idx, average in enumerate(params["average"]): y_true = params["y_true"][idx] @@ -191,7 +191,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_f = snowml_metrics.fbeta_score( @@ -219,7 +219,7 @@ def test_zero_division(self, params: Dict[str, Any]) -> None: data = [ [0, 0, 0, 0, 0, 0], ] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for zero_division in params["zero_division"]: if zero_division == "warn": diff --git a/tests/integ/snowflake/ml/modeling/metrics/log_loss_test.py b/tests/integ/snowflake/ml/modeling/metrics/log_loss_test.py index f0242535..d4f96e77 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/log_loss_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/log_loss_test.py @@ -11,7 +11,7 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] + [utils.DataType.FLOAT] * 4 -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -54,7 +54,7 @@ def tearDown(self) -> None: @parameterized.parameters( # type: ignore[misc] { "params": { - "eps": ["auto", 0.1, 0.5, 0.99], + "eps": ["auto", 0.01, 0.1, 0.5, 0.9, 0.99], "values": [ {"data": _BINARY_DATA, "y_true": _BINARY_Y_TRUE_COL, "y_pred": _BINARY_Y_PRED_COL}, {"data": _MULTICLASS_DATA, "y_true": _MULTICLASS_Y_TRUE_COL, "y_pred": _MULTICLASS_Y_PRED_COLS}, @@ -67,7 +67,7 @@ def test_eps(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for eps in params["eps"]: actual_loss = snowml_metrics.log_loss( @@ -99,7 +99,7 @@ def test_normalize(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for normalize in params["normalize"]: actual_loss = snowml_metrics.log_loss( @@ -131,7 +131,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_loss = snowml_metrics.log_loss( @@ -152,7 +152,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"labels": [None, [2, 0, 4]]}}, ) def test_labels(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for labels in params["labels"]: actual_loss = snowml_metrics.log_loss( diff --git a/tests/integ/snowflake/ml/modeling/metrics/mean_absolute_error_test.py b/tests/integ/snowflake/ml/modeling/metrics/mean_absolute_error_test.py index 44e73cfa..209ee2ee 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/mean_absolute_error_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/mean_absolute_error_test.py @@ -13,13 +13,13 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -67,7 +67,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_loss = snowml_metrics.mean_absolute_error( @@ -120,7 +120,7 @@ def test_multilabel(self) -> None: @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) actual_loss = snowml_metrics.mean_absolute_error( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/mean_absolute_percentage_error_test.py b/tests/integ/snowflake/ml/modeling/metrics/mean_absolute_percentage_error_test.py index 7b657228..bc608c63 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/mean_absolute_percentage_error_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/mean_absolute_percentage_error_test.py @@ -13,13 +13,13 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -67,7 +67,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_loss = snowml_metrics.mean_absolute_percentage_error( @@ -120,7 +120,7 @@ def test_multilabel(self) -> None: @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) actual_loss = snowml_metrics.mean_absolute_percentage_error( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/mean_squared_error_test.py b/tests/integ/snowflake/ml/modeling/metrics/mean_squared_error_test.py index 4a93650a..a8fbd615 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/mean_squared_error_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/mean_squared_error_test.py @@ -13,13 +13,13 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -67,7 +67,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_loss = snowml_metrics.mean_squared_error( @@ -120,7 +120,7 @@ def test_squared(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for squared in params["squared"]: actual_loss = snowml_metrics.mean_squared_error( @@ -152,7 +152,7 @@ def test_multilabel(self) -> None: @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) actual_loss = snowml_metrics.mean_squared_error( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/metrics_utils_test.py b/tests/integ/snowflake/ml/modeling/metrics/metrics_utils_test.py index 6d638664..20cc3655 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/metrics_utils_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/metrics_utils_test.py @@ -9,7 +9,7 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -37,7 +37,7 @@ def tearDown(self) -> None: normalize=(False, True), ) def test_weighted_sum(self, df, sample_weight_col_name, sample_score_col_name, normalize) -> None: - pandas_df, input_df = utils.get_df(self._session, df, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, df, _SF_SCHEMA) snowpark_weight_col = input_df[sample_weight_col_name] if sample_weight_col_name else None actual_sum = metrics_utils.weighted_sum( diff --git a/tests/integ/snowflake/ml/modeling/metrics/precision_recall_curve_test.py b/tests/integ/snowflake/ml/modeling/metrics/precision_recall_curve_test.py index 8c8b7d2f..500d1e0b 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/precision_recall_curve_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/precision_recall_curve_test.py @@ -13,7 +13,7 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] + [utils.DataType.FLOAT] * 2 -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -38,7 +38,7 @@ def tearDown(self) -> None: {"params": {"pos_label": [0, 2, 4]}}, ) def test_pos_label(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) for pos_label in params["pos_label"]: actual_precision, actual_recall, actual_thresholds = snowml_metrics.precision_recall_curve( @@ -60,7 +60,7 @@ def test_pos_label(self, params: Dict[str, Any]) -> None: {"params": {"sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL]}}, ) def test_sample_weight(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_precision, actual_recall, actual_thresholds = snowml_metrics.precision_recall_curve( @@ -81,7 +81,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: @mock.patch("snowflake.ml.modeling.metrics.ranking.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) actual_precision, actual_recall, actual_thresholds = snowml_metrics.precision_recall_curve( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/precision_recall_fscore_support_test.py b/tests/integ/snowflake/ml/modeling/metrics/precision_recall_fscore_support_test.py index f4d1c06c..d0d26c1e 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/precision_recall_fscore_support_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/precision_recall_fscore_support_test.py @@ -12,13 +12,13 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -57,7 +57,7 @@ def test_beta(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for beta in params["beta"]: actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( @@ -80,7 +80,7 @@ def test_beta(self, params: Dict[str, Any]) -> None: {"params": {"labels": [None, [2, 0, 4]]}}, ) def test_labels(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for labels in params["labels"]: actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( @@ -103,7 +103,7 @@ def test_labels(self, params: Dict[str, Any]) -> None: {"params": {"pos_label": [0, 2, 4]}}, ) def test_pos_label(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for pos_label in params["pos_label"]: actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( @@ -138,7 +138,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( @@ -162,7 +162,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"average": [None, "micro", "macro", "weighted"]}}, ) def test_average_multiclass(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for average in params["average"]: actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( @@ -189,7 +189,7 @@ def test_average_multiclass(self, params: Dict[str, Any]) -> None: sample_weight_col_name=(None, _SAMPLE_WEIGHT_COL), ) def test_average_binary_samples(self, y_true, y_pred, average, sample_weight_col_name) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( df=input_df, @@ -215,7 +215,7 @@ def test_zero_division(self, params: Dict[str, Any]) -> None: [0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0], ] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for zero_division in params["zero_division"]: if zero_division == "warn": @@ -258,7 +258,7 @@ def test_zero_division(self, params: Dict[str, Any]) -> None: def test_no_sample(self) -> None: data = [] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/precision_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/precision_score_test.py index 5ec6d5a2..a661c5dc 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/precision_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/precision_score_test.py @@ -12,13 +12,13 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -45,7 +45,7 @@ def tearDown(self) -> None: {"params": {"labels": [None, [2, 0, 4]]}}, ) def test_labels(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for labels in params["labels"]: actual_p = snowml_metrics.precision_score( @@ -67,7 +67,7 @@ def test_labels(self, params: Dict[str, Any]) -> None: {"params": {"pos_label": [0, 2, 4]}}, ) def test_pos_label(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for pos_label in params["pos_label"]: actual_p = snowml_metrics.precision_score( @@ -101,7 +101,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_p = snowml_metrics.precision_score( @@ -124,7 +124,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"average": [None, "micro", "macro", "weighted"]}}, ) def test_average_multiclass(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for average in params["average"]: actual_p = snowml_metrics.precision_score( @@ -150,7 +150,7 @@ def test_average_multiclass(self, params: Dict[str, Any]) -> None: }, ) def test_average_binary(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) for idx, average in enumerate(params["average"]): y_true = params["y_true"][idx] @@ -176,7 +176,7 @@ def test_zero_division(self, params: Dict[str, Any]) -> None: [0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0], ] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for zero_division in params["zero_division"]: if zero_division == "warn": diff --git a/tests/integ/snowflake/ml/modeling/metrics/recall_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/recall_score_test.py index ad1b219e..323546f8 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/recall_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/recall_score_test.py @@ -12,13 +12,13 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=2, ) -_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -45,7 +45,7 @@ def tearDown(self) -> None: {"params": {"labels": [None, [2, 0, 4]]}}, ) def test_labels(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for labels in params["labels"]: actual_r = snowml_metrics.recall_score( @@ -67,7 +67,7 @@ def test_labels(self, params: Dict[str, Any]) -> None: {"params": {"pos_label": [0, 2, 4]}}, ) def test_pos_label(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for pos_label in params["pos_label"]: actual_r = snowml_metrics.recall_score( @@ -89,7 +89,7 @@ def test_pos_label(self, params: Dict[str, Any]) -> None: {"params": {"average": [None, "micro", "macro", "weighted"]}}, ) def test_average_multiclass(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for average in params["average"]: actual_r = snowml_metrics.recall_score( @@ -115,7 +115,7 @@ def test_average_multiclass(self, params: Dict[str, Any]) -> None: }, ) def test_average_binary(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) for idx, average in enumerate(params["average"]): y_true = params["y_true"][idx] @@ -149,7 +149,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_r = snowml_metrics.recall_score( @@ -176,7 +176,7 @@ def test_zero_division(self, params: Dict[str, Any]) -> None: [0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0], ] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for zero_division in params["zero_division"]: if zero_division == "warn": diff --git a/tests/integ/snowflake/ml/modeling/metrics/roc_auc_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/roc_auc_score_test.py index 2340c9e7..6becea54 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/roc_auc_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/roc_auc_score_test.py @@ -13,7 +13,7 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] + [utils.DataType.FLOAT] * 4 -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -57,7 +57,7 @@ def tearDown(self) -> None: {"params": {"average": ["weighted"]}}, ) def test_average_binary(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) for average in params["average"]: actual_auc = snowml_metrics.roc_auc_score( @@ -82,7 +82,7 @@ def test_average_binary(self, params: Dict[str, Any]) -> None: }, ) def test_average_multiclass(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for idx, average in enumerate(params["average"]): multi_class = params["multi_class"][idx] @@ -117,7 +117,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: data = values["data"] y_true = values["y_true"] y_score = values["y_score"] - pandas_df, input_df = utils.get_df(self._session, data, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_auc = snowml_metrics.roc_auc_score( @@ -140,7 +140,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"max_fpr": [None, 0.1, 0.5, 1]}}, ) def test_max_fpr(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) for max_fpr in params["max_fpr"]: actual_auc = snowml_metrics.roc_auc_score( @@ -160,7 +160,7 @@ def test_max_fpr(self, params: Dict[str, Any]) -> None: {"params": {"multi_class": ["ovr", "ovo"]}}, ) def test_multi_class(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for multi_class in params["multi_class"]: actual_auc = snowml_metrics.roc_auc_score( @@ -180,7 +180,7 @@ def test_multi_class(self, params: Dict[str, Any]) -> None: {"params": {"labels": [None, [0, 1, 2]]}}, ) def test_labels(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for labels in params["labels"]: actual_auc = snowml_metrics.roc_auc_score( diff --git a/tests/integ/snowflake/ml/modeling/metrics/roc_curve_test.py b/tests/integ/snowflake/ml/modeling/metrics/roc_curve_test.py index 1b14eba4..231b844a 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/roc_curve_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/roc_curve_test.py @@ -16,13 +16,13 @@ _ROWS = 100 _TYPES = [utils.DataType.INTEGER] + [utils.DataType.FLOAT] * 2 -_BINARY_DATA, _PD_SCHEMA, _SF_SCHEMA = utils.gen_fuzz_data( +_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, high=[2, 1, 1], ) -_MULTICLASS_DATA, _, _ = utils.gen_fuzz_data( +_MULTICLASS_DATA, _ = utils.gen_fuzz_data( rows=_ROWS, types=_TYPES, low=0, @@ -47,7 +47,7 @@ def tearDown(self) -> None: {"params": {"pos_label": [0, 2, 4]}}, ) def test_pos_label(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _MULTICLASS_DATA, _SF_SCHEMA) for pos_label in params["pos_label"]: actual_fpr, actual_tpr, actual_thresholds = snowml_metrics.roc_curve( @@ -70,7 +70,7 @@ def test_pos_label(self, params: Dict[str, Any]) -> None: {"params": {"sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL]}}, ) def test_sample_weight(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) for sample_weight_col_name in params["sample_weight_col_name"]: actual_fpr, actual_tpr, actual_thresholds = snowml_metrics.roc_curve( @@ -94,7 +94,7 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: {"params": {"drop_intermediate": [True, False]}}, ) def test_drop_intermediate(self, params: Dict[str, Any]) -> None: - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) for drop_intermediate in params["drop_intermediate"]: actual_fpr, actual_tpr, actual_thresholds = snowml_metrics.roc_curve( @@ -119,7 +119,7 @@ def test_multi_query_df(self) -> None: self._session.sql(f"create temp stage {stage}").collect() # Load data into the stage. - pandas_df = pd.DataFrame(_BINARY_DATA, columns=_PD_SCHEMA) + pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SF_SCHEMA) with tempfile.TemporaryDirectory() as temp_dir: filename = "data.parquet" local_path = os.path.join(temp_dir, filename) @@ -154,7 +154,7 @@ def test_multi_query_df(self) -> None: @mock.patch("snowflake.ml.modeling.metrics.ranking.result._RESULT_SIZE_THRESHOLD", 0) def test_metric_size_threshold(self) -> None: # TODO: somehow confirm that the stage upload code path was taken. - pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _PD_SCHEMA) + pandas_df, input_df = utils.get_df(self._session, _BINARY_DATA, _SF_SCHEMA) actual_fpr, actual_tpr, actual_thresholds = snowml_metrics.roc_curve( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/k_bins_discretizer_test.py b/tests/integ/snowflake/ml/modeling/preprocessing/k_bins_discretizer_test.py index 34548ca4..8675cb3a 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/k_bins_discretizer_test.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/k_bins_discretizer_test.py @@ -130,11 +130,11 @@ def test_fit_fuzz_data(self) -> None: N_BINS = [10, 7] ENCODE = "ordinal" - data, pd_schema, sf_schema = utils.gen_fuzz_data( + data, sf_schema = utils.gen_fuzz_data( rows=1000, types=[utils.DataType.INTEGER, utils.DataType.FLOAT], ) - pandas_df, snowpark_df = utils.get_df(self._session, data, pd_schema) + pandas_df, snowpark_df = utils.get_df(self._session, data, sf_schema) for strategy in self._strategies: sklearn_discretizer = SklearnKBinsDiscretizer(n_bins=N_BINS, encode=ENCODE, strategy=strategy) @@ -197,7 +197,7 @@ def test_transform_ordinal_encoding_fuzz_data(self) -> None: ENCODE = "ordinal" OUTPUT_COLS = [f"OUT_{x}" for x in range(len(N_BINS))] - data, pd_schema, sf_schema = utils.gen_fuzz_data( + data, sf_schema = utils.gen_fuzz_data( rows=10000, types=[ utils.DataType.INTEGER, @@ -207,7 +207,7 @@ def test_transform_ordinal_encoding_fuzz_data(self) -> None: low=-999999, high=999999, ) - pandas_df, snowpark_df = utils.get_df(self._session, data, pd_schema) + pandas_df, snowpark_df = utils.get_df(self._session, data, sf_schema) for strategy in self._strategies: # 1. Create OSS SKLearn discretizer diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/ordinal_encoder_test.py b/tests/integ/snowflake/ml/modeling/preprocessing/ordinal_encoder_test.py index ef1c4b4c..015414f7 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/ordinal_encoder_test.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/ordinal_encoder_test.py @@ -861,9 +861,8 @@ def test_same_input_output_cols(self) -> None: def test_double_quoted_same_input_output_cols(self) -> None: data = [["a", "b"]] - schema = ["col1", "col2"] cat_cols = ['"col1"', '"col2"'] - df_pandas, df = framework_utils.get_df(self._session, data, schema) + df_pandas, df = framework_utils.get_df(self._session, data, cat_cols) encoder = OrdinalEncoder(drop_input_cols=True).set_input_cols(cat_cols).set_output_cols(cat_cols) transformed_df = encoder.fit(df).transform(df) @@ -872,9 +871,8 @@ def test_double_quoted_same_input_output_cols(self) -> None: def test_mixed_column_types(self) -> None: data = [["a", 1]] - schema = ["col1", "col2"] cat_cols = ['"col1"', '"col2"'] - df_pandas, df = framework_utils.get_df(self._session, data, schema) + df_pandas, df = framework_utils.get_df(self._session, data, cat_cols) encoder = OrdinalEncoder(drop_input_cols=True).set_input_cols(cat_cols).set_output_cols(cat_cols) transformed_df = encoder.fit(df).transform(df) @@ -882,9 +880,8 @@ def test_mixed_column_types(self) -> None: self.assertEqual(cat_cols, transformed_df.columns) data = [[1.0, True]] - schema = ["col1", "col2"] cat_cols = ['"col1"', '"col2"'] - df_pandas, df = framework_utils.get_df(self._session, data, schema) + df_pandas, df = framework_utils.get_df(self._session, data, cat_cols) encoder = OrdinalEncoder(drop_input_cols=True).set_input_cols(cat_cols).set_output_cols(cat_cols) transformed_df = encoder.fit(df).transform(df) @@ -892,15 +889,52 @@ def test_mixed_column_types(self) -> None: self.assertEqual(cat_cols, transformed_df.columns) data = [[True, "a"]] - schema = ["col1", "col2"] cat_cols = ['"col1"', '"col2"'] - df_pandas, df = framework_utils.get_df(self._session, data, schema) + df_pandas, df = framework_utils.get_df(self._session, data, cat_cols) encoder = OrdinalEncoder(drop_input_cols=True).set_input_cols(cat_cols).set_output_cols(cat_cols) transformed_df = encoder.fit(df).transform(df) self.assertEqual(cat_cols, transformed_df.columns) + def test_large_num_cols(self) -> None: + num_cols = 300 + input_cols = [f"COL{i}" for i in range(1, num_cols + 1)] + output_cols = [f"OUT{i}" for i in range(1, num_cols + 1)] + col_cardinality = {col: 2 for col in input_cols} + data = {} + for col, cardinality in col_cardinality.items(): + data[col] = np.random.randint(0, cardinality, size=100) + df = self._session.create_dataframe(pd.DataFrame(data)) + + encoder = OrdinalEncoder(input_cols=input_cols, output_cols=output_cols) + encoder.fit(df) + res = encoder.transform(df) + res.collect() + + def test_large_num_cols_unknown(self) -> None: + num_cols = 300 + input_cols = [f"COL{i}" for i in range(1, num_cols + 1)] + output_cols = [f"OUT{i}" for i in range(1, num_cols + 1)] + col_cardinality = {col: 2 for col in input_cols} + data = {} + for col, cardinality in col_cardinality.items(): + data[col] = np.random.randint(0, cardinality, size=100) + df = self._session.create_dataframe(pd.DataFrame(data)) + + # unknown categories exist + unknown_col_cardinality = {col: 3 for col in input_cols} + unknown_data = {} + for col, cardinality in unknown_col_cardinality.items(): + unknown_data[col] = np.random.randint(0, cardinality, size=100) + unknown_df = self._session.create_dataframe(pd.DataFrame(unknown_data)) + + encoder = OrdinalEncoder(input_cols=input_cols, output_cols=output_cols) + encoder.fit(df) + + with pytest.raises(ValueError): + encoder.transform(unknown_df) + if __name__ == "__main__": main() diff --git a/tests/integ/snowflake/ml/registry/BUILD.bazel b/tests/integ/snowflake/ml/registry/BUILD.bazel index 50e2f815..933cae6a 100644 --- a/tests/integ/snowflake/ml/registry/BUILD.bazel +++ b/tests/integ/snowflake/ml/registry/BUILD.bazel @@ -29,6 +29,7 @@ py_test( name = "model_registry_compat_test", timeout = "long", srcs = ["model_registry_compat_test.py"], + shard_count = 2, deps = [ "//snowflake/ml/registry:model_registry", "//tests/integ/snowflake/ml/test_utils:common_test_base", diff --git a/tests/integ/snowflake/ml/registry/model_registry_compat_test.py b/tests/integ/snowflake/ml/registry/model_registry_compat_test.py index 5a5a21e4..3adec8ba 100644 --- a/tests/integ/snowflake/ml/registry/model_registry_compat_test.py +++ b/tests/integ/snowflake/ml/registry/model_registry_compat_test.py @@ -2,6 +2,7 @@ from typing import Callable, Tuple from absl.testing import absltest +from sklearn import datasets from snowflake.ml.registry import model_registry from snowflake.snowpark import session @@ -57,6 +58,67 @@ def test_open_registry_compat_v0(self) -> None: self.session.use_database(self.current_db) self.session.use_schema(self.current_schema) + def _prepare_registry_and_log_model_fn_factory( + self, + ) -> Tuple[Callable[[session.Session, str, str], None], Tuple[str, str]]: + self.registry_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "registry_db") + + def prepare_registry_and_log_model(session: session.Session, registry_name: str, run_id: str) -> None: + from sklearn import datasets, linear_model + + from snowflake.connector.errors import ProgrammingError + from snowflake.ml.registry import model_registry + + try: + model_registry.create_model_registry(session=session, database_name=registry_name) + except ProgrammingError: + # Previous versions of library will call use even in the sproc env, which is not allowed. + # This is to suppress the error + pass + + registry = model_registry.ModelRegistry(session=session, database_name=registry_name) + + iris_X, iris_y = datasets.load_iris(return_X_y=True, as_frame=True) + # LogisticRegression is for classfication task, such as iris + regr = linear_model.LogisticRegression() + regr.fit(iris_X, iris_y) + + registry.log_model( + model_name="model", + model_version=run_id, + model=regr, + sample_input_data=iris_X, + ) + + return prepare_registry_and_log_model, (self.registry_name, self.run_id) + + @common_test_base.CommonTestBase.compatibility_test( + prepare_fn_factory=_prepare_registry_and_log_model_fn_factory, # type: ignore[misc, arg-type] + version_range=">=1.0.6,<=1.0.11", + ) + def test_log_model_compat_v1(self) -> None: + try: + registry = model_registry.ModelRegistry( + session=self.session, database_name=self.registry_name, create_if_not_exists=True + ) + model_ref = model_registry.ModelReference( + registry=registry, + model_name="model", + model_version=self.run_id, + ) + deployment_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "predict") + model_ref.deploy( # type: ignore[attr-defined] + deployment_name=deployment_name, + target_method="predict", + ) + iris_X, iris_y = datasets.load_iris(return_X_y=True, as_frame=True) + model_ref.predict(deployment_name, iris_X) + + finally: + self._db_manager.drop_database(self.registry_name, if_exists=True) + self.session.use_database(self.current_db) + self.session.use_schema(self.current_schema) + if __name__ == "__main__": absltest.main() diff --git a/tests/integ/snowflake/ml/registry/model_registry_snowservice_integ_test_base.py b/tests/integ/snowflake/ml/registry/model_registry_snowservice_integ_test_base.py index 899e5d25..700e502c 100644 --- a/tests/integ/snowflake/ml/registry/model_registry_snowservice_integ_test_base.py +++ b/tests/integ/snowflake/ml/registry/model_registry_snowservice_integ_test_base.py @@ -23,19 +23,17 @@ def is_valid_yaml(yaml_string) -> bool: class TestModelRegistryIntegSnowServiceBase(spcs_integ_test_base.SpcsIntegTestBase): - @classmethod - def setUpClass(cls) -> None: - super().setUpClass() + def setUp(self) -> None: + super().setUp() model_registry.create_model_registry( - session=cls._session, database_name=cls._TEST_DB, schema_name=cls._TEST_SCHEMA + session=self._session, database_name=self._test_db, schema_name=self._test_schema ) - cls.registry = model_registry.ModelRegistry( - session=cls._session, database_name=cls._TEST_DB, schema_name=cls._TEST_SCHEMA + self.registry = model_registry.ModelRegistry( + session=self._session, database_name=self._test_db, schema_name=self._test_schema ) - @classmethod - def tearDownClass(cls) -> None: - super().tearDownClass() + def tearDown(self) -> None: + super().tearDown() def _test_snowservice_deployment( self, @@ -98,9 +96,15 @@ def _test_snowservice_deployment( self.assertEqual(model_deployment_list["MODEL_VERSION"][0], model_version) self.assertEqual(model_deployment_list["DEPLOYMENT_NAME"][0], deployment_name) + deployment = self.registry._get_deployment( + model_name=model_name, model_version=model_version, deployment_name=deployment_name + ) + service_name = f"service_{deployment['MODEL_ID']}" model_ref.delete_deployment(deployment_name=deployment_name) # type: ignore[attr-defined] self.assertEqual(model_ref.list_deployments().to_pandas().shape[0], 0) # type: ignore[attr-defined] + service_lst = self._session.sql(f"SHOW SERVICES LIKE '{service_name}' in account;").collect() + self.assertEqual(len(service_lst), 0, "Service was not deleted successfully") self.assertEqual(self.registry.list_models().to_pandas().shape[0], 1) self.registry.delete_model(model_name=model_name, model_version=model_version, delete_artifact=True) self.assertEqual(self.registry.list_models().to_pandas().shape[0], 0) diff --git a/tests/integ/snowflake/ml/test_utils/BUILD.bazel b/tests/integ/snowflake/ml/test_utils/BUILD.bazel index 2cde419f..cbc6ee04 100644 --- a/tests/integ/snowflake/ml/test_utils/BUILD.bazel +++ b/tests/integ/snowflake/ml/test_utils/BUILD.bazel @@ -72,6 +72,7 @@ py_library( testonly = True, srcs = ["spcs_integ_test_base.py"], deps = [ + "//snowflake/ml/_internal/utils:identifier", "//snowflake/ml/utils:connection_params", "//tests/integ/snowflake/ml/test_utils:db_manager", ], diff --git a/tests/integ/snowflake/ml/test_utils/common_test_base.py b/tests/integ/snowflake/ml/test_utils/common_test_base.py index fdbe874c..1a37e7a0 100644 --- a/tests/integ/snowflake/ml/test_utils/common_test_base.py +++ b/tests/integ/snowflake/ml/test_utils/common_test_base.py @@ -52,7 +52,7 @@ def tearDown(self) -> None: @classmethod def sproc_test( - kclass: Type[_V], local: bool = True, test_callers_rights=True + kclass: Type[_V], local: bool = True, test_callers_rights: bool = True ) -> Callable[[Callable[Concatenate[_V, _T_args], None]], Callable[Concatenate[_V, _T_args], None]]: def decorator(fn: Callable[Concatenate[_V, _T_args], None]) -> Callable[Concatenate[_V, _T_args], None]: @functools.wraps(fn) @@ -95,7 +95,10 @@ def _in_sproc_test(execute_as: Literal["owner", "caller"] = "owner") -> None: imports = [snowml_zip_module_filename, tests_zip_module_filename] packages = [ - req for req in _snowml_requirements.REQUIREMENTS if "snowflake-connector-python" not in req + req + for req in _snowml_requirements.REQUIREMENTS + # Remove "_" not in req once Snowpark 1.11.0 available, it is a workaround for their bug. + if "snowflake-connector-python" not in req and "_" not in req ] @F.sproc( # type: ignore[misc] diff --git a/tests/integ/snowflake/ml/test_utils/spcs_integ_test_base.py b/tests/integ/snowflake/ml/test_utils/spcs_integ_test_base.py index 7421d355..6d714034 100644 --- a/tests/integ/snowflake/ml/test_utils/spcs_integ_test_base.py +++ b/tests/integ/snowflake/ml/test_utils/spcs_integ_test_base.py @@ -12,35 +12,36 @@ class SpcsIntegTestBase(absltest.TestCase): _SNOWSERVICE_CONNECTION_NAME = "regtest" _TEST_CPU_COMPUTE_POOL = "REGTEST_INFERENCE_CPU_POOL" _TEST_GPU_COMPUTE_POOL = "REGTEST_INFERENCE_GPU_POOL" - _RUN_ID = uuid.uuid4().hex[:2] - _TEST_DB = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "db").upper() - _TEST_SCHEMA = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "schema").upper() - _TEST_STAGE = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "stage").upper() - _TEST_IMAGE_REPO = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "repo").upper() - - @classmethod - def setUpClass(cls) -> None: + + def setUp(self) -> None: """Creates Snowpark and Snowflake environments for testing.""" try: - login_options = connection_params.SnowflakeLoginOptions(connection_name=cls._SNOWSERVICE_CONNECTION_NAME) + login_options = connection_params.SnowflakeLoginOptions(connection_name=self._SNOWSERVICE_CONNECTION_NAME) except KeyError: raise SkipTest( "SnowService connection parameters not present: skipping " "TestModelRegistryIntegWithSnowServiceDeployment." ) - cls._session = Session.builder.configs( + + self._run_id = uuid.uuid4().hex[:2] + self._test_db = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self._run_id, "db").upper() + self._test_schema = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( + self._run_id, "schema" + ).upper() + self._test_stage = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self._run_id, "stage").upper() + + self._session = Session.builder.configs( { **login_options, - **{"database": cls._TEST_DB, "schema": cls._TEST_SCHEMA}, + **{"database": self._test_db, "schema": self._test_schema}, } ).create() - cls._db_manager = db_manager.DBManager(cls._session) - cls._db_manager.create_database(cls._TEST_DB) - cls._db_manager.create_schema(cls._TEST_SCHEMA) - cls._db_manager.create_stage(cls._TEST_STAGE, cls._TEST_SCHEMA, cls._TEST_DB, sse_encrypted=True) - cls._db_manager.cleanup_databases(expire_hours=6) - - @classmethod - def tearDownClass(cls) -> None: - cls._db_manager.drop_database(cls._TEST_DB) - cls._session.close() + self._db_manager = db_manager.DBManager(self._session) + self._db_manager.create_database(self._test_db) + self._db_manager.create_schema(self._test_schema) + self._db_manager.create_stage(self._test_stage, self._test_schema, self._test_db, sse_encrypted=True) + self._db_manager.cleanup_databases(expire_hours=6) + + def tearDown(self) -> None: + self._db_manager.drop_database(self._test_db) + self._session.close() diff --git a/tests/pytest.ini b/tests/pytest.ini index 3d4f2110..a8522207 100644 --- a/tests/pytest.ini +++ b/tests/pytest.ini @@ -19,3 +19,4 @@ markers = ; against those features, should be labeled using this mark, so that they will be excluded during ; the SnowML Build & Test pipeline. They will still be tested in conda environment. pip_incompatible: mark a test as incompatible with pip environment. + conda_incompatible: mark a test as incompatible with conda environment. diff --git a/third_party/rules_python/BUILD.bazel b/third_party/rules_python/BUILD.bazel new file mode 100644 index 00000000..8b331b35 --- /dev/null +++ b/third_party/rules_python/BUILD.bazel @@ -0,0 +1,3 @@ +exports_files([ + "packaging.patch", +]) diff --git a/third_party/rules_python/packaging.patch b/third_party/rules_python/packaging.patch new file mode 100644 index 00000000..61acecfa --- /dev/null +++ b/third_party/rules_python/packaging.patch @@ -0,0 +1,12 @@ +diff --git a/tools/BUILD.bazel b/tools/BUILD.bazel +index 51bd56d..fd951d9 100644 +--- a/tools/BUILD.bazel ++++ b/tools/BUILD.bazel +@@ -21,7 +21,6 @@ licenses(["notice"]) + py_binary( + name = "wheelmaker", + srcs = ["wheelmaker.py"], +- deps = ["@pypi__packaging//:lib"], + ) + + filegroup(