From 9eec61fab5fd108c9b02157468a682737bfc724e Mon Sep 17 00:00:00 2001 From: Snowflake Provisioner <58576687+snowflake-provisioner@users.noreply.github.com> Date: Thu, 27 Jul 2023 20:11:59 -0700 Subject: [PATCH] Project import generated by Copybara. (#31) --- CHANGELOG.md | 12 + README.md | 8 +- bazel/get_affected_targets.sh | 6 +- bazel/mypy/CREDITS.md | 3 + bazel/mypy/mypy.bzl | 263 +-- bazel/mypy/mypy.sh.tpl | 30 +- bazel/mypy/rules.bzl | 38 + bazel/requirements/BUILD.bazel | 134 +- .../parse_and_generate_requirements.py | 44 +- bazel/requirements/requirements.schema.json | 12 +- bazel/requirements/rules.bzl | 129 ++ ci/build_and_run_tests.sh | 6 +- ci/conda_recipe/meta.yaml | 6 +- codegen/codegen_rules.bzl | 1 + codegen/sklearn_wrapper_template.py_template | 36 +- conda-env-snowflake.yml | 3 +- conda-env.yml | 3 +- requirements.yml | 10 +- snowflake/ml/_internal/BUILD.bazel | 2 + snowflake/ml/_internal/env_utils.py | 60 +- snowflake/ml/_internal/env_utils_test.py | 47 +- snowflake/ml/_internal/exceptions/BUILD.bazel | 42 + .../ml/_internal/exceptions/error_codes.py | 69 + .../ml/_internal/exceptions/error_messages.py | 1 + .../ml/_internal/exceptions/exceptions.py | 41 + .../_internal/exceptions/exceptions_test.py | 21 + .../exceptions/fileset_error_messages.py | 8 + .../exceptions}/fileset_errors.py | 0 .../exceptions/modeling_error_messages.py | 6 + .../_internal/exceptions/modeling_errors.py | 4 + snowflake/ml/_internal/file_utils.py | 12 +- snowflake/ml/_internal/telemetry.py | 20 +- snowflake/ml/_internal/telemetry_test.py | 33 +- snowflake/ml/_internal/utils/parallelize.py | 2 +- .../_internal/utils/query_result_checker.py | 43 - .../utils/query_result_checker_test.py | 17 +- snowflake/ml/fileset/BUILD.bazel | 17 +- snowflake/ml/fileset/fileset.py | 122 +- snowflake/ml/fileset/fileset_test.py | 3 +- snowflake/ml/fileset/stage_fs.py | 52 +- snowflake/ml/model/BUILD.bazel | 21 +- .../_deploy_client/image_builds/BUILD.bazel | 6 +- .../image_builds/client_image_builder.py | 84 +- .../image_builds/client_image_builder_test.py | 22 +- .../image_builds/docker_context.py | 25 +- .../image_builds/docker_context_test.py | 56 +- .../image_builds/inference_server/BUILD.bazel | 3 +- .../image_builds/inference_server/main.py | 58 +- .../inference_server/main_test.py | 162 +- .../templates/dockerfile_template | 9 +- .../test_fixtures/dockerfile_test_fixture | 7 +- .../test_fixtures/dockerfile_test_gpu_fixture | 29 + .../_deploy_client/snowservice/BUILD.bazel | 6 +- .../_deploy_client/snowservice/deploy.py | 129 +- .../snowservice/deploy_options.py | 73 +- .../_deploy_client/snowservice/deploy_test.py | 144 +- .../templates/service_spec_template | 4 +- .../model/_deploy_client/utils/constants.py | 7 +- .../utils/snowservice_client.py | 11 +- .../utils/snowservice_client_test.py | 2 +- .../_deploy_client/warehouse/BUILD.bazel | 1 - .../model/_deploy_client/warehouse/deploy.py | 68 +- .../_deploy_client/warehouse/deploy_test.py | 155 +- snowflake/ml/model/_deployer.py | 90 +- snowflake/ml/model/_env.py | 28 +- snowflake/ml/model/_env_test.py | 68 +- snowflake/ml/model/_handlers/BUILD.bazel | 43 + snowflake/ml/model/_handlers/_base.py | 2 +- snowflake/ml/model/_handlers/mlflow.py | 310 +++ snowflake/ml/model/_handlers/pytorch.py | 14 +- snowflake/ml/model/_handlers/sklearn.py | 5 +- snowflake/ml/model/_handlers/snowmlmodel.py | 5 +- snowflake/ml/model/_handlers/tensorflow.py | 196 ++ snowflake/ml/model/_handlers/torchscript.py | 14 +- snowflake/ml/model/_handlers/xgboost.py | 5 +- snowflake/ml/model/_model.py | 44 +- snowflake/ml/model/_model_meta.py | 64 +- snowflake/ml/model/_model_test.py | 599 +++++- snowflake/ml/model/_signatures/BUILD.bazel | 168 ++ .../ml/model/_signatures/base_handler.py | 47 + .../ml/model/_signatures/builtins_handler.py | 46 + .../ml/model/_signatures/builtins_test.py | 68 + snowflake/ml/model/_signatures/core.py | 470 +++++ snowflake/ml/model/_signatures/core_test.py | 164 ++ .../ml/model/_signatures/numpy_handler.py | 123 ++ snowflake/ml/model/_signatures/numpy_test.py | 184 ++ .../ml/model/_signatures/pandas_handler.py | 136 ++ snowflake/ml/model/_signatures/pandas_test.py | 257 +++ .../ml/model/_signatures/pytorch_handler.py | 93 + .../ml/model/_signatures/pytorch_test.py | 367 ++++ .../ml/model/_signatures/snowpark_handler.py | 126 ++ .../ml/model/_signatures/snowpark_test.py | 126 ++ .../model/_signatures/tensorflow_handler.py | 125 ++ .../ml/model/_signatures/tensorflow_test.py | 555 ++++++ snowflake/ml/model/_signatures/utils.py | 88 + snowflake/ml/model/_signatures/utils_test.py | 49 + snowflake/ml/model/model_signature.py | 1218 +----------- snowflake/ml/model/model_signature_test.py | 1715 ----------------- snowflake/ml/model/type_hints.py | 83 +- snowflake/ml/modeling/framework/BUILD.bazel | 3 + snowflake/ml/modeling/framework/_utils.py | 53 +- snowflake/ml/modeling/framework/base.py | 155 +- snowflake/ml/modeling/impute/BUILD_NATIVE.bzl | 1 + .../ml/modeling/impute/simple_imputer.py | 54 +- .../ml/modeling/metrics/classification.py | 28 +- snowflake/ml/modeling/metrics/correlation.py | 4 +- snowflake/ml/modeling/metrics/covariance.py | 4 +- .../ml/modeling/metrics/metrics_utils.py | 14 +- snowflake/ml/modeling/metrics/ranking.py | 18 +- snowflake/ml/modeling/metrics/regression.py | 44 +- snowflake/ml/modeling/pipeline/BUILD.bazel | 1 + snowflake/ml/modeling/pipeline/pipeline.py | 49 +- .../modeling/preprocessing/BUILD_NATIVE.bzl | 5 + .../ml/modeling/preprocessing/binarizer.py | 21 +- .../preprocessing/k_bins_discretizer.py | 85 +- .../modeling/preprocessing/label_encoder.py | 31 +- .../modeling/preprocessing/max_abs_scaler.py | 26 +- .../modeling/preprocessing/min_max_scaler.py | 30 +- .../ml/modeling/preprocessing/normalizer.py | 39 +- .../modeling/preprocessing/one_hot_encoder.py | 146 +- .../modeling/preprocessing/ordinal_encoder.py | 23 +- .../modeling/preprocessing/robust_scaler.py | 38 +- .../modeling/preprocessing/standard_scaler.py | 27 +- snowflake/ml/registry/model_registry.py | 108 +- snowflake/ml/registry/model_registry_test.py | 3 +- ...t to Snowpark Container Service Demo.ipynb | 644 +++++++ snowflake/ml/requirements.bzl | 4 +- snowflake/ml/utils/BUILD.bazel | 2 +- snowflake/ml/version.bzl | 2 +- tests/integ/snowflake/ml/fileset/BUILD.bazel | 1 + .../ml/fileset/fileset_integ_test.py | 9 +- .../snowflake/ml/fileset/sfcfs_integ_test.py | 3 +- tests/integ/snowflake/ml/model/BUILD.bazel | 74 +- .../deployment_to_snowservice_integ_test.py | 34 +- .../ml/model/model_badcase_integ_test.py | 4 +- .../warehouse_mlflow_model_integ_test.py | 201 ++ .../warehouse_pytorch_model_integ_test.py | 35 +- .../warehouse_tensorflow_model_integ_test.py | 355 ++++ .../ml/modeling/framework/BUILD.bazel | 1 + .../ml/modeling/framework/test_base.py | 7 +- .../ml/modeling/impute/test_simple_imputer.py | 3 +- .../ml/modeling/metrics/test_roc_curve.py | 39 + .../preprocessing/test_k_bins_discretizer.py | 12 +- .../preprocessing/test_one_hot_encoder.py | 49 +- tests/integ/snowflake/ml/registry/BUILD.bazel | 18 +- .../ml/registry/model_registry_integ_test.py | 103 +- ...el_registry_integ_test_with_snowservice.py | 224 +++ .../integ/snowflake/ml/test_utils/BUILD.bazel | 18 +- .../snowflake/ml/test_utils/db_manager.py | 16 + .../snowflake/ml/test_utils/model_factory.py | 160 ++ 150 files changed, 8551 insertions(+), 4620 deletions(-) create mode 100644 bazel/mypy/CREDITS.md create mode 100644 bazel/mypy/rules.bzl create mode 100644 bazel/requirements/rules.bzl create mode 100644 snowflake/ml/_internal/exceptions/BUILD.bazel create mode 100644 snowflake/ml/_internal/exceptions/error_codes.py create mode 100644 snowflake/ml/_internal/exceptions/error_messages.py create mode 100644 snowflake/ml/_internal/exceptions/exceptions.py create mode 100644 snowflake/ml/_internal/exceptions/exceptions_test.py create mode 100644 snowflake/ml/_internal/exceptions/fileset_error_messages.py rename snowflake/ml/{fileset => _internal/exceptions}/fileset_errors.py (100%) create mode 100644 snowflake/ml/_internal/exceptions/modeling_error_messages.py create mode 100644 snowflake/ml/_internal/exceptions/modeling_errors.py create mode 100644 snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_gpu_fixture create mode 100644 snowflake/ml/model/_handlers/mlflow.py create mode 100644 snowflake/ml/model/_handlers/tensorflow.py create mode 100644 snowflake/ml/model/_signatures/BUILD.bazel create mode 100644 snowflake/ml/model/_signatures/base_handler.py create mode 100644 snowflake/ml/model/_signatures/builtins_handler.py create mode 100644 snowflake/ml/model/_signatures/builtins_test.py create mode 100644 snowflake/ml/model/_signatures/core.py create mode 100644 snowflake/ml/model/_signatures/core_test.py create mode 100644 snowflake/ml/model/_signatures/numpy_handler.py create mode 100644 snowflake/ml/model/_signatures/numpy_test.py create mode 100644 snowflake/ml/model/_signatures/pandas_handler.py create mode 100644 snowflake/ml/model/_signatures/pandas_test.py create mode 100644 snowflake/ml/model/_signatures/pytorch_handler.py create mode 100644 snowflake/ml/model/_signatures/pytorch_test.py create mode 100644 snowflake/ml/model/_signatures/snowpark_handler.py create mode 100644 snowflake/ml/model/_signatures/snowpark_test.py create mode 100644 snowflake/ml/model/_signatures/tensorflow_handler.py create mode 100644 snowflake/ml/model/_signatures/tensorflow_test.py create mode 100644 snowflake/ml/model/_signatures/utils.py create mode 100644 snowflake/ml/model/_signatures/utils_test.py create mode 100644 snowflake/ml/registry/notebooks/Snowpark ML - Deployment to Snowpark Container Service Demo.ipynb create mode 100644 tests/integ/snowflake/ml/model/warehouse_mlflow_model_integ_test.py create mode 100644 tests/integ/snowflake/ml/model/warehouse_tensorflow_model_integ_test.py create mode 100644 tests/integ/snowflake/ml/registry/model_registry_integ_test_with_snowservice.py create mode 100644 tests/integ/snowflake/ml/test_utils/model_factory.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 2266574b..68e0d5fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Release History +## 1.0.4 + +### New Features +- Model Registry: Added support save/load/deploy Tensorflow models (`tensorflow.Module`). +- Model Registry: Added support save/load/deploy MLFlow PyFunc models (`mlflow.pyfunc.PyFuncModel`). +- Model Development: Input dataframes can now be joined against data loaded from staged files. +- Model Development: Added support for non-English languages. + +### Bug Fixes + +- Model Registry: Fix an issue that model dependencies are incorrectly reported as unresolvable on certain platforms. + ## 1.0.3 (2023-07-14) ### Behavior Changes diff --git a/README.md b/README.md index 85c2e78e..4bed5f68 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ Snowpark ML is a set of tools including SDKs and underlying infrastructure to build and deploy machine learning models. With Snowpark ML, you can pre-process data, train, manage and deploy ML models all within Snowflake, using a single SDK, and benefit from Snowflake’s proven performance, scalability, stability and governance at every stage of the Machine Learning workflow. ## Key Components of Snowpark ML + The Snowpark ML Python SDK provides a number of APIs to support each stage of an end-to-end Machine Learning development and deployment process, and includes two key components. ### Snowpark ML Development [Public Preview] @@ -16,6 +17,7 @@ A collection of python APIs to enable efficient model development directly in Sn ### Snowpark ML Ops [Private Preview] Snowpark MLOps complements the Snowpark ML Development API, and provides model management capabilities along with integrated deployment into Snowflake. Currently, the API consists of + 1. FileSet API: FileSet provides a Python fsspec-compliant API for materializing data into a Snowflake internal stage from a query or Snowpark Dataframe along with a number of convenience APIs. 1. Model Registry: A python API for managing models within Snowflake which also supports deployment of ML models into Snowflake Warehouses as vectorized UDFs. @@ -25,15 +27,19 @@ During PrPr, we are iterating on API without backward compatibility guarantees. - [Documentation](https://docs.snowflake.com/developer-guide/snowpark-ml) ## Getting started + ### Have your Snowflake account ready + If you don't have a Snowflake account yet, you can [sign up for a 30-day free trial account](https://signup.snowflake.com/). ### Create a Python virtual environment -Python 3.8 is required. You can use [miniconda](https://docs.conda.io/en/latest/miniconda.html), [anaconda](https://www.anaconda.com/), or [virtualenv](https://docs.python.org/3/tutorial/venv.html) to create a Python 3.8 virtual environment. + +Python version 3.8, 3.9 & 3.10 are supported. You can use [miniconda](https://docs.conda.io/en/latest/miniconda.html), [anaconda](https://www.anaconda.com/), or [virtualenv](https://docs.python.org/3/tutorial/venv.html) to create a virtual environment. To have the best experience when using this library, [creating a local conda environment with the Snowflake channel](https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-packages.html#local-development-and-testing) is recommended. ### Install the library to the Python virtual environment + ``` pip install snowflake-ml-python ``` diff --git a/bazel/get_affected_targets.sh b/bazel/get_affected_targets.sh index 45f95cbb..c3e0f38d 100755 --- a/bazel/get_affected_targets.sh +++ b/bazel/get_affected_targets.sh @@ -28,8 +28,10 @@ help() { echo "Running ${PROG}" bazel="bazel" -current_revision=$(git rev-parse HEAD) -pr_revision=${current_revision} +current_revision=$(git symbolic-ref --short -q HEAD \ + || git describe --tags --exact-match 2> /dev/null \ + || git rev-parse --short HEAD) +pr_revision=$(git rev-parse HEAD) output_path="/tmp/affected_targets/targets" workspace_path=$(pwd) diff --git a/bazel/mypy/CREDITS.md b/bazel/mypy/CREDITS.md new file mode 100644 index 00000000..b549741a --- /dev/null +++ b/bazel/mypy/CREDITS.md @@ -0,0 +1,3 @@ +Special thanks to [bazel-mypy-integration](https://github.com/bazel-contrib/bazel-mypy-integration). + +This package has been forked from that repo and modified to cater specific need of this Snowflake repo. diff --git a/bazel/mypy/mypy.bzl b/bazel/mypy/mypy.bzl index 041bdc50..ea36c1d2 100644 --- a/bazel/mypy/mypy.bzl +++ b/bazel/mypy/mypy.bzl @@ -1,54 +1,52 @@ +"Public API" + load("@bazel_skylib//lib:shell.bzl", "shell") load("@bazel_skylib//lib:sets.bzl", "sets") +load("//bazel/mypy:rules.bzl", "MyPyStubsInfo") MyPyAspectInfo = provider( + "TODO: documentation", fields = { - "out": "mypy output.", - "cache": "cache generated by mypy.", + "exe": "Used to pass the rule implementation built exe back to calling aspect.", + "out": "Used to pass the dummy output file back to calling aspect.", }, ) -# We don't support stubs (pyi) yet. -PY_EXTENSIONS = ["py"] -PY_RULES = ["py_binary", "py_library", "py_test", "py_wheel", "py_package"] +# Switch to True only during debugging and development. +# All releases should have this as False. +DEBUG = False + +VALID_EXTENSIONS = ["py", "pyi"] DEFAULT_ATTRS = { - "_mypy_sh": attr.label( + "_template": attr.label( default = Label("//bazel/mypy:mypy.sh.tpl"), allow_single_file = True, ), - "_mypy": attr.label( + "_mypy_cli": attr.label( default = Label("//bazel/mypy:mypy"), executable = True, - cfg = "host", + cfg = "exec", ), "_mypy_config": attr.label( default = Label("//:mypy.ini"), allow_single_file = True, ), - "_debug": attr.bool( - default = False, - ) } -# See https://github.com/python/mypy/pull/4759 for what `cache_map_triples` mean. -def _sources_to_cache_map_triples(cache_files, dep_cache_files): +def _sources_to_cache_map_triples(srcs): triples_as_flat_list = [] - for d in (cache_files, dep_cache_files): - for src, (meta, data) in d.items(): - triples_as_flat_list.extend([ - shell.quote(src.path), - shell.quote(meta.path), - shell.quote(data.path), - ]) + for f in srcs: + f_path = f.path + triples_as_flat_list.extend([ + shell.quote(f_path), + shell.quote("{}.meta.json".format(f_path)), + shell.quote("{}.data.json".format(f_path)), + ]) return triples_as_flat_list -def _flatten_cache_dict(cache_files): - result = [] - for meta, data in cache_files.values(): - result.append(meta) - result.append(data) - return result +def _is_external_dep(dep): + return dep.label.workspace_root.startswith("external/") def _is_external_src(src_file): return src_file.path.startswith("external/") @@ -57,127 +55,142 @@ def _extract_srcs(srcs): direct_src_files = [] for src in srcs: for f in src.files.to_list(): - if f.extension in PY_EXTENSIONS and not _is_external_src(f): + if f.extension in VALID_EXTENSIONS: direct_src_files.append(f) return direct_src_files -# Overview -# This aspect does the following: -# - Create an action to run mypy against the sources of `target` -# - input of this action: -# - source files of `target` and source files of all its deps. -# - cache files produced by checking its deps. -# - output of this action: -# - mypy stderr+stdout in a file -# - cache files produced by checking the source files of `target` -# - this action depends on actions created for the deps, so that it always -# has access to cache files produced by those actions. -# - Propagate the output of this action along the `deps` edge of the build graph. -# - Produces a OutputGroup which contains the output of all the actions created -# along the build graph so that one can use bazel commandline to mark all those -# actions as required and to make them run. -def _mypy_aspect_impl(target, ctx): - if (ctx.rule.kind not in PY_RULES or - ctx.label.workspace_root.startswith("external")): - return [] +def _extract_transitive_deps(deps): + transitive_deps = [] + for dep in deps: + if MyPyStubsInfo not in dep and PyInfo in dep and not _is_external_dep(dep): + transitive_deps.append(dep[PyInfo].transitive_sources) + return transitive_deps + +def _extract_stub_deps(deps): + # Need to add the .py files AND the .pyi files that are + # deps of the rule + stub_files = [] + for dep in deps: + if MyPyStubsInfo in dep: + for stub_srcs_target in dep[MyPyStubsInfo].srcs: + for src_f in stub_srcs_target.files.to_list(): + if src_f.extension == "pyi": + stub_files.append(src_f) + return stub_files + +def _extract_imports(imports, label): + # NOTE: Bazel's implementation of this for py_binary, py_test is at + # src/main/java/com/google/devtools/build/lib/bazel/rules/python/BazelPythonSemantics.java + mypypath_parts = [] + for import_ in imports: + if import_.startswith("/"): + # buildifier: disable=print + print("ignoring invalid absolute path '{}'".format(import_)) + elif import_ in ["", "."]: + mypypath_parts.append(label.package) + else: + mypypath_parts.append("{}/{}".format(label.package, import_)) + return mypypath_parts + +def _mypy_rule_impl(ctx): base_rule = ctx.rule - debug = ctx.attr._debug - mypy_config_file = ctx.file._mypy_config - # Get the cache files generated by running mypy against the deps. - dep_cache_files = {} - for dep in ctx.rule.attr.deps: - if MyPyAspectInfo in dep: - dep_cache_files.update(dep[MyPyAspectInfo].cache) + mypy_config_file = ctx.file._mypy_config + mypypath_parts = [] direct_src_files = [] + transitive_srcs_depsets = [] + stub_files = [] + if hasattr(base_rule.attr, "srcs"): direct_src_files = _extract_srcs(base_rule.attr.srcs) - # It's possible that this target does not have srcs (py_wheel for example). - # However, if the user requests to type check a py_wheel, we should make sure - # its python transitive deps get checked. - if direct_src_files: - # There are source files in this target to check. The check will result in - # cache files. Request bazel to allocate those files now. - cache_files = {} - for src in direct_src_files: - meta_file = ctx.actions.declare_file("{}.meta.json".format(src.basename)) - data_file = ctx.actions.declare_file("{}.data.json".format(src.basename)) - cache_files[src] = (meta_file, data_file) - - - # The mypy stdout, which is expected to be produced by mypy_script. - mypy_out = ctx.actions.declare_file("%s_mypy_out" % ctx.rule.attr.name) - # The script to invoke mypy against this target. - mypy_script = ctx.actions.declare_file( - "%s_mypy_script" % ctx.rule.attr.name, - ) - - # Generated files are located in a different root dir than source files - # Thus we need to let mypy know where to find both kinds in case in one analysis - # both kinds are present. - src_root_paths = sets.to_list( - sets.make( - [f.root.path for f in dep_cache_files.keys()] + - [f.root.path for f in cache_files.keys()]), - ) - - all_src_files = direct_src_files + list(dep_cache_files.keys()) + if hasattr(base_rule.attr, "deps"): + transitive_srcs_depsets = _extract_transitive_deps(base_rule.attr.deps) + stub_files = _extract_stub_deps(base_rule.attr.deps) + + if hasattr(base_rule.attr, "imports"): + mypypath_parts = _extract_imports(base_rule.attr.imports, ctx.label) + + final_srcs_depset = depset(transitive = transitive_srcs_depsets + + [depset(direct = direct_src_files)]) + src_files = [f for f in final_srcs_depset.to_list() if not _is_external_src(f)] + if not src_files: + return None + + mypypath_parts += [src_f.dirname for src_f in stub_files] + mypypath = ":".join(mypypath_parts) + + out = ctx.actions.declare_file("%s_dummy_out" % ctx.rule.attr.name) + exe = ctx.actions.declare_file( + "%s_mypy_exe" % ctx.rule.attr.name, + ) + + # Compose a list of the files needed for use. Note that aspect rules can use + # the project version of mypy however, other rules should fall back on their + # relative runfiles. + runfiles = ctx.runfiles(files = src_files + stub_files + [mypy_config_file]) + + src_root_paths = sets.to_list( + sets.make([f.root.path for f in src_files]), + ) + + ctx.actions.expand_template( + template = ctx.file._template, + output = exe, substitutions = { - "{MYPY_BIN}": ctx.executable._mypy.path, - "{CACHE_MAP_TRIPLES}": " ".join(_sources_to_cache_map_triples(cache_files, dep_cache_files)), + "{MYPY_EXE}": ctx.executable._mypy_cli.path, + "{MYPY_ROOT}": ctx.executable._mypy_cli.root.path, + "{CACHE_MAP_TRIPLES}": " ".join(_sources_to_cache_map_triples(src_files)), "{PACKAGE_ROOTS}": " ".join([ "--package-root " + shell.quote(path or ".") for path in src_root_paths ]), "{SRCS}": " ".join([ shell.quote(f.path) - for f in all_src_files + for f in src_files ]), - "{VERBOSE_OPT}": "--verbose" if debug else "", - "{VERBOSE_BASH}": "set -x" if debug else "", - "{OUTPUT}": mypy_out.path, - "{ADDITIONAL_MYPYPATH}": ":".join([p for p in src_root_paths if p]), - "{MYPY_INI}": mypy_config_file.path, - } - ctx.actions.expand_template( - template = ctx.file._mypy_sh, - output = mypy_script, - substitutions = substitutions, - is_executable = True, - ) - - # We want mypy to follow imports, so all the source files of the dependencies - # are need altoghther to check this target. - ctx.actions.run( - outputs = [mypy_out] + _flatten_cache_dict(cache_files), - inputs = depset( - all_src_files + - [mypy_config_file] + - _flatten_cache_dict(dep_cache_files) # cache generated by analyzing deps - ), - tools = [ctx.executable._mypy], - executable = mypy_script, - mnemonic = "MyPy", - progress_message = "Type-checking %s" % ctx.label, - use_default_shell_env = True, - ) - dep_cache_files.update(cache_files) - transitive_mypy_outs = [] - for dep in ctx.rule.attr.deps: - if OutputGroupInfo in dep: - if hasattr(dep[OutputGroupInfo], "mypy"): - transitive_mypy_outs.append(dep[OutputGroupInfo].mypy) + "{VERBOSE_OPT}": "--verbose" if DEBUG else "", + "{VERBOSE_BASH}": "set -x" if DEBUG else "", + "{OUTPUT}": out.path if out else "", + "{MYPYPATH_PATH}": mypypath if mypypath else "", + "{MYPY_INI_PATH}": mypy_config_file.path, + }, + is_executable = True, + ) + + return [ + DefaultInfo(executable = exe, runfiles = runfiles), + MyPyAspectInfo(exe = exe, out = out), + ] +def _mypy_aspect_impl(_, ctx): + if (ctx.rule.kind not in ["py_binary", "py_library", "py_test", "mypy_test"] or + ctx.label.workspace_root.startswith("external")): + return [] + + providers = _mypy_rule_impl( + ctx + ) + if not providers: + return [] + + info = providers[0] + aspect_info = providers[1] + + ctx.actions.run( + outputs = [aspect_info.out], + inputs = info.default_runfiles.files, + tools = [ctx.executable._mypy_cli], + executable = aspect_info.exe, + mnemonic = "MyPy", + progress_message = "Type-checking %s" % ctx.label, + use_default_shell_env = True, + ) return [ OutputGroupInfo( - # We may not need to run mypy against this target, but we request - # all its dependencies to be checked, recursively, but demanding the output - # of those checks. - mypy = depset([mypy_out] if direct_src_files else [], transitive=transitive_mypy_outs), + mypy = depset([aspect_info.out]), ), - MyPyAspectInfo(out = mypy_out if direct_src_files else None, cache = dep_cache_files), ] mypy_aspect = aspect( diff --git a/bazel/mypy/mypy.sh.tpl b/bazel/mypy/mypy.sh.tpl index 57cdb09c..855853f9 100644 --- a/bazel/mypy/mypy.sh.tpl +++ b/bazel/mypy/mypy.sh.tpl @@ -1,29 +1,47 @@ #!/usr/bin/env bash {VERBOSE_BASH} -set -u +set -o errexit +set -o nounset set -o pipefail main() { + local output local report_file local status + local root local mypy report_file="{OUTPUT}" - mypy="{MYPY_BIN}" + root="{MYPY_ROOT}/" + mypy="{MYPY_EXE}" - export MYPYPATH="$(pwd):{ADDITIONAL_MYPYPATH}" + # TODO(Jonathon): Consider UX improvements using https://mypy.readthedocs.io/en/stable/command_line.html#configuring-error-messages - # --enable-incomplete-features is specified to support unpacking features for precise TypedDict typing. Can be changed to --enable-incomplete-features=Unpack with mypy version >= 1.0 - $mypy {VERBOSE_OPT} --bazel {PACKAGE_ROOTS} --config-file {MYPY_INI} --cache-map {CACHE_MAP_TRIPLES} --enable-incomplete-features -- {SRCS} > "${report_file}" 2>&1 + export MYPYPATH="$(pwd):{MYPYPATH_PATH}" + + # Workspace rules run in a different location from aspect rules. Here we + # normalize if the external source isn't found. + if [ ! -f $mypy ]; then + mypy=${mypy#${root}} + fi + + set +o errexit + output=$($mypy {VERBOSE_OPT} --bazel {PACKAGE_ROOTS} --config-file {MYPY_INI_PATH} --enable-incomplete-features --cache-map {CACHE_MAP_TRIPLES} -- {SRCS} 2>&1) status=$? + set -o errexit + + if [ ! -z "$report_file" ]; then + echo "${output}" > "${report_file}" + fi if [[ $status -ne 0 ]]; then printf "\033[0;31m======== BEGIN MYPY ERROR ========\033[0m\n" - cat "${report_file}" # Show MyPy's error to end-user via Bazel's console logging + echo "${output}" # Show MyPy's error to end-user via Bazel's console logging printf "\033[0;31m======== END MYPY ERROR ========\033[0m\n" exit 1 fi + } main "$@" diff --git a/bazel/mypy/rules.bzl b/bazel/mypy/rules.bzl new file mode 100644 index 00000000..58f7c531 --- /dev/null +++ b/bazel/mypy/rules.bzl @@ -0,0 +1,38 @@ +"mypy_stubs rule" +MyPyStubsInfo = provider( + "TODO: docs", + fields = { + "srcs": ".pyi stub files", + }, +) + +def _mypy_stubs_impl(ctx): + pyi_srcs = [] + for target in ctx.attr.srcs: + pyi_srcs.extend(target.files.to_list()) + transitive_srcs = depset(direct = pyi_srcs) + + return [ + MyPyStubsInfo( + srcs = ctx.attr.srcs, + ), + PyInfo( + # TODO(Jonathon): Stub files only for Py3 right? + has_py2_only_sources = False, + has_py3_only_sources = True, + uses_shared_libraries = False, + transitive_sources = transitive_srcs, + ), + ] + +mypy_stubs = rule( + implementation = _mypy_stubs_impl, + attrs = { + "srcs": attr.label_list( + allow_empty = False, + mandatory = True, + doc = "TODO(Jonathon)", + allow_files = [".pyi"], + ), + }, +) diff --git a/bazel/requirements/BUILD.bazel b/bazel/requirements/BUILD.bazel index aca69d96..ef0758ef 100644 --- a/bazel/requirements/BUILD.bazel +++ b/bazel/requirements/BUILD.bazel @@ -1,8 +1,6 @@ load("//bazel:py_rules.bzl", "py_binary") -load("@bazel_skylib//rules:diff_test.bzl", "diff_test") -load("@bazel_skylib//rules:write_file.bzl", "write_file") -load("@aspect_bazel_lib//lib:yq.bzl", "yq") load("//snowflake/ml:version.bzl", "VERSION") +load("//bazel/requirements:rules.bzl", "generate_requirement_file", "generate_requirement_file_yaml", "sync_target") package(default_visibility = ["//visibility:public"]) @@ -15,19 +13,14 @@ py_binary( _SRC_REQUIREMENT_FILE = "//:requirements.yml" -_SCHEMA_FILE = ":requirements.schema.json" +_CURRENT_PATH = "bazel/requirements" -_GENERATE_TOOL = ":parse_and_generate_requirements" +_SYNC_RULE_NAME = "sync_requirements" -_GENERATE_COMMAND = "$(location " + _GENERATE_TOOL + ") $(location " + _SRC_REQUIREMENT_FILE + ") --schema $(location " + _SCHEMA_FILE + ") {options} > $@" - -_AUTOGEN_HEADERS = """# DO NOT EDIT! -# Generate by running 'bazel run //bazel/requirements:sync_requirements' -""" - -# "---" is a document start marker, which is legit but optional (https://yaml.org/spec/1.1/#c-document-start). This -# is needed for conda meta.yaml to bypass some bug from conda side. -_YAML_START_DOCUMENT_MARKER = "---" +_SYNC_BAZEL_CMD = "bazel run //{current_path}:{sync_rule}".format( + current_path = _CURRENT_PATH, + sync_rule = _SYNC_RULE_NAME, +) _GENERATED_REQUIREMENTS_FILES = { "requirements_txt": { @@ -58,108 +51,33 @@ _GENERATED_REQUIREMENTS_FILES = { } [ - genrule( - name = "gen_{name}_body".format(name = name), - srcs = [ - _SRC_REQUIREMENT_FILE, - _SCHEMA_FILE, - ], - outs = ["{generated}.body".format(generated = value["generated"])], - cmd = _GENERATE_COMMAND.format(options = value["cmd"]), - tools = [_GENERATE_TOOL], - ) - for name, value in _GENERATED_REQUIREMENTS_FILES.items() - if name != "conda_meta" -] - -[ - genrule( - name = "gen_{name}".format(name = name), - srcs = [ - "{generated}.body".format(generated = value["generated"]), - ], - outs = [value["generated"]], - cmd = "(echo -e \"" + _AUTOGEN_HEADERS + "\" ; cat $(location :{generated}.body) ) > $@".format( - generated = value["generated"], - ), - tools = [_GENERATE_TOOL], + generate_requirement_file( + name = name, + cmd = value["cmd"], + generated_file = value["generated"], + generation_cmd = _SYNC_BAZEL_CMD, + src_requirement_file = _SRC_REQUIREMENT_FILE, + target = value["target"], ) for name, value in _GENERATED_REQUIREMENTS_FILES.items() if name != "conda_meta" ] # Generate ci/conda-recipe/meta.yaml -genrule( - name = "gen_conda_meta_body", - srcs = [ - _SRC_REQUIREMENT_FILE, - _SCHEMA_FILE, - ], - outs = ["meta.body.yaml"], - cmd = _GENERATE_COMMAND.format(options = "--mode version_requirements --format conda_meta --version " + VERSION), - tools = [_GENERATE_TOOL], -) - -yq( - name = "gen_conda_meta_body_format", - srcs = [ - ":meta.body.yaml", - "//bazel/requirements/templates:meta.tpl.yaml", - ], - outs = ["meta.body.formatted.yaml"], - expression = ". as $item ireduce ({}; . * $item ) | sort_keys(..)", -) - -genrule( - name = "gen_conda_meta", - srcs = [ - ":meta.body.formatted.yaml", - ], - outs = ["meta.yaml"], - cmd = "(echo -e \"" + _AUTOGEN_HEADERS + "\" ; echo \"" + _YAML_START_DOCUMENT_MARKER + "\"; cat $(location :meta.body.formatted.yaml) ) > $@", +generate_requirement_file_yaml( + name = "conda_meta", + cmd = _GENERATED_REQUIREMENTS_FILES["conda_meta"]["cmd"], + generated_file = _GENERATED_REQUIREMENTS_FILES["conda_meta"]["generated"], + generation_cmd = _SYNC_BAZEL_CMD, + src_requirement_file = _SRC_REQUIREMENT_FILE, + target = _GENERATED_REQUIREMENTS_FILES["conda_meta"]["target"], + template_file = "//bazel/requirements/templates:meta.tpl.yaml", ) -# Create a test target for each file that Bazel should -# write to the source tree. -[ - diff_test( - name = "check_{name}".format(name = name), - # Make it trivial for devs to understand that if - # this test fails, they just need to run the updater - # Note, you need bazel-skylib version 1.1.1 or greater - # to get the failure_message attribute - failure_message = "Please run: bazel run //bazel/requirements:sync_requirements", - file1 = ":{generated}".format(generated = value["generated"]), - file2 = value["target"], - ) - for name, value in _GENERATED_REQUIREMENTS_FILES.items() -] - # Generate the updater script so there's only one target for devs to run, # even if many generated files are in the source folder. -write_file( - name = "gen_sync_requirements", - out = "sync_requirements.sh", - content = [ - # This depends on bash, would need tweaks for Windows - "#!/usr/bin/env sh", - # Bazel gives us a way to access the source folder! - "cd $BUILD_WORKSPACE_DIRECTORY", - ] + [ - # Paths are now relative to the workspace. - # We can copy files from bazel-bin to the sources - "cp -fv bazel-bin/bazel/requirements/{generated} {target}".format( - generated = value["generated"], - # Convert label to path - target = value["target"].lstrip("//").lstrip(":").replace(":", "/"), - ) - for name, value in _GENERATED_REQUIREMENTS_FILES.items() - ], -) - -# This is what you can `bazel run` and it can write to the source folder -sh_binary( - name = "sync_requirements", - srcs = ["sync_requirements.sh"], - data = [":{generated}".format(generated = value["generated"]) for value in _GENERATED_REQUIREMENTS_FILES.values()], +sync_target( + name = _SYNC_RULE_NAME, + root_path = _CURRENT_PATH, + targets = _GENERATED_REQUIREMENTS_FILES.values(), ) diff --git a/bazel/requirements/parse_and_generate_requirements.py b/bazel/requirements/parse_and_generate_requirements.py index d97f8b67..15aff83d 100644 --- a/bazel/requirements/parse_and_generate_requirements.py +++ b/bazel/requirements/parse_and_generate_requirements.py @@ -7,12 +7,14 @@ import sys from typing import ( Generator, + List, Literal, MutableMapping, Optional, Sequence, Set, TypedDict, + Union, cast, ) @@ -80,7 +82,7 @@ def filter_by_extras(req_info: RequirementInfo, extras: bool, no_extras: bool) - ) -def get_req_name(req_info: RequirementInfo, env: Literal["conda", "pip"]) -> Optional[str]: +def get_req_name(req_info: RequirementInfo, env: Literal["conda", "pip", "conda-only", "pip-only"]) -> Optional[str]: """Get the name of the requirement in the given env. For each env, env specific name will be chosen, if not presented, common name will be chosen. @@ -96,13 +98,23 @@ def get_req_name(req_info: RequirementInfo, env: Literal["conda", "pip"]) -> Opt """ if env == "conda": return req_info.get("name_conda", req_info.get("name", None)) + elif env == "conda-only": + if "name_pypi" in req_info or "name" in req_info: + return None + return req_info.get("name_conda", None) elif env == "pip": return req_info.get("name_pypi", req_info.get("name", None)) + elif env == "pip-only": + if "name_conda" in req_info or "name" in req_info: + return None + return req_info.get("name_pypi", None) else: raise ValueError("Unreachable") -def generate_dev_pinned_string(req_info: RequirementInfo, env: Literal["conda", "pip"]) -> Optional[str]: +def generate_dev_pinned_string( + req_info: RequirementInfo, env: Literal["conda", "pip", "conda-only", "pip-only"] +) -> Optional[str]: """Get the pinned version for dev environment of the requirement in the given env. For each env, env specific pinned version will be chosen, if not presented, common pinned version will be chosen. @@ -121,19 +133,27 @@ def generate_dev_pinned_string(req_info: RequirementInfo, env: Literal["conda", name = get_req_name(req_info, env) if name is None: return None - if env == "conda": + if env.startswith("conda"): version = req_info.get("dev_version_conda", req_info.get("dev_version", None)) if version is None: raise ValueError("No pinned version exists.") from_channel = req_info.get("from_channel", None) + if version == "": + version_str = "" + else: + version_str = f"=={version}" if from_channel: - return f"{from_channel}::{name}=={version}" - return f"{name}=={version}" - elif env == "pip": + return f"{from_channel}::{name}{version_str}" + return f"{name}{version_str}" + elif env.startswith("pip"): version = req_info.get("dev_version_pypi", req_info.get("dev_version", None)) if version is None: raise ValueError("No pinned version exists.") - return f"{name}=={version}" + if version == "": + version_str = "" + else: + version_str = f"=={version}" + return f"{name}{version_str}" else: raise ValueError("Unreachable") @@ -302,11 +322,17 @@ def generate_requirements( ) ) ) - extended_env = list( + extended_env_conda = list( sorted(filter(None, map(lambda req_info: generate_dev_pinned_string(req_info, "conda"), requirements))) ) resolve_conda_environment(snowflake_only_env, channels=channels_to_use) - resolve_conda_environment(extended_env, channels=channels_to_use) + resolve_conda_environment(extended_env_conda, channels=channels_to_use) + extended_env: List[Union[str, MutableMapping[str, Sequence[str]]]] = extended_env_conda # type: ignore[assignment] + pip_only_reqs = list( + sorted(filter(None, map(lambda req_info: generate_dev_pinned_string(req_info, "pip-only"), requirements))) + ) + if pip_only_reqs: + extended_env.extend(["pip", {"pip": pip_only_reqs}]) if (mode, format) == ("dev_version", "text"): results = list( diff --git a/bazel/requirements/requirements.schema.json b/bazel/requirements/requirements.schema.json index 02fb8682..d055fe51 100644 --- a/bazel/requirements/requirements.schema.json +++ b/bazel/requirements/requirements.schema.json @@ -20,17 +20,17 @@ "dev_version": { "type": "string", "description": "The version to use in the development environment.", - "pattern": "^([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?$" + "pattern": "^([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc|alpha|beta)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?$" }, "dev_version_pypi": { "type": "string", "description": "The version to use in the development environment in PyPI, set if differs.", - "pattern": "^([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?$" + "pattern": "^([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc|alpha|beta)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?$" }, "dev_version_conda": { "type": "string", "description": "The version to use in the development environment in conda, set if differs.", - "pattern": "^([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?$" + "pattern": "^([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc|alpha|beta)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?$" }, "from_channel": { "type": "string", @@ -40,17 +40,17 @@ "version_requirements": { "type": "string", "description": "The version requirements of this package as a dependency when released.", - "pattern": "^((<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)(,(<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)*$" + "pattern": "^((<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc|alpha|beta)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)(,(<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc|alpha|beta)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)*$" }, "version_requirements_pypi": { "type": "string", "description": "The version requirements of this package as a dependency when released in PyPI, set if differs.", - "pattern": "^((<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)(,(<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)*$" + "pattern": "^((<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc|alpha|beta)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)(,(<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc|alpha|beta)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)*$" }, "version_requirements_conda": { "type": "string", "description": "The version requirements of this package as a dependency when released.", - "pattern": "^((<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)(,(<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)*$" + "pattern": "^((<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc|alpha|beta)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)(,(<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc|alpha|beta)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)*$" }, "requirements_extra_tags": { "type": "array", diff --git a/bazel/requirements/rules.bzl b/bazel/requirements/rules.bzl new file mode 100644 index 00000000..1082a08a --- /dev/null +++ b/bazel/requirements/rules.bzl @@ -0,0 +1,129 @@ +load("@bazel_skylib//rules:diff_test.bzl", "diff_test") +load("@bazel_skylib//rules:write_file.bzl", "write_file") +load("@aspect_bazel_lib//lib:yq.bzl", "yq") + +_AUTOGEN_HEADERS = """# DO NOT EDIT! +# Generate by running '{generation_cmd}' +""" + +_SCHEMA_FILE = "//bazel/requirements:requirements.schema.json" + +_GENERATE_TOOL = "//bazel/requirements:parse_and_generate_requirements" + +_GENERATE_COMMAND = "$(location " + _GENERATE_TOOL + ") $(location {src_requirement_file} ) --schema $(location " + _SCHEMA_FILE + ") {options} > $@" + +# "---" is a document start marker, which is legit but optional (https://yaml.org/spec/1.1/#c-document-start). This +# is needed for conda meta.yaml to bypass some bug from conda side. +_YAML_START_DOCUMENT_MARKER = "---" + +def generate_requirement_file( + name, + generated_file, + target, + cmd, + src_requirement_file, + generation_cmd): + native.genrule( + name = "gen_{name}_body".format(name = name), + srcs = [ + src_requirement_file, + _SCHEMA_FILE, + ], + outs = ["{generated}.body".format(generated = generated_file)], + cmd = _GENERATE_COMMAND.format(src_requirement_file = src_requirement_file, options = cmd), + tools = [_GENERATE_TOOL], + ) + native.genrule( + name = "gen_{name}".format(name = name), + srcs = [ + "{generated}.body".format(generated = generated_file), + ], + outs = [generated_file], + cmd = "(echo -e \"" + _AUTOGEN_HEADERS.format(generation_cmd = generation_cmd) + "\" ; cat $(location :{generated}.body) ) > $@".format( + generated = generated_file, + ), + tools = [_GENERATE_TOOL], + ) + diff_test( + name = "check_{name}".format(name = name), + failure_message = "Please run: bazel run {generation_cmd}".format(generation_cmd = generation_cmd), + file1 = ":{generated}".format(generated = generated_file), + file2 = target, + ) + +def generate_requirement_file_yaml( + name, + template_file, + generated_file, + target, + cmd, + src_requirement_file, + generation_cmd): + native.genrule( + name = "gen_{name}_body".format(name = name), + srcs = [ + src_requirement_file, + _SCHEMA_FILE, + ], + outs = ["{generated_file}.body.yaml".format(generated_file = generated_file)], + cmd = _GENERATE_COMMAND.format(src_requirement_file = src_requirement_file, options = cmd), + tools = [_GENERATE_TOOL], + ) + + yq( + name = "gen_{name}_body_format".format(name = name), + srcs = [ + "{generated_file}.body.yaml".format(generated_file = generated_file), + template_file, + ], + outs = ["{generated_file}.body.formatted.yaml".format(generated_file = generated_file)], + expression = ". as $item ireduce ({}; . * $item ) | sort_keys(..)", + ) + + native.genrule( + name = "gen_{name}".format(name = name), + srcs = [ + ":{generated_file}.body.formatted.yaml".format(generated_file = generated_file), + ], + outs = [generated_file], + cmd = "(echo -e \"" + _AUTOGEN_HEADERS.format(generation_cmd = generation_cmd) + "\" ; echo \"" + _YAML_START_DOCUMENT_MARKER + "\"; cat $(location :{generated_file}.body.formatted.yaml) ) > $@".format(generated_file = generated_file), + ) + + diff_test( + name = "check_{name}".format(name = name), + failure_message = "Please run: bazel run {generation_cmd}".format(generation_cmd = generation_cmd), + file1 = ":{generated}".format(generated = generated_file), + file2 = target, + ) + +def sync_target( + name, + root_path, + targets): + write_file( + name = "gen_{name}".format(name = name), + out = "{name}.sh".format(name = name), + content = [ + # This depends on bash, would need tweaks for Windows + "#!/usr/bin/env sh", + # Bazel gives us a way to access the source folder! + "cd $BUILD_WORKSPACE_DIRECTORY", + ] + [ + # Paths are now relative to the workspace. + # We can copy files from bazel-bin to the sources + "cp -fv bazel-bin/{root_path}/{generated} {target}".format( + root_path = root_path, + generated = value["generated"], + # Convert label to path + target = value["target"].lstrip("//").lstrip(":").replace(":", "/"), + ) + for value in targets + ], + ) + + # This is what you can `bazel run` and it can write to the source folder + native.sh_binary( + name = name, + srcs = ["{name}.sh".format(name = name)], + data = [":{generated}".format(generated = value["generated"]) for value in targets], + ) diff --git a/ci/build_and_run_tests.sh b/ci/build_and_run_tests.sh index 139b7699..39db4936 100755 --- a/ci/build_and_run_tests.sh +++ b/ci/build_and_run_tests.sh @@ -141,7 +141,7 @@ if [ "${ENV}" = "pip" ]; then python3.8 -m pip install -U pip setuptools wheel echo "Building snowpark wheel from main:$(git rev-parse HEAD)." pip wheel . --no-deps - cp snowflake_snowpark_python-*.whl "${WORKSPACE}" + cp "$(find . -maxdepth 1 -iname 'snowflake_snowpark_python-*.whl')" "${WORKSPACE}" deactivate popd fi @@ -149,7 +149,7 @@ if [ "${ENV}" = "pip" ]; then # Build SnowML pushd ${SNOWML_DIR} "${BAZEL}" build //snowflake/ml:wheel - cp bazel-bin/snowflake/ml/snowflake_ml_python-*.whl "${WORKSPACE}" + cp "$(${BAZEL} info bazel-bin)/snowflake/ml/snowflake_ml_python-${VERSION}-py3-none-any.whl" "${WORKSPACE}" popd else which conda @@ -194,7 +194,7 @@ if [ "${ENV}" = "pip" ]; then python3.8 -m pip list python3.8 -m pip install "snowflake_ml_python-${VERSION}-py3-none-any.whl[all]" pytest-xdist inflection --no-cache-dir --force-reinstall if [ "${WITH_SNOWPARK}" = true ]; then - cp "${WORKSPACE}/snowflake_snowpark_python-*.whl" "${TEMP_TEST_DIR}" + cp "$(find "${WORKSPACE}" -maxdepth 1 -iname 'snowflake_snowpark_python-*.whl')" "${TEMP_TEST_DIR}" python3.8 -m pip install "$(find . -maxdepth 1 -iname 'snowflake_snowpark_python-*.whl')" --force-reinstall fi python3.8 -m pip list diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml index 4654d18b..0f45d49c 100644 --- a/ci/conda_recipe/meta.yaml +++ b/ci/conda_recipe/meta.yaml @@ -17,7 +17,7 @@ build: noarch: python package: name: snowflake-ml-python - version: 1.0.3 + version: 1.0.4 requirements: build: - python @@ -26,7 +26,6 @@ requirements: - absl-py>=0.15,<2 - anyio>=3.5.0,<4 - cloudpickle - - conda-libmamba-solver>=23.1.0,<24 - fsspec>=2022.11,<=2023.1 - numpy>=1.23,<2 - packaging>=20.9,<24 @@ -36,12 +35,13 @@ requirements: - scikit-learn>=1.2.1,<1.3 - scipy>=1.9,<2 - snowflake-connector-python>=3.0.3,<4 - - snowflake-snowpark-python>=1.4.0,<2 + - snowflake-snowpark-python>=1.5.1,<2 - sqlparse>=0.4,<1 - typing-extensions>=4.1.0,<5 - xgboost>=1.7.3,<2 run_constrained: - lightgbm==3.3.5 + - mlflow>=2.1.0,<3 - tensorflow>=2.9,<3 - torchdata>=0.4,<1 source: diff --git a/codegen/codegen_rules.bzl b/codegen/codegen_rules.bzl index 1dcd7895..437a01e3 100644 --- a/codegen/codegen_rules.bzl +++ b/codegen/codegen_rules.bzl @@ -91,6 +91,7 @@ def autogen_estimators(module, estimator_info_list): "//snowflake/ml/_internal/utils:pkg_version_utils", "//snowflake/ml/_internal/utils:identifier", "//snowflake/ml/model:model_signature", + "//snowflake/ml/model/_signatures:utils", ], tags = ["skip_mypy_check"], ) diff --git a/codegen/sklearn_wrapper_template.py_template b/codegen/sklearn_wrapper_template.py_template index 78aa1b85..77512e34 100644 --- a/codegen/sklearn_wrapper_template.py_template +++ b/codegen/sklearn_wrapper_template.py_template @@ -29,8 +29,8 @@ from snowflake.ml.model.model_signature import ( FeatureSpec, ModelSignature, _infer_signature, - _rename_features, ) +from snowflake.ml.model._signatures import utils as model_signature_utils _PROJECT = "ModelDevelopment" # Derive subproject from module name by removing "sklearn" @@ -220,7 +220,7 @@ class {transform.original_class_name}(BaseTransformer): dataset = dataset.select(selected_cols) # Extract query that generated the datafrome. We will need to pass it to the fit procedure. - query = str(dataset.queries["queries"][0]) + queries = dataset.queries["queries"] # Create a temp file and dump the transform to that file. local_transform_file_name = get_temp_file_path() @@ -235,10 +235,6 @@ class {transform.original_class_name}(BaseTransformer): query=stage_creation_query ).has_dimensions( expected_rows=1, expected_cols=1 - ).has_value_match( - row_idx=0, - col_idx=0, - expected_value=f"Stage area {{transform_stage_name}} successfully created." ).validate() # Use posixpath to construct stage paths @@ -276,7 +272,7 @@ class {transform.original_class_name}(BaseTransformer): ) def fit_wrapper_sproc( session: Session, - sql_query: str, + sql_queries: List[str], stage_transform_file_name: str, stage_result_file_name: str, input_cols: List[str], @@ -292,9 +288,11 @@ class {transform.original_class_name}(BaseTransformer): import inspect {transform.fit_sproc_imports} - # Execute snowpark query and obtain the results as pandas dataframe + # Execute snowpark queries and obtain the results as pandas dataframe # NB: this implies that the result data must fit into memory. - df = session.sql(sql_query).to_pandas(statement_params=statement_params) + for query in sql_queries[:-1]: + _ = session.sql(query).collect(statement_params=statement_params) + df = session.sql(sql_queries[-1]).to_pandas(statement_params=statement_params) local_transform_file = tempfile.NamedTemporaryFile(delete=True) local_transform_file_name = local_transform_file.name @@ -351,7 +349,7 @@ class {transform.original_class_name}(BaseTransformer): ) sproc_export_file_name = fit_wrapper_sproc( session, - query, + queries, stage_transform_file_name, stage_result_file_name, identifier.get_unescaped_names(self.input_cols), @@ -931,8 +929,8 @@ class {transform.original_class_name}(BaseTransformer): if len(selected_cols) > 0: dataset = dataset.select(selected_cols) - # Extract query that generated the dataframe. We will need to pass it to score procedure. - query = str(dataset.queries["queries"][0]) + # Extract queries that generated the dataframe. We will need to pass it to score procedure. + queries = dataset.queries["queries"] # Create a temp file and dump the score to that file. local_score_file_name = get_temp_file_path() @@ -948,10 +946,6 @@ class {transform.original_class_name}(BaseTransformer): query=stage_creation_query ).has_dimensions( expected_rows=1, expected_cols=1 - ).has_value_match( - row_idx=0, - col_idx=0, - expected_value=f"Stage area {{score_stage_name}} successfully created." ).validate() # Use posixpath to construct stage paths @@ -986,7 +980,7 @@ class {transform.original_class_name}(BaseTransformer): ) def score_wrapper_sproc( session: Session, - sql_query: str, + sql_queries: List[str], stage_score_file_name: str, input_cols: List[str], label_cols: List[str], @@ -1001,7 +995,9 @@ class {transform.original_class_name}(BaseTransformer): import inspect {transform.fit_sproc_imports} - df = session.sql(sql_query).to_pandas(statement_params=statement_params) + for query in sql_queries[:-1]: + _ = session.sql(query).collect(statement_params=statement_params) + df = session.sql(sql_queries[-1]).to_pandas(statement_params=statement_params) local_score_file = tempfile.NamedTemporaryFile(delete=True) local_score_file_name = local_score_file.name @@ -1043,7 +1039,7 @@ class {transform.original_class_name}(BaseTransformer): ) score = score_wrapper_sproc( session, - query, + queries, stage_score_file_name, identifier.get_unescaped_names(self.input_cols), identifier.get_unescaped_names(self.label_cols), @@ -1065,7 +1061,7 @@ class {transform.original_class_name}(BaseTransformer): # For classifier, the type of predict is the same as the type of label if self._sklearn_object._estimator_type == 'classifier': outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output - outputs = _rename_features(outputs, self.output_cols) # rename the output columns + outputs = model_signature_utils.rename_features(outputs, self.output_cols) # rename the output columns self._model_signature_dict["predict"] = ModelSignature(inputs, ([] if self._drop_input_cols else inputs) + outputs) # For regressor, the type of predict is float64 diff --git a/conda-env-snowflake.yml b/conda-env-snowflake.yml index d8639f76..250e318a 100644 --- a/conda-env-snowflake.yml +++ b/conda-env-snowflake.yml @@ -19,6 +19,7 @@ dependencies: - joblib==1.1.1 - jsonschema==3.2.0 - lightgbm==3.3.5 +- mlflow==2.3.1 - moto==4.0.11 - mypy==0.981 - networkx==2.8.4 @@ -34,7 +35,7 @@ dependencies: - scikit-learn==1.2.2 - scipy==1.9.3 - snowflake-connector-python==3.0.3 -- snowflake-snowpark-python==1.4.0 +- snowflake-snowpark-python==1.5.1 - sqlparse==0.4.3 - tensorflow==2.11.0 - torchdata==0.5.1 diff --git a/conda-env.yml b/conda-env.yml index 367f40c3..ed189434 100644 --- a/conda-env.yml +++ b/conda-env.yml @@ -21,6 +21,7 @@ dependencies: - joblib==1.1.1 - jsonschema==3.2.0 - lightgbm==3.3.5 +- mlflow==2.3.1 - moto==4.0.11 - mypy==0.981 - networkx==2.8.4 @@ -36,7 +37,7 @@ dependencies: - scikit-learn==1.2.2 - scipy==1.9.3 - snowflake-connector-python==3.0.3 -- snowflake-snowpark-python==1.4.0 +- snowflake-snowpark-python==1.5.1 - sqlparse==0.4.3 - tensorflow==2.11.0 - torchdata==0.5.1 diff --git a/requirements.yml b/requirements.yml index 2a50e07a..b7f47b29 100644 --- a/requirements.yml +++ b/requirements.yml @@ -50,7 +50,6 @@ dev_version: "1.24.28" - name_conda: conda-libmamba-solver dev_version_conda: "23.3.0" - version_requirements_conda: ">=23.1.0,<24" - name: cloudpickle dev_version: "2.0.0" version_requirements: "" @@ -79,6 +78,11 @@ version_requirements: "==3.3.5" requirements_extra_tags: - lightgbm +- name: mlflow + dev_version: "2.3.1" + version_requirements: ">=2.1.0,<3" + requirements_extra_tags: + - mlflow - name: moto dev_version: "4.0.11" - name: mypy @@ -128,8 +132,8 @@ dev_version: "3.0.3" version_requirements: ">=3.0.3,<4" - name: snowflake-snowpark-python - dev_version: "1.4.0" - version_requirements: ">=1.4.0,<2" + dev_version: "1.5.1" + version_requirements: ">=1.5.1,<2" tags: - deployment_core - name: starlette diff --git a/snowflake/ml/_internal/BUILD.bazel b/snowflake/ml/_internal/BUILD.bazel index 2e84a63f..3403618e 100644 --- a/snowflake/ml/_internal/BUILD.bazel +++ b/snowflake/ml/_internal/BUILD.bazel @@ -58,6 +58,7 @@ py_library( srcs = ["telemetry.py"], deps = [ "//snowflake/ml/_internal:env", + "//snowflake/ml/_internal/exceptions:exceptions", ], ) @@ -67,5 +68,6 @@ py_test( deps = [ ":telemetry", "//snowflake/ml/_internal:env", + "//snowflake/ml/_internal/exceptions:exceptions", ], ) diff --git a/snowflake/ml/_internal/env_utils.py b/snowflake/ml/_internal/env_utils.py index c0712b8b..1e26d3d2 100644 --- a/snowflake/ml/_internal/env_utils.py +++ b/snowflake/ml/_internal/env_utils.py @@ -1,5 +1,6 @@ import collections import copy +import re import textwrap import warnings from importlib import metadata as importlib_metadata @@ -16,6 +17,8 @@ _INFO_SCHEMA_PACKAGES_HAS_RUNTIME_VERSION: Optional[bool] = None _SNOWFLAKE_CONDA_PACKAGE_CACHE: Dict[str, List[version.Version]] = {} +DEFAULT_CHANNEL_NAME = "" + def _validate_pip_requirement_string(req_str: str) -> requirements.Requirement: """Validate the input pip requirement string according to PEP 508. @@ -225,42 +228,6 @@ def relax_requirement_version(req: requirements.Requirement) -> requirements.Req return new_req -def resolve_conda_environment( - packages: List[requirements.Requirement], channels: List[str], python_version: str -) -> Optional[List[str]]: - """Use conda api to check if given packages are resolvable in given channels. Only work when conda is - locally installed. - - Args: - packages: Packages to be installed. - channels: Anaconda channels (name or url) where conda should search into. - python_version: A string of python version where model is run. - - Returns: - List of frozen dependencies represented in PEP 508 form if resolvable, None otherwise. - """ - from conda import exceptions as conda_exceptions - from conda_libmamba_solver import solver - - package_names = list(map(lambda x: x.name, packages)) - specs = list(map(str, packages)) + [f"python=={python_version}"] - - conda_solver = solver.LibMambaSolver("snow-env", channels=channels, specs_to_add=specs) - try: - solve_result = conda_solver.solve_final_state() - except ( - conda_exceptions.ResolvePackageNotFound, - conda_exceptions.UnsatisfiableError, - conda_exceptions.PackagesNotFoundError, - solver.LibMambaUnsatisfiableError, - ): - return None - - return sorted( - f"{pkg_record.name}=={pkg_record.version}" for pkg_record in solve_result if pkg_record.name in package_names - ) - - def _check_runtime_version_column_existence(session: session.Session) -> bool: sql = textwrap.dedent( """ @@ -351,3 +318,24 @@ def validate_requirements_in_snowflake_conda_channel( else: ret_list.append(str(req)) return sorted(ret_list) + + +# We have to use re to support MLFlow generated python string, which use = rather than == +PYTHON_VERSION_PATTERN = re.compile(r"python(?:(?P=|==|>|<|>=|<=|~=|===)(?P\d(?:\.\d+)+))?") + + +def parse_python_version_string(dep: str) -> Optional[str]: + if dep.startswith("python"): + m = PYTHON_VERSION_PATTERN.search(dep) + if m is None: + return None + op = m.group("op") + if op and (op != "=" and op != "=="): + raise ValueError("Unsupported operator for python version specifier.") + ver = m.group("ver") + if ver: + return ver + else: + # "python" only, no specifier + return "" + return None diff --git a/snowflake/ml/_internal/env_utils_test.py b/snowflake/ml/_internal/env_utils_test.py index 14d18aa7..0a21e25f 100644 --- a/snowflake/ml/_internal/env_utils_test.py +++ b/snowflake/ml/_internal/env_utils_test.py @@ -274,42 +274,6 @@ def test_relax_requirement_version(self) -> None: self.assertEqual(env_utils.relax_requirement_version(r), requirements.Requirement("python-package")) self.assertIsNot(env_utils.relax_requirement_version(r), r) - def test_resolve_conda_environment(self) -> None: - _SNOWFLAKE_CONDA_CHANNEL_URL = "https://repo.anaconda.com/pkgs/snowflake" - rl = [requirements.Requirement("numpy")] - self.assertIsNotNone( - env_utils.resolve_conda_environment( - rl, [_SNOWFLAKE_CONDA_CHANNEL_URL], python_version=snowml_env.PYTHON_VERSION - ) - ) - - rl = [requirements.Requirement("numpy==1.22.4")] - self.assertIsNone( - env_utils.resolve_conda_environment( - rl, [_SNOWFLAKE_CONDA_CHANNEL_URL], python_version=snowml_env.PYTHON_VERSION - ) - ) - - rl = [requirements.Requirement(f"numpy=={importlib_metadata.version('numpy')}")] - self.assertListEqual( - env_utils.resolve_conda_environment( - rl, - ["defaults"], - python_version=snowml_env.PYTHON_VERSION, - ), - [f"numpy=={importlib_metadata.version('numpy')}"], - ) - - rl = [requirements.Requirement(f"numpy<={importlib_metadata.version('numpy')}")] - self.assertListEqual( - env_utils.resolve_conda_environment( - rl, - ["defaults"], - python_version=snowml_env.PYTHON_VERSION, - ), - [f"numpy=={importlib_metadata.version('numpy')}"], - ) - def test_validate_requirements_in_snowflake_conda_channel(self) -> None: m_session = mock_session.MockSession(conn=None, test_case=self) m_session.add_mock_sql( @@ -613,6 +577,17 @@ def test_validate_requirements_in_snowflake_conda_channel(self) -> None: sorted(["xgboost", "pytorch"]), ) + def test_parse_python_version_string(self) -> None: + self.assertIsNone(env_utils.parse_python_version_string("not_python")) + self.assertEqual(env_utils.parse_python_version_string("python"), "") + self.assertEqual(env_utils.parse_python_version_string("python==3.8.13"), "3.8.13") + self.assertEqual(env_utils.parse_python_version_string("python=3.11"), "3.11") + with self.assertRaises(ValueError): + env_utils.parse_python_version_string("python<=3.11") + + with self.assertRaises(ValueError): + env_utils.parse_python_version_string("python>2.7.16") + if __name__ == "__main__": absltest.main() diff --git a/snowflake/ml/_internal/exceptions/BUILD.bazel b/snowflake/ml/_internal/exceptions/BUILD.bazel new file mode 100644 index 00000000..bad11106 --- /dev/null +++ b/snowflake/ml/_internal/exceptions/BUILD.bazel @@ -0,0 +1,42 @@ +load("//bazel:py_rules.bzl", "py_library", "py_test") + +package(default_visibility = ["//visibility:public"]) + +py_library( + name = "exceptions", + srcs = [ + "exceptions.py", + "error_codes.py", + ], +) + +py_test( + name = "exceptions_test", + srcs = ["exceptions_test.py"], + deps = [":exceptions"], +) + +py_library( + name = "error_messages", + srcs = ["error_messages.py"], +) + +py_library( + name = "fileset_errors", + srcs = ["fileset_errors.py"], +) + +py_library( + name = "fileset_error_messages", + srcs = ["fileset_error_messages.py"], +) + +py_library( + name = "modeling_errors", + srcs = ["modeling_errors.py"], +) + +py_library( + name = "modeling_error_messages", + srcs = ["modeling_error_messages.py"], +) diff --git a/snowflake/ml/_internal/exceptions/error_codes.py b/snowflake/ml/_internal/exceptions/error_codes.py new file mode 100644 index 00000000..dd52525f --- /dev/null +++ b/snowflake/ml/_internal/exceptions/error_codes.py @@ -0,0 +1,69 @@ +""" +- *XXX: category + - 0XXX: undefined + - 1XXX: system + - 2XXX: user + - 9XXX: internal test +- X*XX: source + - X0XX: undefined + - X1XX: Python built-in + - X2XX: snowml (e.g. FileSetError) + - X3XX: Snowpark (e.g. SnowparkClientException) + - X4XX: Python connector (e.g. DatabaseError) + - X5XX: 3p dependency +- XX**: cause +""" + +# INTERNAL +# Indicates an intentional error for internal error handling testing. +INTERNAL_TEST = "9000" + +# UNDEFINED +# Indicates a failure that is not raised by Snowpark ML is caught by telemetry, and therefore undefined to the library, +# which can be caused by dependency APIs, unknown internal errors, etc. +UNDEFINED = "0000" + +# SYSTEM +# Indicates an internal failure raising a Python built-in error with an ambiguous cause, such as invoking an unexpected +# private API, catching an error with an unknown cause, etc. +INTERNAL_PYTHON_ERROR = "1100" +# Indicates an internal failure raising a Snowpark ML error with an ambiguous cause, such as invoking an unexpected +# private API, catching an error with an unknown cause, etc. +INTERNAL_SNOWML_ERROR = "1200" + +# USER +# Indicates the incompatibility of local dependency versions with the target requirements. For example, an API added in +# a later version is called with an older dependency installed. +DEPENDENCY_VERSION_ERROR = "2100" +# Indicates the resource is missing: not whether the absence is temporary or permanent. +NOT_FOUND = "2101" +# The method is known but is not supported by the target resource. For example, calling `to_xgboost` is not allowed by +# Snowpark ML models based on scikit-learn. +METHOD_NOT_ALLOWED = "2102" +# Calling an API with unsupported keywords/values. +INVALID_ARGUMENT = "2110" +# Object has invalid attributes caused by invalid/unsupported value, unsupported data type, size mismatch, etc. +INVALID_ATTRIBUTE = "2111" +# Missing and invalid data caused by null value, unexpected value (e.g. division by 0), out of range value, etc. +INVALID_DATA = "2112" +# Invalid data type in the processed data. For example, an API handling numeric columns gets a string column. +INVALID_DATA_TYPE = "2113" + +# Indicates the creation of underlying resources (files, stages, tables, etc) failed, which can be caused by duplicated +# name, invalid permission, etc. +SNOWML_CREATE_FAILED = "2200" +# Indicates the read of underlying resources (files, stages, tables, etc) failed, which can be caused by duplicated +# name, invalid permission, etc. +SNOWML_READ_FAILED = "2201" +# Indicates the update of underlying resources (files, stages, tables, etc) failed, which can be caused by duplicated +# name, invalid permission, etc. +SNOWML_UPDATE_FAILED = "2202" +# Indicates the deletion of underlying resources (files, stages, tables, etc) failed, which can be caused by duplicated +# name, invalid permission, etc. +SNOWML_DELETE_FAILED = "2203" +# Indicates the Snowflake resource is missing: not whether the absence is temporary or permanent. +SNOWML_NOT_FOUND = "2204" +# Indicates the access of a stage failed, which can be caused by invalid name, invalid permission, etc. +SNOWML_INVALID_STAGE = "2210" +# Invalid query caused by syntax error, invalid source, etc. +SNOWML_INVALID_QUERY = "2211" diff --git a/snowflake/ml/_internal/exceptions/error_messages.py b/snowflake/ml/_internal/exceptions/error_messages.py new file mode 100644 index 00000000..f338f584 --- /dev/null +++ b/snowflake/ml/_internal/exceptions/error_messages.py @@ -0,0 +1 @@ +UNEXPECTED_KEYWORD = "Unexpected keyword: {}." diff --git a/snowflake/ml/_internal/exceptions/exceptions.py b/snowflake/ml/_internal/exceptions/exceptions.py new file mode 100644 index 00000000..77d49dc1 --- /dev/null +++ b/snowflake/ml/_internal/exceptions/exceptions.py @@ -0,0 +1,41 @@ +class SnowflakeMLException(Exception): + """Base Snowflake ML exception class""" + + def __init__( + self, + error_code: str, + original_exception: Exception, + ) -> None: + """ + Args: + error_code: Error code. + original_exception: Original exception. This is the exception raised to users by telemetry. + + Attributes: + error_code: Error code. + original_exception: Original exception with an error code in its message. + + Raises: + ValueError: Null error_code or original_exception. + + Examples: + raise exceptions.SnowflakeMLException(error_code=ERROR_CODE, original_exception=ValueError("Message.")) + + Internal error: + SnowflakeMLException("ValueError('(ERROR_CODE) Message.')") + + User error info: + ValueError: (ERROR_CODE) Message. + """ + if not (error_code and original_exception): + raise ValueError("Must provide non-empty error_code and original_exception.") + + self.error_code = error_code + self.original_exception = type(original_exception)(f"({self.error_code}) {str(original_exception)}") + self._pretty_msg = repr(self.original_exception) + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self._pretty_msg!r})" + + def __str__(self) -> str: + return self._pretty_msg diff --git a/snowflake/ml/_internal/exceptions/exceptions_test.py b/snowflake/ml/_internal/exceptions/exceptions_test.py new file mode 100644 index 00000000..8bdd799b --- /dev/null +++ b/snowflake/ml/_internal/exceptions/exceptions_test.py @@ -0,0 +1,21 @@ +from absl.testing import absltest, parameterized + +from snowflake.ml._internal.exceptions import error_codes, exceptions + + +class ExceptionsTest(parameterized.TestCase): + """Testing exceptions.""" + + def test_message(self) -> None: + message = "Error message." + error_code_message = f"({error_codes.INTERNAL_TEST}) {message}" + expected_exception = Exception(error_code_message) + actual_exception = exceptions.SnowflakeMLException( + error_code=error_codes.INTERNAL_TEST, original_exception=Exception(message) + ) + + self.assertEqual(repr(expected_exception), str(actual_exception)) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/_internal/exceptions/fileset_error_messages.py b/snowflake/ml/_internal/exceptions/fileset_error_messages.py new file mode 100644 index 00000000..d4113f0b --- /dev/null +++ b/snowflake/ml/_internal/exceptions/fileset_error_messages.py @@ -0,0 +1,8 @@ +BOTH_SF_CONNECTION_AND_SNOWPARK_SESSION_SPECIFIED = ( + "sf_connection and snowpark_session cannot be specified at the same time." +) +NO_SF_CONNECTION_OR_SNOWPARK_SESSION = "sf_connection or snowpark_session must be provided" +NO_SF_CONNECTION_OR_SNOWPARK_DATAFRAME = "Either snowpark_dataframe or sf_connection should be non-empty." + +# FileSetError +FILESET_ALREADY_EXISTS = "FileSet with name {} has already existed." diff --git a/snowflake/ml/fileset/fileset_errors.py b/snowflake/ml/_internal/exceptions/fileset_errors.py similarity index 100% rename from snowflake/ml/fileset/fileset_errors.py rename to snowflake/ml/_internal/exceptions/fileset_errors.py diff --git a/snowflake/ml/_internal/exceptions/modeling_error_messages.py b/snowflake/ml/_internal/exceptions/modeling_error_messages.py new file mode 100644 index 00000000..d407162a --- /dev/null +++ b/snowflake/ml/_internal/exceptions/modeling_error_messages.py @@ -0,0 +1,6 @@ +ATTRIBUTE_NOT_SET = "{} is not set." +SIZE_MISMATCH = "Size mismatch: {}={}, {}={}." +INVALID_MODEL_PARAM = "Invalid parameter {} for model {}. Valid parameters: {}." +UNSUPPORTED_MODEL_CONVERSION = "Object doesn't support {}. Please use {}." +INCOMPATIBLE_NEW_SKLEARN_PARAM = "Incompatible scikit-learn version: {} requires scikit-learn>={}. Installed: {}." +REMOVED_SKLEARN_PARAM = "Incompatible scikit-learn version: {} is removed in scikit-learn>={}. Installed: {}." diff --git a/snowflake/ml/_internal/exceptions/modeling_errors.py b/snowflake/ml/_internal/exceptions/modeling_errors.py new file mode 100644 index 00000000..91d79b69 --- /dev/null +++ b/snowflake/ml/_internal/exceptions/modeling_errors.py @@ -0,0 +1,4 @@ +class ModelingError(Exception): + """Base class for modeling exceptions.""" + + pass diff --git a/snowflake/ml/_internal/file_utils.py b/snowflake/ml/_internal/file_utils.py index 82d48bdf..2781bed7 100644 --- a/snowflake/ml/_internal/file_utils.py +++ b/snowflake/ml/_internal/file_utils.py @@ -153,11 +153,15 @@ def _update_hash_from_dir(directory: Union[str, pathlib.Path], hash: "hashlib._H return _update_hash_from_dir(directory, hashlib.sha1()).hexdigest() -def get_all_modules(dirname: str, prefix: str = "") -> List[pkgutil.ModuleInfo]: +def get_all_modules(dirname: str, prefix: str = "") -> List[str]: + modules = [mod.name for mod in pkgutil.iter_modules([dirname], prefix=prefix)] subdirs = [f.path for f in os.scandir(dirname) if f.is_dir()] - modules = list(pkgutil.iter_modules(subdirs, prefix=prefix)) - for dirname in subdirs: - modules.extend(get_all_modules(dirname, prefix=f"{prefix}.{dirname}" if prefix else dirname)) + for sub_dirname in subdirs: + basename = os.path.basename(sub_dirname) + sub_dir_namespace = f"{prefix}{basename}" + if sub_dir_namespace not in modules: + modules.append(sub_dir_namespace) + modules.extend(get_all_modules(sub_dirname, prefix=f"{sub_dir_namespace}.")) return modules diff --git a/snowflake/ml/_internal/telemetry.py b/snowflake/ml/_internal/telemetry.py index e53f8f15..1e03ed5e 100644 --- a/snowflake/ml/_internal/telemetry.py +++ b/snowflake/ml/_internal/telemetry.py @@ -25,7 +25,11 @@ from snowflake import connector from snowflake.connector import telemetry as connector_telemetry, time_util from snowflake.ml._internal import env -from snowflake.snowpark import dataframe, exceptions, session +from snowflake.ml._internal.exceptions import ( + error_codes, + exceptions as snowml_exceptions, +) +from snowflake.snowpark import dataframe, exceptions as snowpark_exceptions, session from snowflake.snowpark._internal import utils _log_counter = 0 @@ -47,6 +51,7 @@ class TelemetryField(enum.Enum): KEY_FUNC_NAME = "func_name" KEY_FUNC_PARAMS = "func_params" KEY_ERROR_INFO = "error_info" + KEY_ERROR_CODE = "error_code" KEY_VERSION = "version" KEY_PYTHON_VERSION = "python_version" KEY_OS = "operating_system" @@ -260,7 +265,7 @@ def wrap(*args: Any, **kwargs: Any) -> _ReturnValue: try: active_session = next(iter(session._get_active_sessions())) # server no default session - except exceptions.SnowparkSessionException: + except snowpark_exceptions.SnowparkSessionException: try: return func(*args, **kwargs) except Exception as e: @@ -299,9 +304,11 @@ def wrap(*args: Any, **kwargs: Any) -> _ReturnValue: try: res = func(*args, **kwargs) except Exception as e: - error = repr(e) - telemetry_args["error"] = error - raise + if not isinstance(e, snowml_exceptions.SnowflakeMLException): + e = snowml_exceptions.SnowflakeMLException(error_code=error_codes.UNDEFINED, original_exception=e) + telemetry_args["error"] = repr(e) + telemetry_args["error_code"] = e.error_code + raise e.original_exception else: return res finally: @@ -525,6 +532,7 @@ def send_function_usage_telemetry( sfqids: Optional[List[Any]] = None, custom_tags: Optional[Dict[str, Union[bool, int, str, float]]] = None, error: Optional[str] = None, + error_code: Optional[str] = None, ) -> None: """ Send function usage telemetry message. @@ -537,6 +545,7 @@ def send_function_usage_telemetry( sfqids: Snowflake query IDs. custom_tags: Custom tags. error: Error. + error_code: Error code. """ data: Dict[str, Any] = { TelemetryField.KEY_FUNC_NAME.value: func_name, @@ -559,6 +568,7 @@ def send_function_usage_telemetry( if error: message[TelemetryField.KEY_ERROR_INFO.value] = error + message[TelemetryField.KEY_ERROR_CODE.value] = error_code self._send(message) diff --git a/snowflake/ml/_internal/telemetry_test.py b/snowflake/ml/_internal/telemetry_test.py index 925c5172..32cc79a0 100644 --- a/snowflake/ml/_internal/telemetry_test.py +++ b/snowflake/ml/_internal/telemetry_test.py @@ -3,12 +3,12 @@ from typing import Any, Dict, Optional from unittest import mock -import pytest from absl.testing import absltest, parameterized from snowflake import connector from snowflake.connector import telemetry as connector_telemetry from snowflake.ml._internal import env, telemetry as utils_telemetry +from snowflake.ml._internal.exceptions import error_codes, exceptions from snowflake.snowpark import dataframe, session from snowflake.snowpark._internal import error_message, server_connection @@ -155,6 +155,7 @@ def foo(self) -> None: def test_client_error(self, mock_get_active_sessions: mock.MagicMock) -> None: """Test send_api_usage_telemetry when the decorated function raises an error.""" mock_get_active_sessions.return_value = {self.mock_session} + message = "foo error" class DummyObject: @utils_telemetry.send_api_usage_telemetry( @@ -163,16 +164,18 @@ class DummyObject: func_params_to_log=["param"], ) def foo(self) -> None: - raise RuntimeError("foo error") + raise RuntimeError(message) test_obj = DummyObject() - with pytest.raises(RuntimeError): + with self.assertRaises(RuntimeError): test_obj.foo() self.mock_telemetry.try_add_log_to_batch.assert_called() self.mock_telemetry.send_batch.assert_called() - message = self.mock_telemetry.try_add_log_to_batch.call_args.args[0].to_dict()["message"] - self.assertEqual(repr(RuntimeError("foo error")), message[utils_telemetry.TelemetryField.KEY_ERROR_INFO.value]) + telemetry_message = self.mock_telemetry.try_add_log_to_batch.call_args.args[0].to_dict()["message"] + expected_error = exceptions.SnowflakeMLException(error_codes.UNDEFINED, RuntimeError(message)) + self.assertEqual(error_codes.UNDEFINED, telemetry_message[utils_telemetry.TelemetryField.KEY_ERROR_CODE.value]) + self.assertEqual(repr(expected_error), telemetry_message[utils_telemetry.TelemetryField.KEY_ERROR_INFO.value]) def test_get_statement_params_full_func_name(self) -> None: """Test get_statement_params_full_func_name.""" @@ -352,6 +355,26 @@ def foo(self) -> None: test_obj.foo() self.mock_telemetry.send_batch.assert_called() + @mock.patch("snowflake.snowpark.session._get_active_sessions") + def test_snowml_error(self, mock_get_active_sessions: mock.MagicMock) -> None: + """Test send_api_usage_telemetry when the decorated function raises a snowml error.""" + mock_get_active_sessions.return_value = {self.mock_session} + + class DummyObject: + @utils_telemetry.send_api_usage_telemetry( + project=_PROJECT, + ) + def foo(self) -> None: + raise exceptions.SnowflakeMLException( + error_code=error_codes.INTERNAL_TEST, + original_exception=RuntimeError("foo error"), + ) + + test_obj = DummyObject() + with self.assertRaises(RuntimeError) as ex: + test_obj.foo() + self.assertIn(error_codes.INTERNAL_TEST, str(ex.exception)) + if __name__ == "__main__": absltest.main() diff --git a/snowflake/ml/_internal/utils/parallelize.py b/snowflake/ml/_internal/utils/parallelize.py index eaceb9b9..e67598da 100644 --- a/snowflake/ml/_internal/utils/parallelize.py +++ b/snowflake/ml/_internal/utils/parallelize.py @@ -80,7 +80,7 @@ def map_dataframe_by_column( else: if n_output_cols != len(mapped_df.columns): raise Exception("All partitions must contain the same number of columns.") - mapped_df = mapped_df.with_column(partition_id_col, F.lit(partition_id)) # type: ignore + mapped_df = mapped_df.with_column(partition_id_col, F.lit(partition_id)) unioned_df = mapped_df if unioned_df is None else unioned_df.union_all(mapped_df) # Store results in a list of size |n_partitions| x |n_rows| x |n_output_cols| diff --git a/snowflake/ml/_internal/utils/query_result_checker.py b/snowflake/ml/_internal/utils/query_result_checker.py index 551c39f9..b572319a 100644 --- a/snowflake/ml/_internal/utils/query_result_checker.py +++ b/snowflake/ml/_internal/utils/query_result_checker.py @@ -119,37 +119,9 @@ def cell_value_by_column_matcher( return True -def cell_value_partial_matcher( - row_idx: int, col_idx: int, expected_value: Any, result: list[snowpark.Row], sql: str | None = None -) -> bool: - """Returns true if `expected_value` is found in `result[row_idx, col_idx]` cell. Raise exception otherwise.""" - if len(result) <= row_idx or len(result[row_idx]) <= col_idx: - raise connector.DataError( - formatting.unwrap( - f"""Query Result did not have required number of rows x col [{row_idx}][{col_idx}]. Result from - operation was: {result}.{_query_log(sql)}""" - ) - ) - validated = False - if isinstance(expected_value, str): - validated = expected_value in result[row_idx][col_idx] - else: - validated = expected_value == result[row_idx][col_idx] - if not validated: - raise connector.DataError( - formatting.unwrap( - f"""Query Result did not have the expected value '{expected_value}' at expected position - [{row_idx}][{col_idx}]. Actual value at position [{row_idx}][{col_idx}] was - '{result[row_idx][col_idx]}'.{_query_log(sql)}""" - ) - ) - return True - - _DEFAULT_MATCHERS = [ partial(result_dimension_matcher, 1, 1), partial(column_name_matcher, "status"), - partial(cell_value_partial_matcher, 0, 0, "successfully"), ] @@ -199,21 +171,6 @@ def has_column(self, expected_col_name: str) -> ResultValidator: self._success_matchers.append(partial(column_name_matcher, expected_col_name)) return self - def has_value_match(self, row_idx: int, col_idx: int, expected_value: Any) -> ResultValidator: - """Validate that the a column with the name `expected_column_name` exists in the result. - - Args: - row_idx: Row index of the cell that needs to match. - col_idx: Column index of the cell that needs to match. - expected_value: Value that the cell needs to match. For strings it is treated as a substring match, all - other types will expect an exact match. - - Returns: - ResultValidator object (self) - """ - self._success_matchers.append(partial(cell_value_partial_matcher, row_idx, col_idx, expected_value)) - return self - def has_named_value_match(self, row_idx: int, col_name: str, expected_value: Any) -> ResultValidator: """Validate that the column `col_name` in row `row_idx` of ther results exists and matches `expected_value`. diff --git a/snowflake/ml/_internal/utils/query_result_checker_test.py b/snowflake/ml/_internal/utils/query_result_checker_test.py index 76fa0f5c..de44002a 100644 --- a/snowflake/ml/_internal/utils/query_result_checker_test.py +++ b/snowflake/ml/_internal/utils/query_result_checker_test.py @@ -28,28 +28,18 @@ def test_column_name_matcher(self) -> None: self.assertRaises(DataError, query_result_checker.column_name_matcher, "name1", []) self.assertRaises(DataError, query_result_checker.column_name_matcher, "name3", [row1, row2]) - def test_cell_value_partial_matcher(self) -> None: - """Test cell_value_partial_matcher().""" - row1 = Row(name1=1, name2="foo") - row2 = Row(name1=2, name2="bar") - self.assertTrue(query_result_checker.cell_value_partial_matcher(0, 0, 1, [row1, row2])) - self.assertRaises(DataError, query_result_checker.cell_value_partial_matcher, 0, 0, 2, [row1, row2]) - self.assertTrue(query_result_checker.cell_value_partial_matcher(0, 1, "foo", [row1, row2])) - self.assertRaises(DataError, query_result_checker.cell_value_partial_matcher, 1, 1, "foo", [row1, row2]) - def test_result_validator_dimensions_partial_ok(self) -> None: - """Use the base ResultValidator to verify the dimensions and value match of an operation result.""" + """Use the base ResultValidator to verify the dimensions of an operation result.""" expected_result = [Row("number of rows updated=1, number of multi-joined rows updated=0")] actual_result = ( query_result_checker.ResultValidator(result=expected_result) .has_dimensions(expected_rows=1, expected_cols=1) - .has_value_match(row_idx=0, col_idx=0, expected_value="number of rows updated=1") .validate() ) self.assertEqual(expected_result, actual_result) def test_sql_result_validator_dimensions_partial_ok(self) -> None: - """Use SqlResultValidator to check dimension and value match of the result.""" + """Use SqlResultValidator to check dimension of the result.""" session = mock_session.MockSession(conn=None, test_case=self) query = "UPDATE TABLE SET COL = 'value'" sql_result = [Row("number of rows updated=1, number of multi-joined rows updated=0")] @@ -57,7 +47,6 @@ def test_sql_result_validator_dimensions_partial_ok(self) -> None: actual_result = ( query_result_checker.SqlResultValidator(session=session, query=query) .has_dimensions(expected_rows=1, expected_cols=1) - .has_value_match(row_idx=0, col_idx=0, expected_value="number of rows updated=1") .validate() ) self.assertEqual(sql_result, actual_result) @@ -88,7 +77,7 @@ def test_sql_result_validator_dimensions_ok_partial_fail(self) -> None: actual_result = ( query_result_checker.SqlResultValidator(session=session, query=query) .has_dimensions(expected_rows=1, expected_cols=1) - .has_value_match(row_idx=0, col_idx=0, expected_value="number of rows updated=1") + .has_column(expected_col_name="fake_name") .validate() ) self.assertEqual(actual_result, sql_result) diff --git a/snowflake/ml/fileset/BUILD.bazel b/snowflake/ml/fileset/BUILD.bazel index afb39ec1..d7166b99 100644 --- a/snowflake/ml/fileset/BUILD.bazel +++ b/snowflake/ml/fileset/BUILD.bazel @@ -7,8 +7,10 @@ py_library( name = "stage_fs", srcs = ["stage_fs.py"], deps = [ - ":fileset_errors", "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/_internal/exceptions:exceptions", + "//snowflake/ml/_internal/exceptions:fileset_errors", + "//snowflake/ml/_internal/exceptions:fileset_error_messages", ], ) @@ -27,10 +29,9 @@ py_library( name = "sfcfs", srcs = ["sfcfs.py"], deps = [ - ":fileset_errors", ":stage_fs", - "//snowflake/ml/_internal/utils:identifier", "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/_internal/utils:identifier", ], ) @@ -57,11 +58,13 @@ py_library( srcs = ["fileset.py"], compatible_with_snowpark = False, deps = [ - ":fileset_errors", ":sfcfs", ":tf_dataset", ":torch_datapipe", "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/_internal/exceptions:exceptions", + "//snowflake/ml/_internal/exceptions:fileset_errors", + "//snowflake/ml/_internal/exceptions:fileset_error_messages", "//snowflake/ml/_internal/utils:import_utils", ], ) @@ -71,6 +74,7 @@ py_test( srcs = ["fileset_test.py"], deps = [ ":fileset", + "//snowflake/ml/_internal/exceptions:fileset_errors", ], ) @@ -121,11 +125,6 @@ py_test( ], ) -py_library( - name = "fileset_errors", - srcs = ["fileset_errors.py"], -) - py_library( name = "parquet_test_util", testonly = True, diff --git a/snowflake/ml/fileset/fileset.py b/snowflake/ml/fileset/fileset.py index cfce1675..4c5ca848 100644 --- a/snowflake/ml/fileset/fileset.py +++ b/snowflake/ml/fileset/fileset.py @@ -6,9 +6,15 @@ from snowflake import snowpark from snowflake.connector import connection from snowflake.ml._internal import telemetry +from snowflake.ml._internal.exceptions import ( + error_codes, + exceptions as snowml_exceptions, + fileset_error_messages, + fileset_errors, +) from snowflake.ml._internal.utils import identifier, import_utils -from snowflake.ml.fileset import fileset_errors, sfcfs -from snowflake.snowpark import exceptions, functions, types +from snowflake.ml.fileset import sfcfs +from snowflake.snowpark import exceptions as snowpark_exceptions, functions, types # The max file size for data loading. TARGET_FILE_SIZE = 32 * 2**20 @@ -26,7 +32,10 @@ def _raise_if_deleted(func: Callable[..., Any]) -> Callable[..., Any]: @functools.wraps(func) def raise_if_deleted_helper(self: Any, *args: Any, **kwargs: Any) -> Any: if self._is_deleted: - raise fileset_errors.FileSetAlreadyDeletedError("The FileSet has already been deleted.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.SNOWML_DELETE_FAILED, + original_exception=fileset_errors.FileSetAlreadyDeletedError("The FileSet has already been deleted."), + ) return func(self, *args, **kwargs) return raise_if_deleted_helper @@ -35,6 +44,7 @@ def raise_if_deleted_helper(self: Any, *args: Any, **kwargs: Any) -> Any: class FileSet: """A FileSet represents an immutable snapshot of the result of a query in the form of files.""" + @telemetry.send_api_usage_telemetry(project=_PROJECT) def __init__( self, *, @@ -80,9 +90,9 @@ def __init__( ['sfc://@mydb.myschema.mystage/mydir/helloworld/data_0_0_0.snappy.parquet'] """ if sf_connection and snowpark_session: - raise ValueError("sf_connection and snowpark_session cannot be specified at the same time.") + raise ValueError(fileset_error_messages.BOTH_SF_CONNECTION_AND_SNOWPARK_SESSION_SPECIFIED) if not sf_connection and not snowpark_session: - raise ValueError("sf_connection or snowpark_session must be provided") + raise ValueError(fileset_error_messages.NO_SF_CONNECTION_OR_SNOWPARK_SESSION) self._snowpark_session = ( snowpark_session if snowpark_session @@ -138,6 +148,8 @@ def make( FileSetExistError: An error occured whern a FileSet with the same name exists in the given path. FileSetError: An error occured when the SQL query/dataframe is not able to get materialized. + # noqa: DAR401 + Note: During the generation of stage files, data casting will occur. The casting rules are as follows:: - Data casting: - DecimalType(NUMBER): @@ -188,11 +200,11 @@ def make( ['sfc://@mydb.myschema.mystage/helloworld/data_0_0_0.snappy.parquet'] """ if snowpark_dataframe and sf_connection: - raise ValueError("sf_connection and snowpark_session cannot be specified at the same time.") + raise ValueError(fileset_error_messages.BOTH_SF_CONNECTION_AND_SNOWPARK_SESSION_SPECIFIED) if not snowpark_dataframe: if not sf_connection: - raise ValueError("Either snowpark_dataframe or sf_connection should be non-empty.") + raise ValueError(fileset_error_messages.NO_SF_CONNECTION_OR_SNOWPARK_DATAFRAME) if not query: raise ValueError("Please use non-empty query to generate meaningful result.") snowpark_session = snowpark.Session.builder.config("connection", sf_connection).create() @@ -203,10 +215,13 @@ def make( snowpark_session = snowpark_dataframe._session casted_df = _cast_snowpark_dataframe(snowpark_dataframe) - _validate_target_stage_loc(snowpark_session, target_stage_loc) + try: + _validate_target_stage_loc(snowpark_session, target_stage_loc) + except snowml_exceptions.SnowflakeMLException as e: + raise e.original_exception target_stage_exists = snowpark_session.sql(f"List {_fileset_absoluate_path(target_stage_loc, name)}").collect() if target_stage_exists: - raise fileset_errors.FileSetExistError(f"FileSet with name {name} has already existed.") + raise fileset_errors.FileSetExistError(fileset_error_messages.FILESET_ALREADY_EXISTS.format(name)) if shuffle: casted_df = casted_df.order_by(functions.random()) @@ -232,10 +247,10 @@ def make( api_calls=[snowpark.DataFrameWriter.copy_into_location], ), ) - except exceptions.SnowparkClientException as e: + except snowpark_exceptions.SnowparkClientException as e: # Snowpark wraps the Python Connector error code in the head of the error message. if e.message.startswith(fileset_errors.ERRNO_FILE_EXIST_IN_STAGE): - raise fileset_errors.FileSetExistError(f"FileSet with name {name} has already existed.") + raise fileset_errors.FileSetExistError(fileset_error_messages.FILESET_ALREADY_EXISTS.format(name)) else: raise fileset_errors.FileSetError(str(e)) @@ -246,6 +261,17 @@ def name(self) -> str: """Get the name of the FileSet.""" return self._name + def _list_files(self) -> List[str]: + """Private helper function that lists all files in this fileset and caches the results for subsequent use.""" + if self._files: + return self._files + loc = self._fileset_absolute_path() + + # TODO(zzhu)[SNOW-703491]: We could use manifest file to speed up file listing + files = self._fs.ls(loc) + self._files = [f"sfc://{file}" for file in files] + return self._files + def _fileset_absolute_path(self) -> str: """Get the Snowflake absoluate path to this FileSet directory.""" return _fileset_absoluate_path(self._target_stage_loc, self.name) @@ -270,14 +296,7 @@ def files(self) -> List[str]: ["sfc://@mydb.myschema.mystage/test/hello_world_0_0_0.snappy.parquet", "sfc://@mydb.myschema.mystage/test/hello_world_0_0_1.snappy.parquet"] """ - if self._files: - return self._files - loc = self._fileset_absolute_path() - - # TODO(zzhu)[SNOW-703491]: We could use manifest file to speed up file listing - files = self._fs.ls(loc) - self._files = [f"sfc://{file}" for file in files] - return self._files + return self._list_files() @telemetry.send_api_usage_telemetry( project=_PROJECT, @@ -337,9 +356,9 @@ def to_torch_datapipe(self, *, batch_size: int, shuffle: bool = False, drop_last IterableWrapper, _ = import_utils.import_or_get_dummy("torchdata.datapipes.iter.IterableWrapper") torch_datapipe_module, _ = import_utils.import_or_get_dummy("snowflake.ml.fileset.torch_datapipe") - self._fs.optimize_read(self.files()) + self._fs.optimize_read(self._list_files()) - input_dp = IterableWrapper(self.files()) + input_dp = IterableWrapper(self._list_files()) return torch_datapipe_module.ReadAndParseParquet(input_dp, self._fs, batch_size, shuffle, drop_last_batch) @telemetry.send_api_usage_telemetry( @@ -376,9 +395,11 @@ def to_tf_dataset(self, *, batch_size: int, shuffle: bool = False, drop_last_bat """ tf_dataset_module, _ = import_utils.import_or_get_dummy("snowflake.ml.fileset.tf_dataset") - self._fs.optimize_read(self.files()) + self._fs.optimize_read(self._list_files()) - return tf_dataset_module.read_and_parse_parquet(self.files(), self._fs, batch_size, shuffle, drop_last_batch) + return tf_dataset_module.read_and_parse_parquet( + self._list_files(), self._fs, batch_size, shuffle, drop_last_batch + ) @telemetry.send_api_usage_telemetry( project=_PROJECT, @@ -399,7 +420,7 @@ def to_snowpark_dataframe(self) -> snowpark.DataFrame: - Unsupported types (see comments of :func:`~FileSet.fileset.make`) will not have any guarantee. For example, an OBJECT column may be scanned back as a STRING column. """ - query_id = _get_fileset_query_id_or_raise(self.files(), self._fileset_absolute_path()) + query_id = _get_fileset_query_id_or_raise(self._list_files(), self._fileset_absolute_path()) file_path_pattern = f".*data_{query_id}.*[.]parquet" df = self._snowpark_session.read.option("pattern", file_path_pattern).parquet(self._fileset_absolute_path()) assert isinstance(df, snowpark.DataFrame) @@ -416,7 +437,7 @@ def delete(self) -> None: If not called, the FileSet and all its stage files will stay in Snowflake stage. Raises: - FileSetCannotDeleteError: An error occured when the FileSet cannot get deleted. + SnowflakeMLException: An error occured when the FileSet cannot get deleted. """ delete_sql = f"remove {self._fileset_absolute_path()}" try: @@ -430,8 +451,11 @@ def delete(self) -> None: ) self._files = [] self._is_deleted = True - except exceptions.SnowparkClientException as e: - raise fileset_errors.FileSetCannotDeleteError(e) + except snowpark_exceptions.SnowparkClientException as e: + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.SNOWML_DELETE_FAILED, + original_exception=fileset_errors.FileSetCannotDeleteError(str(e)), + ) return @@ -448,7 +472,7 @@ def _get_fileset_query_id_or_raise(files: List[str], fileset_absolute_path: str) The query id of the sql query which is used to generate the stage files. Raises: - MoreThanOneQuerySourceError: If the input files are not generated by the same query. + SnowflakeMLException: If the input files are not generated by the same query. """ if not files: return None @@ -471,7 +495,12 @@ def _get_fileset_query_id_or_raise(files: List[str], fileset_absolute_path: str) query_id = truncatred_filename[:idx] if not valid: - raise fileset_errors.MoreThanOneQuerySourceError("This FileSet contains files generated by the other queries.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.SNOWML_INVALID_QUERY, + original_exception=fileset_errors.MoreThanOneQuerySourceError( + "This FileSet contains files generated by the other queries." + ), + ) return query_id @@ -490,23 +519,34 @@ def _validate_target_stage_loc(snowpark_session: snowpark.Session, target_stage_ A Boolean value about whether the input target stage location is a valid path in an internal SSE stage. Raises: - FileSetLocationError: An error occured when the input stage path is invalid. + SnowflakeMLException: The input stage path does not start with '@'. + SnowflakeMLException: No valid stages found. + SnowflakeMLException: An error occured when the input stage path is invalid. """ if not target_stage_loc.startswith("@"): - raise fileset_errors.FileSetLocationError('FileSet location should start with "@".') + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.SNOWML_INVALID_STAGE, + original_exception=fileset_errors.FileSetLocationError('FileSet location should start with "@".'), + ) try: db, schema, stage, _ = identifier.parse_schema_level_object_identifier(target_stage_loc[1:]) df_stages = snowpark_session.sql(f"Show stages like '{stage}' in SCHEMA {db}.{schema}") - df_stages = df_stages.filter(functions.col('"type"').like(f"%{_FILESET_STAGE_TYPE}%")) # type:ignore[arg-type] + df_stages = df_stages.filter(functions.col('"type"').like(f"%{_FILESET_STAGE_TYPE}%")) valid_stage = df_stages.collect() if not valid_stage: - raise fileset_errors.FileSetLocationError( - "A FileSet requires its location to be in an existing server-side-encrypted internal stage." - "See https://docs.snowflake.com/en/sql-reference/sql/create-stage#internal-stage-parameters-internalstageparams " # noqa: E501 - "on how to create such a stage." + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.SNOWML_NOT_FOUND, + original_exception=fileset_errors.FileSetLocationError( + "A FileSet requires its location to be in an existing server-side-encrypted internal stage." + "See https://docs.snowflake.com/en/sql-reference/sql/create-stage#internal-stage-parameters-internalstageparams " # noqa: E501 + "on how to create such a stage." + ), ) except ValueError as e: - raise fileset_errors.FileSetLocationError(e) + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.SNOWML_INVALID_STAGE, + original_exception=fileset_errors.FileSetLocationError(str(e)), + ) return True @@ -552,19 +592,19 @@ def _cast_snowpark_dataframe(df: snowpark.DataFrame) -> snowpark.DataFrame: dest: types.DataType = types.FloatType() else: dest = types.LongType() - selected_cols.append(functions.cast(functions.col(src), dest).alias(src)) # type:ignore[arg-type] + selected_cols.append(functions.cast(functions.col(src), dest).alias(src)) elif isinstance(field.datatype, types.DoubleType): dest = types.FloatType() - selected_cols.append(functions.cast(functions.col(src), dest).alias(src)) # type:ignore[arg-type] + selected_cols.append(functions.cast(functions.col(src), dest).alias(src)) elif isinstance(field.datatype, types.ByteType): # Snowpark maps ByteType to BYTEINT, which will not do the casting job when unloading to parquet files. # We will use SMALLINT instead until this issue got fixed. # Investigate JIRA filed: SNOW-725041 dest = types.ShortType() - selected_cols.append(functions.cast(functions.col(src), dest).alias(src)) # type:ignore[arg-type] + selected_cols.append(functions.cast(functions.col(src), dest).alias(src)) elif field.datatype in (types.ShortType(), types.IntegerType(), types.LongType()): dest = field.datatype - selected_cols.append(functions.cast(functions.col(src), dest).alias(src)) # type:ignore[arg-type] + selected_cols.append(functions.cast(functions.col(src), dest).alias(src)) else: if field.datatype in (types.DateType(), types.TimestampType(), types.TimeType()): logging.warning( diff --git a/snowflake/ml/fileset/fileset_test.py b/snowflake/ml/fileset/fileset_test.py index 1a5e6a76..3f2a10d7 100644 --- a/snowflake/ml/fileset/fileset_test.py +++ b/snowflake/ml/fileset/fileset_test.py @@ -4,7 +4,8 @@ from snowflake import snowpark from snowflake.connector import connection -from snowflake.ml.fileset import fileset, fileset_errors +from snowflake.ml._internal.exceptions import fileset_errors +from snowflake.ml.fileset import fileset from snowflake.snowpark import types MockResultMetaData = collections.namedtuple("MockResultMetaData", ["name", "type_code", "precision", "scale"]) diff --git a/snowflake/ml/fileset/stage_fs.py b/snowflake/ml/fileset/stage_fs.py index a7a495b8..7541fce5 100644 --- a/snowflake/ml/fileset/stage_fs.py +++ b/snowflake/ml/fileset/stage_fs.py @@ -10,8 +10,13 @@ from snowflake import snowpark from snowflake.connector import connection from snowflake.ml._internal import telemetry -from snowflake.ml.fileset import fileset_errors -from snowflake.snowpark import exceptions +from snowflake.ml._internal.exceptions import ( + error_codes, + exceptions as snowml_exceptions, + fileset_error_messages, + fileset_errors, +) +from snowflake.snowpark import exceptions as snowpark_exceptions # The default length of how long a presigned url stays active in seconds. # Presigned url here is used to fetch file objects from Snowflake when SFStageFileSystem.open() is called. @@ -97,9 +102,9 @@ def __init__( ValueError: An error occured when not exactly one of sf_connection and snowpark_session is given. """ if sf_connection and snowpark_session: - raise ValueError("sf_connection and snowpark_session cannot be specified at the same time.") + raise ValueError(fileset_error_messages.BOTH_SF_CONNECTION_AND_SNOWPARK_SESSION_SPECIFIED) if not sf_connection and not snowpark_session: - raise ValueError("sf_connection or snowpark_session must be provided") + raise ValueError(fileset_error_messages.NO_SF_CONNECTION_OR_SNOWPARK_SESSION) if sf_connection: self._session = snowpark.Session.builder.config("connection", sf_connection).create() else: @@ -148,17 +153,25 @@ def ls(self, path: str, detail: bool = False) -> Union[List[str], List[Dict[str, A list of filename if `detail` is false, or a list of dict if `detail` is true. Raises: - StageNotFoundError: An error occured when the given path points to a stage that cannot be found. - FileSetError: An error occured when Snowflake cannot list files in the given stage path. + SnowflakeMLException: An error occured when the given path points to a stage that cannot be found. + SnowflakeMLException: An error occured when Snowflake cannot list files in the given stage path. """ try: loc = self.stage_name objects = self._session.sql(f"LIST {loc}/{path}").collect() - except exceptions.SnowparkClientException as e: + except snowpark_exceptions.SnowparkClientException as e: if e.message.startswith(fileset_errors.ERRNO_DOMAIN_NOT_EXIST): - raise fileset_errors.StageNotFoundError(f"Stage {loc} does not exist or is not authorized.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.SNOWML_NOT_FOUND, + original_exception=fileset_errors.StageNotFoundError( + f"Stage {loc} does not exist or is not authorized." + ), + ) else: - raise fileset_errors.FileSetError(str(e)) + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INTERNAL_SNOWML_ERROR, + original_exception=fileset_errors.FileSetError(str(e)), + ) files = self._parse_list_result(objects, path) if detail: return files @@ -211,7 +224,7 @@ def _open(self, path: str, mode: str = "rb", **kwargs: Any) -> fsspec.spec.Abstr A fsspec file-like object. Raises: - StageFileNotFoundError: An error occured when the given path points to a file that cannot be found. + SnowflakeMLException: An error occured when the given path points to a file that cannot be found. """ path = path.lstrip("/") cached_presigned_url = self._url_cache.get(path, None) @@ -229,7 +242,10 @@ def _open(self, path: str, mode: str = "rb", **kwargs: Any) -> fsspec.spec.Abstr try: return self._fs._open(url, mode=mode, **kwargs) except FileNotFoundError: - raise fileset_errors.StageFileNotFoundError(f"Stage file {path} doesn't exist.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.SNOWML_NOT_FOUND, + original_exception=fileset_errors.StageFileNotFoundError(f"Stage file {path} doesn't exist."), + ) def _parse_list_result( self, list_result: List[Tuple[str, int, str, str]], search_path: str @@ -328,9 +344,17 @@ def _fetch_presigned_urls( api_calls=[snowpark.DataFrame.collect], ), ) - except exceptions.SnowparkClientException as e: + except snowpark_exceptions.SnowparkClientException as e: if e.message.startswith(fileset_errors.ERRNO_STAGE_NOT_EXIST): - raise fileset_errors.StageNotFoundError(f"Stage {self.stage_name} does not exist or is not authorized.") + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.SNOWML_NOT_FOUND, + original_exception=fileset_errors.StageNotFoundError( + f"Stage {self.stage_name} does not exist or is not authorized." + ), + ) else: - raise fileset_errors.FileSetError(str(e)) + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INTERNAL_SNOWML_ERROR, + original_exception=fileset_errors.FileSetError(str(e)), + ) return presigned_urls diff --git a/snowflake/ml/model/BUILD.bazel b/snowflake/ml/model/BUILD.bazel index bf2038bf..03394896 100644 --- a/snowflake/ml/model/BUILD.bazel +++ b/snowflake/ml/model/BUILD.bazel @@ -32,9 +32,18 @@ py_library( deps = [ ":type_hints", "//snowflake/ml/_internal:type_utils", - "//snowflake/ml/model/_deploy_client/warehouse:infer_template", "//snowflake/ml/_internal/utils:formatting", "//snowflake/ml/_internal/utils:identifier", + "//snowflake/ml/model/_deploy_client/warehouse:infer_template", + "//snowflake/ml/model/_signatures:base_handler", + "//snowflake/ml/model/_signatures:core", + "//snowflake/ml/model/_signatures:builtins_handler", + "//snowflake/ml/model/_signatures:numpy_handler", + "//snowflake/ml/model/_signatures:pandas_handler", + "//snowflake/ml/model/_signatures:pytorch_handler", + "//snowflake/ml/model/_signatures:snowpark_handler", + "//snowflake/ml/model/_signatures:tensorflow_handler", + "//snowflake/ml/model/_signatures:utils", ], ) @@ -67,6 +76,7 @@ py_library( "//snowflake/ml/_internal:env", "//snowflake/ml/_internal:env_utils", "//snowflake/ml/_internal:file_utils", + "//snowflake/ml/model/_signatures:snowpark_handler", ], ) @@ -77,9 +87,10 @@ py_library( ":model_signature", ":type_hints", "//snowflake/ml/_internal/utils:identifier", + "//snowflake/ml/model/_deploy_client/snowservice:deploy", "//snowflake/ml/model/_deploy_client/warehouse:deploy", "//snowflake/ml/model/_deploy_client/warehouse:infer_template", - "//snowflake/ml/model/_deploy_client/snowservice:deploy", + "//snowflake/ml/model/_signatures:snowpark_handler", ], ) @@ -105,9 +116,11 @@ py_library( "//snowflake/ml/_internal:file_utils", "//snowflake/ml/_internal:type_utils", "//snowflake/ml/model/_handlers:custom", + "//snowflake/ml/model/_handlers:mlflow", "//snowflake/ml/model/_handlers:pytorch", "//snowflake/ml/model/_handlers:sklearn", "//snowflake/ml/model/_handlers:snowmlmodel", + "//snowflake/ml/model/_handlers:tensorflow", "//snowflake/ml/model/_handlers:torchscript", "//snowflake/ml/model/_handlers:xgboost", "//snowflake/ml/modeling/framework", @@ -136,7 +149,6 @@ py_test( srcs = ["model_signature_test.py"], deps = [ ":model_signature", - "//snowflake/ml/utils:connection_params", ], ) @@ -164,5 +176,8 @@ py_test( ":type_hints", "//snowflake/ml/modeling/linear_model:linear_regression", "//snowflake/ml/test_utils:mock_session", + "//snowflake/ml/model/_signatures:utils", + "//snowflake/ml/model/_signatures:pytorch_handler", + "//snowflake/ml/model/_signatures:tensorflow_handler", ], ) diff --git a/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel b/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel index f3dd5a24..df3e9816 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel @@ -44,10 +44,12 @@ py_test( name = "docker_context_test", srcs = ["docker_context_test.py"], deps = [ - ":docker_context" + ":docker_context", + "//snowflake/ml/model:_model" ], data = [ - "test_fixtures/dockerfile_test_fixture" + "test_fixtures/dockerfile_test_fixture", + "test_fixtures/dockerfile_test_gpu_fixture" ] ) diff --git a/snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py b/snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py index bdd58738..a223c4ab 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +++ b/snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py @@ -2,22 +2,17 @@ import json import logging import os -import posixpath import subprocess import tempfile -import zipfile from enum import Enum from typing import List -import yaml - from snowflake import snowpark from snowflake.ml._internal.utils import query_result_checker from snowflake.ml.model._deploy_client.image_builds import ( base_image_builder, docker_context, ) -from snowflake.ml.model._deploy_client.utils import constants class Platform(Enum): @@ -36,20 +31,20 @@ class ClientImageBuilder(base_image_builder.ImageBuilder): """ def __init__( - self, *, id: str, image_repo: str, model_zip_stage_path: str, session: snowpark.Session, use_gpu: bool = False + self, *, id: str, image_repo: str, model_dir: str, session: snowpark.Session, use_gpu: bool = False ) -> None: """Initialization Args: id: A hexadecimal string used for naming the image tag. image_repo: Path to image repository. - model_zip_stage_path: Path to model zip file in stage. + model_dir: Local model directory, downloaded form stage and extracted. use_gpu: Boolean flag for generating the CPU or GPU base image. session: Snowpark session """ self.image_tag = "/".join([image_repo.rstrip("/"), id]) + ":latest" self.image_repo = image_repo - self.model_zip_stage_path = model_zip_stage_path + self.model_dir = model_dir self.use_gpu = use_gpu self.session = session @@ -76,6 +71,20 @@ def _setup_docker_config(docker_config_dir: str) -> None: with open(config_path, "w", encoding="utf-8") as file: json.dump(content, file) + def _cleanup_local_image() -> None: + try: + image_exist_command = f"docker image inspect {self.image_tag}" + subprocess.check_call( + image_exist_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True + ) + except subprocess.CalledProcessError: + # Image does not exist, probably due to failed build step + pass + else: + commands = ["docker", "--config", config_dir, "rmi", self.image_tag] + logging.info(f"Removing local image: {self.image_tag}") + self._run_docker_commands(commands) + self.validate_docker_client_env() query_result = ( @@ -95,12 +104,17 @@ def _setup_docker_config(docker_config_dir: str) -> None: self.session.sql("ALTER SESSION SET PYTHON_CONNECTOR_QUERY_RESULT_FORMAT = 'json'").collect() _setup_docker_config(config_dir) self._build(config_dir) - self._upload(config_dir) + except Exception as e: + raise RuntimeError(f"Failed to build docker image: {str(e)}") + else: + try: + self._upload(config_dir) + except Exception as e: + raise RuntimeError(f"Failed to upload docker image to registry: {str(e)}") + finally: + _cleanup_local_image() finally: self.session.sql(f"ALTER SESSION SET PYTHON_CONNECTOR_QUERY_RESULT_FORMAT = '{prev_format}'").collect() - commands = ["docker", "--config", config_dir, "rmi", self.image_tag] - logging.info(f"Removing local image: {self.image_tag}") - self._run_docker_commands(commands) return self.image_tag def validate_docker_client_env(self) -> None: @@ -129,40 +143,6 @@ def validate_docker_client_env(self) -> None: "https://docs.docker.com/build/buildkit/#getting-started" ) - def _extract_model_zip(self, context_dir: str) -> str: - """Extract a zip file into the specified directory. - - Args: - context_dir: Directory to extract the zip to. - - Returns: - The extracted model directory. - """ - - local_model_zip_path = os.path.join(context_dir, posixpath.basename(self.model_zip_stage_path)) - if zipfile.is_zipfile(local_model_zip_path): - extracted_model_dir = os.path.join(context_dir, constants.MODEL_DIR) - with zipfile.ZipFile(local_model_zip_path, "r") as model_zip: - if len(model_zip.namelist()) > 1: - model_zip.extractall(extracted_model_dir) - conda_path = os.path.join(extracted_model_dir, "env", "conda.yaml") - - def remove_snowml_from_conda() -> None: - with open(conda_path, encoding="utf-8") as file: - conda_yaml = yaml.safe_load(file) - - dependencies = conda_yaml["dependencies"] - dependencies = [dep for dep in dependencies if not dep.startswith("snowflake-ml-python")] - - conda_yaml["dependencies"] = dependencies - - with open(conda_path, "w", encoding="utf-8") as file: - yaml.dump(conda_yaml, file) - - # TODO(shchen): Remove once SNOW-840411 is landed. - remove_snowml_from_conda() - return extracted_model_dir - def _build(self, docker_config_dir: str) -> None: """Constructs the Docker context directory and then builds a Docker image based on that context. @@ -171,15 +151,7 @@ def _build(self, docker_config_dir: str) -> None: """ with tempfile.TemporaryDirectory() as context_dir: - # Download the model zip file that is already uploaded to stage during model registry log_model step. - # This is needed in order to obtain the conda and requirement file inside the model zip. - self.session.file.get(self.model_zip_stage_path, context_dir) - - extracted_model_dir = self._extract_model_zip(context_dir) - - dc = docker_context.DockerContext( - context_dir=context_dir, model_dir=extracted_model_dir, use_gpu=self.use_gpu - ) + dc = docker_context.DockerContext(context_dir=context_dir, model_dir=self.model_dir, use_gpu=self.use_gpu) dc.build() self._build_image_from_context(context_dir=context_dir, docker_config_dir=docker_config_dir) @@ -201,7 +173,7 @@ def _run_docker_commands(self, commands: List[str]) -> None: logging.info(line) if proc.wait(): - raise RuntimeError(f"Docker build failed: {''.join(output_lines)}") + raise RuntimeError(f"Docker commands failed: \n {''.join(output_lines)}") def _build_image_from_context( self, context_dir: str, docker_config_dir: str, *, platform: Platform = Platform.LINUX_AMD64 diff --git a/snowflake/ml/model/_deploy_client/image_builds/client_image_builder_test.py b/snowflake/ml/model/_deploy_client/image_builds/client_image_builder_test.py index 00768af3..907e8ab6 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/client_image_builder_test.py +++ b/snowflake/ml/model/_deploy_client/image_builds/client_image_builder_test.py @@ -15,13 +15,13 @@ def setUp(self) -> None: self.m_session = cast(snowpark.session.Session, mock_session.MockSession(conn=None, test_case=self)) self.unique_id = "mock_id" self.image_repo = "mock_image_repo" - self.model_zip_stage_path = "db.schema.stage/dir/model.zip" - self.use_gpu = False + self.model_dir = "local/dir/model.zip" + self.use_gpu = True self.client_image_builder = client_image_builder.ClientImageBuilder( id=self.unique_id, image_repo=self.image_repo, - model_zip_stage_path=self.model_zip_stage_path, + model_dir=self.model_dir, session=self.m_session, use_gpu=self.use_gpu, ) @@ -43,26 +43,14 @@ def test_build(self, m_tempdir: mock.MagicMock, m_docker_context_class: mock.Mag m_context_dir = "mock_context_dir" # Modify the m_tempdir mock to return the desired TemporaryDirectory object m_tempdir.return_value.__enter__.return_value = m_context_dir - m_file = absltest.mock.Mock() - m_extracted_model_dir = "mock_extracted_model_dir" m_docker_config_dir = "mock_docker_config_dir" with mock.patch.object(m_docker_context, "build") as m_build, mock.patch.object( self.client_image_builder, "_build_image_from_context" - ) as m_build_image_from_context, mock.patch.object( - self.client_image_builder, "_extract_model_zip" - ) as m_extract_model_zip: - - self.m_session.__setattr__("file", m_file) - - m_extract_model_zip.return_value = m_extracted_model_dir - + ) as m_build_image_from_context: self.client_image_builder._build(m_docker_config_dir) - - m_file.get.assert_called_once_with(self.model_zip_stage_path, m_context_dir) - m_extract_model_zip.assert_called_once_with(m_context_dir) m_docker_context_class.assert_called_once_with( - context_dir=m_context_dir, model_dir=m_extracted_model_dir, use_gpu=self.use_gpu + context_dir=m_context_dir, model_dir=self.model_dir, use_gpu=True ) m_build.assert_called_once() m_build_image_from_context.assert_called_once_with( diff --git a/snowflake/ml/model/_deploy_client/image_builds/docker_context.py b/snowflake/ml/model/_deploy_client/image_builds/docker_context.py index 60de61b1..21d7c785 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/docker_context.py +++ b/snowflake/ml/model/_deploy_client/image_builds/docker_context.py @@ -1,4 +1,3 @@ -import importlib import os import shutil import string @@ -22,7 +21,6 @@ def __init__(self, context_dir: str, model_dir: str, *, use_gpu: bool = False) - """ self.context_dir = context_dir self.model_dir = model_dir - # TODO(shchen): SNOW-825995, Define dockerfile template used for model deployment. use_gpu will be used. self.use_gpu = use_gpu def build(self) -> None: @@ -32,29 +30,20 @@ def build(self) -> None: """ self._generate_inference_code() self._copy_entrypoint_script_to_docker_context() - self._copy_snowml_source_code_to_docker_context() + self._copy_model_env_dependency_to_docker_context() self._generate_docker_file() - def _copy_snowml_source_code_to_docker_context(self) -> None: - """Copy the entire snowflake/ml source code to docker context. This will be particularly useful for CI tests - against latest changes. - - Note that we exclude the experimental directory mainly for development scenario; as experimental directory won't - be included in the release. - """ - snow_ml_source_dir = list(importlib.import_module("snowflake.ml").__path__)[0] - shutil.copytree( - snow_ml_source_dir, - os.path.join(self.context_dir, "snowflake", "ml"), - ignore=shutil.ignore_patterns("*.pyc", "experimental"), - ) - def _copy_entrypoint_script_to_docker_context(self) -> None: """Copy gunicorn_run.sh entrypoint to docker context directory.""" path = os.path.join(os.path.dirname(__file__), constants.ENTRYPOINT_SCRIPT) assert os.path.exists(path), f"Run script file missing at path: {path}" shutil.copy(path, os.path.join(self.context_dir, constants.ENTRYPOINT_SCRIPT)) + def _copy_model_env_dependency_to_docker_context(self) -> None: + path = os.path.join(self.model_dir, constants.MODEL_ENV_FOLDER) + assert os.path.exists(path), f"Model env folder missing at path: {path}" + shutil.copytree(path, os.path.join(self.context_dir, constants.MODEL_ENV_FOLDER)) + def _generate_docker_file(self) -> None: """ Generates dockerfile based on dockerfile template. @@ -71,7 +60,7 @@ def _generate_docker_file(self) -> None: "base_image": "mambaorg/micromamba:focal-cuda-11.7.1" if self.use_gpu else "mambaorg/micromamba:1.4.3", - "model_dir": constants.MODEL_DIR, + "model_env_folder": constants.MODEL_ENV_FOLDER, "inference_server_dir": constants.INFERENCE_SERVER_DIR, "entrypoint_script": constants.ENTRYPOINT_SCRIPT, } diff --git a/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py b/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py index 94d7e9e6..68dfd823 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py +++ b/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py @@ -3,16 +3,38 @@ import shutil import tempfile +import sklearn.base +import sklearn.datasets as datasets from absl.testing import absltest +from sklearn import neighbors +from snowflake.ml.model import _model as model_api from snowflake.ml.model._deploy_client.image_builds import docker_context from snowflake.ml.model._deploy_client.utils import constants +_IRIS = datasets.load_iris(as_frame=True) +_IRIS_X = _IRIS.data +_IRIS_Y = _IRIS.target + + +def _get_sklearn_model() -> "sklearn.base.BaseEstimator": + knn_model = neighbors.KNeighborsClassifier() + knn_model.fit(_IRIS_X, _IRIS_Y) + return knn_model + class DockerContextTest(absltest.TestCase): def setUp(self) -> None: self.context_dir = tempfile.mkdtemp() self.model_dir = tempfile.mkdtemp() + + model_api.save_model( + name="model", + model_dir_path=self.model_dir, + model=_get_sklearn_model(), + sample_input=_IRIS_X, + ) + self.use_gpu = False self.docker_context = docker_context.DockerContext(self.context_dir, model_dir=self.model_dir, use_gpu=False) @@ -21,12 +43,7 @@ def tearDown(self) -> None: shutil.rmtree(self.context_dir) def test_build_results_in_correct_docker_context_file_structure(self) -> None: - expected_files = [ - "Dockerfile", - constants.INFERENCE_SERVER_DIR, - constants.ENTRYPOINT_SCRIPT, - "snowflake", - ] + expected_files = ["Dockerfile", constants.INFERENCE_SERVER_DIR, constants.ENTRYPOINT_SCRIPT, "env"] self.docker_context.build() generated_files = os.listdir(self.context_dir) self.assertCountEqual(expected_files, generated_files) @@ -34,14 +51,8 @@ def test_build_results_in_correct_docker_context_file_structure(self) -> None: actual_inference_files = os.listdir(os.path.join(self.context_dir, constants.INFERENCE_SERVER_DIR)) self.assertCountEqual(["main.py"], actual_inference_files) - snow_ml_dir = os.path.join(self.context_dir, "snowflake", "ml") - self.assertTrue(os.path.exists(snow_ml_dir)) - - snow_ml_model_dir = os.path.join(self.context_dir, "snowflake", "ml", "model") - self.assertTrue(os.path.exists(snow_ml_model_dir)) - - experimental_dir = os.path.join(self.context_dir, "snowflake", "ml", "experimental") - self.assertFalse(os.path.exists(experimental_dir)) + model_env_dir = os.path.join(self.context_dir, "env") + self.assertTrue(os.path.exists(model_env_dir)) def test_docker_file_content(self) -> None: self.docker_context.build() @@ -59,6 +70,23 @@ def test_docker_file_content(self) -> None: actual = re.sub(comment_pattern, "", actual, flags=re.MULTILINE) self.assertEqual(actual, expected, "Generated dockerfile is not aligned with the docker template") + def test_docker_file_content_with_gpu(self) -> None: + gpu_context = docker_context.DockerContext(self.context_dir, model_dir=self.model_dir, use_gpu=True) + gpu_context.build() + dockerfile_path = os.path.join(self.context_dir, "Dockerfile") + dockerfile_fixture_path = os.path.join( + os.path.dirname(__file__), "test_fixtures", "dockerfile_test_gpu_fixture" + ) + with open(dockerfile_path) as dockerfile, open(dockerfile_fixture_path) as expected_dockerfile: + actual = dockerfile.read() + expected = expected_dockerfile.read() + + # Define a regular expression pattern to match comment lines + comment_pattern = r"\s*#.*$" + # Remove comments + actual = re.sub(comment_pattern, "", actual, flags=re.MULTILINE) + self.assertEqual(actual, expected, "Generated dockerfile is not aligned with the docker template") + if __name__ == "__main__": absltest.main() diff --git a/snowflake/ml/model/_deploy_client/image_builds/inference_server/BUILD.bazel b/snowflake/ml/model/_deploy_client/image_builds/inference_server/BUILD.bazel index 113c6be1..93d0e7e5 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/inference_server/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/image_builds/inference_server/BUILD.bazel @@ -15,6 +15,7 @@ py_test( name = "main_test", srcs = ["main_test.py"], deps = [ - ":main" + ":main", + "//snowflake/ml/model:_model_meta", ] ) diff --git a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py index dd38d35a..cd7aa438 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +++ b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py @@ -1,13 +1,18 @@ import logging import os +import sys import tempfile import zipfile +from typing import List, cast import pandas as pd from starlette import applications, requests, responses, routing logger = logging.getLogger(__name__) -loaded_model = None +_LOADED_MODEL = None +_LOADED_META = None +TARGET_METHOD = "predict" +MODEL_CODE_DIR = "code" def _run_setup() -> None: @@ -17,9 +22,8 @@ def _run_setup() -> None: logger.handlers = gunicorn_logger.handlers logger.setLevel(gunicorn_logger.level) - from snowflake.ml.model import _model as model_api - - global loaded_model + global _LOADED_MODEL + global _LOADED_META MODEL_ZIP_STAGE_PATH = os.getenv("MODEL_ZIP_STAGE_PATH") assert MODEL_ZIP_STAGE_PATH, "Missing environment variable MODEL_ZIP_STAGE_PATH" @@ -36,7 +40,11 @@ def _run_setup() -> None: else: raise RuntimeError(f"No model zip found at stage path: {model_zip_stage_path}") logger.info(f"Loading model from {extracted_dir} into memory") - loaded_model, _ = model_api._load_model_for_deploy(model_dir_path=extracted_dir) + + sys.path.insert(0, os.path.join(extracted_dir, MODEL_CODE_DIR)) + from snowflake.ml.model import _model as model_api + + _LOADED_MODEL, _LOADED_META = model_api._load_model_for_deploy(model_dir_path=extracted_dir) logger.info("Successfully loaded model into memory") @@ -52,36 +60,54 @@ async def predict(request: requests.Request) -> responses.JSONResponse: request: The input data is expected to be in the following JSON format: { "data": [ - [0, 5.1, 3.5, 4.2, 1.3], - [1, 4.7, 3.2, 4.1, 4.2] + [0, {'_ID': 0, 'input_feature_0': 0.0, 'input_feature_1': 1.0}], + [1, {'_ID': 1, 'input_feature_0': 2.0, 'input_feature_1': 3.0}], } Each row is represented as a list, where the first element denotes the index of the row. Returns: Two possible responses: - For success, return a JSON response {"data": [[0, 1], [1, 2]]}, where the first element of each resulting list - denotes the index of the row, and the rest of the elements represent the prediction results for that row. + For success, return a JSON response + { + "data": [ + [0, {'_ID': 0, 'output': 1}], + [1, {'_ID': 1, 'output': 2}] + ] + }, + The first element of each resulting list denotes the index of the row, and the rest of the elements + represent the prediction results for that row. For an error, return {"error": error_message, "status_code": http_response_status_code}. """ + assert _LOADED_MODEL, "model is not loaded" + assert _LOADED_META, "model metadata is not loaded" + from snowflake.ml.model.model_signature import FeatureSpec + try: input = await request.json() + features = cast(List[FeatureSpec], _LOADED_META.signatures[TARGET_METHOD].inputs) + dtype_map = {feature.name: feature.as_dtype() for feature in features} + input_cols = [spec.name for spec in features] + output_cols = [spec.name for spec in _LOADED_META.signatures[TARGET_METHOD].outputs] assert "data" in input, "missing data field in the request input" # The expression x[1:] is used to exclude the index of the data row. - input_data = [x[1:] for x in input.get("data")] - x = pd.DataFrame(input_data) + input_data = [x[1] for x in input.get("data")] + df = pd.json_normalize(input_data).astype(dtype=dtype_map) + x = df[input_cols] assert len(input_data) != 0 and not all(not row for row in input_data), "empty data" except Exception as e: error_message = f"Input data malformed: {str(e)}" return responses.JSONResponse({"error": error_message}, status_code=400) - assert loaded_model - try: # TODO(shchen): SNOW-835369, Support target method in inference server (Multi-task model). # Mypy ignore will be fixed along with the above ticket. - predictions = loaded_model.predict(x) # type: ignore[attr-defined] - result = predictions.to_records(index=True).tolist() - response = {"data": result} + predictions_df = _LOADED_MODEL.predict(x) # type: ignore[attr-defined] + predictions_df.columns = output_cols + # Use _ID to keep the order of prediction result and associated features. + _KEEP_ORDER_COL_NAME = "_ID" + if _KEEP_ORDER_COL_NAME in df.columns: + predictions_df[_KEEP_ORDER_COL_NAME] = df[_KEEP_ORDER_COL_NAME] + response = {"data": [[i, row] for i, row in enumerate(predictions_df.to_dict(orient="records"))]} return responses.JSONResponse(response) except Exception as e: error_message = f"Prediction failed: {str(e)}" diff --git a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py index 853f1a41..63630742 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py +++ b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py @@ -1,3 +1,6 @@ +import os +from typing import Tuple + import pandas as pd import sklearn.datasets as datasets import sklearn.neighbors as neighbors @@ -5,7 +8,7 @@ from absl.testing.absltest import mock from starlette import testclient -from snowflake.ml.model import custom_model +from snowflake.ml.model import _model as model_api, _model_meta, custom_model class MainTest(absltest.TestCase): @@ -19,15 +22,12 @@ class MainTest(absltest.TestCase): def setUp(self) -> None: super().setUp() - from main import app self.client = testclient.TestClient(app) + self.loaded_sklearn_model, self.loaded_sklearn_meta = self.get_custom_sklearn_model() - self.loaded_model = self.get_custom_model() - - def get_custom_model(self) -> custom_model.CustomModel: - # Set up a mock model + def get_custom_sklearn_model(self) -> Tuple[custom_model.CustomModel, _model_meta.ModelMetadata]: iris = datasets.load_iris(as_frame=True) x = iris.data y = iris.target @@ -42,74 +42,140 @@ def __init__(self, context: custom_model.ModelContext) -> None: def predict(self, input: pd.DataFrame) -> pd.DataFrame: return pd.DataFrame(knn_model.predict(input)) - return TestCustomModel(custom_model.ModelContext()) + model = TestCustomModel(custom_model.ModelContext()) + tmpdir = self.create_tempdir() + model_name = "model_name" + model_api.save_model( + name=model_name, + model_dir_path=os.path.join(tmpdir.full_path, model_name), + model=model, + sample_input=x, + metadata={"author": "halu", "version": "1"}, + ) + return model_api._load_model_for_deploy(model_dir_path=os.path.join(tmpdir, model_name)) def test_ready_endpoint(self) -> None: - with mock.patch("main.loaded_model", return_value=self.loaded_model): - response = self.client.get("/health") - self.assertEqual(response.status_code, 200) - self.assertEqual(response.json(), {"status": "ready"}) + response = self.client.get("/health") + self.assertEqual(response.status_code, 200) + self.assertEqual(response.json(), {"status": "ready"}) def test_predict_endpoint_happy_path(self) -> None: + loaded_model, loaded_meta = self.get_custom_sklearn_model() + + # Construct data input based on external function data input format data = { - "data": [[0, 5.1, 3.5, 4.2, 1.3], [1, 4.7, 3.2, 4.1, 4.2], [2, 5.1, 3.5, 4.2, 4.6], [3, 4.7, 3.2, 4.1, 5.1]] + "data": [ + [ + 0, + { + "_ID": 0, + "sepal length (cm)": 5.1, + "sepal width (cm)": 3.5, + "petal length (cm)": 4.2, + "petal width (cm)": 1.3, + }, + ], + [ + 1, + { + "_ID": 1, + "sepal length (cm)": 4.7, + "sepal width (cm)": 3.2, + "petal length (cm)": 4.1, + "petal width (cm)": 4.2, + }, + ], + ] } - with mock.patch("main.loaded_model", self.loaded_model): + with mock.patch("main._LOADED_MODEL", loaded_model), mock.patch("main._LOADED_META", loaded_meta): response = self.client.post("/predict", json=data) - self.assertEqual(response.status_code, 200) - expected_response = {"data": [[0, 1], [1, 2], [2, 2], [3, 2]]} + expected_response = {"data": [[0, {"output_feature_0": 1, "_ID": 0}], [1, {"output_feature_0": 2, "_ID": 1}]]} self.assertEqual(response.json(), expected_response) + # def test_predict_endpoint_with_invalid_input(self) -> None: - response = self.client.post("/predict", json={}) - self.assertEqual(response.status_code, 400) - self.assertRegex(response.text, "Input data malformed: missing data field in the request input") - - response = self.client.post("/predict", json={"data": []}) - self.assertEqual(response.status_code, 400) - self.assertRegex(response.text, "Input data malformed: empty data") - - # Input data with indexes only. - response = self.client.post("/predict", json={"data": [[0], [1]]}) - self.assertEqual(response.status_code, 400) - self.assertRegex(response.text, "Input data malformed: empty data") - - response = self.client.post( - "/predict", - json={ - "foo": [ - [1, 2], - [2, 3], - ] - }, - ) - self.assertEqual(response.status_code, 400) - self.assertRegex(response.text, "Input data malformed: missing data field in the request input") + loaded_model, loaded_meta = self.get_custom_sklearn_model() + with mock.patch("main._LOADED_MODEL", loaded_model), mock.patch("main._LOADED_META", loaded_meta): + response = self.client.post("/predict", json={}) + self.assertEqual(response.status_code, 400) + self.assertRegex(response.text, "Input data malformed: missing data field in the request input") + response = self.client.post("/predict", json={"data": []}) + self.assertEqual(response.status_code, 400) + self.assertRegex(response.text, "Input data malformed") + + # Input data with indexes only. + response = self.client.post("/predict", json={"data": [[0], [1]]}) + self.assertEqual(response.status_code, 400) + self.assertRegex(response.text, "Input data malformed") + + response = self.client.post( + "/predict", + json={ + "foo": [ + [1, 2], + [2, 3], + ] + }, + ) + self.assertEqual(response.status_code, 400) + self.assertRegex(response.text, "Input data malformed: missing data field in the request input") + + # def test_predict_with_misshaped_data(self) -> None: - data = {"data": [[0, 5.1, 3.5, 4.2], [1, 4.7, 3.2, 4.1], [2, 5.1, 3.5, 4.2], [3, 4.7, 3.2, 4.1]]} + loaded_model, loaded_meta = self.get_custom_sklearn_model() + + data = { + "data": [ + [ + 0, + { + "_ID": 0, + "sepal length (cm)": 5.1, + "sepal width (cm)": 3.5, + "petal length (cm)": 4.2, + }, + ], + [ + 1, + { + "_ID": 1, + "sepal length (cm)": 4.7, + "sepal width (cm)": 3.2, + "petal length (cm)": 4.1, + }, + ], + ] + } - with mock.patch("main.loaded_model", self.loaded_model): + with mock.patch("main._LOADED_MODEL", loaded_model), mock.patch("main._LOADED_META", loaded_meta): response = self.client.post("/predict", json=data) self.assertEqual(response.status_code, 400) - self.assertRegex( - response.text, - "Prediction failed: X has 3 features, but KNeighborsClassifier is " "expecting 4 features as input", - ) + self.assertRegex(response.text, r"Input data malformed: .*dtype mappings argument.*") def test_predict_with_incorrect_data_type(self) -> None: + loaded_model, loaded_meta = self.get_custom_sklearn_model() data = { "data": [ - [0, "a", "b", "c", "d"], + [ + 0, + { + "_ID": 0, + "sepal length (cm)": "a", + "sepal width (cm)": "b", + "petal length (cm)": "c", + "petal width (cm)": "d", + }, + ] ] } - with mock.patch("main.loaded_model", self.loaded_model): + with mock.patch("main._LOADED_MODEL", loaded_model), mock.patch("main._LOADED_META", loaded_meta): response = self.client.post("/predict", json=data) self.assertEqual(response.status_code, 400) - self.assertRegex(response.text, "Prediction failed: could not convert string to float") + self.assertRegex(response.text, "Input data malformed: could not convert string to float") if __name__ == "__main__": diff --git a/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template b/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template index c0cc7091..b9faa3a0 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +++ b/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template @@ -1,14 +1,14 @@ FROM $base_image as build -COPY $model_dir/env/conda.yaml conda.yaml -COPY $model_dir/env/requirements.txt requirements.txt +COPY $model_env_folder/conda.yaml conda.yaml +COPY $model_env_folder/requirements.txt requirements.txt # Set MAMBA_DOCKERFILE_ACTIVATE=1 to activate the conda environment during build time. ARG MAMBA_DOCKERFILE_ACTIVATE=1 # The micromamba image comes with an empty environment named base. RUN --mount=type=cache,target=/opt/conda/pkgs micromamba install -y -n base -f conda.yaml && \ - python -m pip install "uvicorn[standard]" gunicorn starlette && \ + python -m pip install "uvicorn[standard]" gunicorn starlette==0.30.0 && \ python -m pip install -r requirements.txt FROM debian:buster-slim AS runtime @@ -25,9 +25,6 @@ RUN adduser --disabled-password \ COPY $inference_server_dir ./$inference_server_dir COPY $entrypoint_script ./$entrypoint_script RUN chmod +x /$entrypoint_script - # Copy Snowflake/ml source code -# TODO: not needed as source code is either in model, or pulled from conda -COPY snowflake ./snowflake # The mamba root prefix by default is set to /opt/conda, in which the base conda environment is built at. COPY --from=build /opt/conda /opt/conda diff --git a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture index e877df48..aa44a8f6 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture +++ b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture @@ -1,10 +1,10 @@ FROM mambaorg/micromamba:1.4.3 as build -COPY model_dir/env/conda.yaml conda.yaml -COPY model_dir/env/requirements.txt requirements.txt +COPY env/conda.yaml conda.yaml +COPY env/requirements.txt requirements.txt ARG MAMBA_DOCKERFILE_ACTIVATE=1 RUN --mount=type=cache,target=/opt/conda/pkgs micromamba install -y -n base -f conda.yaml && \ - python -m pip install "uvicorn[standard]" gunicorn starlette && \ + python -m pip install "uvicorn[standard]" gunicorn starlette==0.30.0 && \ python -m pip install -r requirements.txt FROM debian:buster-slim AS runtime @@ -21,7 +21,6 @@ RUN adduser --disabled-password \ COPY inference_server ./inference_server COPY gunicorn_run.sh ./gunicorn_run.sh RUN chmod +x /gunicorn_run.sh -COPY snowflake ./snowflake COPY --from=build /opt/conda /opt/conda EXPOSE 5000 diff --git a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_gpu_fixture b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_gpu_fixture new file mode 100644 index 00000000..6c76264a --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_gpu_fixture @@ -0,0 +1,29 @@ +FROM mambaorg/micromamba:focal-cuda-11.7.1 as build + +COPY env/conda.yaml conda.yaml +COPY env/requirements.txt requirements.txt +ARG MAMBA_DOCKERFILE_ACTIVATE=1 +RUN --mount=type=cache,target=/opt/conda/pkgs micromamba install -y -n base -f conda.yaml && \ + python -m pip install "uvicorn[standard]" gunicorn starlette==0.30.0 && \ + python -m pip install -r requirements.txt + +FROM debian:buster-slim AS runtime + +ENV USER nonrootuser +ENV UID 1000 +ENV HOME /home/$USER +RUN adduser --disabled-password \ + --gecos "A non-root user for running inference server" \ + --uid $UID \ + --home $HOME \ + $USER + +COPY inference_server ./inference_server +COPY gunicorn_run.sh ./gunicorn_run.sh +RUN chmod +x /gunicorn_run.sh +COPY --from=build /opt/conda /opt/conda +EXPOSE 5000 + +USER nonrootuser + +CMD ["/gunicorn_run.sh"] diff --git a/snowflake/ml/model/_deploy_client/snowservice/BUILD.bazel b/snowflake/ml/model/_deploy_client/snowservice/BUILD.bazel index c575cd21..65bde879 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/snowservice/BUILD.bazel @@ -14,10 +14,14 @@ py_library( name = "deploy", srcs = ["deploy.py"], deps = [ + "//snowflake/ml/model:_model", + "//snowflake/ml/model:_model_meta", + "//snowflake/ml/model:type_hints", "//snowflake/ml/model/_deploy_client/image_builds:base_image_builder", "//snowflake/ml/model/_deploy_client/image_builds:client_image_builder", ":deploy_options", - "//snowflake/ml/model/_deploy_client/utils:snowservice_client" + "//snowflake/ml/model/_deploy_client/utils:snowservice_client", + "//snowflake/ml/_internal:file_utils" ], data = [ "templates/service_spec_template" diff --git a/snowflake/ml/model/_deploy_client/snowservice/deploy.py b/snowflake/ml/model/_deploy_client/snowservice/deploy.py index 92148bea..83a16269 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/deploy.py +++ b/snowflake/ml/model/_deploy_client/snowservice/deploy.py @@ -4,17 +4,16 @@ import string import tempfile from abc import ABC -from typing import Any, Dict, cast +from typing import Any, Dict, Optional, cast from typing_extensions import Unpack -from snowflake.ml.model._deploy_client.image_builds import ( - base_image_builder, - client_image_builder, -) +from snowflake.ml._internal import file_utils +from snowflake.ml.model import _model, _model_meta, type_hints +from snowflake.ml.model._deploy_client.image_builds import client_image_builder from snowflake.ml.model._deploy_client.snowservice import deploy_options from snowflake.ml.model._deploy_client.utils import constants, snowservice_client -from snowflake.snowpark import Session +from snowflake.snowpark import FileOperation, Session def _deploy( @@ -23,8 +22,9 @@ def _deploy( model_id: str, service_func_name: str, model_zip_stage_path: str, - **kwargs: Unpack[deploy_options.SnowServiceDeployOptionsTypedHint], -) -> None: + deployment_stage_path: str, + **kwargs: Unpack[type_hints.SnowparkContainerServiceDeployOptions], +) -> _model_meta.ModelMetadata: """Entrypoint for model deployment to SnowService. This function will trigger a docker image build followed by workflow deployment to SnowService. @@ -33,12 +33,16 @@ def _deploy( model_id: Unique hex string of length 32, provided by model registry. service_func_name: The service function name in SnowService associated with the created service. model_zip_stage_path: Path to model zip file in stage. Note that this path has a "@" prefix. + deployment_stage_path: Path to stage containing deployment artifacts. **kwargs: various SnowService deployment options. Raises: ValueError: Raised when model_id is empty. ValueError: Raised when service_func_name is empty. ValueError: Raised when model_stage_file_path is empty. + + Returns: + The metadata of the model that has been deployed. """ snowpark_logger = logging.getLogger("snowflake.snowpark") snowflake_connector_logger = logging.getLogger("snowflake.connector") @@ -57,31 +61,79 @@ def _deploy( raise ValueError( 'Must provide a non-empty string for "model_stage_file_path" when deploying to SnowService' ) + if not deployment_stage_path: + raise ValueError( + 'Must provide a non-empty string for "deployment_stage_path" when deploying to SnowService' + ) + + # Remove full qualified name to avoid double quotes corrupting the service spec + model_zip_stage_path = model_zip_stage_path.replace('"', "") + deployment_stage_path = deployment_stage_path.replace('"', "") + assert model_zip_stage_path.startswith("@"), f"stage path should start with @, actual: {model_zip_stage_path}" + assert deployment_stage_path.startswith("@"), f"stage path should start with @, actual: {deployment_stage_path}" options = deploy_options.SnowServiceDeployOptions.from_dict(cast(Dict[str, Any], kwargs)) - image_builder = client_image_builder.ClientImageBuilder( - id=model_id, image_repo=options.image_repo, model_zip_stage_path=model_zip_stage_path, session=session - ) - ss_deployment = SnowServiceDeployment( - session=session, - model_id=model_id, - service_func_name=service_func_name, - model_zip_stage_path=model_zip_stage_path, - image_builder=image_builder, - options=options, - ) - ss_deployment.deploy() + + # TODO[shchen]: SNOW-863701, Explore ways to prevent entire model zip being downloaded during deploy step + # (for both warehouse and snowservice deployment) + # One alternative is for model registry to duplicate the model metadata and env dependency storage from model + # zip so that we don't have to pull down the entire model zip. + fo = FileOperation(session=session) + zf = fo.get_stream(model_zip_stage_path) + with file_utils.unzip_stream_in_temp_dir(stream=zf) as temp_local_model_dir_path: + # Download the model zip file that is already uploaded to stage during model registry log_model step. + # This is needed in order to obtain the conda and requirement file inside the model zip, as well as to + # return the model object needed for deployment info tracking. + ss_deployment = SnowServiceDeployment( + session=session, + model_id=model_id, + service_func_name=service_func_name, + model_zip_stage_path=model_zip_stage_path, # Pass down model_zip_stage_path for service spec file + deployment_stage_path=deployment_stage_path, + model_dir=temp_local_model_dir_path, + options=options, + ) + ss_deployment.deploy() + meta = _model.load_model(model_dir_path=temp_local_model_dir_path, meta_only=True) + return meta finally: # Preserve the original logging level. snowpark_logger.setLevel(snowpark_log_level) snowflake_connector_logger.setLevel(snowflake_connector_log_level) +def _get_or_create_image_repo(session: Session, *, image_repo: Optional[str]) -> str: + def _sanitize_dns_url(url: str) -> str: + # Align with existing SnowService image registry url standard. + return url.lower() + + if image_repo: + return _sanitize_dns_url(image_repo) + + try: + conn = session._conn._conn + org = conn.host.split(".")[1] + account = conn.account + db = conn._database + schema = conn._schema + subdomain = constants.PROD_IMAGE_REGISTRY_SUBDOMAIN + sanitized_url = _sanitize_dns_url( + f"{org}-{account}.{subdomain}.{constants.PROD_IMAGE_REGISTRY_DOMAIN}/{db}/" + f"{schema}/{constants.SNOWML_IMAGE_REPO}" + ) + client = snowservice_client.SnowServiceClient(session) + client.create_image_repo(constants.SNOWML_IMAGE_REPO) + return sanitized_url + except Exception: + raise RuntimeError( + "Failed to construct image repo URL, please ensure the following connections" + "parameters are set in your session: ['host', 'account', 'database', 'schema']" + ) + + class SnowServiceDeployment(ABC): """ Class implementation that encapsulates image build and workflow deployment to SnowService - - #TODO[shchen], SNOW-830093 GPU support on model deployment to SnowService """ def __init__( @@ -89,8 +141,9 @@ def __init__( session: Session, model_id: str, service_func_name: str, + model_dir: str, model_zip_stage_path: str, - image_builder: base_image_builder.ImageBuilder, + deployment_stage_path: str, options: deploy_options.SnowServiceDeployOptions, ) -> None: """Initialization @@ -100,8 +153,9 @@ def __init__( model_id: Unique hex string of length 32, provided by model registry; if not provided, auto-generate one for resource naming.The model_id serves as an idempotent key throughout the deployment workflow. service_func_name: The service function name in SnowService associated with the created service. + model_dir: Local model directory, downloaded form stage and extracted. model_zip_stage_path: Path to model zip file in stage. - image_builder: InferenceImageBuilder instance that handles image build and upload to image registry. + deployment_stage_path: Path to stage containing deployment artifacts. options: A SnowServiceDeployOptions object containing deployment options. """ @@ -109,11 +163,11 @@ def __init__( self.id = model_id self.service_func_name = service_func_name self.model_zip_stage_path = model_zip_stage_path - self.image_builder = image_builder + self.model_dir = model_dir self.options = options self._service_name = f"service_{model_id}" # Spec file and future deployment related artifacts will be stored under {stage}/models/{model_id} - self._model_artifact_stage_location = posixpath.join(options.stage, "models", self.id) + self._model_artifact_stage_location = posixpath.join(deployment_stage_path, "models", self.id) def deploy(self) -> None: """ @@ -121,9 +175,18 @@ def deploy(self) -> None: """ if self.options.prebuilt_snowflake_image: image = self.options.prebuilt_snowflake_image - logging.info(f"Skipped image build. Use Snowflake prebuilt image: {self.options.prebuilt_snowflake_image}") + logging.warning(f"Skipped image build. Use prebuilt image: {self.options.prebuilt_snowflake_image}") else: + logging.warning( + "Building the Docker image and deploying to Snowpark Container Service. " + "This process may take a few minutes." + ) image = self._build_and_upload_image() + + logging.warning( + f"Image successfully built! To prevent the need for rebuilding the Docker image in future deployments, " + f"simply specify 'prebuilt_snowflake_image': '{image}' in the options field of the deploy() function" + ) self._deploy_workflow(image) def _build_and_upload_image(self) -> str: @@ -132,7 +195,15 @@ def _build_and_upload_image(self) -> str: Returns: Path to the image in the remote image repository. """ - return self.image_builder.build_and_upload_image() + image_repo = _get_or_create_image_repo(self.session, image_repo=self.options.image_repo) + image_builder = client_image_builder.ClientImageBuilder( + id=self.id, + image_repo=image_repo, + model_dir=self.model_dir, + session=self.session, + use_gpu=True if self.options.use_gpu else False, + ) + return image_builder.build_and_upload_image() def _prepare_and_upload_artifacts_to_stage(self, image: str) -> None: """Constructs and upload service spec to stage. @@ -152,7 +223,7 @@ def _prepare_and_upload_artifacts_to_stage(self, image: str) -> None: { "image": image, "predict_endpoint_name": constants.PREDICT, - "stage": self.options.stage, + "model_stage": self.model_zip_stage_path[1:].split("/")[0], # Reserve only the stage name "model_zip_stage_path": self.model_zip_stage_path[1:], # Remove the @ prefix "inference_server_container_name": constants.INFERENCE_SERVER_CONTAINER, } diff --git a/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py b/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py index 3de24a53..08cd018b 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +++ b/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py @@ -1,70 +1,49 @@ -from typing import Any, Dict, Optional, TypedDict - -from typing_extensions import NotRequired +import inspect +from typing import Any, Dict, Optional from snowflake.ml.model._deploy_client.utils import constants -class SnowServiceDeployOptionsTypedHint(TypedDict): - """Deployment options for deploying to SnowService. - - stage: the name of the stage for uploading artifacts. - compute_pool: SnowService compute pool name. - image_repo: SnowService image repo path. e.g. "///" - min_instances: Minimum number of service replicas. - max_instances: Maximum number of service replicas. - endpoint: The specific name of the endpoint that the service function will communicate with. Default to - "predict". This option is useful when service has multiple endpoints. - overridden_base_image: When provided, it will override the base image. - """ - - stage: str - compute_pool: str - image_repo: str - min_instances: NotRequired[int] - max_instances: NotRequired[int] - endpoint: NotRequired[str] - overridden_base_image: NotRequired[str] - - class SnowServiceDeployOptions: def __init__( self, - stage: str, compute_pool: str, - image_repo: str, *, - min_instances: int = 1, - max_instances: int = 1, - endpoint: str = constants.PREDICT, - overridden_base_image: Optional[str] = None, + image_repo: Optional[str] = None, + min_instances: Optional[int] = 1, + max_instances: Optional[int] = 1, + endpoint: Optional[str] = constants.PREDICT, prebuilt_snowflake_image: Optional[str] = None, + use_gpu: Optional[bool] = False, ) -> None: """Initialization + When updated, please ensure the type hint is updated accordingly at: //snowflake/ml/model/type_hints + Args: - stage: the name of the stage for uploading artifacts. - compute_pool: SnowService compute pool name. - image_repo: SnowService image repo path. e.g. "///" - min_instances: Minimum number of service replicas. - max_instances: Maximum number of service replicas. - endpoint: The specific name of the endpoint that the service function will communicate with. Default to - "predict". This option is useful when service has multiple endpoints. - overridden_base_image: When provided, it will override the base image. - prebuilt_snowflake_image: When provided, the image building step is skipped, and the pre-built image from + compute_pool: SnowService compute pool name. Please refer to official doc for how to create a + compute pool: https://docs.snowflake.com/LIMITEDACCESS/snowpark-containers/reference/compute-pool + image_repo: SnowService image repo path. e.g. "///". Default to auto + inferred based on session information. + min_instances: Minimum number of service replicas. Default to 1. + max_instances: Maximum number of service replicas. Default to 1. + endpoint: The specific name of the endpoint that the service function will communicate with. This option is + useful when the service has multiple endpoints. Default to “predict”. + prebuilt_snowflake_image: When provided, the image-building step is skipped, and the pre-built image from Snowflake is used as is. This option is for users who consistently use the same image for multiple use cases, allowing faster deployment. The snowflake image used for deployment is logged to the console for - future use. + future use. Default to None. + use_gpu: When set to True, a CUDA-enabled Docker image will be used to provide a runtime CUDA environment. + Default to False. """ - self.stage = stage self.compute_pool = compute_pool self.image_repo = image_repo self.min_instances = min_instances self.max_instances = max_instances self.endpoint = endpoint - self.overridden_base_image = overridden_base_image self.prebuilt_snowflake_image = prebuilt_snowflake_image + self.use_gpu = use_gpu @classmethod def from_dict(cls, options_dict: Dict[str, Any]) -> "SnowServiceDeployOptions": @@ -79,10 +58,10 @@ def from_dict(cls, options_dict: Dict[str, Any]) -> "SnowServiceDeployOptions": Returns: A SnowServiceDeployOptions object """ - required_options = [constants.STAGE, constants.COMPUTE_POOL, constants.IMAGE_REPO] + required_options = [constants.COMPUTE_POOL] missing_keys = [key for key in required_options if options_dict.get(key) is None] if missing_keys: raise ValueError(f"Must provide options when deploying to SnowService: {', '.join(missing_keys)}") - # SnowService image repo cannot handle upper case repo name. - options_dict[constants.IMAGE_REPO] = options_dict[constants.IMAGE_REPO].lower() - return cls(**options_dict) + supported_options_keys = inspect.signature(cls.__init__).parameters.keys() + filtered_options = {k: v for k, v in options_dict.items() if k in supported_options_keys} + return cls(**filtered_options) diff --git a/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py b/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py index aea0b528..c8bdeb9e 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py +++ b/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py @@ -3,14 +3,23 @@ from absl.testing import absltest from absl.testing.absltest import mock -from snowflake.ml.model._deploy_client.image_builds import client_image_builder from snowflake.ml.model._deploy_client.snowservice import deploy_options from snowflake.ml.model._deploy_client.snowservice.deploy import ( SnowServiceDeployment, _deploy, + _get_or_create_image_repo, ) +from snowflake.ml.model._deploy_client.utils import constants from snowflake.ml.test_utils import mock_session -from snowflake.snowpark import session +from snowflake.snowpark import FileOperation, session + + +class Connection: + def __init__(self, host: str, account: str, database: str, schema: str) -> None: + self.host = host + self.account = account + self._database = database + self._schema = schema class DeployTestCase(absltest.TestCase): @@ -18,32 +27,47 @@ def setUp(self) -> None: super().setUp() self.m_session = cast(session.Session, mock_session.MockSession(conn=None, test_case=self)) self.options: Dict[str, Any] = { - "stage": "mock_stage", "compute_pool": "mock_compute_pool", "image_repo": "mock_image_repo", } + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model") # type: ignore + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.file_utils") # type: ignore @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore - def test_deploy_with_model_id(self, m_deployment_class: mock.MagicMock) -> None: + def test_deploy_with_model_id( + self, m_deployment_class: mock.MagicMock, m_file_utils_class: mock.MagicMock, m_model_class: mock.MagicMock + ) -> None: m_deployment = m_deployment_class.return_value - - _deploy( - session=self.m_session, - model_id="provided_model_id", - service_func_name="mock_service_func", - model_zip_stage_path="@mock_model_zip_stage_path", - **self.options, - ) - - m_deployment_class.assert_called_once_with( - session=self.m_session, - model_id="provided_model_id", - service_func_name="mock_service_func", - model_zip_stage_path="@mock_model_zip_stage_path", - image_builder=mock.ANY, - options=mock.ANY, - ) - m_deployment.deploy.assert_called_once() + m_file_utils = m_file_utils_class.return_value + + m_extracted_model_dir = "mock_extracted_model_dir" + m_model_zip_stage_path = "@mock_model_zip_stage_path/model.zip" + m_deployment_stage_path = "@mock_model_deployment_stage_path" + + with mock.patch.object(FileOperation, "get_stream", return_value=None): + with mock.patch.object(m_file_utils, "unzip_stream_in_temp_dir", return_value=m_extracted_model_dir): + _deploy( + session=self.m_session, + model_id="provided_model_id", + service_func_name="mock_service_func", + model_zip_stage_path=m_model_zip_stage_path, + deployment_stage_path=m_deployment_stage_path, + **self.options, + ) + + # TODO: for some reason mock is not wired up properly + # m_model.load_model.assert_called_once_with(model_dir_path=m_extracted_model_dir, meta_only=True) + + m_deployment_class.assert_called_once_with( + session=self.m_session, + model_id="provided_model_id", + service_func_name="mock_service_func", + model_zip_stage_path=m_model_zip_stage_path, + deployment_stage_path=m_deployment_stage_path, + model_dir=mock.ANY, + options=mock.ANY, + ) + m_deployment.deploy.assert_called_once() @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore def test_deploy_with_empty_model_id(self, m_deployment_class: mock.MagicMock) -> None: @@ -52,7 +76,8 @@ def test_deploy_with_empty_model_id(self, m_deployment_class: mock.MagicMock) -> session=self.m_session, service_func_name="mock_service_func", model_id="", - model_zip_stage_path="mock_model_zip_stage_path", + model_zip_stage_path="@mock_model_zip_stage_path/model.zip", + deployment_stage_path="@mock_model_deployment_stage_path", **self.options, ) @@ -60,47 +85,55 @@ def test_deploy_with_empty_model_id(self, m_deployment_class: mock.MagicMock) -> @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore def test_deploy_with_missing_required_options(self, m_deployment_class: mock.MagicMock) -> None: - with self.assertRaisesRegex(ValueError, "stage, image_repo"): - options: Dict[str, Any] = {"compute_pool": "mock_compute_pool"} + with self.assertRaisesRegex(ValueError, "compute_pool"): + options: Dict[str, Any] = {} _deploy( session=self.m_session, service_func_name="mock_service_func", model_id="mock_model_id", - model_zip_stage_path="@mock_model_zip_stage_path", + model_zip_stage_path="@mock_model_zip_stage_path/model.zip", + deployment_stage_path="@mock_model_deployment_stage_path", **options, ) + m_deployment_class.assert_not_called() - with self.assertRaisesRegex(ValueError, "stage"): - options = {"compute_pool": "mock_compute_pool", "image_repo": "mock_image_repo"} - _deploy( - session=self.m_session, - service_func_name="mock_service_func", - model_id="mock_model_id", - model_zip_stage_path="@mock_model_zip_stage_path", - **options, - ) + @mock.patch( + "snowflake.ml.model._deploy_client.snowservice.deploy." "snowservice_client.SnowServiceClient" + ) # type: ignore + def test_get_or_create_image_repo(self, m_snowservice_client_class: mock.MagicMock) -> None: + # Test when image repo url is provided. + self.assertEqual( + _get_or_create_image_repo( + self.m_session, image_repo="org-account.registry-dev.snowflakecomputing.com/DB/SCHEMA/REPO" + ), + "org-account.registry-dev.snowflakecomputing.com/db/schema/repo", + ) - with self.assertRaisesRegex(ValueError, "image_repo"): - options = {"stage": "mock_stage", "compute_pool": "mock_compute_pool"} - _deploy( - session=self.m_session, - service_func_name="mock_service_func", - model_id="mock_model_id", - model_zip_stage_path="@mock_model_zip_stage_path", - **options, - ) + # Test when session is missing component(db/schema etc) in order to construct image repo url + with self.assertRaises(RuntimeError): + _get_or_create_image_repo(self.m_session, image_repo=None) - m_deployment_class.assert_not_called() + # Test constructing image repo from session object + self.m_session._conn = mock.MagicMock() + self.m_session._conn._conn = Connection( + host="account.org.us-west-2.aws.snowflakecomputing.com", account="account", database="DB", schema="SCHEMA" + ) # type: ignore + + m_snowservice_client = m_snowservice_client_class.return_value + expected = f"org-account.registry.snowflakecomputing.com/db/schema/{constants.SNOWML_IMAGE_REPO}" + self.assertEqual(_get_or_create_image_repo(self.m_session, image_repo=None), expected) + m_snowservice_client.create_image_repo.assert_called_with(constants.SNOWML_IMAGE_REPO) class SnowServiceDeploymentTestCase(absltest.TestCase): def setUp(self) -> None: super().setUp() self.m_session = cast(session.Session, mock_session.MockSession(conn=None, test_case=self)) - self.m_image_builder = mock.create_autospec(client_image_builder.ClientImageBuilder) self.m_model_id = "provided_model_id" self.m_service_func_name = "provided_service_func_name" - self.m_model_zip_stage_path = "@provided_model_zip_stage_path" + self.m_model_zip_stage_path = "@provided_model_zip_stage_path/model.zip" + self.m_deployment_stage_path = "@mock_model_deployment_stage_path" + self.m_model_dir = "tmp/local_model.zip" self.m_options = { "stage": "mock_stage", "compute_pool": "mock_compute_pool", @@ -111,8 +144,9 @@ def setUp(self) -> None: self.m_session, model_id=self.m_model_id, service_func_name=self.m_service_func_name, + model_dir=self.m_model_dir, model_zip_stage_path=self.m_model_zip_stage_path, - image_builder=self.m_image_builder, + deployment_stage_path=self.m_deployment_stage_path, options=deploy_options.SnowServiceDeployOptions.from_dict(self.m_options), ) @@ -124,9 +158,17 @@ def test_deploy(self) -> None: m_build_and_upload_image.assert_called_once() m_deploy_workflow.assert_called_once() - def test_build_and_upload_image(self) -> None: - self.deployment._build_and_upload_image() - self.m_image_builder.build_and_upload_image.assert_called_once() + @mock.patch( + "snowflake.ml.model._deploy_client.snowservice.deploy.client_image_builder" ".ClientImageBuilder" + ) # type: ignore + def test_build_and_upload_image(self, client_image_builder_class: mock.MagicMock) -> None: + m_image_builder = client_image_builder_class.return_value + with mock.patch.object( + m_image_builder, "build_and_upload_image", return_value="image_path" + ) as mock_build_and_upload: + res = self.deployment._build_and_upload_image() + mock_build_and_upload.assert_called_once() + self.assertEqual(res, "image_path") if __name__ == "__main__": diff --git a/snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template b/snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template index 634d4245..39994ed0 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +++ b/snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template @@ -11,7 +11,7 @@ spec: - name: vol1 mountPath: /local/user/vol1 - name: stage - mountPath: ${stage} + mountPath: ${model_stage} endpoint: - name: ${predict_endpoint_name} port: 5000 @@ -19,6 +19,6 @@ spec: - name: vol1 source: local # only local emptyDir volume is supported - name: stage - source: "@${stage}" + source: "@${model_stage}" uid: 1000 gid: 1000 diff --git a/snowflake/ml/model/_deploy_client/utils/constants.py b/snowflake/ml/model/_deploy_client/utils/constants.py index 8bb88666..1404db08 100644 --- a/snowflake/ml/model/_deploy_client/utils/constants.py +++ b/snowflake/ml/model/_deploy_client/utils/constants.py @@ -32,7 +32,6 @@ class ResourceStatus(Enum): PREDICT = "predict" STAGE = "stage" COMPUTE_POOL = "compute_pool" -IMAGE_REPO = "image_repo" MIN_INSTANCES = "min_instances" MAX_INSTANCES = "max_instances" GPU_COUNT = "gpu" @@ -42,6 +41,12 @@ class ResourceStatus(Enum): INFERENCE_SERVER_CONTAINER = "inference-server" """Image build related constants""" +SNOWML_IMAGE_REPO = "snowml_repo" MODEL_DIR = "model_dir" INFERENCE_SERVER_DIR = "inference_server" ENTRYPOINT_SCRIPT = "gunicorn_run.sh" +PROD_IMAGE_REGISTRY_DOMAIN = "snowflakecomputing.com" +PROD_IMAGE_REGISTRY_SUBDOMAIN = "registry" +DEV_IMAGE_REGISTRY_SUBDOMAIN = "registry-dev" +MODEL_ENV_FOLDER = "env" +CONDA_FILE = "conda.yaml" diff --git a/snowflake/ml/model/_deploy_client/utils/snowservice_client.py b/snowflake/ml/model/_deploy_client/utils/snowservice_client.py index 4902e48c..4300bc13 100644 --- a/snowflake/ml/model/_deploy_client/utils/snowservice_client.py +++ b/snowflake/ml/model/_deploy_client/utils/snowservice_client.py @@ -20,14 +20,17 @@ def __init__(self, session: Session) -> None: """ self.session = session + def create_image_repo(self, repo_name: str) -> None: + self.session.sql(f"CREATE OR REPLACE IMAGE REPOSITORY {repo_name}").collect() + def create_or_replace_service( self, service_name: str, compute_pool: str, spec_stage_location: str, *, - min_instances: int = 1, - max_instances: int = 1, + min_instances: Optional[int] = 1, + max_instances: Optional[int] = 1, ) -> None: """Create or replace service. Since SnowService doesn't support the CREATE OR REPLACE service syntax, we will first attempt to drop the service if it exists, and then create the service. Please note that this approach may @@ -40,13 +43,14 @@ def create_or_replace_service( compute_pool: Name of the compute pool. spec_stage_location: Stage path for the service spec. """ + assert spec_stage_location.startswith("@"), f"stage path should start with @, actual: {spec_stage_location}" self._drop_service_if_exists(service_name) sql = f""" CREATE SERVICE {service_name} MIN_INSTANCES={min_instances} MAX_INSTANCES={max_instances} COMPUTE_POOL={compute_pool} - SPEC=@{spec_stage_location} + SPEC={spec_stage_location} """ logging.info(f"Create service with SQL: \n {sql}") self.session.sql(sql).collect() @@ -87,6 +91,7 @@ def create_or_replace_service_function( """ logging.info(f"Create service function with SQL: \n {sql}") self.session.sql(sql).collect() + logging.info(f"Successfully created service function: {service_func_name}") def block_until_resource_is_ready( self, diff --git a/snowflake/ml/model/_deploy_client/utils/snowservice_client_test.py b/snowflake/ml/model/_deploy_client/utils/snowservice_client_test.py index 09cbc5a1..93124595 100644 --- a/snowflake/ml/model/_deploy_client/utils/snowservice_client_test.py +++ b/snowflake/ml/model/_deploy_client/utils/snowservice_client_test.py @@ -22,7 +22,7 @@ def test_create_or_replace_service(self) -> None: m_min_instances = 1 m_max_instances = 2 m_compute_pool = "mock_compute_pool" - m_spec_storgae_location = "mock_spec_storage_location" + m_spec_storgae_location = "@mock_spec_storage_location" self.m_session.add_mock_sql( query="drop service if exists mock_service_name", result=mock_data_frame.MockDataFrame(collect_result=[]) diff --git a/snowflake/ml/model/_deploy_client/warehouse/BUILD.bazel b/snowflake/ml/model/_deploy_client/warehouse/BUILD.bazel index a6b00a27..e408b590 100644 --- a/snowflake/ml/model/_deploy_client/warehouse/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/warehouse/BUILD.bazel @@ -16,7 +16,6 @@ py_library( "//snowflake/ml/_internal:env_utils", "//snowflake/ml/_internal:file_utils", "//snowflake/ml/_internal/utils:identifier", - "//snowflake/ml/model:_env", "//snowflake/ml/model:_model", "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:type_hints", diff --git a/snowflake/ml/model/_deploy_client/warehouse/deploy.py b/snowflake/ml/model/_deploy_client/warehouse/deploy.py index 32ca40dd..8a43ae88 100644 --- a/snowflake/ml/model/_deploy_client/warehouse/deploy.py +++ b/snowflake/ml/model/_deploy_client/warehouse/deploy.py @@ -1,7 +1,6 @@ import os import posixpath import tempfile -import warnings from types import ModuleType from typing import IO, List, Optional, Tuple, TypedDict, Union @@ -9,12 +8,7 @@ from snowflake.ml._internal import env_utils, file_utils from snowflake.ml._internal.utils import identifier -from snowflake.ml.model import ( - _env as model_env, - _model, - _model_meta, - type_hints as model_types, -) +from snowflake.ml.model import _model, _model_meta, type_hints as model_types from snowflake.ml.model._deploy_client.warehouse import infer_template from snowflake.snowpark import session as snowpark_session, types as st @@ -68,14 +62,10 @@ def _deploy_to_warehouse( relax_version = kwargs.get("relax_version", False) - disable_local_conda_resolver = kwargs.get("disable_local_conda_resolver", False) - if target_method not in meta.signatures.keys(): raise ValueError(f"Target method {target_method} does not exist in model.") - final_packages = _get_model_final_packages( - meta, session, relax_version=relax_version, disable_local_conda_resolver=disable_local_conda_resolver - ) + final_packages = _get_model_final_packages(meta, session, relax_version=relax_version) stage_location = kwargs.get("permanent_udf_stage_location", None) if stage_location: @@ -152,7 +142,6 @@ def _get_model_final_packages( meta: _model_meta.ModelMetadata, session: snowpark_session.Session, relax_version: Optional[bool] = False, - disable_local_conda_resolver: Optional[bool] = False, ) -> List[str]: """Generate final packages list of dependency of a model to be deployed to warehouse. @@ -161,8 +150,6 @@ def _get_model_final_packages( session: Snowpark connection session. relax_version: Whether or not relax the version restriction when fail to resolve dependencies. Defaults to False. - disable_local_conda_resolver: Set to disable use local conda resolver to do pre-check on environment and rely on - the information schema only. Defaults to False. Raises: RuntimeError: Raised when PIP requirements and dependencies from non-Snowflake anaconda channel found. @@ -173,49 +160,38 @@ def _get_model_final_packages( """ final_packages = None if ( - any(channel.lower() not in ["", "snowflake"] for channel in meta._conda_dependencies.keys()) + any( + channel.lower() not in [env_utils.DEFAULT_CHANNEL_NAME, "snowflake"] + for channel in meta._conda_dependencies.keys() + ) or meta.pip_requirements ): raise RuntimeError("PIP requirements and dependencies from non-Snowflake anaconda channel is not supported.") - deps = meta._conda_dependencies[""] + deps = meta._conda_dependencies[env_utils.DEFAULT_CHANNEL_NAME] - try: - if disable_local_conda_resolver: - raise ImportError("Raise to disable local conda resolver. Should be captured.") - final_packages = env_utils.resolve_conda_environment( - deps, [model_env._SNOWFLAKE_CONDA_CHANNEL_URL], python_version=meta.python_version - ) - if final_packages is None and relax_version: - final_packages = env_utils.resolve_conda_environment( - list(map(env_utils.relax_requirement_version, deps)), - [model_env._SNOWFLAKE_CONDA_CHANNEL_URL], - python_version=meta.python_version, - ) - except ImportError: - warnings.warn( - "Cannot find conda resolver, use Snowflake information schema for best-effort dependency pre-check.", - category=RuntimeWarning, - ) + final_packages = env_utils.validate_requirements_in_snowflake_conda_channel( + session=session, + reqs=deps, + python_version=meta.python_version, + ) + if final_packages is None and relax_version: final_packages = env_utils.validate_requirements_in_snowflake_conda_channel( session=session, - reqs=deps, + reqs=list(map(env_utils.relax_requirement_version, deps)), python_version=meta.python_version, ) - if final_packages is None and relax_version: - final_packages = env_utils.validate_requirements_in_snowflake_conda_channel( - session=session, - reqs=list(map(env_utils.relax_requirement_version, deps)), - python_version=meta.python_version, - ) - finally: + if final_packages is None: + relax_version_info_str = "" if relax_version else "Try to set relax_version as True in the options. " + required_deps = list(map(env_utils.relax_requirement_version, deps)) if relax_version else deps if final_packages is None: raise RuntimeError( "The model's dependency cannot fit into Snowflake Warehouse. " - + "Trying to set relax_version as True in the options. Required packages are:\n" - + '"' - + " ".join(map(str, meta._conda_dependencies[""])) - + '"' + + relax_version_info_str + + "Required packages are:\n" + + " ".join(map(lambda x: f'"{x}"', required_deps)) + + "\n Required Python version is: " + + meta.python_version ) return final_packages diff --git a/snowflake/ml/model/_deploy_client/warehouse/deploy_test.py b/snowflake/ml/model/_deploy_client/warehouse/deploy_test.py index e08abf69..e5005e1c 100644 --- a/snowflake/ml/model/_deploy_client/warehouse/deploy_test.py +++ b/snowflake/ml/model/_deploy_client/warehouse/deploy_test.py @@ -1,9 +1,6 @@ -import sys -import tempfile import textwrap from importlib import metadata as importlib_metadata from typing import Dict, List, cast -from unittest import mock from absl.testing import absltest from packaging import requirements @@ -37,10 +34,6 @@ class TestFinalPackagesWithoutConda(absltest.TestCase): @classmethod def setUpClass(cls) -> None: - cls._temp_conda = None - if sys.modules.get("conda"): - cls._temp_conda = sys.modules["conda"] - sys.modules["conda"] = None # type: ignore[assignment] env_utils._INFO_SCHEMA_PACKAGES_HAS_RUNTIME_VERSION = None cls.m_session = mock_session.MockSession(conn=None, test_case=None) cls.m_session.add_mock_sql( @@ -68,10 +61,7 @@ def setUp(self) -> None: @classmethod def tearDownClass(cls) -> None: - if cls._temp_conda: - sys.modules["conda"] = cls._temp_conda - else: - del sys.modules["conda"] + pass def add_packages(self, packages_dicts: Dict[str, List[str]]) -> None: pkg_names_str = " OR ".join(f"package_name = '{pkg}'" for pkg in sorted(packages_dicts.keys())) @@ -97,12 +87,9 @@ def test_get_model_final_packages(self) -> None: env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} meta = _model_meta.ModelMetadata(name="model1", model_type="custom", signatures=_DUMMY_SIG) c_session = cast(session.Session, self.m_session) - with self.assertWarnsRegex( - RuntimeWarning, - "Cannot find conda resolver", - ): - final_packages = deploy._get_model_final_packages(meta, c_session) - self.assertListEqual(final_packages, list(map(str, _BASIC_DEPENDENCIES_FINAL_PACKAGES))) + + final_packages = deploy._get_model_final_packages(meta, c_session) + self.assertListEqual(final_packages, list(map(str, _BASIC_DEPENDENCIES_FINAL_PACKAGES))) def test_get_model_final_packages_no_relax(self) -> None: env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} @@ -110,12 +97,8 @@ def test_get_model_final_packages_no_relax(self) -> None: name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["pandas==1.0.*"] ) c_session = cast(session.Session, self.m_session) - with self.assertWarnsRegex( - RuntimeWarning, - "Cannot find conda resolver", - ): - with self.assertRaises(RuntimeError): - deploy._get_model_final_packages(meta, c_session) + with self.assertRaises(RuntimeError): + deploy._get_model_final_packages(meta, c_session) def test_get_model_final_packages_relax(self) -> None: env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} @@ -123,14 +106,9 @@ def test_get_model_final_packages_relax(self) -> None: name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["pandas==1.0.*"] ) c_session = cast(session.Session, self.m_session) - with self.assertWarnsRegex( - RuntimeWarning, - "Cannot find conda resolver", - ): - final_packages = deploy._get_model_final_packages(meta, c_session, relax_version=True) - self.assertListEqual( - final_packages, sorted(list(map(lambda x: x.name, _BASIC_DEPENDENCIES_FINAL_PACKAGES))) - ) + + final_packages = deploy._get_model_final_packages(meta, c_session, relax_version=True) + self.assertListEqual(final_packages, sorted(list(map(lambda x: x.name, _BASIC_DEPENDENCIES_FINAL_PACKAGES)))) def test_get_model_final_packages_with_pip(self) -> None: env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} @@ -170,120 +148,9 @@ def test_get_model_final_packages_with_non_exist_package(self) -> None: name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["python-package"] ) c_session = cast(session.Session, self.m_session) - with self.assertWarnsRegex( - RuntimeWarning, - "Cannot find conda resolver", - ): - with self.assertRaises(RuntimeError): - deploy._get_model_final_packages(meta, c_session) - -class TestFinalPackagesWithCondaWIthoutSnowML(absltest.TestCase): - def setUp(self) -> None: - self.m_session = mock_session.MockSession(conn=None, test_case=self) - - def tearDown(self) -> None: - pass - - def test_get_model_final_packages_disable_conda(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - embed_local_ml_library=True, - ) as meta: - c_session = cast(session.Session, self.m_session) - with mock.patch.object( - env_utils, "validate_requirements_in_snowflake_conda_channel", return_value=None - ) as mock_validate_requirements_in_snowflake_conda_channel: - with self.assertRaises(RuntimeError): - _ = deploy._get_model_final_packages(meta, c_session, disable_local_conda_resolver=True) - mock_validate_requirements_in_snowflake_conda_channel.assert_called_once() - - def test_get_model_final_packages(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - embed_local_ml_library=True, - ) as meta: - c_session = cast(session.Session, self.m_session) - final_packages = deploy._get_model_final_packages(meta, c_session, relax_version=True) - self.assertIsNotNone(final_packages) - - def test_get_model_final_packages_no_relax(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - conda_dependencies=["pandas<1"], - embed_local_ml_library=True, - ) as meta: - c_session = cast(session.Session, self.m_session) - with self.assertRaises(RuntimeError): - deploy._get_model_final_packages(meta, c_session) - - def test_get_model_final_packages_relax(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - conda_dependencies=["pandas<1"], - embed_local_ml_library=True, - ) as meta: - c_session = cast(session.Session, self.m_session) - final_packages = deploy._get_model_final_packages(meta, c_session, relax_version=True) - self.assertIsNotNone(final_packages) - - def test_get_model_final_packages_with_pip(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - pip_requirements=["python_package"], - embed_local_ml_library=True, - ) as meta: - c_session = cast(session.Session, self.m_session) - with self.assertRaises(RuntimeError): - deploy._get_model_final_packages(meta, c_session) - - def test_get_model_final_packages_with_other_channel(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - conda_dependencies=["conda-forge::python_package"], - embed_local_ml_library=True, - ) as meta: - c_session = cast(session.Session, self.m_session) - with self.assertRaises(RuntimeError): - deploy._get_model_final_packages(meta, c_session) - - def test_get_model_final_packages_with_non_exist_package(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - with _model_meta._create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, - conda_dependencies=["python_package"], - embed_local_ml_library=True, - ) as meta: - c_session = cast(session.Session, self.m_session) - with self.assertRaises(RuntimeError): - deploy._get_model_final_packages(meta, c_session) + with self.assertRaises(RuntimeError): + deploy._get_model_final_packages(meta, c_session) if __name__ == "__main__": diff --git a/snowflake/ml/model/_deployer.py b/snowflake/ml/model/_deployer.py index 7fce1fbd..4e262375 100644 --- a/snowflake/ml/model/_deployer.py +++ b/snowflake/ml/model/_deployer.py @@ -1,21 +1,37 @@ import traceback from enum import Enum -from typing import Optional, TypedDict, Union, overload +from typing import Optional, TypedDict, Union, cast, overload import pandas as pd from typing_extensions import Required from snowflake.ml._internal.utils import identifier from snowflake.ml.model import model_signature, type_hints as model_types +from snowflake.ml.model._deploy_client.snowservice import deploy as snowservice_deploy +from snowflake.ml.model._deploy_client.utils import constants as snowservice_constants from snowflake.ml.model._deploy_client.warehouse import ( deploy as warehouse_deploy, infer_template, ) +from snowflake.ml.model._signatures import snowpark_handler from snowflake.snowpark import DataFrame as SnowparkDataFrame, Session, functions as F class TargetPlatform(Enum): WAREHOUSE = "warehouse" + SNOWPARK_CONTAINER_SERVICE = "snowpark_container_service" + + def __repr__(self) -> str: + """Construct a string format that works with the "ModelReference" in model_registry.py. Fundamentally, + ModelReference uses the TargetPlatform enum type when constructing the "deploy" function through exec(). + Since "exec" in Python takes input as a string, we need to dynamically construct a full path so that the + enum can be loaded successfully. + + Returns: + A enum string representation. + """ + + return f"{__name__.split('.')[-1]}.{self.__class__.__name__}.{self.name}" class Deployment(TypedDict): @@ -82,6 +98,34 @@ def deploy( ... +@overload +def deploy( + session: Session, + *, + model_id: str, + name: str, + platform: TargetPlatform, + target_method: str, + model_stage_file_path: str, + deployment_stage_path: str, + options: Optional[model_types.DeployOptions], +) -> Optional[Deployment]: + """Create a deployment from a model in a local directory and deploy it to remote platform. + + Args: + session: Snowpark Connection Session. + model_id: Internal model ID string. + name: Name of the deployment for the model. + platform: Target platform to deploy the model. + target_method: The name of the target method to be deployed. + model_stage_file_path: Model file in the stage to be deployed. Must be a file with .zip extension. + deployment_stage_path: Path to stage containing snowpark container service deployment artifacts. + options: Additional options when deploying the model. + Each target platform will have their own specifications of options. + """ + ... + + def deploy( session: Session, *, @@ -90,18 +134,22 @@ def deploy( target_method: str, model_dir_path: Optional[str] = None, model_stage_file_path: Optional[str] = None, + deployment_stage_path: Optional[str] = None, + model_id: Optional[str] = None, options: Optional[model_types.DeployOptions], ) -> Optional[Deployment]: """Create a deployment from a model and deploy it to remote platform. Args: session: Snowpark Connection Session. + model_id: Internal model ID string. name: Name of the deployment for the model. platform: Target platform to deploy the model. target_method: The name of the target method to be deployed. model_dir_path: Directory of the model. Exclusive with `model_stage_dir_path`. model_stage_file_path: Model file in the stage to be deployed. Exclusive with `model_dir_path`. Must be a file with .zip extension. + deployment_stage_path: Path to stage containing deployment artifacts. options: Additional options when deploying the model. Each target platform will have their own specifications of options. @@ -136,8 +184,28 @@ def deploy( ) except Exception: raise RuntimeError("Error happened when deploying to the warehouse: " + traceback.format_exc()) + + elif platform == TargetPlatform.SNOWPARK_CONTAINER_SERVICE: + options = cast(model_types.SnowparkContainerServiceDeployOptions, options) + assert model_id, "Require 'model_id' for Snowpark container service deployment" + assert model_stage_file_path, "Require 'model_stage_file_path' for Snowpark container service deployment" + assert deployment_stage_path, "Require 'deployment_stage_path' for Snowpark container service deployment" + if snowservice_constants.COMPUTE_POOL not in options: + raise ValueError("Missing 'compute_pool' in options field for Snowpark container service deployment") + try: + meta = snowservice_deploy._deploy( + session=session, + model_id=model_id, + service_func_name=name, + model_zip_stage_path=model_stage_file_path, + deployment_stage_path=deployment_stage_path, + **options, + ) + except Exception: + raise RuntimeError(f"Failed to deploy to Snowpark Container Service: {traceback.format_exc()}") + else: - raise ValueError("Unsupported target Platform.") + raise ValueError(f"Unsupported target Platform: {platform}") signature = meta.signatures.get(target_method, None) if not signature: raise ValueError(f"Target method {target_method} does not exist in model.") @@ -192,11 +260,12 @@ def predict( sig = deployment["signature"] keep_order = deployment["options"].get("keep_order", True) output_with_input_features = deployment["options"].get("output_with_input_features", False) + platform = deployment["platform"] # Validate and prepare input if not isinstance(X, SnowparkDataFrame): df = model_signature._convert_and_validate_local_data(X, sig.inputs) - s_df = model_signature._SnowparkDataFrameHandler.convert_from_df(session, df, keep_order=keep_order) + s_df = snowpark_handler.SnowparkDataFrameHandler.convert_from_df(session, df, keep_order=keep_order) else: model_signature._validate_snowpark_data(X, sig.inputs) s_df = X @@ -216,13 +285,20 @@ def predict( literal_col_name = identifier.get_unescaped_names(col_name) input_cols.extend( [ - F.lit(literal_col_name), # type:ignore[arg-type] + F.lit(literal_col_name), F.col(col_name), ] ) - output_obj = F.call_udf( - identifier.get_inferred_name(deployment["name"]), F.object_construct(*input_cols) # type:ignore[arg-type] + + # TODO[shchen]: SNOW-870032, For SnowService, external function name cannot be double quoted, else it results in + # external function no found. + udf_name = ( + deployment["name"] + if platform == TargetPlatform.SNOWPARK_CONTAINER_SERVICE + else identifier.get_inferred_name(deployment["name"]) ) + output_obj = F.call_udf(udf_name, F.object_construct(*input_cols)) + if output_with_input_features: df_res = s_df.with_column(INTERMEDIATE_OBJ_NAME, output_obj) else: @@ -248,6 +324,6 @@ def predict( # Get final result if not isinstance(X, SnowparkDataFrame): - return model_signature._SnowparkDataFrameHandler.convert_to_df(df_res, features=sig.outputs) + return snowpark_handler.SnowparkDataFrameHandler.convert_to_df(df_res, features=sig.outputs) else: return df_res diff --git a/snowflake/ml/model/_env.py b/snowflake/ml/model/_env.py index b67c726f..aa19df6c 100644 --- a/snowflake/ml/model/_env.py +++ b/snowflake/ml/model/_env.py @@ -10,6 +10,7 @@ _CONDA_ENV_FILE_NAME = "conda.yaml" _SNOWFLAKE_CONDA_CHANNEL_URL = "https://repo.anaconda.com/pkgs/snowflake" +_NODEFAULTS = "nodefaults" _REQUIREMENTS_FILE_NAME = "requirements.txt" @@ -31,7 +32,11 @@ def save_conda_env_file( path = os.path.join(dir_path, _CONDA_ENV_FILE_NAME) env: Dict[str, Any] = dict() env["name"] = "snow-env" - env["channels"] = [_SNOWFLAKE_CONDA_CHANNEL_URL, "nodefaults"] + env["channels"] = ( + [_SNOWFLAKE_CONDA_CHANNEL_URL] + + [channel_name for channel_name, channel_deps in deps.items() if len(channel_deps) == 0] + + [_NODEFAULTS] + ) env["dependencies"] = [f"python=={python_version}"] for chan, reqs in deps.items(): env["dependencies"].extend([f"{chan}::{str(req)}" if chan else str(req) for req in reqs]) @@ -78,16 +83,27 @@ def load_conda_env_file(path: str) -> Tuple[DefaultDict[str, List[requirements.R python_version = None + channels = env["channels"] + channels.remove(_SNOWFLAKE_CONDA_CHANNEL_URL) + channels.remove(_NODEFAULTS) + for dep in env["dependencies"]: if isinstance(dep, str): - if dep.startswith("python=="): - hd, _, ver = dep.partition("==") - assert hd == "python" - python_version = ver + ver = env_utils.parse_python_version_string(dep) + # ver is None: not python, ver is "": python w/o specifier, ver is str: python w/ specifier + if ver is not None: + if ver: + python_version = ver else: deps.append(dep) - return env_utils.validate_conda_dependency_string_list(deps), python_version + conda_dep_dict = env_utils.validate_conda_dependency_string_list(deps) + + if len(channels) > 0: + for channel in channels: + conda_dep_dict[channel] = [] + + return conda_dep_dict, python_version def load_requirements_file(path: str) -> List[requirements.Requirement]: diff --git a/snowflake/ml/model/_env_test.py b/snowflake/ml/model/_env_test.py index dd7baa16..3e7e7697 100644 --- a/snowflake/ml/model/_env_test.py +++ b/snowflake/ml/model/_env_test.py @@ -7,7 +7,7 @@ from absl.testing import absltest from packaging import requirements -from snowflake.ml._internal import env as snowml_env +from snowflake.ml._internal import env as snowml_env, env_utils from snowflake.ml.model import _env @@ -22,14 +22,14 @@ def test_conda_env_file(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: cd = collections.defaultdict(list) - cd[""] = [requirements.Requirement("numpy")] + cd[env_utils.DEFAULT_CHANNEL_NAME] = [requirements.Requirement("numpy")] env_file_path = _env.save_conda_env_file(tmpdir, cd) loaded_cd, _ = _env.load_conda_env_file(env_file_path) self.assertEqual(cd, loaded_cd) with tempfile.TemporaryDirectory() as tmpdir: cd = collections.defaultdict(list) - cd[""] = [requirements.Requirement("numpy>=1.22.4")] + cd[env_utils.DEFAULT_CHANNEL_NAME] = [requirements.Requirement("numpy>=1.22.4")] env_file_path = _env.save_conda_env_file(tmpdir, cd) loaded_cd, _ = _env.load_conda_env_file(env_file_path) self.assertEqual(cd, loaded_cd) @@ -38,7 +38,7 @@ def test_conda_env_file(self) -> None: cd = collections.defaultdict(list) cd.update( { - "": [requirements.Requirement("numpy>=1.22.4")], + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("numpy>=1.22.4")], "conda-forge": [requirements.Requirement("pytorch!=2.0")], } ) @@ -46,13 +46,65 @@ def test_conda_env_file(self) -> None: loaded_cd, _ = _env.load_conda_env_file(env_file_path) self.assertEqual(cd, loaded_cd) + with tempfile.TemporaryDirectory() as tmpdir: + cd = collections.defaultdict(list) + cd.update( + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("numpy>=1.22.4")], + "apple": [], + "conda-forge": [requirements.Requirement("pytorch!=2.0")], + } + ) + env_file_path = _env.save_conda_env_file(tmpdir, cd) + with open(env_file_path, encoding="utf-8") as f: + writed_yaml = yaml.safe_load(f) + self.assertDictEqual( + writed_yaml, + { + "name": "snow-env", + "channels": ["https://repo.anaconda.com/pkgs/snowflake", "apple", "nodefaults"], + "dependencies": [ + f"python=={snowml_env.PYTHON_VERSION}", + "numpy>=1.22.4", + "conda-forge::pytorch!=2.0", + ], + }, + ) + loaded_cd, _ = _env.load_conda_env_file(env_file_path) + self.assertEqual(cd, loaded_cd) + + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, _env._CONDA_ENV_FILE_NAME), "w", encoding="utf-8") as f: + yaml.safe_dump( + stream=f, + data={ + "name": "snow-env", + "channels": ["https://repo.anaconda.com/pkgs/snowflake", "nodefaults"], + "dependencies": [ + f"python=={snowml_env.PYTHON_VERSION}", + "::numpy>=1.22.4", + "conda-forge::pytorch!=2.0", + {"pip": "python-package"}, + ], + }, + ) + loaded_cd, python_ver = _env.load_conda_env_file(os.path.join(tmpdir, _env._CONDA_ENV_FILE_NAME)) + self.assertEqual( + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("numpy>=1.22.4")], + "conda-forge": [requirements.Requirement("pytorch!=2.0")], + }, + loaded_cd, + ) + self.assertEqual(python_ver, snowml_env.PYTHON_VERSION) + with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, _env._CONDA_ENV_FILE_NAME), "w", encoding="utf-8") as f: yaml.safe_dump( stream=f, data={ "name": "snow-env", - "chanels": ["https://repo.anaconda.com/pkgs/snowflake", "nodefaults"], + "channels": ["https://repo.anaconda.com/pkgs/snowflake", "apple", "nodefaults"], "dependencies": [ f"python=={snowml_env.PYTHON_VERSION}", "::numpy>=1.22.4", @@ -61,14 +113,16 @@ def test_conda_env_file(self) -> None: ], }, ) - loaded_cd, _ = _env.load_conda_env_file(os.path.join(tmpdir, _env._CONDA_ENV_FILE_NAME)) + loaded_cd, python_ver = _env.load_conda_env_file(os.path.join(tmpdir, _env._CONDA_ENV_FILE_NAME)) self.assertEqual( { - "": [requirements.Requirement("numpy>=1.22.4")], + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("numpy>=1.22.4")], "conda-forge": [requirements.Requirement("pytorch!=2.0")], + "apple": [], }, loaded_cd, ) + self.assertEqual(python_ver, snowml_env.PYTHON_VERSION) def test_generate_requirements_file(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: diff --git a/snowflake/ml/model/_handlers/BUILD.bazel b/snowflake/ml/model/_handlers/BUILD.bazel index d2f5207d..7fea5971 100644 --- a/snowflake/ml/model/_handlers/BUILD.bazel +++ b/snowflake/ml/model/_handlers/BUILD.bazel @@ -35,6 +35,8 @@ py_library( "//snowflake/ml/model:custom_model", "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_signatures:utils", + "//snowflake/ml/model/_signatures:numpy_handler", ], ) @@ -49,6 +51,8 @@ py_library( "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", "//snowflake/ml/modeling/framework", + "//snowflake/ml/model/_signatures:utils", + "//snowflake/ml/model/_signatures:numpy_handler", ], ) @@ -61,6 +65,8 @@ py_library( "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_signatures:utils", + "//snowflake/ml/model/_signatures:numpy_handler", ], ) @@ -74,6 +80,8 @@ py_library( "//snowflake/ml/model:custom_model", "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_signatures:utils", + "//snowflake/ml/model/_signatures:pytorch_handler", ], ) @@ -87,5 +95,40 @@ py_library( "//snowflake/ml/model:custom_model", "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_signatures:utils", + "//snowflake/ml/model/_signatures:pytorch_handler", + ], +) + +py_library( + name = "tensorflow", + srcs = ["tensorflow.py"], + deps = [ + ":_base", + "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/model:_model_meta", + "//snowflake/ml/model:custom_model", + "//snowflake/ml/model:model_signature", + "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_signatures:utils", + "//snowflake/ml/model/_signatures:numpy_handler", + "//snowflake/ml/model/_signatures:tensorflow_handler", + ], +) + +py_library( + name = "mlflow", + srcs = ["mlflow.py"], + deps = [ + ":_base", + "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/_internal:env_utils", + "//snowflake/ml/_internal:file_utils", + "//snowflake/ml/model:_env", + "//snowflake/ml/model:_model_meta", + "//snowflake/ml/model:custom_model", + "//snowflake/ml/model:model_signature", + "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_signatures:utils", ], ) diff --git a/snowflake/ml/model/_handlers/_base.py b/snowflake/ml/model/_handlers/_base.py index 67dac3e5..50177ad7 100644 --- a/snowflake/ml/model/_handlers/_base.py +++ b/snowflake/ml/model/_handlers/_base.py @@ -43,7 +43,7 @@ def _save_model( model_blobs_dir_path: str, sample_input: Optional[model_types.SupportedDataType] = None, is_sub_model: Optional[bool] = False, - **kwargs: Unpack[model_types.ModelSaveOption], + **kwargs: Unpack[model_types.BaseModelSaveOption], ) -> None: """Save the model. diff --git a/snowflake/ml/model/_handlers/mlflow.py b/snowflake/ml/model/_handlers/mlflow.py new file mode 100644 index 00000000..c0393e00 --- /dev/null +++ b/snowflake/ml/model/_handlers/mlflow.py @@ -0,0 +1,310 @@ +import itertools +import os +import tempfile +import warnings +from typing import TYPE_CHECKING, Callable, Optional, Type, cast + +import pandas as pd +import yaml +from typing_extensions import TypeGuard, Unpack + +from snowflake.ml._internal import env_utils, file_utils, type_utils +from snowflake.ml.model import ( + _model_meta as model_meta_api, + custom_model, + model_signature, + type_hints as model_types, +) +from snowflake.ml.model._handlers import _base +from snowflake.ml.model._signatures import utils as model_signature_utils + +if TYPE_CHECKING: + import mlflow + + +def _parse_mlflow_env(model_uri: str, model_meta: model_meta_api.ModelMetadata) -> model_meta_api.ModelMetadata: + """Parse MLFlow env file and modify model meta based on MLFlow env. + + Args: + model_uri: Model uri where the env file could be downloaded + model_meta: model meta to be modified + + Raises: + ValueError: Raised when cannot download MLFlow model dependencies file. + + Returns: + Modified model metadata. + """ + import mlflow + + try: + conda_env_file_path = mlflow.pyfunc.get_model_dependencies(model_uri, format="conda") + + with open(conda_env_file_path, encoding="utf-8") as f: + env = yaml.safe_load(stream=f) + except (mlflow.MlflowException, OSError): + raise ValueError("Cannot load MLFlow model dependencies.") + + assert isinstance(env, dict) + + mlflow_conda_deps = [] + mlflow_pip_deps = [] + mlflow_python_version = None + + mlflow_conda_channels = env.get("channels", []) + + for dep in env["dependencies"]: + if isinstance(dep, str): + ver = env_utils.parse_python_version_string(dep) + # ver is None: not python, ver is "": python w/o specifier, ver is str: python w/ specifier + if ver is not None: + if ver: + mlflow_python_version = ver + else: + mlflow_conda_deps.append(dep) + elif isinstance(dep, dict) and "pip" in dep: + mlflow_pip_deps.extend(dep["pip"]) + + if mlflow_python_version: + model_meta.python_version = mlflow_python_version + + mlflow_conda_deps_dict = env_utils.validate_conda_dependency_string_list(mlflow_conda_deps) + mlflow_pip_deps_list = env_utils.validate_pip_requirement_string_list(mlflow_pip_deps) + + for mlflow_channel, mlflow_channel_dependencies in mlflow_conda_deps_dict.items(): + if mlflow_channel != env_utils.DEFAULT_CHANNEL_NAME: + warnings.warn( + ( + "Found dependencies from MLflow specified from non-Snowflake channel." + + " This may prevent model deploying to Snowflake Warehouse." + ), + category=UserWarning, + ) + for mlflow_channel_dependency in mlflow_channel_dependencies: + try: + env_utils.append_conda_dependency( + model_meta._conda_dependencies, (mlflow_channel, mlflow_channel_dependency) + ) + except env_utils.DuplicateDependencyError: + pass + except env_utils.DuplicateDependencyInMultipleChannelsError: + warnings.warn( + ( + f"Dependency {mlflow_channel_dependency.name} appeared in multiple channels." + + " This may be unintentional." + ), + category=UserWarning, + ) + + if mlflow_conda_channels: + warnings.warn( + ( + "Found conda channels specified from MLflow." + + " This may prevent model deploying to Snowflake Warehouse." + ), + category=UserWarning, + ) + for channel_name in mlflow_conda_channels: + model_meta._conda_dependencies[channel_name] = [] + + if mlflow_pip_deps_list: + warnings.warn( + ( + "Found dependencies from MLflow specified as pip requirements." + + " This may prevent model deploying to Snowflake Warehouse." + ), + category=UserWarning, + ) + for mlflow_pip_dependency in mlflow_pip_deps_list: + if any( + mlflow_channel_dependency.name == mlflow_pip_dependency.name + for mlflow_channel_dependency in itertools.chain(*mlflow_conda_deps_dict.values()) + ): + continue + env_utils.append_requirement_list(model_meta._pip_requirements, mlflow_pip_dependency) + + return model_meta + + +class _MLFlowHandler(_base._ModelHandler["mlflow.pyfunc.PyFuncModel"]): + """Handler for MLFlow based model. + + Currently mlflow.pyfunc.PyFuncModel based classes are supported. + """ + + handler_type = "mlflow" + MODEL_BLOB_FILE = "model" + _DEFAULT_TARGET_METHOD = "predict" + DEFAULT_TARGET_METHODS = [_DEFAULT_TARGET_METHOD] + + @staticmethod + def can_handle( + model: model_types.SupportedModelType, + ) -> TypeGuard["mlflow.pyfunc.PyFuncModel"]: + return type_utils.LazyType("mlflow.pyfunc.PyFuncModel").isinstance(model) + + @staticmethod + def cast_model( + model: model_types.SupportedModelType, + ) -> "mlflow.pyfunc.PyFuncModel": + import mlflow + + assert isinstance(model, mlflow.pyfunc.PyFuncModel) + + return cast(mlflow.pyfunc.PyFuncModel, model) + + @staticmethod + def _save_model( + name: str, + model: "mlflow.pyfunc.PyFuncModel", + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + sample_input: Optional[model_types.SupportedDataType] = None, + is_sub_model: Optional[bool] = False, + **kwargs: Unpack[model_types.MLFlowSaveOptions], + ) -> None: + import mlflow + + assert isinstance(model, mlflow.pyfunc.PyFuncModel) + + model_info = model.metadata.get_model_info() + model_uri = kwargs.get("model_uri", model_info.model_uri) + + pyfunc_flavor_info = model_info.flavors.get(mlflow.pyfunc.FLAVOR_NAME, None) + if pyfunc_flavor_info is None: + raise ValueError("Cannot save MLFlow model that does not have PyFunc flavor.") + + # Port MLFlow signature + if not is_sub_model: + if model_meta._signatures is not None: + model_meta_api._validate_target_methods(model, list(model_meta.signatures.keys())) + else: + model_meta_api._validate_target_methods(model, _MLFlowHandler.DEFAULT_TARGET_METHODS) + model_meta._signatures = { + _MLFlowHandler._DEFAULT_TARGET_METHOD: model_signature.ModelSignature.from_mlflow_sig( + model_info.signature + ) + } + + # Port MLFlow metadata + mlflow_model_metadata = model_info.metadata + if mlflow_model_metadata and not kwargs.get("ignore_mlflow_metadata", False): + if not model_meta.metadata: + model_meta.metadata = {} + model_meta.metadata.update(mlflow_model_metadata) + + # Port MLFlow dependencies + if kwargs.get("ignore_mlflow_dependencies", False): + model_meta._include_if_absent([model_meta_api.Dependency(conda_name="mlflow", pip_name="mlflow")]) + else: + model_meta = _parse_mlflow_env(model_uri, model_meta) + + model_blob_path = os.path.join(model_blobs_dir_path, name) + + os.makedirs(model_blob_path, exist_ok=True) + with tempfile.TemporaryDirectory() as tmpdir: + try: + local_path = mlflow.artifacts.download_artifacts(model_uri, dst_path=tmpdir) + except (mlflow.MlflowException, OSError): + raise ValueError("Cannot load MLFlow model artifacts.") + + file_utils.copy_file_or_tree(local_path, os.path.join(model_blob_path, _MLFlowHandler.MODEL_BLOB_FILE)) + + base_meta = model_meta_api._ModelBlobMetadata( + name=name, + model_type=_MLFlowHandler.handler_type, + path=_MLFlowHandler.MODEL_BLOB_FILE, + options={"artifact_path": model_info.artifact_path}, + ) + model_meta.models[name] = base_meta + + @staticmethod + def _load_model( + name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + ) -> "mlflow.pyfunc.PyFuncModel": + import mlflow + + model_blob_path = os.path.join(model_blobs_dir_path, name) + if not hasattr(model_meta, "models"): + raise ValueError("Ill model metadata found.") + model_blobs_metadata = model_meta.models + if name not in model_blobs_metadata: + raise ValueError(f"Blob of model {name} does not exist.") + model_blob_metadata = model_blobs_metadata[name] + + model_blob_options = model_blob_metadata.options + + model_artifact_path = model_blob_options.get("artifact_path", None) + if model_artifact_path is None: + raise ValueError("Cannot find a place to load the MLFlow model.") + + model_blob_filename = model_blob_metadata.path + + # This is to make sure the loaded model can be saved again. + with mlflow.start_run() as run: + mlflow.log_artifacts( + os.path.join(model_blob_path, model_blob_filename, model_artifact_path), + artifact_path=model_artifact_path, + ) + m = mlflow.pyfunc.load_model(f"runs:/{run.info.run_id}/{model_artifact_path}") + m.metadata.run_id = run.info.run_id + return m + + @staticmethod + def _load_as_custom_model( + name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + ) -> custom_model.CustomModel: + """Create a custom model class wrap for unified interface when being deployed. The predict method will be + re-targeted based on target_method metadata. + + Args: + name: Name of the model. + model_meta: The model metadata. + model_blobs_dir_path: Directory path to the whole model. + + Returns: + The model object as a custom model. + """ + import mlflow + + from snowflake.ml.model import custom_model + + # We need to redirect the mlruns folder to a writable location in the sandbox. + tmpdir = tempfile.TemporaryDirectory(dir="/tmp") + mlflow.set_tracking_uri(f"file://{tmpdir}") + + def _create_custom_model( + raw_model: "mlflow.pyfunc.PyFuncModel", + model_meta: model_meta_api.ModelMetadata, + ) -> Type[custom_model.CustomModel]: + def fn_factory( + raw_model: "mlflow.pyfunc.PyFuncModel", + signature: model_signature.ModelSignature, + target_method: str, + ) -> Callable[[custom_model.CustomModel, pd.DataFrame], pd.DataFrame]: + @custom_model.inference_api + def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: + res = raw_model.predict(X) + return model_signature_utils.rename_pandas_df( + model_signature._convert_local_data_to_df(res), features=signature.outputs + ) + + return fn + + type_method_dict = {} + for target_method_name, sig in model_meta.signatures.items(): + type_method_dict[target_method_name] = fn_factory(raw_model, sig, target_method_name) + + _MLFlowModel = type( + "_MLFlowModel", + (custom_model.CustomModel,), + type_method_dict, + ) + + return _MLFlowModel + + raw_model = _MLFlowHandler._load_model(name, model_meta, model_blobs_dir_path) + _MLFlowModel = _create_custom_model(raw_model, model_meta) + mlflow_model = _MLFlowModel(custom_model.ModelContext()) + + return mlflow_model diff --git a/snowflake/ml/model/_handlers/pytorch.py b/snowflake/ml/model/_handlers/pytorch.py index 022ce221..da371492 100644 --- a/snowflake/ml/model/_handlers/pytorch.py +++ b/snowflake/ml/model/_handlers/pytorch.py @@ -14,6 +14,10 @@ type_hints as model_types, ) from snowflake.ml.model._handlers import _base +from snowflake.ml.model._signatures import ( + pytorch_handler, + utils as model_signature_utils, +) if TYPE_CHECKING: import torch @@ -71,8 +75,8 @@ def _save_model( def get_prediction( target_method_name: str, sample_input: "model_types.SupportedLocalDataType" ) -> model_types.SupportedLocalDataType: - if not model_signature._SeqOfPyTorchTensorHandler.can_handle(sample_input): - sample_input = model_signature._SeqOfPyTorchTensorHandler.convert_from_df( + if not pytorch_handler.SeqOfPyTorchTensorHandler.can_handle(sample_input): + sample_input = pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( model_signature._convert_local_data_to_df(sample_input) ) @@ -157,12 +161,12 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: raise ValueError("Tensor cannot handle null values.") raw_model.eval() - t = model_signature._SeqOfPyTorchTensorHandler.convert_from_df(X, signature.inputs) + t = pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(X, signature.inputs) with torch.no_grad(): res = getattr(raw_model, target_method)(t) - return model_signature._rename_pandas_df( - data=model_signature._SeqOfPyTorchTensorHandler.convert_to_df(res), features=signature.outputs + return model_signature_utils.rename_pandas_df( + data=pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(res), features=signature.outputs ) return fn diff --git a/snowflake/ml/model/_handlers/sklearn.py b/snowflake/ml/model/_handlers/sklearn.py index 542a9c8c..3e87ddd5 100644 --- a/snowflake/ml/model/_handlers/sklearn.py +++ b/snowflake/ml/model/_handlers/sklearn.py @@ -14,6 +14,7 @@ type_hints as model_types, ) from snowflake.ml.model._handlers import _base +from snowflake.ml.model._signatures import numpy_handler, utils as model_signature_utils if TYPE_CHECKING: import sklearn.base @@ -161,11 +162,11 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: if isinstance(res, list) and len(res) > 0 and isinstance(res[0], np.ndarray): # In case of multi-output estimators, predict_proba(), decision_function(), etc., functions # return a list of ndarrays. We need to deal them seperately - df = model_signature._SeqOfNumpyArrayHandler.convert_to_df(res) + df = numpy_handler.SeqOfNumpyArrayHandler.convert_to_df(res) else: df = pd.DataFrame(res) - return model_signature._rename_pandas_df(df, signature.outputs) + return model_signature_utils.rename_pandas_df(df, signature.outputs) return fn diff --git a/snowflake/ml/model/_handlers/snowmlmodel.py b/snowflake/ml/model/_handlers/snowmlmodel.py index 3a823f19..fe8f47bf 100644 --- a/snowflake/ml/model/_handlers/snowmlmodel.py +++ b/snowflake/ml/model/_handlers/snowmlmodel.py @@ -14,6 +14,7 @@ type_hints as model_types, ) from snowflake.ml.model._handlers import _base +from snowflake.ml.model._signatures import numpy_handler, utils as model_signature_utils if TYPE_CHECKING: from snowflake.ml.modeling.framework.base import BaseEstimator @@ -164,11 +165,11 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: if isinstance(res, list) and len(res) > 0 and isinstance(res[0], np.ndarray): # In case of multi-output estimators, predict_proba(), decision_function(), etc., functions # return a list of ndarrays. We need to deal them seperately - df = model_signature._SeqOfNumpyArrayHandler.convert_to_df(res) + df = numpy_handler.SeqOfNumpyArrayHandler.convert_to_df(res) else: df = pd.DataFrame(res) - return model_signature._rename_pandas_df(df, signature.outputs) + return model_signature_utils.rename_pandas_df(df, signature.outputs) return fn diff --git a/snowflake/ml/model/_handlers/tensorflow.py b/snowflake/ml/model/_handlers/tensorflow.py new file mode 100644 index 00000000..a8d985cd --- /dev/null +++ b/snowflake/ml/model/_handlers/tensorflow.py @@ -0,0 +1,196 @@ +import os +from typing import TYPE_CHECKING, Callable, Optional, Type, cast + +import numpy as np +import pandas as pd +from typing_extensions import TypeGuard, Unpack + +from snowflake.ml._internal import type_utils +from snowflake.ml.model import ( + _model_meta as model_meta_api, + custom_model, + model_signature, + type_hints as model_types, +) +from snowflake.ml.model._handlers import _base +from snowflake.ml.model._signatures import ( + numpy_handler, + tensorflow_handler, + utils as model_signature_utils, +) + +if TYPE_CHECKING: + import tensorflow + + +class _TensorFlowHandler(_base._ModelHandler["tensorflow.Module"]): + """Handler for TensorFlow based model. + + Currently tensorflow.Module based classes are supported. + """ + + handler_type = "tensorflow" + MODEL_BLOB_FILE = "model" + DEFAULT_TARGET_METHODS = ["__call__"] + + @staticmethod + def can_handle( + model: model_types.SupportedModelType, + ) -> TypeGuard["tensorflow.nn.Module"]: + return type_utils.LazyType("tensorflow.Module").isinstance(model) + + @staticmethod + def cast_model( + model: model_types.SupportedModelType, + ) -> "tensorflow.Module": + import tensorflow + + assert isinstance(model, tensorflow.Module) + + return cast(tensorflow.Module, model) + + @staticmethod + def _save_model( + name: str, + model: "tensorflow.Module", + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + sample_input: Optional[model_types.SupportedDataType] = None, + is_sub_model: Optional[bool] = False, + **kwargs: Unpack[model_types.TensorflowSaveOptions], + ) -> None: + import tensorflow + + assert isinstance(model, tensorflow.Module) + + if isinstance(model, tensorflow.keras.Model): + default_target_methods = ["predict"] + else: + default_target_methods = _TensorFlowHandler.DEFAULT_TARGET_METHODS + + if not is_sub_model: + target_methods = model_meta_api._get_target_methods( + model=model, + target_methods=kwargs.pop("target_methods", None), + default_target_methods=default_target_methods, + ) + + def get_prediction( + target_method_name: str, sample_input: "model_types.SupportedLocalDataType" + ) -> model_types.SupportedLocalDataType: + if not tensorflow_handler.SeqOfTensorflowTensorHandler.can_handle(sample_input): + sample_input = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + model_signature._convert_local_data_to_df(sample_input) + ) + + target_method = getattr(model, target_method_name, None) + assert callable(target_method) + for tensor in sample_input: + tensorflow.stop_gradient(tensor) + predictions_df = target_method(sample_input) + return predictions_df + + model_meta = model_meta_api._validate_signature( + model=model, + model_meta=model_meta, + target_methods=target_methods, + sample_input=sample_input, + get_prediction_fn=get_prediction, + ) + + model_blob_path = os.path.join(model_blobs_dir_path, name) + os.makedirs(model_blob_path, exist_ok=True) + if isinstance(model, tensorflow.keras.Model): + tensorflow.keras.models.save_model(model, os.path.join(model_blob_path, _TensorFlowHandler.MODEL_BLOB_FILE)) + else: + tensorflow.saved_model.save(model, os.path.join(model_blob_path, _TensorFlowHandler.MODEL_BLOB_FILE)) + + base_meta = model_meta_api._ModelBlobMetadata( + name=name, model_type=_TensorFlowHandler.handler_type, path=_TensorFlowHandler.MODEL_BLOB_FILE + ) + model_meta.models[name] = base_meta + model_meta._include_if_absent([model_meta_api.Dependency(conda_name="tensorflow", pip_name="tensorflow")]) + + @staticmethod + def _load_model( + name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + ) -> "tensorflow.Module": + import tensorflow + + model_blob_path = os.path.join(model_blobs_dir_path, name) + if not hasattr(model_meta, "models"): + raise ValueError("Ill model metadata found.") + model_blobs_metadata = model_meta.models + if name not in model_blobs_metadata: + raise ValueError(f"Blob of model {name} does not exist.") + model_blob_metadata = model_blobs_metadata[name] + model_blob_filename = model_blob_metadata.path + m = tensorflow.keras.models.load_model(os.path.join(model_blob_path, model_blob_filename), compile=False) + if isinstance(m, tensorflow.keras.Model): + return m + return cast(tensorflow.Module, m) + + @staticmethod + def _load_as_custom_model( + name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + ) -> custom_model.CustomModel: + """Create a custom model class wrap for unified interface when being deployed. The predict method will be + re-targeted based on target_method metadata. + + Args: + name: Name of the model. + model_meta: The model metadata. + model_blobs_dir_path: Directory path to the whole model. + + Returns: + The model object as a custom model. + """ + import tensorflow + + from snowflake.ml.model import custom_model + + def _create_custom_model( + raw_model: "tensorflow.Module", + model_meta: model_meta_api.ModelMetadata, + ) -> Type[custom_model.CustomModel]: + def fn_factory( + raw_model: "tensorflow.Module", + signature: model_signature.ModelSignature, + target_method: str, + ) -> Callable[[custom_model.CustomModel, pd.DataFrame], pd.DataFrame]: + @custom_model.inference_api + def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: + if X.isnull().any(axis=None): + raise ValueError("Tensor cannot handle null values.") + + t = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(X, signature.inputs) + + for tensor in t: + tensorflow.stop_gradient(tensor) + res = getattr(raw_model, target_method)(t) + if isinstance(res, list) and len(res) > 0 and isinstance(res[0], np.ndarray): + # In case of running on CPU, it will return numpy array + df = numpy_handler.SeqOfNumpyArrayHandler.convert_to_df(res) + else: + df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(res) + return model_signature_utils.rename_pandas_df(df, signature.outputs) + + return fn + + type_method_dict = {} + for target_method_name, sig in model_meta.signatures.items(): + type_method_dict[target_method_name] = fn_factory(raw_model, sig, target_method_name) + + _TensorFlowModel = type( + "_TensorFlowModel", + (custom_model.CustomModel,), + type_method_dict, + ) + + return _TensorFlowModel + + raw_model = _TensorFlowHandler()._load_model(name, model_meta, model_blobs_dir_path) + _TensorFlowModel = _create_custom_model(raw_model, model_meta) + tf_model = _TensorFlowModel(custom_model.ModelContext()) + + return tf_model diff --git a/snowflake/ml/model/_handlers/torchscript.py b/snowflake/ml/model/_handlers/torchscript.py index b68d9b1c..27654496 100644 --- a/snowflake/ml/model/_handlers/torchscript.py +++ b/snowflake/ml/model/_handlers/torchscript.py @@ -12,6 +12,10 @@ type_hints as model_types, ) from snowflake.ml.model._handlers import _base +from snowflake.ml.model._signatures import ( + pytorch_handler, + utils as model_signature_utils, +) if TYPE_CHECKING: import torch @@ -67,8 +71,8 @@ def _save_model( def get_prediction( target_method_name: str, sample_input: "model_types.SupportedLocalDataType" ) -> model_types.SupportedLocalDataType: - if not model_signature._SeqOfPyTorchTensorHandler.can_handle(sample_input): - sample_input = model_signature._SeqOfPyTorchTensorHandler.convert_from_df( + if not pytorch_handler.SeqOfPyTorchTensorHandler.can_handle(sample_input): + sample_input = pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( model_signature._convert_local_data_to_df(sample_input) ) @@ -151,12 +155,12 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: raw_model.eval() - t = model_signature._SeqOfPyTorchTensorHandler.convert_from_df(X, signature.inputs) + t = pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(X, signature.inputs) with torch.no_grad(): res = getattr(raw_model, target_method)(t) - return model_signature._rename_pandas_df( - data=model_signature._SeqOfPyTorchTensorHandler.convert_to_df(res), features=signature.outputs + return model_signature_utils.rename_pandas_df( + data=pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(res), features=signature.outputs ) return fn diff --git a/snowflake/ml/model/_handlers/xgboost.py b/snowflake/ml/model/_handlers/xgboost.py index c24d4fd9..27d6936a 100644 --- a/snowflake/ml/model/_handlers/xgboost.py +++ b/snowflake/ml/model/_handlers/xgboost.py @@ -14,6 +14,7 @@ type_hints as model_types, ) from snowflake.ml.model._handlers import _base +from snowflake.ml.model._signatures import numpy_handler, utils as model_signature_utils if TYPE_CHECKING: import xgboost @@ -162,11 +163,11 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: if isinstance(res, list) and len(res) > 0 and isinstance(res[0], np.ndarray): # In case of multi-output estimators, predict_proba(), decision_function(), etc., functions # return a list of ndarrays. We need to deal them seperately - df = model_signature._SeqOfNumpyArrayHandler.convert_to_df(res) + df = numpy_handler.SeqOfNumpyArrayHandler.convert_to_df(res) else: df = pd.DataFrame(res) - return model_signature._rename_pandas_df(df, signature.outputs) + return model_signature_utils.rename_pandas_df(df, signature.outputs) return fn diff --git a/snowflake/ml/model/_model.py b/snowflake/ml/model/_model.py index 12f89c10..d46808c6 100644 --- a/snowflake/ml/model/_model.py +++ b/snowflake/ml/model/_model.py @@ -3,7 +3,7 @@ import tempfile import warnings from types import ModuleType -from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union, overload +from typing import Dict, List, Literal, Optional, Tuple, Union, overload from snowflake.ml._internal import file_utils, type_utils from snowflake.ml.model import ( @@ -16,9 +16,6 @@ ) from snowflake.snowpark import FileOperation, Session -if TYPE_CHECKING: - from snowflake.ml.modeling.framework import base - MODEL_BLOBS_DIR = "models" @@ -26,7 +23,7 @@ def save_model( *, name: str, - model: "base.BaseEstimator", + model: model_types.SupportedNoSignatureRequirementsModelType, model_dir_path: str, metadata: Optional[Dict[str, str]] = None, conda_dependencies: Optional[List[str]] = None, @@ -36,11 +33,11 @@ def save_model( code_paths: Optional[List[str]] = None, options: Optional[model_types.ModelSaveOption] = None, ) -> _model_meta.ModelMetadata: - """Save a SnowML modeling model under `dir_path`. + """Save a model that does not require a signature under `dir_path`. Args: name: Name of the model. - model: SnowML modeling model object. + model: Model object. model_dir_path: Directory to save the model. metadata: Model metadata. conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify @@ -62,7 +59,7 @@ def save_model( def save_model( *, name: str, - model: model_types.SupportedLocalModelType, + model: model_types.SupportedRequireSignatureModelType, model_dir_path: str, signatures: Dict[str, model_signature.ModelSignature], metadata: Optional[Dict[str, str]] = None, @@ -73,7 +70,7 @@ def save_model( code_paths: Optional[List[str]] = None, options: Optional[model_types.ModelSaveOption] = None, ) -> _model_meta.ModelMetadata: - """Save a local model with user provided signatures under `dir_path`. + """Save a model that requires a external signature with user provided signatures under `dir_path`. Args: name: Name of the model. @@ -100,7 +97,7 @@ def save_model( def save_model( *, name: str, - model: model_types.SupportedLocalModelType, + model: model_types.SupportedRequireSignatureModelType, model_dir_path: str, sample_input: model_types.SupportedDataType, metadata: Optional[Dict[str, str]] = None, @@ -111,7 +108,8 @@ def save_model( code_paths: Optional[List[str]] = None, options: Optional[model_types.ModelSaveOption] = None, ) -> _model_meta.ModelMetadata: - """Save a local model under `dir_path` with signature inferred from a sample_input_data. + """Save a model that requires a external signature under `dir_path` with signature + inferred from a sample_input_data. Args: name: Name of the model. @@ -138,7 +136,7 @@ def save_model( def save_model( *, name: str, - model: "base.BaseEstimator", + model: model_types.SupportedNoSignatureRequirementsModelType, session: Session, model_stage_file_path: str, metadata: Optional[Dict[str, str]] = None, @@ -149,11 +147,11 @@ def save_model( code_paths: Optional[List[str]] = None, options: Optional[model_types.ModelSaveOption] = None, ) -> _model_meta.ModelMetadata: - """Save a SnowML modeling model to a zip file whose path is the provided stage file path. + """Save a model that does not require a signature to a zip file whose path is the provided stage file path. Args: name: Name of the model. - model: SnowML modeling model object. + model: Model object. session: Snowpark connection session. model_stage_file_path: Path to the file in Snowflake stage where the function should put the saved model. Must be a file with .zip extension. @@ -177,7 +175,7 @@ def save_model( def save_model( *, name: str, - model: model_types.SupportedLocalModelType, + model: model_types.SupportedRequireSignatureModelType, session: Session, model_stage_file_path: str, signatures: Dict[str, model_signature.ModelSignature], @@ -189,7 +187,8 @@ def save_model( code_paths: Optional[List[str]] = None, options: Optional[model_types.ModelSaveOption] = None, ) -> _model_meta.ModelMetadata: - """Save a local model with user provided signatures to a zip file whose path is the provided stage file path. + """Save a model that requires a external signature with user provided signatures + to a zip file whose path is the provided stage file path. Args: name: Name of the model. @@ -218,7 +217,7 @@ def save_model( def save_model( *, name: str, - model: model_types.SupportedLocalModelType, + model: model_types.SupportedRequireSignatureModelType, session: Session, model_stage_file_path: str, sample_input: model_types.SupportedDataType, @@ -230,8 +229,8 @@ def save_model( code_paths: Optional[List[str]] = None, options: Optional[model_types.ModelSaveOption] = None, ) -> _model_meta.ModelMetadata: - """Save a local model to a zip file whose path is the provided stage file path with signature inferred from a - sample_input_data. + """Save a model that requires a external signature to a zip file whose path is the + provided stage file path with signature inferred from a sample_input_data. Args: name: Name of the model. @@ -328,7 +327,10 @@ def save_model( if ( (signatures is None) and (sample_input is None) - and not type_utils.LazyType("snowflake.ml.modeling.framework.base.BaseEstimator").isinstance(model) + and not ( + type_utils.LazyType("snowflake.ml.modeling.framework.base.BaseEstimator").isinstance(model) + or type_utils.LazyType("mlflow.pyfunc.PyFuncModel").isinstance(model) + ) ) or ((signatures is not None) and (sample_input is not None)): raise ValueError( "Signatures and sample_input both cannot be " @@ -336,7 +338,7 @@ def save_model( ) if not options: - options = {} + options = model_types.BaseModelSaveOption() if model_dir_path: if os.path.exists(model_dir_path): diff --git a/snowflake/ml/model/_model_meta.py b/snowflake/ml/model/_model_meta.py index b656823e..9d81e6fb 100644 --- a/snowflake/ml/model/_model_meta.py +++ b/snowflake/ml/model/_model_meta.py @@ -20,10 +20,13 @@ model_signature, type_hints as model_types, ) +from snowflake.ml.model._signatures import snowpark_handler from snowflake.snowpark import DataFrame as SnowparkDataFrame MODEL_METADATA_VERSION = 1 _BASIC_DEPENDENCIES = _core_requirements.REQUIREMENTS +_SNOWFLAKE_PKG_NAME = "snowflake" +_SNOWFLAKE_ML_PKG_NAME = f"{_SNOWFLAKE_PKG_NAME}.ml" Dependency = namedtuple("Dependency", ["conda_name", "pip_name"]) @@ -81,13 +84,18 @@ def _create_model_metadata( current version would be captured. Defaults to None. **kwargs: Dict of attributes and values of the metadata. Used when loading from file. + Raises: + ValueError: Raised when the code path contains reserved file or directory. + Yields: A model metadata object. """ model_dir_path = os.path.normpath(model_dir_path) embed_local_ml_library = kwargs.pop("embed_local_ml_library", False) + # Use the last one which is loaded first, that is mean, it is loaded from site-packages. + # We could make sure that user does not overwrite our library with their code follow the same naming. + snowml_path = list(importlib.import_module(_SNOWFLAKE_ML_PKG_NAME).__path__)[-1] if embed_local_ml_library: - snowml_path = list(importlib.import_module("snowflake.ml").__path__)[0] kwargs["local_ml_library_version"] = f"{snowml_env.VERSION}+{file_utils.hash_directory(snowml_path)}" model_meta = ModelMetadata( @@ -100,19 +108,25 @@ def _create_model_metadata( signatures=signatures, **kwargs, ) - if code_paths: - code_dir_path = os.path.join(model_dir_path, ModelMetadata.MODEL_CODE_DIR) + + code_dir_path = os.path.join(model_dir_path, ModelMetadata.MODEL_CODE_DIR) + if embed_local_ml_library or code_paths: os.makedirs(code_dir_path, exist_ok=True) - for code_path in code_paths: - file_utils.copy_file_or_tree(code_path, code_dir_path) if embed_local_ml_library: - code_dir_path = os.path.join(model_dir_path, ModelMetadata.MODEL_CODE_DIR) - snowml_path = list(importlib.import_module("snowflake.ml").__path__)[0] - snowml_path_in_code = os.path.join(code_dir_path, "snowflake") + snowml_path_in_code = os.path.join(code_dir_path, _SNOWFLAKE_PKG_NAME) os.makedirs(snowml_path_in_code, exist_ok=True) file_utils.copy_file_or_tree(snowml_path, snowml_path_in_code) + if code_paths: + for code_path in code_paths: + # This part is to prevent users from providing code following our naming and overwrite our code. + if ( + os.path.isfile(code_path) and os.path.splitext(os.path.basename(code_path))[0] == _SNOWFLAKE_PKG_NAME + ) or (os.path.isdir(code_path) and os.path.basename(code_path) == _SNOWFLAKE_PKG_NAME): + raise ValueError("`snowflake` is a reserved name and you cannot contain that into code path.") + file_utils.copy_file_or_tree(code_path, code_dir_path) + try: imported_modules = [] if ext_modules: @@ -146,9 +160,23 @@ def _load_model_metadata(model_dir_path: str) -> "ModelMetadata": if code_path in sys.path: sys.path.remove(code_path) sys.path.insert(0, code_path) - modules = file_utils.get_all_modules(code_path) - for module in modules: - sys.modules.pop(module.name, None) + module_names = file_utils.get_all_modules(code_path) + # If the module_name starts with snowflake, then do not replace it. + # When deploying, we would add them beforehand. + # When in the local, they should not be added. We already prevent user from overwriting us. + module_names = [ + module_name + for module_name in module_names + if not (module_name.startswith(f"{_SNOWFLAKE_PKG_NAME}.") or module_name == _SNOWFLAKE_PKG_NAME) + ] + for module_name in module_names: + actual_module = sys.modules.pop(module_name, None) + if actual_module is not None: + sys.modules[module_name] = importlib.import_module(module_name) + + assert code_path in sys.path + sys.path.remove(code_path) + return meta @@ -244,7 +272,7 @@ def _include_if_absent(self, pkgs: List[Dependency]) -> None: pip_reqs = env_utils.validate_pip_requirement_string_list(list(pip_reqs_str)) conda_reqs = env_utils.validate_conda_dependency_string_list(list(conda_reqs_str)) - for conda_req, pip_req in zip(conda_reqs[""], pip_reqs): + for conda_req, pip_req in zip(conda_reqs[env_utils.DEFAULT_CHANNEL_NAME], pip_reqs): req_to_add = env_utils.get_local_installed_version_of_pip_package(pip_req) req_to_add.name = conda_req.name for added_pip_req in self._pip_requirements: @@ -257,7 +285,9 @@ def _include_if_absent(self, pkgs: List[Dependency]) -> None: category=UserWarning, ) try: - env_utils.append_conda_dependency(self._conda_dependencies, ("", req_to_add)) + env_utils.append_conda_dependency( + self._conda_dependencies, (env_utils.DEFAULT_CHANNEL_NAME, req_to_add) + ) except env_utils.DuplicateDependencyError: pass except env_utils.DuplicateDependencyInMultipleChannelsError: @@ -373,12 +403,12 @@ def load_model_metadata(cls, path: str) -> "ModelMetadata": return meta -def _is_callable(model: model_types.SupportedLocalModelType, method_name: str) -> bool: +def _is_callable(model: model_types.SupportedModelType, method_name: str) -> bool: return callable(getattr(model, method_name, None)) def _validate_signature( - model: model_types.SupportedLocalModelType, + model: model_types.SupportedRequireSignatureModelType, model_meta: ModelMetadata, target_methods: Sequence[str], sample_input: Optional[model_types.SupportedDataType], @@ -397,7 +427,7 @@ def _validate_signature( if isinstance(sample_input, SnowparkDataFrame): # Added because of Any from missing stubs. trunc_sample_input = cast(SnowparkDataFrame, trunc_sample_input) - local_sample_input = model_signature._SnowparkDataFrameHandler.convert_to_df(trunc_sample_input) + local_sample_input = snowpark_handler.SnowparkDataFrameHandler.convert_to_df(trunc_sample_input) else: local_sample_input = trunc_sample_input for target_method in target_methods: @@ -408,7 +438,7 @@ def _validate_signature( def _get_target_methods( - model: model_types.SupportedLocalModelType, + model: model_types.SupportedModelType, target_methods: Optional[Sequence[str]], default_target_methods: Sequence[str], ) -> Sequence[str]: diff --git a/snowflake/ml/model/_model_test.py b/snowflake/ml/model/_model_test.py index bb19d0de..319aefe8 100644 --- a/snowflake/ml/model/_model_test.py +++ b/snowflake/ml/model/_model_test.py @@ -1,12 +1,17 @@ import asyncio +import importlib import os +import sys import tempfile +import uuid import warnings from typing import List, Tuple, cast from unittest import mock +import mlflow import numpy as np import pandas as pd +import tensorflow as tf import torch import xgboost from absl.testing import absltest @@ -18,6 +23,11 @@ model_signature, type_hints as model_types, ) +from snowflake.ml.model._signatures import ( + pytorch_handler, + tensorflow_handler, + utils as model_signature_utils, +) from snowflake.ml.modeling.linear_model import LinearRegression from snowflake.ml.test_utils import mock_session from snowflake.snowpark import FileOperation, Session @@ -125,6 +135,141 @@ def _prepare_torch_model( return model, data_x, data_y +class SimpleModule(tf.Module): + def __init__(self, name: str = None) -> None: + super().__init__(name=name) + self.a_variable = tf.Variable(5.0, name="train_me") + self.non_trainable_variable = tf.Variable(5.0, trainable=False, name="do_not_train_me") + + @tf.function # type: ignore[misc] + def __call__(self, tensors: List[tf.Tensor]) -> List[tf.Tensor]: + return [self.a_variable * tensors[0] + self.non_trainable_variable] + + +class KerasModel(tf.keras.Model): + def __init__(self, n_hidden: int, n_out: int) -> None: + super().__init__() + self.fc_1 = tf.keras.layers.Dense(n_hidden, activation="relu") + self.fc_2 = tf.keras.layers.Dense(n_out, activation="sigmoid") + + def call(self, tensors: List[tf.Tensor]) -> List[tf.Tensor]: + input = tensors[0] + x = self.fc_1(input) + x = self.fc_2(x) + return [x] + + +def _prepare_keras_model( + dtype: tf.dtypes.DType = tf.float32, +) -> Tuple[tf.keras.Model, List[tf.Tensor], List[tf.Tensor]]: + n_input, n_hidden, n_out, batch_size, learning_rate = 10, 15, 1, 100, 0.01 + x = np.random.rand(batch_size, n_input) + data_x = [tf.convert_to_tensor(x, dtype=dtype)] + raw_data_y = tf.random.uniform((batch_size, 1)) + raw_data_y = tf.where(raw_data_y > 0.5, tf.ones_like(raw_data_y), tf.zeros_like(raw_data_y)) + data_y = [tf.cast(raw_data_y, dtype=dtype)] + + def loss_fn(y_true: List[tf.Tensor], y_pred: List[tf.Tensor]) -> tf.Tensor: + return tf.keras.losses.mse(y_true[0], y_pred[0]) + + model = KerasModel(n_hidden, n_out) + model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), loss=loss_fn) + model.fit(data_x, data_y, batch_size=batch_size, epochs=100) + return model, data_x, data_y + + +PY_SRC = """\ +def get_name(): + return __name__ +def get_file(): + return __file__ +""" + + +class ModelLoadHygieneTest(absltest.TestCase): + def test_model_load_hygiene(self) -> None: + with tempfile.TemporaryDirectory() as workspace: + with tempfile.TemporaryDirectory() as src_path: + fake_mod_dirpath = os.path.join(src_path, "fake", "fake_module") + os.makedirs(fake_mod_dirpath) + + py_file_path = os.path.join(fake_mod_dirpath, "p.py") + with open(py_file_path, "w", encoding="utf-8") as f: + f.write(PY_SRC) + f.flush() + + sys.path.insert(0, src_path) + + from fake.fake_module import p + + self.assertEqual(p.__file__, py_file_path) + + lm = DemoModel(context=custom_model.ModelContext(models={}, artifacts={})) + arr = np.array([[1, 2, 3], [4, 2, 5]]) + d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + model_api.save_model( + name="model1", + model_dir_path=os.path.join(workspace, "model1"), + model=lm, + sample_input=d, + metadata={"author": "halu", "version": "1"}, + code_paths=[os.path.join(src_path, "fake")], + ) + + print(list(os.walk(os.path.join(workspace, "model1")))) + _ = model_api.load_model(model_dir_path=os.path.join(workspace, "model1")) + from fake.fake_module import p + + self.assertEqual(p.__file__, os.path.join(workspace, "model1", "code", "fake", "fake_module", "p.py")) + + importlib.reload(p) + self.assertEqual(p.__file__, py_file_path) + sys.path.remove(src_path) + + def test_model_save_validation(self) -> None: + with tempfile.TemporaryDirectory() as workspace: + with tempfile.TemporaryDirectory() as src_path: + fake_mod_dirpath = os.path.join(src_path, "snowflake", "fake_module") + os.makedirs(fake_mod_dirpath) + + py_file_path = os.path.join(fake_mod_dirpath, "p.py") + with open(py_file_path, "w", encoding="utf-8") as f: + f.write(PY_SRC) + f.flush() + + lm = DemoModel(context=custom_model.ModelContext(models={}, artifacts={})) + arr = np.array([[1, 2, 3], [4, 2, 5]]) + d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + with self.assertRaises(ValueError): + model_api.save_model( + name="model1", + model_dir_path=os.path.join(workspace, "model1"), + model=lm, + sample_input=d, + metadata={"author": "halu", "version": "1"}, + code_paths=[os.path.join(src_path, "snowflake")], + ) + + with tempfile.TemporaryDirectory() as src_path: + py_file_path = os.path.join(src_path, "snowflake.py") + with open(py_file_path, "w", encoding="utf-8") as f: + f.write(PY_SRC) + f.flush() + + lm = DemoModel(context=custom_model.ModelContext(models={}, artifacts={})) + arr = np.array([[1, 2, 3], [4, 2, 5]]) + d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + with self.assertRaises(ValueError): + model_api.save_model( + name="model1", + model_dir_path=os.path.join(workspace, "model1"), + model=lm, + sample_input=d, + metadata={"author": "halu", "version": "1"}, + code_paths=[py_file_path], + ) + + class ModelInterfaceTest(absltest.TestCase): def test_save_interface(self) -> None: m_session = mock_session.MockSession(conn=None, test_case=self) @@ -269,7 +414,7 @@ def test_save_interface(self) -> None: with mock.patch.object(model_api, "_save", return_value=None) as mock_save: with mock.patch.object(FileOperation, "put_stream", return_value=None) as mock_put_stream: - model_api.save_model( + model_api.save_model( # type:ignore[call-overload] name="model1", model=linear_model.LinearRegression(), session=c_session, @@ -907,8 +1052,8 @@ def test_pytorch(self) -> None: model.eval() y_pred = model.forward(data_x)[0].detach() - x_df = model_signature._rename_pandas_df( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False), + x_df = model_signature_utils.rename_pandas_df( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False), s["forward"].inputs, ) @@ -922,7 +1067,7 @@ def test_pytorch(self) -> None: predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) torch.testing.assert_close( # type:ignore[attr-defined] - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( predict_method(x_df), s["forward"].outputs )[0], y_pred, @@ -945,7 +1090,7 @@ def test_pytorch(self) -> None: predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) torch.testing.assert_close( # type:ignore[attr-defined] - model_signature._SeqOfPyTorchTensorHandler.convert_from_df(predict_method(x_df), s["forward"].outputs)[ + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(predict_method(x_df), s["forward"].outputs)[ 0 ], y_pred, @@ -977,8 +1122,8 @@ def test_torchscript(self) -> None: model_script.eval() y_pred = model_script.forward(data_x)[0].detach() - x_df = model_signature._rename_pandas_df( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False), + x_df = model_signature_utils.rename_pandas_df( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False), s["forward"].inputs, ) @@ -992,7 +1137,7 @@ def test_torchscript(self) -> None: predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) torch.testing.assert_close( # type:ignore[attr-defined] - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( predict_method(x_df), s["forward"].outputs )[0], y_pred, @@ -1015,7 +1160,7 @@ def test_torchscript(self) -> None: predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) torch.testing.assert_close( # type:ignore[attr-defined] - model_signature._SeqOfPyTorchTensorHandler.convert_from_df(predict_method(x_df), s["forward"].outputs)[ + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(predict_method(x_df), s["forward"].outputs)[ 0 ], y_pred, @@ -1030,8 +1175,8 @@ def test_torch_df_sample_input(self) -> None: model.eval() y_pred = model.forward(data_x)[0].detach() - x_df = model_signature._rename_pandas_df( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False), + x_df = model_signature_utils.rename_pandas_df( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False), s["forward"].inputs, ) model_api.save_model( @@ -1050,7 +1195,7 @@ def test_torch_df_sample_input(self) -> None: predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) torch.testing.assert_close( # type:ignore[attr-defined] - model_signature._SeqOfPyTorchTensorHandler.convert_from_df(predict_method(x_df))[0], y_pred + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(predict_method(x_df))[0], y_pred ) model_script.eval() @@ -1072,7 +1217,435 @@ def test_torch_df_sample_input(self) -> None: predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) torch.testing.assert_close( # type:ignore[attr-defined] - model_signature._SeqOfPyTorchTensorHandler.convert_from_df(predict_method(x_df))[0], y_pred + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(predict_method(x_df))[0], y_pred + ) + + def test_tensorflow(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + simple_module = SimpleModule(name="simple") + x = [tf.constant([[5.0], [10.0]])] + y_pred = simple_module(x) + s = {"__call__": model_signature.infer_signature(x, y_pred)} + with self.assertRaises(ValueError): + model_api.save_model( + name="model1", + model_dir_path=os.path.join(tmpdir, "model1"), + model=simple_module, + signatures={**s, "another_forward": s["__call__"]}, + metadata={"author": "halu", "version": "1"}, + ) + + model_api.save_model( + name="model1", + model_dir_path=os.path.join(tmpdir, "model1"), + model=simple_module, + signatures=s, + metadata={"author": "halu", "version": "1"}, + ) + + x_df = model_signature_utils.rename_pandas_df( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(x, ensure_serializable=False), + s["__call__"].inputs, + ) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + + m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + assert callable(m) + tf.assert_equal(m.__call__(x)[0], y_pred[0]) + m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + assert callable(m_udf) + tf.assert_equal( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(m_udf(x_df), s["__call__"].outputs)[ + 0 + ], + y_pred[0], + ) + + model_api.save_model( + name="model1_no_sig_1", + model_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), + model=simple_module, + sample_input=x, + metadata={"author": "halu", "version": "1"}, + ) + + m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) + assert callable(m) + tf.assert_equal(m(x)[0], y_pred[0]) + m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig_1")) + assert callable(m_udf) + tf.assert_equal( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(m_udf(x_df), s["__call__"].outputs)[0], + y_pred[0], + ) + + model_api.save_model( + name="model1_no_sig_2", + model_dir_path=os.path.join(tmpdir, "model1_no_sig_2"), + model=simple_module, + sample_input=x_df, + metadata={"author": "halu", "version": "1"}, + ) + + m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig_2")) + assert callable(m_udf) + tf.assert_equal( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(m_udf(x_df), s["__call__"].outputs)[0], + y_pred[0], + ) + + def test_tensorflow_keras(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + model, data_x, data_y = _prepare_keras_model() + s = {"predict": model_signature.infer_signature(data_x, data_y)} + with self.assertRaises(ValueError): + model_api.save_model( + name="model1", + model_dir_path=os.path.join(tmpdir, "model1"), + model=model, + signatures={**s, "another_forward": s["predict"]}, + metadata={"author": "halu", "version": "1"}, + ) + + model_api.save_model( + name="model1", + model_dir_path=os.path.join(tmpdir, "model1"), + model=model, + signatures=s, + metadata={"author": "halu", "version": "1"}, + ) + + y_pred = model.predict(data_x)[0] + + x_df = model_signature_utils.rename_pandas_df( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False), + s["predict"].inputs, + ) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + + m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + assert isinstance(m, tf.keras.Model) + tf.debugging.assert_near(m.predict(data_x)[0], y_pred) + m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + predict_method = getattr(m_udf, "predict", None) + assert callable(predict_method) + tf.debugging.assert_near( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + predict_method(x_df), s["predict"].outputs + )[0], + y_pred, + ) + + model_api.save_model( + name="model1_no_sig_1", + model_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), + model=model, + sample_input=data_x, + metadata={"author": "halu", "version": "1"}, + ) + + m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) + assert isinstance(m, tf.keras.Model) + tf.debugging.assert_near(m.predict(data_x)[0], y_pred) + self.assertEqual(s["predict"], meta.signatures["predict"]) + + m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig_1")) + predict_method = getattr(m_udf, "predict", None) + assert callable(predict_method) + tf.debugging.assert_near( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + predict_method(x_df), s["predict"].outputs + )[0], + y_pred, + ) + + def test_mlflow_model(self) -> None: + db = datasets.load_diabetes() + X_train, X_test, y_train, y_test = model_selection.train_test_split(db.data, db.target) + with mlflow.start_run() as run: + rf = ensemble.RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3) + rf.fit(X_train, y_train) + + # Use the model to make predictions on the test dataset. + predictions = rf.predict(X_test) + signature = mlflow.models.signature.infer_signature(X_test, predictions) + mlflow.sklearn.log_model( + rf, + "model", + conda_env={ + "channels": ["conda-forge"], + "dependencies": [ + "python=3.8.13", + "pip<=23.0.1", + { + "pip": [ + "mlflow<3,>=2.3", + "cloudpickle==2.0.0", + "numpy==1.23.4", + "psutil==5.9.0", + "scikit-learn==1.2.2", + "scipy==1.9.3", + "typing-extensions==4.5.0", + ] + }, + ], + "name": "mlflow-env", + }, + signature=signature, + metadata={"author": "halu", "version": "1"}, + ) + + run_id = run.info.run_id + + with tempfile.TemporaryDirectory() as tmpdir: + mlflow_pyfunc_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") + saved_meta = model_api.save_model( + name="model1", + model_dir_path=os.path.join(tmpdir, "model1"), + model=mlflow_pyfunc_model, + ) + + self.assertEqual(saved_meta.python_version, "3.8.13") + self.assertDictEqual(saved_meta.metadata, {"author": "halu", "version": "1"}) + self.assertDictEqual( + saved_meta.signatures, + { + "predict": model_signature.ModelSignature( + inputs=[ + model_signature.FeatureSpec( + name="input_feature_0", dtype=model_signature.DataType.DOUBLE, shape=(10,) + ) + ], + outputs=[ + model_signature.FeatureSpec(name="output_feature_0", dtype=model_signature.DataType.DOUBLE) + ], + ) + }, + ) + self.assertListEqual( + sorted(saved_meta.pip_requirements), + sorted( + [ + "mlflow<3,>=2.3", + "cloudpickle==2.0.0", + "numpy==1.23.4", + "psutil==5.9.0", + "scikit-learn==1.2.2", + "scipy==1.9.3", + "typing-extensions==4.5.0", + ] + ), + ) + self.assertIn("pip<=23.0.1", saved_meta.conda_dependencies) + + m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + assert isinstance(m, mlflow.pyfunc.PyFuncModel) + self.assertNotEqual(m.metadata.run_id, run_id) + + _ = model_api.save_model( + name="model1_again", + model_dir_path=os.path.join(tmpdir, "model1_again"), + model=m, + ) + + self.assertEqual(meta.python_version, "3.8.13") + self.assertDictEqual(meta.metadata, {"author": "halu", "version": "1"}) + self.assertDictEqual( + meta.signatures, + { + "predict": model_signature.ModelSignature( + inputs=[ + model_signature.FeatureSpec( + name="input_feature_0", dtype=model_signature.DataType.DOUBLE, shape=(10,) + ) + ], + outputs=[ + model_signature.FeatureSpec(name="output_feature_0", dtype=model_signature.DataType.DOUBLE) + ], + ) + }, + ) + self.assertListEqual( + sorted(meta.pip_requirements), + sorted( + [ + "mlflow<3,>=2.3", + "cloudpickle==2.0.0", + "numpy==1.23.4", + "psutil==5.9.0", + "scikit-learn==1.2.2", + "scipy==1.9.3", + "typing-extensions==4.5.0", + ] + ), + ) + self.assertIn("pip<=23.0.1", meta.conda_dependencies) + + np.testing.assert_allclose(predictions, m.predict(X_test)) + + m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + predict_method = getattr(m_udf, "predict", None) + assert callable(predict_method) + X_df = pd.DataFrame(X_test) + np.testing.assert_allclose(np.expand_dims(predictions, axis=1), predict_method(X_df).to_numpy()) + + def test_mlflow_model_df_inputs(self) -> None: + db = datasets.load_diabetes(as_frame=True) + X_train, X_test, y_train, y_test = model_selection.train_test_split(db.data, db.target) + with mlflow.start_run() as run: + rf = ensemble.RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3) + rf.fit(X_train, y_train) + + # Use the model to make predictions on the test dataset. + predictions = rf.predict(X_test) + signature = mlflow.models.signature.infer_signature(X_test, predictions) + mlflow.sklearn.log_model( + rf, + "model", + signature=signature, + ) + + run_id = run.info.run_id + + with tempfile.TemporaryDirectory() as tmpdir: + mlflow_pyfunc_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") + _ = model_api.save_model( + name="model1", + model_dir_path=os.path.join(tmpdir, "model1"), + model=mlflow_pyfunc_model, + ) + + m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + assert isinstance(m, mlflow.pyfunc.PyFuncModel) + self.assertNotEqual(m.metadata.run_id, run_id) + + np.testing.assert_allclose(predictions, m.predict(X_test)) + + m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + predict_method = getattr(m_udf, "predict", None) + assert callable(predict_method) + np.testing.assert_allclose(np.expand_dims(predictions, axis=1), predict_method(X_test).to_numpy()) + + def test_mlflow_model_bad_case(self) -> None: + db = datasets.load_diabetes(as_frame=True) + X_train, X_test, y_train, y_test = model_selection.train_test_split(db.data, db.target) + with mlflow.start_run() as run: + rf = ensemble.RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3) + rf.fit(X_train, y_train) + + # Use the model to make predictions on the test dataset. + predictions = rf.predict(X_test) + signature = mlflow.models.signature.infer_signature(X_test, predictions) + mlflow.sklearn.log_model( + rf, + "model", + signature=signature, + metadata={"author": "halu", "version": "1"}, + ) + + run_id = run.info.run_id + + with tempfile.TemporaryDirectory() as tmpdir: + local_path = mlflow.artifacts.download_artifacts(f"runs:/{run_id}/model", dst_path=tmpdir) + mlflow_pyfunc_model = mlflow.pyfunc.load_model(local_path) + mlflow_pyfunc_model.metadata.run_id = uuid.uuid4().hex.lower() + with self.assertRaisesRegex(ValueError, "Cannot load MLFlow model artifacts."): + _ = model_api.save_model( + name="model1", + model_dir_path=os.path.join(tmpdir, "model1"), + model=mlflow_pyfunc_model, + options={"ignore_mlflow_dependencies": True}, + ) + + saved_meta = model_api.save_model( + name="model1", + model_dir_path=os.path.join(tmpdir, "model1"), + model=mlflow_pyfunc_model, + options={"model_uri": local_path, "ignore_mlflow_dependencies": True}, + ) + + self.assertEmpty(saved_meta.pip_requirements) + + with self.assertRaisesRegex(ValueError, "Cannot load MLFlow model dependencies."): + _ = model_api.save_model( + name="model1", + model_dir_path=os.path.join(tmpdir, "model1"), + model=mlflow_pyfunc_model, + ) + + saved_meta = model_api.save_model( + name="model2", + model_dir_path=os.path.join(tmpdir, "model2"), + model=mlflow_pyfunc_model, + options={"model_uri": local_path, "ignore_mlflow_metadata": True}, + ) + + self.assertIsNone(saved_meta.metadata) + + m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model2")) + assert isinstance(m, mlflow.pyfunc.PyFuncModel) + self.assertNotEqual(m.metadata.run_id, run_id) + + np.testing.assert_allclose(predictions, m.predict(X_test)) + + _ = model_api.save_model( + name="model2_again", + model_dir_path=os.path.join(tmpdir, "model2_again"), + model=m, + ) + + m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model2")) + predict_method = getattr(m_udf, "predict", None) + assert callable(predict_method) + np.testing.assert_allclose(np.expand_dims(predictions, axis=1), predict_method(X_test).to_numpy()) + + def test_mlflow_model_pytorch(self) -> None: + net = torch.nn.Linear(6, 1) + loss_function = torch.nn.L1Loss() + optimizer = torch.optim.Adam(net.parameters(), lr=1e-4) + + X = torch.randn(6) + y = torch.randn(1) + + epochs = 5 + for _ in range(epochs): + optimizer.zero_grad() + outputs = net(X) + + loss = loss_function(outputs, y) + loss.backward() + + optimizer.step() + + with mlflow.start_run(): + signature = mlflow.models.infer_signature(X.numpy(), net(X).detach().numpy()) + model_info = mlflow.pytorch.log_model(net, "model", signature=signature) + + pytorch_pyfunc = mlflow.pyfunc.load_model(model_uri=model_info.model_uri) + input_x = torch.randn(6).numpy() + predictions = pytorch_pyfunc.predict(input_x) + + with tempfile.TemporaryDirectory() as tmpdir: + _ = model_api.save_model( + name="model1", + model_dir_path=os.path.join(tmpdir, "model1"), + model=pytorch_pyfunc, + ) + + m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + assert isinstance(m, mlflow.pyfunc.PyFuncModel) + + np.testing.assert_allclose(predictions, m.predict(input_x)) + + m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + predict_method = getattr(m_udf, "predict", None) + assert callable(predict_method) + np.testing.assert_allclose( + np.expand_dims(predictions, axis=1), predict_method(pd.DataFrame(input_x)).to_numpy() ) diff --git a/snowflake/ml/model/_signatures/BUILD.bazel b/snowflake/ml/model/_signatures/BUILD.bazel new file mode 100644 index 00000000..6c775c66 --- /dev/null +++ b/snowflake/ml/model/_signatures/BUILD.bazel @@ -0,0 +1,168 @@ +load("//bazel:py_rules.bzl", "py_library", "py_test") + +package(default_visibility = ["//visibility:public"]) + +py_library( + name = "base_handler", + srcs = ["base_handler.py"], + deps = [ + ":core", + "//snowflake/ml/model:type_hints", + ], +) + +py_library( + name = "core", + srcs = ["core.py"], +) + +py_test( + name = "core_test", + srcs = ["core_test.py"], + deps = [ + ":core", + ], +) + +py_library( + name = "pandas_handler", + srcs = ["pandas_handler.py"], + deps = [ + ":core", + ":base_handler", + ":utils", + "//snowflake/ml/model:type_hints", + ], +) + +py_test( + name = "pandas_test", + srcs = ["pandas_test.py"], + deps = [ + ":core", + ":pandas_handler", + ], +) + +py_library( + name = "numpy_handler", + srcs = ["numpy_handler.py"], + deps = [ + ":core", + ":base_handler", + "//snowflake/ml/model:type_hints", + ], +) + +py_test( + name = "numpy_test", + srcs = ["numpy_test.py"], + deps = [ + ":core", + ":numpy_handler", + ], +) + +py_library( + name = "pytorch_handler", + srcs = ["pytorch_handler.py"], + deps = [ + ":core", + ":base_handler", + "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/model:type_hints", + ], +) + +py_test( + name = "pytorch_test", + srcs = ["pytorch_test.py"], + deps = [ + ":core", + ":pytorch_handler", + ":utils", + ], +) + +py_library( + name = "tensorflow_handler", + srcs = ["tensorflow_handler.py"], + deps = [ + ":core", + ":base_handler", + "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/model:type_hints", + ], +) + +py_test( + name = "tensorflow_test", + srcs = ["tensorflow_test.py"], + deps = [ + ":core", + ":tensorflow_handler", + ":utils", + ], +) + +py_library( + name = "builtins_handler", + srcs = ["builtins_handler.py"], + deps = [ + ":core", + ":base_handler", + ":pandas_handler", + "//snowflake/ml/model:type_hints", + ], +) + +py_test( + name = "builtins_test", + srcs = ["builtins_test.py"], + deps = [ + ":core", + ":builtins_handler", + ], +) + +py_library( + name = "utils", + srcs = ["utils.py"], + deps = [ + ":core", + ], +) + +py_test( + name = "utils_test", + srcs = ["utils_test.py"], + deps = [ + ":core", + ":utils", + ], +) + + +py_library( + name = "snowpark_handler", + srcs = ["snowpark_handler.py"], + deps = [ + ":core", + ":base_handler", + ":pandas_handler", + "//snowflake/ml/_internal/utils:identifier", + "//snowflake/ml/model/_deploy_client/warehouse:infer_template", + "//snowflake/ml/model:type_hints", + ], +) + +py_test( + name = "snowpark_test", + srcs = ["snowpark_test.py"], + deps = [ + ":core", + ":snowpark_handler", + "//snowflake/ml/model:model_signature", + "//snowflake/ml/utils:connection_params", + ], +) diff --git a/snowflake/ml/model/_signatures/base_handler.py b/snowflake/ml/model/_signatures/base_handler.py new file mode 100644 index 00000000..fa83041d --- /dev/null +++ b/snowflake/ml/model/_signatures/base_handler.py @@ -0,0 +1,47 @@ +from abc import ABC, abstractmethod +from typing import Final, Generic, Literal, Sequence + +import pandas as pd +from typing_extensions import TypeGuard + +from snowflake.ml.model import type_hints as model_types +from snowflake.ml.model._signatures import core + + +class BaseDataHandler(ABC, Generic[model_types._DataType]): + FEATURE_PREFIX: Final[str] = "feature" + INPUT_PREFIX: Final[str] = "input" + OUTPUT_PREFIX: Final[str] = "output" + SIG_INFER_ROWS_COUNT_LIMIT: Final[int] = 10 + + @staticmethod + @abstractmethod + def can_handle(data: model_types.SupportedDataType) -> TypeGuard[model_types._DataType]: + ... + + @staticmethod + @abstractmethod + def count(data: model_types._DataType) -> int: + ... + + @staticmethod + @abstractmethod + def truncate(data: model_types._DataType) -> model_types._DataType: + ... + + @staticmethod + @abstractmethod + def validate(data: model_types._DataType) -> None: + ... + + @staticmethod + @abstractmethod + def infer_signature( + data: model_types._DataType, role: Literal["input", "output"] + ) -> Sequence[core.BaseFeatureSpec]: + ... + + @staticmethod + @abstractmethod + def convert_to_df(data: model_types._DataType, ensure_serializable: bool = True) -> pd.DataFrame: + ... diff --git a/snowflake/ml/model/_signatures/builtins_handler.py b/snowflake/ml/model/_signatures/builtins_handler.py new file mode 100644 index 00000000..9129059c --- /dev/null +++ b/snowflake/ml/model/_signatures/builtins_handler.py @@ -0,0 +1,46 @@ +from typing import Literal, Sequence + +import pandas as pd +from typing_extensions import TypeGuard + +from snowflake.ml.model import type_hints as model_types +from snowflake.ml.model._signatures import base_handler, core, pandas_handler + + +class ListOfBuiltinHandler(base_handler.BaseDataHandler[model_types._SupportedBuiltinsList]): + @staticmethod + def can_handle(data: model_types.SupportedDataType) -> TypeGuard[model_types._SupportedBuiltinsList]: + return ( + isinstance(data, list) + and len(data) > 0 + and all(isinstance(data_col, (int, float, bool, str, bytes, list)) for data_col in data) + ) + + @staticmethod + def count(data: model_types._SupportedBuiltinsList) -> int: + return len(data) + + @staticmethod + def truncate(data: model_types._SupportedBuiltinsList) -> model_types._SupportedBuiltinsList: + return data[: min(ListOfBuiltinHandler.count(data), ListOfBuiltinHandler.SIG_INFER_ROWS_COUNT_LIMIT)] + + @staticmethod + def validate(data: model_types._SupportedBuiltinsList) -> None: + if not all(isinstance(data_row, type(data[0])) for data_row in data): + raise ValueError(f"Data Validation Error: Inconsistent type of object found in data {data}.") + df = pd.DataFrame(data) + if df.isnull().values.any(): + raise ValueError(f"Data Validation Error: Ill-shaped list data {data} confronted.") + + @staticmethod + def infer_signature( + data: model_types._SupportedBuiltinsList, role: Literal["input", "output"] + ) -> Sequence[core.BaseFeatureSpec]: + return pandas_handler.PandasDataFrameHandler.infer_signature(pd.DataFrame(data), role) + + @staticmethod + def convert_to_df( + data: model_types._SupportedBuiltinsList, + ensure_serializable: bool = True, + ) -> pd.DataFrame: + return pd.DataFrame(data) diff --git a/snowflake/ml/model/_signatures/builtins_test.py b/snowflake/ml/model/_signatures/builtins_test.py new file mode 100644 index 00000000..0e4e7a6f --- /dev/null +++ b/snowflake/ml/model/_signatures/builtins_test.py @@ -0,0 +1,68 @@ +import pandas as pd +from absl.testing import absltest + +from snowflake.ml.model._signatures import builtins_handler, core + + +class ListOfBuiltinsHandlerTest(absltest.TestCase): + def test_validate_list_builtins(self) -> None: + lt6 = ["Hello", [2, 3]] + with self.assertRaisesRegex(ValueError, "Inconsistent type of object found in data"): + builtins_handler.ListOfBuiltinHandler.validate(lt6) # type:ignore[arg-type] + + lt7 = [[1], [2, 3]] + with self.assertRaisesRegex(ValueError, "Ill-shaped list data"): + builtins_handler.ListOfBuiltinHandler.validate(lt7) + + lt8 = [pd.DataFrame([1]), pd.DataFrame([2, 3])] + self.assertFalse(builtins_handler.ListOfBuiltinHandler.can_handle(lt8)) + + def test_infer_signature_list_builtins(self) -> None: + lt1 = [1, 2, 3, 4] + self.assertListEqual( + builtins_handler.ListOfBuiltinHandler.infer_signature(lt1, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT64)], + ) + + lt2 = ["a", "b", "c", "d"] + self.assertListEqual( + builtins_handler.ListOfBuiltinHandler.infer_signature(lt2, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.STRING)], + ) + + lt3 = [ele.encode() for ele in lt2] + self.assertListEqual( + builtins_handler.ListOfBuiltinHandler.infer_signature(lt3, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.BYTES)], + ) + + lt4 = [[1, 2], [3, 4]] + self.assertListEqual( + builtins_handler.ListOfBuiltinHandler.infer_signature(lt4, role="input"), + [ + core.FeatureSpec("input_feature_0", core.DataType.INT64), + core.FeatureSpec("input_feature_1", core.DataType.INT64), + ], + ) + + lt5 = [[1, 2.0], [3, 4]] # This is not encouraged and will have type error, but we support it. + self.assertListEqual( + builtins_handler.ListOfBuiltinHandler.infer_signature(lt5, role="input"), # type:ignore[arg-type] + [ + core.FeatureSpec("input_feature_0", core.DataType.INT64), + core.FeatureSpec("input_feature_1", core.DataType.DOUBLE), + ], + ) + + lt6 = [[[1, 1], [2, 2]], [[3, 3], [4, 4]]] + self.assertListEqual( + builtins_handler.ListOfBuiltinHandler.infer_signature(lt6, role="input"), + [ + core.FeatureSpec("input_feature_0", core.DataType.INT64, shape=(2,)), + core.FeatureSpec("input_feature_1", core.DataType.INT64, shape=(2,)), + ], + ) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_signatures/core.py b/snowflake/ml/model/_signatures/core.py new file mode 100644 index 00000000..1f07b5c0 --- /dev/null +++ b/snowflake/ml/model/_signatures/core.py @@ -0,0 +1,470 @@ +import textwrap +from abc import ABC, abstractmethod +from enum import Enum +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + List, + Optional, + Sequence, + Tuple, + Type, + Union, + final, +) + +import numpy as np +import numpy.typing as npt + +import snowflake.snowpark.types as spt + +if TYPE_CHECKING: + import mlflow + import torch + + +class DataType(Enum): + def __init__(self, value: str, snowpark_type: Type[spt.DataType], numpy_type: npt.DTypeLike) -> None: + self._value = value + self._snowpark_type = snowpark_type + self._numpy_type = numpy_type + + INT8 = ("int8", spt.ByteType, np.int8) + INT16 = ("int16", spt.ShortType, np.int16) + INT32 = ("int32", spt.IntegerType, np.int32) + INT64 = ("int64", spt.LongType, np.int64) + + FLOAT = ("float", spt.FloatType, np.float32) + DOUBLE = ("double", spt.DoubleType, np.float64) + + UINT8 = ("uint8", spt.ByteType, np.uint8) + UINT16 = ("uint16", spt.ShortType, np.uint16) + UINT32 = ("uint32", spt.IntegerType, np.uint32) + UINT64 = ("uint64", spt.LongType, np.uint64) + + BOOL = ("bool", spt.BooleanType, np.bool_) + STRING = ("string", spt.StringType, np.str_) + BYTES = ("bytes", spt.BinaryType, np.bytes_) + + def as_snowpark_type(self) -> spt.DataType: + """Convert to corresponding Snowpark Type. + + Returns: + A Snowpark type. + """ + return self._snowpark_type() + + def __repr__(self) -> str: + return f"DataType.{self.name}" + + @classmethod + def from_numpy_type(cls, np_type: npt.DTypeLike) -> "DataType": + """Translate numpy dtype to DataType for signature definition. + + Args: + np_type: The numpy dtype. + + Raises: + NotImplementedError: Raised when the given numpy type is not supported. + + Returns: + Corresponding DataType. + """ + np_to_snowml_type_mapping = {i._numpy_type: i for i in DataType} + for potential_type in np_to_snowml_type_mapping.keys(): + if np.can_cast(np_type, potential_type, casting="no"): + # This is used since the same dtype might represented in different ways. + return np_to_snowml_type_mapping[potential_type] + raise NotImplementedError(f"Type {np_type} is not supported as a DataType.") + + @classmethod + def from_torch_type(cls, torch_type: "torch.dtype") -> "DataType": + import torch + + """Translate torch dtype to DataType for signature definition. + + Args: + torch_type: The torch dtype. + + Returns: + Corresponding DataType. + """ + torch_dtype_to_numpy_dtype_mapping = { + torch.uint8: np.uint8, + torch.int8: np.int8, + torch.int16: np.int16, + torch.int32: np.int32, + torch.int64: np.int64, + torch.float32: np.float32, + torch.float64: np.float64, + torch.bool: np.bool_, + } + return cls.from_numpy_type(torch_dtype_to_numpy_dtype_mapping[torch_type]) + + @classmethod + def from_snowpark_type(cls, snowpark_type: spt.DataType) -> "DataType": + """Translate snowpark type to DataType for signature definition. + + Args: + snowpark_type: The snowpark type. + + Raises: + NotImplementedError: Raised when the given numpy type is not supported. + + Returns: + Corresponding DataType. + """ + if isinstance(snowpark_type, spt.ArrayType): + actual_sp_type = snowpark_type.element_type + else: + actual_sp_type = snowpark_type + + snowpark_to_snowml_type_mapping: Dict[Type[spt.DataType], DataType] = { + i._snowpark_type: i + for i in DataType + # We by default infer as signed integer. + if i not in [DataType.UINT8, DataType.UINT16, DataType.UINT32, DataType.UINT64] + } + for potential_type in snowpark_to_snowml_type_mapping.keys(): + if isinstance(actual_sp_type, potential_type): + return snowpark_to_snowml_type_mapping[potential_type] + # Fallback for decimal type. + if isinstance(snowpark_type, spt.DecimalType): + if snowpark_type.scale == 0: + return DataType.INT64 + raise NotImplementedError(f"Type {snowpark_type} is not supported as a DataType.") + + def is_same_snowpark_type(self, incoming_snowpark_type: spt.DataType) -> bool: + """Check if provided snowpark type is the same as Data Type. + + Args: + incoming_snowpark_type: The snowpark type. + + Raises: + NotImplementedError: Raised when the given numpy type is not supported. + + Returns: + If the provided snowpark type is the same as the DataType. + """ + # Special handle for Decimal Type. + if isinstance(incoming_snowpark_type, spt.DecimalType): + if incoming_snowpark_type.scale == 0: + return self == DataType.INT64 or self == DataType.UINT64 + raise NotImplementedError(f"Type {incoming_snowpark_type} is not supported as a DataType.") + + return isinstance(incoming_snowpark_type, self._snowpark_type) + + +class BaseFeatureSpec(ABC): + """Abstract Class for specification of a feature.""" + + def __init__(self, name: str) -> None: + self._name = name + + @final + @property + def name(self) -> str: + """Name of the feature.""" + return self._name + + @abstractmethod + def as_snowpark_type(self) -> spt.DataType: + """Convert to corresponding Snowpark Type.""" + pass + + @abstractmethod + def to_dict(self) -> Dict[str, Any]: + """Serialization""" + pass + + @classmethod + @abstractmethod + def from_dict(self, input_dict: Dict[str, Any]) -> "BaseFeatureSpec": + """Deserialization""" + pass + + +class FeatureSpec(BaseFeatureSpec): + """Specification of a feature in Snowflake native model packaging.""" + + def __init__( + self, + name: str, + dtype: DataType, + shape: Optional[Tuple[int, ...]] = None, + ) -> None: + """Initialize a feature. + + Args: + name: Name of the feature. + dtype: Type of the elements in the feature. + shape: Used to represent scalar feature, 1-d feature list or n-d tensor. + -1 is used to represent variable length.Defaults to None. + + E.g. + None: scalar + (2,): 1d list with fixed len of 2. + (-1,): 1d list with variable length. Used for ragged tensor representation. + (d1, d2, d3): 3d tensor. + + Raises: + TypeError: Raised when the dtype input type is incorrect. + TypeError: Raised when the shape input type is incorrect. + """ + super().__init__(name=name) + + if not isinstance(dtype, DataType): + raise TypeError("dtype should be a model signature datatype.") + self._dtype = dtype + + if shape and not isinstance(shape, tuple): + raise TypeError("Shape should be a tuple if presented.") + self._shape = shape + + def as_snowpark_type(self) -> spt.DataType: + result_type = self._dtype.as_snowpark_type() + if not self._shape: + return result_type + for _ in range(len(self._shape)): + result_type = spt.ArrayType(result_type) + return result_type + + def as_dtype(self) -> npt.DTypeLike: + """Convert to corresponding local Type.""" + if not self._shape: + return self._dtype._numpy_type + return np.object_ + + def __eq__(self, other: object) -> bool: + if isinstance(other, FeatureSpec): + return self._name == other._name and self._dtype == other._dtype and self._shape == other._shape + else: + return False + + def __repr__(self) -> str: + shape_str = f", shape={repr(self._shape)}" if self._shape else "" + return f"FeatureSpec(dtype={repr(self._dtype)}, name={repr(self._name)}{shape_str})" + + def to_dict(self) -> Dict[str, Any]: + """Serialize the feature group into a dict. + + Returns: + A dict that serializes the feature group. + """ + base_dict: Dict[str, Any] = { + "type": self._dtype.name, + "name": self._name, + } + if self._shape is not None: + base_dict["shape"] = self._shape + return base_dict + + @classmethod + def from_dict(cls, input_dict: Dict[str, Any]) -> "FeatureSpec": + """Deserialize the feature specification from a dict. + + Args: + input_dict: The dict containing information of the feature specification. + + Returns: + A feature specification instance deserialized and created from the dict. + """ + name = input_dict["name"] + shape = input_dict.get("shape", None) + if shape: + shape = tuple(shape) + type = DataType[input_dict["type"]] + return FeatureSpec(name=name, dtype=type, shape=shape) + + @classmethod + def from_mlflow_spec( + cls, input_spec: Union["mlflow.types.ColSpec", "mlflow.types.TensorSpec"], feature_name: str + ) -> "FeatureSpec": + import mlflow + + if isinstance(input_spec, mlflow.types.ColSpec): + name = input_spec.name + if name is None: + name = feature_name + return FeatureSpec(name=name, dtype=DataType.from_numpy_type(input_spec.type.to_numpy())) + elif isinstance(input_spec, mlflow.types.TensorSpec): + if len(input_spec.shape) == 1: + shape = None + else: + shape = tuple(input_spec.shape[1:]) + + name = input_spec.name + if name is None: + name = feature_name + return FeatureSpec(name=name, dtype=DataType.from_numpy_type(input_spec.type), shape=shape) + else: + raise NotImplementedError(f"MLFlow schema type {type(input_spec)} is not supported.") + + +class FeatureGroupSpec(BaseFeatureSpec): + """Specification of a group of features in Snowflake native model packaging.""" + + def __init__(self, name: str, specs: List[FeatureSpec]) -> None: + """Initialize a feature group. + + Args: + name: Name of the feature group. + specs: A list of feature specifications that composes the group. All children feature specs have to have + name. And all of them should have the same type. + """ + super().__init__(name=name) + self._specs = specs + self._validate() + + def _validate(self) -> None: + if len(self._specs) == 0: + raise ValueError("No children feature specs.") + # each has to have name, and same type + if not all(s._name is not None for s in self._specs): + raise ValueError("All children feature specs have to have name.") + if not (all(s._shape is None for s in self._specs) or all(s._shape is not None for s in self._specs)): + raise ValueError("All children feature specs have to have same shape.") + first_type = self._specs[0]._dtype + if not all(s._dtype == first_type for s in self._specs): + raise ValueError("All children feature specs have to have same type.") + + def as_snowpark_type(self) -> spt.DataType: + first_type = self._specs[0].as_snowpark_type() + return spt.MapType(spt.StringType(), first_type) + + def __eq__(self, other: object) -> bool: + if isinstance(other, FeatureGroupSpec): + return self._specs == other._specs + else: + return False + + def __repr__(self) -> str: + spec_strs = ",\n\t\t".join(repr(spec) for spec in self._specs) + return textwrap.dedent( + f"""FeatureGroupSpec( + name={repr(self._name)}, + specs=[ + {spec_strs} + ] + ) + """ + ) + + def to_dict(self) -> Dict[str, Any]: + """Serialize the feature group into a dict. + + Returns: + A dict that serializes the feature group. + """ + return {"feature_group": {"name": self._name, "specs": [s.to_dict() for s in self._specs]}} + + @classmethod + def from_dict(cls, input_dict: Dict[str, Any]) -> "FeatureGroupSpec": + """Deserialize the feature group from a dict. + + Args: + input_dict: The dict containing information of the feature group. + + Returns: + A feature group instance deserialized and created from the dict. + """ + specs = [] + for e in input_dict["feature_group"]["specs"]: + spec = FeatureSpec.from_dict(e) + specs.append(spec) + return FeatureGroupSpec(name=input_dict["feature_group"]["name"], specs=specs) + + +class ModelSignature: + """Signature of a model that specifies the input and output of a model.""" + + def __init__(self, inputs: Sequence[BaseFeatureSpec], outputs: Sequence[BaseFeatureSpec]) -> None: + """Initialize a model signature + + Args: + inputs: A sequence of feature specifications and feature group specifications that will compose the + input of the model. + outputs: A sequence of feature specifications and feature group specifications that will compose the + output of the model. + """ + self._inputs = inputs + self._outputs = outputs + + @property + def inputs(self) -> Sequence[BaseFeatureSpec]: + """Inputs of the model, containing a sequence of feature specifications and feature group specifications.""" + return self._inputs + + @property + def outputs(self) -> Sequence[BaseFeatureSpec]: + """Outputs of the model, containing a sequence of feature specifications and feature group specifications.""" + return self._outputs + + def __eq__(self, other: object) -> bool: + if isinstance(other, ModelSignature): + return self._inputs == other._inputs and self._outputs == other._outputs + else: + return False + + def to_dict(self) -> Dict[str, Any]: + """Generate a dict to represent the whole signature. + + Returns: + A dict that serializes the signature. + """ + + return { + "inputs": [spec.to_dict() for spec in self._inputs], + "outputs": [spec.to_dict() for spec in self._outputs], + } + + @classmethod + def from_dict(cls, loaded: Dict[str, Any]) -> "ModelSignature": + """Create a signature given the dict containing specifications of children features and feature groups. + + Args: + loaded: The dict to be deserialized. + + Returns: + A signature deserialized and created from the dict. + """ + sig_outs = loaded["outputs"] + sig_inputs = loaded["inputs"] + + deserialize_spec: Callable[[Dict[str, Any]], BaseFeatureSpec] = ( + lambda sig_spec: FeatureGroupSpec.from_dict(sig_spec) + if "feature_group" in sig_spec + else FeatureSpec.from_dict(sig_spec) + ) + + return ModelSignature( + inputs=[deserialize_spec(s) for s in sig_inputs], outputs=[deserialize_spec(s) for s in sig_outs] + ) + + def __repr__(self) -> str: + inputs_spec_strs = ",\n\t\t".join(repr(spec) for spec in self._inputs) + outputs_spec_strs = ",\n\t\t".join(repr(spec) for spec in self._outputs) + return textwrap.dedent( + f"""ModelSignature( + inputs=[ + {inputs_spec_strs} + ], + outputs=[ + {outputs_spec_strs} + ] + )""" + ) + + @classmethod + def from_mlflow_sig(cls, mlflow_sig: "mlflow.models.ModelSignature") -> "ModelSignature": + return ModelSignature( + inputs=[ + FeatureSpec.from_mlflow_spec(spec, f"input_feature_{idx}") for idx, spec in enumerate(mlflow_sig.inputs) + ], + outputs=[ + FeatureSpec.from_mlflow_spec(spec, f"output_feature_{idx}") + for idx, spec in enumerate(mlflow_sig.outputs) + ], + ) diff --git a/snowflake/ml/model/_signatures/core_test.py b/snowflake/ml/model/_signatures/core_test.py new file mode 100644 index 00000000..a6f065b8 --- /dev/null +++ b/snowflake/ml/model/_signatures/core_test.py @@ -0,0 +1,164 @@ +import numpy as np +from absl.testing import absltest + +import snowflake.snowpark.types as spt +from snowflake.ml.model._signatures import core + + +class DataTypeTest(absltest.TestCase): + def test_numpy_type(self) -> None: + data = np.array([1, 2, 3, 4]) + self.assertEqual(core.DataType.INT64, core.DataType.from_numpy_type(data.dtype)) + + data = np.array(["a", "b", "c", "d"]) + self.assertEqual(core.DataType.STRING, core.DataType.from_numpy_type(data.dtype)) + + def test_snowpark_type(self) -> None: + self.assertEqual(core.DataType.INT8, core.DataType.from_snowpark_type(spt.ByteType())) + self.assertEqual(core.DataType.INT16, core.DataType.from_snowpark_type(spt.ShortType())) + self.assertEqual(core.DataType.INT32, core.DataType.from_snowpark_type(spt.IntegerType())) + self.assertEqual(core.DataType.INT64, core.DataType.from_snowpark_type(spt.LongType())) + + self.assertEqual(core.DataType.INT64, core.DataType.from_snowpark_type(spt.DecimalType(38, 0))) + + self.assertEqual(core.DataType.FLOAT, core.DataType.from_snowpark_type(spt.FloatType())) + self.assertEqual(core.DataType.DOUBLE, core.DataType.from_snowpark_type(spt.DoubleType())) + + with self.assertRaises(NotImplementedError): + core.DataType.from_snowpark_type(spt.DecimalType(38, 6)) + + self.assertEqual(core.DataType.BOOL, core.DataType.from_snowpark_type(spt.BooleanType())) + self.assertEqual(core.DataType.STRING, core.DataType.from_snowpark_type(spt.StringType())) + self.assertEqual(core.DataType.BYTES, core.DataType.from_snowpark_type(spt.BinaryType())) + + self.assertTrue(core.DataType.INT64.is_same_snowpark_type(spt.LongType())) + self.assertTrue(core.DataType.INT32.is_same_snowpark_type(spt.IntegerType())) + self.assertTrue(core.DataType.INT16.is_same_snowpark_type(spt.ShortType())) + self.assertTrue(core.DataType.INT8.is_same_snowpark_type(spt.ByteType())) + self.assertTrue(core.DataType.UINT64.is_same_snowpark_type(spt.LongType())) + self.assertTrue(core.DataType.UINT32.is_same_snowpark_type(spt.IntegerType())) + self.assertTrue(core.DataType.UINT16.is_same_snowpark_type(spt.ShortType())) + self.assertTrue(core.DataType.UINT8.is_same_snowpark_type(spt.ByteType())) + + self.assertTrue(core.DataType.FLOAT.is_same_snowpark_type(spt.FloatType())) + self.assertTrue(core.DataType.DOUBLE.is_same_snowpark_type(spt.DoubleType())) + + self.assertTrue(core.DataType.INT64.is_same_snowpark_type(incoming_snowpark_type=spt.DecimalType(38, 0))) + self.assertTrue(core.DataType.UINT64.is_same_snowpark_type(incoming_snowpark_type=spt.DecimalType(38, 0))) + + +class FeatureSpecTest(absltest.TestCase): + def test_feature_spec(self) -> None: + ft = core.FeatureSpec(name="feature", dtype=core.DataType.INT64) + self.assertEqual(ft, eval(repr(ft), core.__dict__)) + self.assertEqual(ft, core.FeatureSpec.from_dict(ft.to_dict())) + self.assertEqual(ft.as_snowpark_type(), spt.LongType()) + + ft = core.FeatureSpec(name="feature", dtype=core.DataType.INT64, shape=(2,)) + self.assertEqual(ft, eval(repr(ft), core.__dict__)) + self.assertEqual(ft, core.FeatureSpec.from_dict(input_dict=ft.to_dict())) + self.assertEqual(ft.as_snowpark_type(), spt.ArrayType(spt.LongType())) + + +class FeatureGroupSpecTest(absltest.TestCase): + def test_feature_group_spec(self) -> None: + with self.assertRaisesRegex(ValueError, "No children feature specs."): + _ = core.FeatureGroupSpec(name="features", specs=[]) + + with self.assertRaisesRegex(ValueError, "All children feature specs have to have name."): + ft1 = core.FeatureSpec(name="feature1", dtype=core.DataType.INT64) + ft2 = core.FeatureSpec(name="feature2", dtype=core.DataType.INT64) + ft2._name = None # type: ignore[assignment] + _ = core.FeatureGroupSpec(name="features", specs=[ft1, ft2]) + + with self.assertRaisesRegex(ValueError, "All children feature specs have to have same type."): + ft1 = core.FeatureSpec(name="feature1", dtype=core.DataType.INT64) + ft2 = core.FeatureSpec(name="feature2", dtype=core.DataType.FLOAT) + _ = core.FeatureGroupSpec(name="features", specs=[ft1, ft2]) + + with self.assertRaisesRegex(ValueError, "All children feature specs have to have same shape."): + ft1 = core.FeatureSpec(name="feature1", dtype=core.DataType.INT64) + ft2 = core.FeatureSpec(name="feature2", dtype=core.DataType.INT64, shape=(2,)) + fts = core.FeatureGroupSpec(name="features", specs=[ft1, ft2]) + + ft1 = core.FeatureSpec(name="feature1", dtype=core.DataType.INT64) + ft2 = core.FeatureSpec(name="feature2", dtype=core.DataType.INT64) + fts = core.FeatureGroupSpec(name="features", specs=[ft1, ft2]) + self.assertEqual(fts, eval(repr(fts), core.__dict__)) + self.assertEqual(fts, core.FeatureGroupSpec.from_dict(fts.to_dict())) + self.assertEqual(fts.as_snowpark_type(), spt.MapType(spt.StringType(), spt.LongType())) + + ft1 = core.FeatureSpec(name="feature1", dtype=core.DataType.INT64, shape=(3,)) + ft2 = core.FeatureSpec(name="feature2", dtype=core.DataType.INT64, shape=(2,)) + fts = core.FeatureGroupSpec(name="features", specs=[ft1, ft2]) + self.assertEqual(fts, eval(repr(fts), core.__dict__)) + self.assertEqual(fts, core.FeatureGroupSpec.from_dict(fts.to_dict())) + self.assertEqual(fts.as_snowpark_type(), spt.MapType(spt.StringType(), spt.ArrayType(spt.LongType()))) + + +class ModelSignatureTest(absltest.TestCase): + def test_1(self) -> None: + s = core.ModelSignature( + inputs=[ + core.FeatureSpec(dtype=core.DataType.FLOAT, name="c1"), + core.FeatureGroupSpec( + name="cg1", + specs=[ + core.FeatureSpec( + dtype=core.DataType.FLOAT, + name="cc1", + ), + core.FeatureSpec( + dtype=core.DataType.FLOAT, + name="cc2", + ), + ], + ), + core.FeatureSpec(dtype=core.DataType.FLOAT, name="c2", shape=(-1,)), + ], + outputs=[core.FeatureSpec(name="output", dtype=core.DataType.FLOAT)], + ) + target = { + "inputs": [ + {"type": "FLOAT", "name": "c1"}, + { + "feature_group": { + "name": "cg1", + "specs": [{"type": "FLOAT", "name": "cc1"}, {"type": "FLOAT", "name": "cc2"}], + } + }, + {"type": "FLOAT", "name": "c2", "shape": (-1,)}, + ], + "outputs": [{"type": "FLOAT", "name": "output"}], + } + self.assertDictEqual(s.to_dict(), target) + self.assertEqual(s, eval(repr(s), core.__dict__)) + self.assertEqual(s, core.ModelSignature.from_dict(s.to_dict())) + + def test_2(self) -> None: + s = core.ModelSignature( + inputs=[ + core.FeatureSpec(dtype=core.DataType.FLOAT, name="c1"), + core.FeatureGroupSpec( + name="cg1", + specs=[ + core.FeatureSpec( + dtype=core.DataType.FLOAT, + name="cc1", + ), + core.FeatureSpec( + dtype=core.DataType.FLOAT, + name="cc2", + ), + ], + ), + core.FeatureSpec(dtype=core.DataType.FLOAT, name="c2", shape=(-1,)), + ], + outputs=[core.FeatureSpec(name="output", dtype=core.DataType.FLOAT)], + ) + self.assertEqual(s, eval(repr(s), core.__dict__)) + self.assertEqual(s, core.ModelSignature.from_dict(s.to_dict())) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_signatures/numpy_handler.py b/snowflake/ml/model/_signatures/numpy_handler.py new file mode 100644 index 00000000..73347474 --- /dev/null +++ b/snowflake/ml/model/_signatures/numpy_handler.py @@ -0,0 +1,123 @@ +from typing import List, Literal, Sequence + +import numpy as np +import pandas as pd +from typing_extensions import TypeGuard + +from snowflake.ml.model import type_hints as model_types +from snowflake.ml.model._signatures import base_handler, core + + +class NumpyArrayHandler(base_handler.BaseDataHandler[model_types._SupportedNumpyArray]): + @staticmethod + def can_handle(data: model_types.SupportedDataType) -> TypeGuard[model_types._SupportedNumpyArray]: + return isinstance(data, np.ndarray) + + @staticmethod + def count(data: model_types._SupportedNumpyArray) -> int: + return data.shape[0] + + @staticmethod + def truncate(data: model_types._SupportedNumpyArray) -> model_types._SupportedNumpyArray: + return data[: min(NumpyArrayHandler.count(data), NumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT)] + + @staticmethod + def validate(data: model_types._SupportedNumpyArray) -> None: + if data.shape == (0,): + # Empty array + raise ValueError("Data Validation Error: Empty data is found.") + + if data.shape == (): + # scalar + raise ValueError("Data Validation Error: Scalar data is found.") + + @staticmethod + def infer_signature( + data: model_types._SupportedNumpyArray, role: Literal["input", "output"] + ) -> Sequence[core.BaseFeatureSpec]: + feature_prefix = f"{NumpyArrayHandler.FEATURE_PREFIX}_" + dtype = core.DataType.from_numpy_type(data.dtype) + role_prefix = (NumpyArrayHandler.INPUT_PREFIX if role == "input" else NumpyArrayHandler.OUTPUT_PREFIX) + "_" + if len(data.shape) == 1: + return [core.FeatureSpec(dtype=dtype, name=f"{role_prefix}{feature_prefix}0")] + else: + # For high-dimension array, 0-axis is for batch, 1-axis is for column, further more is details of columns. + features = [] + n_cols = data.shape[1] + ft_names = [f"{role_prefix}{feature_prefix}{i}" for i in range(n_cols)] + for col_data, ft_name in zip(data[0], ft_names): + if isinstance(col_data, np.ndarray): + ft_shape = np.shape(col_data) + features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape)) + else: + features.append(core.FeatureSpec(dtype=dtype, name=ft_name)) + return features + + @staticmethod + def convert_to_df(data: model_types._SupportedNumpyArray, ensure_serializable: bool = True) -> pd.DataFrame: + if len(data.shape) == 1: + data = np.expand_dims(data, axis=1) + n_cols = data.shape[1] + if len(data.shape) == 2: + return pd.DataFrame(data) + else: + n_rows = data.shape[0] + if ensure_serializable: + return pd.DataFrame(data={i: [data[k, i].tolist() for k in range(n_rows)] for i in range(n_cols)}) + return pd.DataFrame(data={i: [list(data[k, i]) for k in range(n_rows)] for i in range(n_cols)}) + + +class SeqOfNumpyArrayHandler(base_handler.BaseDataHandler[Sequence[model_types._SupportedNumpyArray]]): + @staticmethod + def can_handle(data: model_types.SupportedDataType) -> TypeGuard[Sequence[model_types._SupportedNumpyArray]]: + if not isinstance(data, list): + return False + if len(data) == 0: + return False + if isinstance(data[0], np.ndarray): + return all(isinstance(data_col, np.ndarray) for data_col in data) + return False + + @staticmethod + def count(data: Sequence[model_types._SupportedNumpyArray]) -> int: + return min(NumpyArrayHandler.count(data_col) for data_col in data) + + @staticmethod + def truncate(data: Sequence[model_types._SupportedNumpyArray]) -> Sequence[model_types._SupportedNumpyArray]: + return [ + data_col[: min(SeqOfNumpyArrayHandler.count(data), SeqOfNumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT)] + for data_col in data + ] + + @staticmethod + def validate(data: Sequence[model_types._SupportedNumpyArray]) -> None: + for data_col in data: + NumpyArrayHandler.validate(data_col) + + @staticmethod + def infer_signature( + data: Sequence[model_types._SupportedNumpyArray], role: Literal["input", "output"] + ) -> Sequence[core.BaseFeatureSpec]: + feature_prefix = f"{SeqOfNumpyArrayHandler.FEATURE_PREFIX}_" + features: List[core.BaseFeatureSpec] = [] + role_prefix = ( + SeqOfNumpyArrayHandler.INPUT_PREFIX if role == "input" else SeqOfNumpyArrayHandler.OUTPUT_PREFIX + ) + "_" + + for i, data_col in enumerate(data): + dtype = core.DataType.from_numpy_type(data_col.dtype) + ft_name = f"{role_prefix}{feature_prefix}{i}" + if len(data_col.shape) == 1: + features.append(core.FeatureSpec(dtype=dtype, name=ft_name)) + else: + ft_shape = tuple(data_col.shape[1:]) + features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape)) + return features + + @staticmethod + def convert_to_df( + data: Sequence[model_types._SupportedNumpyArray], ensure_serializable: bool = True + ) -> pd.DataFrame: + if ensure_serializable: + return pd.DataFrame(data={i: data_col.tolist() for i, data_col in enumerate(data)}) + return pd.DataFrame(data={i: list(data_col) for i, data_col in enumerate(data)}) diff --git a/snowflake/ml/model/_signatures/numpy_test.py b/snowflake/ml/model/_signatures/numpy_test.py new file mode 100644 index 00000000..db0d27ee --- /dev/null +++ b/snowflake/ml/model/_signatures/numpy_test.py @@ -0,0 +1,184 @@ +import numpy as np +import pandas as pd +from absl.testing import absltest + +from snowflake.ml.model._signatures import core, numpy_handler + + +class NumpyArrayHandlerTest(absltest.TestCase): + def test_validate_np_ndarray(self) -> None: + arr = np.array([]) + with self.assertRaisesRegex(ValueError, "Empty data is found."): + numpy_handler.NumpyArrayHandler.validate(arr) + + arr = np.array(1) + with self.assertRaisesRegex(ValueError, "Scalar data is found."): + numpy_handler.NumpyArrayHandler.validate(arr) + + def test_trunc_np_ndarray(self) -> None: + arr = np.array([1] * (numpy_handler.NumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1)) + + np.testing.assert_equal( + np.array([1] * (numpy_handler.NumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT)), + numpy_handler.NumpyArrayHandler.truncate(arr), + ) + + arr = np.array([1] * (numpy_handler.NumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)) + + np.testing.assert_equal( + arr, + numpy_handler.NumpyArrayHandler.truncate(arr), + ) + + def test_infer_schema_np_ndarray(self) -> None: + arr = np.array([1, 2, 3, 4]) + self.assertListEqual( + numpy_handler.NumpyArrayHandler.infer_signature(arr, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT64)], + ) + + arr = np.array([[1, 2], [3, 4]]) + self.assertListEqual( + numpy_handler.NumpyArrayHandler.infer_signature(arr, role="input"), + [ + core.FeatureSpec("input_feature_0", core.DataType.INT64), + core.FeatureSpec("input_feature_1", core.DataType.INT64), + ], + ) + + arr = np.array([[[1, 1], [2, 2]], [[3, 3], [4, 4]]]) + self.assertListEqual( + numpy_handler.NumpyArrayHandler.infer_signature(arr, role="input"), + [ + core.FeatureSpec("input_feature_0", core.DataType.INT64, shape=(2,)), + core.FeatureSpec("input_feature_1", core.DataType.INT64, shape=(2,)), + ], + ) + + arr = np.array([1, 2, 3, 4]) + self.assertListEqual( + numpy_handler.NumpyArrayHandler.infer_signature(arr, role="output"), + [core.FeatureSpec("output_feature_0", core.DataType.INT64)], + ) + + arr = np.array([[1, 2], [3, 4]]) + self.assertListEqual( + numpy_handler.NumpyArrayHandler.infer_signature(arr, role="output"), + [ + core.FeatureSpec("output_feature_0", core.DataType.INT64), + core.FeatureSpec("output_feature_1", core.DataType.INT64), + ], + ) + + arr = np.array([[[1, 1], [2, 2]], [[3, 3], [4, 4]]]) + self.assertListEqual( + numpy_handler.NumpyArrayHandler.infer_signature(arr, role="output"), + [ + core.FeatureSpec("output_feature_0", core.DataType.INT64, shape=(2,)), + core.FeatureSpec("output_feature_1", core.DataType.INT64, shape=(2,)), + ], + ) + + def test_convert_to_df_numpy_array(self) -> None: + arr1 = np.array([1, 2, 3, 4]) + pd.testing.assert_frame_equal( + numpy_handler.NumpyArrayHandler.convert_to_df(arr1), + pd.DataFrame([1, 2, 3, 4]), + ) + + arr2 = np.array([[1, 1], [2, 2], [3, 3], [4, 4]]) + pd.testing.assert_frame_equal( + numpy_handler.NumpyArrayHandler.convert_to_df(arr2), + pd.DataFrame([[1, 1], [2, 2], [3, 3], [4, 4]]), + ) + + arr3 = np.array([[[1, 1], [2, 2]], [[3, 3], [4, 4]]]) + pd.testing.assert_frame_equal( + numpy_handler.NumpyArrayHandler.convert_to_df(arr3), + pd.DataFrame(data={0: [np.array([1, 1]), np.array([3, 3])], 1: [np.array([2, 2]), np.array([4, 4])]}), + ) + + +class SeqOfNumpyArrayHandlerTest(absltest.TestCase): + def test_validate_list_of_numpy_array(self) -> None: + lt8 = [pd.DataFrame([1]), pd.DataFrame([2, 3])] + self.assertFalse(numpy_handler.SeqOfNumpyArrayHandler.can_handle(lt8)) + + def test_trunc_np_ndarray(self) -> None: + arrs = [np.array([1] * (numpy_handler.SeqOfNumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1))] * 2 + + for arr in numpy_handler.SeqOfNumpyArrayHandler.truncate(arrs): + np.testing.assert_equal( + np.array([1] * (numpy_handler.SeqOfNumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT)), arr + ) + + arrs = [ + np.array([1]), + np.array([1] * (numpy_handler.SeqOfNumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)), + ] + + for arr in numpy_handler.SeqOfNumpyArrayHandler.truncate(arrs): + np.testing.assert_equal(np.array([1]), arr) + + def test_infer_signature_list_of_numpy_array(self) -> None: + arr = np.array([1, 2, 3, 4]) + lt = [arr, arr] + self.assertListEqual( + numpy_handler.SeqOfNumpyArrayHandler.infer_signature(lt, role="input"), + [ + core.FeatureSpec("input_feature_0", core.DataType.INT64), + core.FeatureSpec("input_feature_1", core.DataType.INT64), + ], + ) + + arr = np.array([[1, 2], [3, 4]]) + lt = [arr, arr] + self.assertListEqual( + numpy_handler.SeqOfNumpyArrayHandler.infer_signature(lt, role="input"), + [ + core.FeatureSpec("input_feature_0", core.DataType.INT64, shape=(2,)), + core.FeatureSpec("input_feature_1", core.DataType.INT64, shape=(2,)), + ], + ) + + arr = np.array([[[1, 1], [2, 2]], [[3, 3], [4, 4]]]) + lt = [arr, arr] + self.assertListEqual( + numpy_handler.SeqOfNumpyArrayHandler.infer_signature(lt, role="output"), + [ + core.FeatureSpec("output_feature_0", core.DataType.INT64, shape=(2, 2)), + core.FeatureSpec("output_feature_1", core.DataType.INT64, shape=(2, 2)), + ], + ) + + def test_convert_to_df_list_of_numpy_array(self) -> None: + arr1 = np.array([1, 2, 3, 4]) + lt = [arr1, arr1] + pd.testing.assert_frame_equal( + numpy_handler.SeqOfNumpyArrayHandler.convert_to_df(lt), + pd.DataFrame([[1, 1], [2, 2], [3, 3], [4, 4]]), + check_names=False, + ) + + arr2 = np.array([[1, 1], [2, 2], [3, 3], [4, 4]]) + lt = [arr1, arr2] + pd.testing.assert_frame_equal( + numpy_handler.SeqOfNumpyArrayHandler.convert_to_df(lt), + pd.DataFrame([[1, [1, 1]], [2, [2, 2]], [3, [3, 3]], [4, [4, 4]]]), + ) + + arr = np.array([[[1, 1], [2, 2]], [[3, 3], [4, 4]]]) + lt = [arr, arr] + pd.testing.assert_frame_equal( + numpy_handler.SeqOfNumpyArrayHandler.convert_to_df(lt), + pd.DataFrame( + data={ + 0: [[[1, 1], [2, 2]], [[3, 3], [4, 4]]], + 1: [[[1, 1], [2, 2]], [[3, 3], [4, 4]]], + } + ), + ) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_signatures/pandas_handler.py b/snowflake/ml/model/_signatures/pandas_handler.py new file mode 100644 index 00000000..43a80639 --- /dev/null +++ b/snowflake/ml/model/_signatures/pandas_handler.py @@ -0,0 +1,136 @@ +from typing import Literal, Sequence + +import numpy as np +import pandas as pd +from typing_extensions import TypeGuard + +from snowflake.ml.model import type_hints as model_types +from snowflake.ml.model._signatures import base_handler, core, utils + + +class PandasDataFrameHandler(base_handler.BaseDataHandler[pd.DataFrame]): + @staticmethod + def can_handle(data: model_types.SupportedDataType) -> TypeGuard[pd.DataFrame]: + return isinstance(data, pd.DataFrame) + + @staticmethod + def count(data: pd.DataFrame) -> int: + return len(data.index) + + @staticmethod + def truncate(data: pd.DataFrame) -> pd.DataFrame: + return data.head(min(PandasDataFrameHandler.count(data), PandasDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT)) + + @staticmethod + def validate(data: pd.DataFrame) -> None: + df_cols = data.columns + + if df_cols.has_duplicates: # Rule out categorical index with duplicates + raise ValueError("Data Validation Error: Duplicate column index is found.") + + assert all(hasattr(data[col], "dtype") for col in data.columns), f"Unknown column confronted in {data}" + + if len(df_cols) == 0: + raise ValueError("Data Validation Error: Empty data is found.") + + if df_cols.dtype not in [ + np.int64, + np.uint64, + np.float64, + np.object_, + ]: # To keep compatibility with Pandas 2.x and 1.x + raise ValueError("Data Validation Error: Unsupported column index type is found.") + + df_col_dtypes = [data[col].dtype for col in data.columns] + for df_col, df_col_dtype in zip(df_cols, df_col_dtypes): + if df_col_dtype == np.dtype("O"): + # Check if all objects have the same type + if not all(isinstance(data_row, type(data[df_col][0])) for data_row in data[df_col]): + raise ValueError( + f"Data Validation Error: Inconsistent type of object found in column data {data[df_col]}." + ) + + if isinstance(data[df_col][0], list): + arr = utils.convert_list_to_ndarray(data[df_col][0]) + arr_dtype = core.DataType.from_numpy_type(arr.dtype) + + converted_data_list = [utils.convert_list_to_ndarray(data_row) for data_row in data[df_col]] + + if not all( + core.DataType.from_numpy_type(converted_data.dtype) == arr_dtype + for converted_data in converted_data_list + ): + raise ValueError( + "Data Validation Error: " + + f"Inconsistent type of element in object found in column data {data[df_col]}." + ) + + elif isinstance(data[df_col][0], np.ndarray): + arr_dtype = core.DataType.from_numpy_type(data[df_col][0].dtype) + + if not all(core.DataType.from_numpy_type(data_row.dtype) == arr_dtype for data_row in data[df_col]): + raise ValueError( + "Data Validation Error: " + + f"Inconsistent type of element in object found in column data {data[df_col]}." + ) + elif not isinstance(data[df_col][0], (str, bytes)): + raise ValueError(f"Data Validation Error: Unsupported type confronted in {data[df_col]}") + + @staticmethod + def infer_signature(data: pd.DataFrame, role: Literal["input", "output"]) -> Sequence[core.BaseFeatureSpec]: + feature_prefix = f"{PandasDataFrameHandler.FEATURE_PREFIX}_" + df_cols = data.columns + role_prefix = ( + PandasDataFrameHandler.INPUT_PREFIX if role == "input" else PandasDataFrameHandler.OUTPUT_PREFIX + ) + "_" + if df_cols.dtype in [np.int64, np.uint64, np.float64]: + ft_names = [f"{role_prefix}{feature_prefix}{i}" for i in df_cols] + else: + ft_names = list(map(str, data.columns.to_list())) + + df_col_dtypes = [data[col].dtype for col in data.columns] + + specs = [] + for df_col, df_col_dtype, ft_name in zip(df_cols, df_col_dtypes, ft_names): + if df_col_dtype == np.dtype("O"): + if isinstance(data[df_col][0], list): + arr = utils.convert_list_to_ndarray(data[df_col][0]) + arr_dtype = core.DataType.from_numpy_type(arr.dtype) + ft_shape = np.shape(data[df_col][0]) + + converted_data_list = [utils.convert_list_to_ndarray(data_row) for data_row in data[df_col]] + + if not all(np.shape(converted_data) == ft_shape for converted_data in converted_data_list): + ft_shape = (-1,) + + specs.append(core.FeatureSpec(dtype=arr_dtype, name=ft_name, shape=ft_shape)) + elif isinstance(data[df_col][0], np.ndarray): + arr_dtype = core.DataType.from_numpy_type(data[df_col][0].dtype) + ft_shape = np.shape(data[df_col][0]) + + if not all(np.shape(data_row) == ft_shape for data_row in data[df_col]): + ft_shape = (-1,) + + specs.append(core.FeatureSpec(dtype=arr_dtype, name=ft_name, shape=ft_shape)) + elif isinstance(data[df_col][0], str): + specs.append(core.FeatureSpec(dtype=core.DataType.STRING, name=ft_name)) + elif isinstance(data[df_col][0], bytes): + specs.append(core.FeatureSpec(dtype=core.DataType.BYTES, name=ft_name)) + else: + specs.append(core.FeatureSpec(dtype=core.DataType.from_numpy_type(df_col_dtype), name=ft_name)) + return specs + + @staticmethod + def convert_to_df(data: pd.DataFrame, ensure_serializable: bool = True) -> pd.DataFrame: + if not ensure_serializable: + return data + # This convert is necessary since numpy dataframe cannot be correctly handled when provided as an element of + # a list when creating Snowpark Dataframe. + df = data.copy() + df_cols = df.columns + df_col_dtypes = [df[col].dtype for col in df.columns] + for df_col, df_col_dtype in zip(df_cols, df_col_dtypes): + if df_col_dtype == np.dtype("O"): + if isinstance(df[df_col][0], np.ndarray): + df[df_col] = df[df_col].map(np.ndarray.tolist) + return df diff --git a/snowflake/ml/model/_signatures/pandas_test.py b/snowflake/ml/model/_signatures/pandas_test.py new file mode 100644 index 00000000..fc38ef43 --- /dev/null +++ b/snowflake/ml/model/_signatures/pandas_test.py @@ -0,0 +1,257 @@ +import numpy as np +import pandas as pd +from absl.testing import absltest + +from snowflake.ml.model._signatures import core, pandas_handler + + +class PandasDataFrameHandlerTest(absltest.TestCase): + def test_validate_pd_DataFrame(self) -> None: + df = pd.DataFrame([]) + with self.assertRaisesRegex(ValueError, "Empty data is found."): + pandas_handler.PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, 2], [2, 4]], columns=["a", "a"]) + with self.assertRaisesRegex(ValueError, "Duplicate column index is found"): + pandas_handler.PandasDataFrameHandler.validate(df) + + sub_df = pd.DataFrame([2.5, 6.8]) + df = pd.DataFrame([[1, sub_df], [2, sub_df]], columns=["a", "b"]) + with self.assertRaisesRegex(ValueError, "Unsupported type confronted in"): + pandas_handler.PandasDataFrameHandler.validate(df) + + df = pd.DataFrame( + [[1, 2.0, 1, 2.0, 1, 2.0], [2, 4.0, 2, 4.0, 2, 4.0]], + columns=pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]), + ) + with self.assertRaisesRegex(ValueError, "Duplicate column index is found"): + pandas_handler.PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, 2], [2, 4]], columns=["a", "a"]) + with self.assertRaisesRegex(ValueError, "Duplicate column index is found"): + pandas_handler.PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, "Hello"], [2, [2, 6]]], columns=["a", "b"]) + with self.assertRaisesRegex(ValueError, "Inconsistent type of object"): + pandas_handler.PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, 2], [2, [2, 6]]], columns=["a", "b"]) + with self.assertRaisesRegex(ValueError, "Inconsistent type of object"): + pandas_handler.PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, [2, [6]]], [2, [2, 6]]], columns=["a", "b"]) + with self.assertRaisesRegex(ValueError, "Ragged nested or Unsupported list-like data"): + pandas_handler.PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, [2, 6]], [2, [2, [6]]]], columns=["a", "b"]) + with self.assertRaisesRegex(ValueError, "Ragged nested or Unsupported list-like data"): + pandas_handler.PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, [2.5, 6.8]], [2, [2, 6]]], columns=["a", "b"]) + with self.assertRaisesRegex(ValueError, "Inconsistent type of element in object found in column data"): + pandas_handler.PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2, 6])]], columns=["a", "b"]) + with self.assertRaisesRegex(ValueError, "Inconsistent type of element in object found in column data"): + pandas_handler.PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, np.array([2.5, 6.8])], [2, 6]], columns=["a", "b"]) + with self.assertRaisesRegex(ValueError, "Inconsistent type of object found in column data"): + pandas_handler.PandasDataFrameHandler.validate(df) + + def test_trunc_pd_DataFrame(self) -> None: + df = pd.DataFrame([1] * (pandas_handler.PandasDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1)) + + pd.testing.assert_frame_equal( + pd.DataFrame([1] * (pandas_handler.PandasDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT)), + pandas_handler.PandasDataFrameHandler.truncate(df), + ) + + df = pd.DataFrame([1] * (pandas_handler.PandasDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)) + + pd.testing.assert_frame_equal( + df, + pandas_handler.PandasDataFrameHandler.truncate(df), + ) + + def test_infer_signature_pd_DataFrame(self) -> None: + df = pd.DataFrame([1, 2, 3, 4]) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT64)], + ) + + df = pd.DataFrame([1, 2, 3, 4], columns=["a"]) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input"), + [core.FeatureSpec("a", core.DataType.INT64)], + ) + + df = pd.DataFrame(["a", "b", "c", "d"], columns=["a"]) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input"), + [core.FeatureSpec("a", core.DataType.STRING)], + ) + + df = pd.DataFrame([ele.encode() for ele in ["a", "b", "c", "d"]], columns=["a"]) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input"), + [core.FeatureSpec("a", core.DataType.BYTES)], + ) + + df = pd.DataFrame([[1, 2.0], [2, 4.0]]) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input"), + [ + core.FeatureSpec("input_feature_0", core.DataType.INT64), + core.FeatureSpec("input_feature_1", core.DataType.DOUBLE), + ], + ) + + df = pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5, 6.8]]], columns=["a", "b"]) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input"), + [ + core.FeatureSpec("a", core.DataType.INT64), + core.FeatureSpec("b", core.DataType.DOUBLE, shape=(2,)), + ], + ) + + df = pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5]]], columns=["a", "b"]) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input"), + [ + core.FeatureSpec("a", core.DataType.INT64), + core.FeatureSpec("b", core.DataType.DOUBLE, shape=(-1,)), + ], + ) + + df = pd.DataFrame([[1, [[2.5], [6.8]]], [2, [[2.5], [6.8]]]], columns=["a", "b"]) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input"), + [ + core.FeatureSpec("a", core.DataType.INT64), + core.FeatureSpec("b", core.DataType.DOUBLE, shape=(2, 1)), + ], + ) + + a = np.array([2.5, 6.8]) + df = pd.DataFrame([[1, a], [2, a]], columns=["a", "b"]) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input"), + [ + core.FeatureSpec("a", core.DataType.INT64), + core.FeatureSpec("b", core.DataType.DOUBLE, shape=(2,)), + ], + ) + + df = pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2.5])]], columns=["a", "b"]) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input"), + [ + core.FeatureSpec("a", core.DataType.INT64), + core.FeatureSpec("b", core.DataType.DOUBLE, shape=(-1,)), + ], + ) + + a = np.array([[2, 5], [6, 8]]) + df = pd.DataFrame([[1, a], [2, a]], columns=["a", "b"]) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input"), + [ + core.FeatureSpec("a", core.DataType.INT64), + core.FeatureSpec("b", core.DataType.INT64, shape=(2, 2)), + ], + ) + + df = pd.DataFrame([[1, 2.0], [2, 4.0]], columns=pd.PeriodIndex(year=[2000, 2002], quarter=[1, 3])) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input"), + [ + core.FeatureSpec("2000Q1", core.DataType.INT64), + core.FeatureSpec("2002Q3", core.DataType.DOUBLE), + ], + ) + + df = pd.DataFrame([[1, 2.0], [2, 4.0]], columns=pd.date_range("2020-01-06", "2020-03-03", freq="MS")) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input"), + [ + core.FeatureSpec("2020-02-01 00:00:00", core.DataType.INT64), + core.FeatureSpec("2020-03-01 00:00:00", core.DataType.DOUBLE), + ], + ) + + df = pd.DataFrame( + [[1, 2.0], [2, 4.0]], columns=pd.TimedeltaIndex(data=["1 days 02:00:00", "1 days 06:05:01.000030"]) + ) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input"), + [ + core.FeatureSpec("1 days 02:00:00", core.DataType.INT64), + core.FeatureSpec("1 days 06:05:01.000030", core.DataType.DOUBLE), + ], + ) + + df = pd.DataFrame([[1, 2.0], [2, 4.0]], columns=pd.interval_range(start=0, end=2)) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input"), + [ + core.FeatureSpec("(0, 1]", core.DataType.INT64), + core.FeatureSpec("(1, 2]", core.DataType.DOUBLE), + ], + ) + + arrays = [[1, 2], ["red", "blue"]] + df = pd.DataFrame([[1, 2.0], [2, 4.0]], columns=pd.MultiIndex.from_arrays(arrays, names=("number", "color"))) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input"), + [ + core.FeatureSpec("(1, 'red')", core.DataType.INT64), + core.FeatureSpec("(2, 'blue')", core.DataType.DOUBLE), + ], + ) + + df = pd.DataFrame([1, 2, 3, 4]) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="output"), + [core.FeatureSpec("output_feature_0", core.DataType.INT64)], + ) + + df = pd.DataFrame([1, 2, 3, 4], columns=["a"]) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="output"), + [core.FeatureSpec("a", core.DataType.INT64)], + ) + + df = pd.DataFrame(["a", "b", "c", "d"], columns=["a"]) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="output"), + [core.FeatureSpec("a", core.DataType.STRING)], + ) + + df = pd.DataFrame([ele.encode() for ele in ["a", "b", "c", "d"]], columns=["a"]) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="output"), + [core.FeatureSpec("a", core.DataType.BYTES)], + ) + + df = pd.DataFrame([[1, 2.0], [2, 4.0]]) + self.assertListEqual( + pandas_handler.PandasDataFrameHandler.infer_signature(df, role="output"), + [ + core.FeatureSpec("output_feature_0", core.DataType.INT64), + core.FeatureSpec("output_feature_1", core.DataType.DOUBLE), + ], + ) + + def test_convert_to_df_pd_DataFrame(self) -> None: + a = np.array([[2, 5], [6, 8]]) + li = [[2, 5], [6, 8]] + df1 = pd.DataFrame([[1, a], [2, a]], columns=["a", "b"]) + df2 = pd.DataFrame([[1, li], [2, li]], columns=["a", "b"]) + pd.testing.assert_frame_equal(pandas_handler.PandasDataFrameHandler.convert_to_df(df1), df2) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_signatures/pytorch_handler.py b/snowflake/ml/model/_signatures/pytorch_handler.py new file mode 100644 index 00000000..cebd4af1 --- /dev/null +++ b/snowflake/ml/model/_signatures/pytorch_handler.py @@ -0,0 +1,93 @@ +from typing import TYPE_CHECKING, List, Literal, Optional, Sequence + +import numpy as np +import pandas as pd +from typing_extensions import TypeGuard + +from snowflake.ml._internal import type_utils +from snowflake.ml.model import type_hints as model_types +from snowflake.ml.model._signatures import base_handler, core + +if TYPE_CHECKING: + import torch + + +class SeqOfPyTorchTensorHandler(base_handler.BaseDataHandler[Sequence["torch.Tensor"]]): + @staticmethod + def can_handle(data: model_types.SupportedDataType) -> TypeGuard[Sequence["torch.Tensor"]]: + if not isinstance(data, list): + return False + if len(data) == 0: + return False + if type_utils.LazyType("torch.Tensor").isinstance(data[0]): + return all(type_utils.LazyType("torch.Tensor").isinstance(data_col) for data_col in data) + return False + + @staticmethod + def count(data: Sequence["torch.Tensor"]) -> int: + return min(data_col.shape[0] for data_col in data) + + @staticmethod + def truncate(data: Sequence["torch.Tensor"]) -> Sequence["torch.Tensor"]: + return [ + data_col[: min(SeqOfPyTorchTensorHandler.count(data), SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT)] + for data_col in data + ] + + @staticmethod + def validate(data: Sequence["torch.Tensor"]) -> None: + import torch + + for data_col in data: + if data_col.shape == torch.Size([0]): + # Empty array + raise ValueError("Data Validation Error: Empty data is found.") + + if data_col.shape == torch.Size([1]): + # scalar + raise ValueError("Data Validation Error: Scalar data is found.") + + @staticmethod + def infer_signature( + data: Sequence["torch.Tensor"], role: Literal["input", "output"] + ) -> Sequence[core.BaseFeatureSpec]: + feature_prefix = f"{SeqOfPyTorchTensorHandler.FEATURE_PREFIX}_" + features: List[core.BaseFeatureSpec] = [] + role_prefix = ( + SeqOfPyTorchTensorHandler.INPUT_PREFIX if role == "input" else SeqOfPyTorchTensorHandler.OUTPUT_PREFIX + ) + "_" + + for i, data_col in enumerate(data): + dtype = core.DataType.from_torch_type(data_col.dtype) + ft_name = f"{role_prefix}{feature_prefix}{i}" + if len(data_col.shape) == 1: + features.append(core.FeatureSpec(dtype=dtype, name=ft_name)) + else: + ft_shape = tuple(data_col.shape[1:]) + features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape)) + return features + + @staticmethod + def convert_to_df(data: Sequence["torch.Tensor"], ensure_serializable: bool = True) -> pd.DataFrame: + # Use list(...) instead of .tolist() to ensure that + # the content is still numpy array so that the type could be preserved. + # But that would not serializable and cannot use as UDF input and output. + if ensure_serializable: + return pd.DataFrame({i: data_col.detach().to("cpu").numpy().tolist() for i, data_col in enumerate(data)}) + return pd.DataFrame({i: list(data_col.detach().to("cpu").numpy()) for i, data_col in enumerate(data)}) + + @staticmethod + def convert_from_df( + df: pd.DataFrame, features: Optional[Sequence[core.BaseFeatureSpec]] = None + ) -> Sequence["torch.Tensor"]: + import torch + + res = [] + if features: + for feature in features: + if isinstance(feature, core.FeatureGroupSpec): + raise NotImplementedError("FeatureGroupSpec is not supported.") + assert isinstance(feature, core.FeatureSpec), "Invalid feature kind." + res.append(torch.from_numpy(np.stack(df[feature.name].to_numpy()).astype(feature._dtype._numpy_type))) + return res + return [torch.from_numpy(np.stack(df[col].to_numpy())) for col in df] diff --git a/snowflake/ml/model/_signatures/pytorch_test.py b/snowflake/ml/model/_signatures/pytorch_test.py new file mode 100644 index 00000000..f144390b --- /dev/null +++ b/snowflake/ml/model/_signatures/pytorch_test.py @@ -0,0 +1,367 @@ +import numpy as np +import pandas as pd +import torch +from absl.testing import absltest + +from snowflake.ml.model._signatures import core, pytorch_handler, utils + + +class SeqOfPyTorchTensorHandlerTest(absltest.TestCase): + def test_validate_list_of_pytorch_tensor(self) -> None: + lt1 = [np.array([1, 4]), np.array([2, 3])] + self.assertFalse(pytorch_handler.SeqOfPyTorchTensorHandler.can_handle(lt1)) + + lt2 = [np.array([1, 4]), torch.Tensor([2, 3])] + self.assertFalse(pytorch_handler.SeqOfPyTorchTensorHandler.can_handle(lt2)) + + lt3 = [torch.Tensor([1, 4]), torch.Tensor([2, 3])] + self.assertTrue(pytorch_handler.SeqOfPyTorchTensorHandler.can_handle(lt3)) + + def test_validate_torch_tensor(self) -> None: + t = [torch.Tensor([])] + with self.assertRaisesRegex(ValueError, "Empty data is found."): + pytorch_handler.SeqOfPyTorchTensorHandler.validate(t) + + t = [torch.Tensor(1)] + with self.assertRaisesRegex(ValueError, "Scalar data is found."): + pytorch_handler.SeqOfPyTorchTensorHandler.validate(t) + + t = [torch.Tensor([1, 2]), torch.Tensor(1)] + with self.assertRaisesRegex(ValueError, "Scalar data is found."): + pytorch_handler.SeqOfPyTorchTensorHandler.validate(t) + + def test_trunc_torch_tensor(self) -> None: + t = [torch.Tensor([1] * (pytorch_handler.SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1))] + + for ts in pytorch_handler.SeqOfPyTorchTensorHandler.truncate(t): + torch.testing.assert_close( # type:ignore[attr-defined] + torch.Tensor([1] * (pytorch_handler.SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT)), ts + ) + + t = [torch.Tensor([1] * (pytorch_handler.SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1))] + + for ts in pytorch_handler.SeqOfPyTorchTensorHandler.truncate(t): + torch.testing.assert_close( # type:ignore[attr-defined] + torch.Tensor([1] * (pytorch_handler.SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)), ts + ) + + t = [torch.Tensor([1] * (pytorch_handler.SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1))] * 2 + + for ts in pytorch_handler.SeqOfPyTorchTensorHandler.truncate(t): + torch.testing.assert_close( # type:ignore[attr-defined] + torch.Tensor([1] * (pytorch_handler.SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT)), ts + ) + + t = [ + torch.Tensor([1]), + torch.Tensor([1] * (pytorch_handler.SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)), + ] + + for ts in pytorch_handler.SeqOfPyTorchTensorHandler.truncate(t): + torch.testing.assert_close( # type:ignore[attr-defined] + torch.Tensor([1]), ts + ) + + def test_infer_schema_torch_tensor(self) -> None: + t1 = [torch.IntTensor([1, 2, 3, 4])] + self.assertListEqual( + pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t1, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT32)], + ) + + t2 = [torch.LongTensor([1, 2, 3, 4])] + self.assertListEqual( + pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t2, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT64)], + ) + + t3 = [torch.ShortTensor([1, 2, 3, 4])] + self.assertListEqual( + pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t3, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT16)], + ) + + t4 = [torch.CharTensor([1, 2, 3, 4])] + self.assertListEqual( + pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t4, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT8)], + ) + + t5 = [torch.ByteTensor([1, 2, 3, 4])] + self.assertListEqual( + pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t5, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.UINT8)], + ) + + t6 = [torch.BoolTensor([False, True])] + self.assertListEqual( + pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t6, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.BOOL)], + ) + + t7 = [torch.FloatTensor([1.2, 3.4])] + self.assertListEqual( + pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t7, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.FLOAT)], + ) + + t8 = [torch.DoubleTensor([1.2, 3.4])] + self.assertListEqual( + pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t8, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.DOUBLE)], + ) + + t9 = [torch.LongTensor([[1, 2], [3, 4]])] + self.assertListEqual( + pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t9, role="input"), + [ + core.FeatureSpec("input_feature_0", core.DataType.INT64, shape=(2,)), + ], + ) + + t10 = [torch.LongTensor([[[1, 1], [2, 2]], [[3, 3], [4, 4]]])] + self.assertListEqual( + pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t10, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT64, shape=(2, 2))], + ) + + t11 = [torch.LongTensor([1, 2, 3, 4])] + self.assertListEqual( + pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t11, role="output"), + [core.FeatureSpec("output_feature_0", core.DataType.INT64)], + ) + + t12 = [torch.LongTensor([1, 2]), torch.LongTensor([3, 4])] + self.assertListEqual( + pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t12, role="output"), + [ + core.FeatureSpec("output_feature_0", core.DataType.INT64), + core.FeatureSpec("output_feature_1", core.DataType.INT64), + ], + ) + + t13 = [torch.FloatTensor([1.2, 2.4]), torch.LongTensor([3, 4])] + self.assertListEqual( + pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t13, role="output"), + [ + core.FeatureSpec("output_feature_0", core.DataType.FLOAT), + core.FeatureSpec("output_feature_1", core.DataType.INT64), + ], + ) + + t14 = [torch.LongTensor([[1, 1], [2, 2]]), torch.LongTensor([[3, 3], [4, 4]])] + self.assertListEqual( + pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t14, role="output"), + [ + core.FeatureSpec("output_feature_0", core.DataType.INT64, shape=(2,)), + core.FeatureSpec("output_feature_1", core.DataType.INT64, shape=(2,)), + ], + ) + + t15 = [torch.LongTensor([[1, 1], [2, 2]]), torch.DoubleTensor([[1.5, 6.8], [2.9, 9.2]])] + self.assertListEqual( + pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t15, role="output"), + [ + core.FeatureSpec("output_feature_0", core.DataType.INT64, shape=(2,)), + core.FeatureSpec("output_feature_1", core.DataType.DOUBLE, shape=(2,)), + ], + ) + + def test_convert_to_df_torch_tensor(self) -> None: + t1 = [torch.LongTensor([1, 2, 3, 4])] + pd.testing.assert_frame_equal( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t1), + pd.DataFrame([1, 2, 3, 4]), + ) + + t2 = [torch.DoubleTensor([1, 2, 3, 4])] + t2[0].requires_grad = True + pd.testing.assert_frame_equal( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t2), + pd.DataFrame([1, 2, 3, 4], dtype=np.double), + ) + + t3 = [torch.LongTensor([[1, 1], [2, 2], [3, 3], [4, 4]])] + pd.testing.assert_frame_equal( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t3), + pd.DataFrame({0: [np.array([1, 1]), np.array([2, 2]), np.array([3, 3]), np.array([4, 4])]}), + ) + + t4 = [torch.LongTensor([[[1, 1], [2, 2]], [[3, 3], [4, 4]]])] + pd.testing.assert_frame_equal( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t4), + pd.DataFrame(data={0: [np.array([[1, 1], [2, 2]]), np.array([[3, 3], [4, 4]])]}), + ) + + t5 = [torch.LongTensor([1, 2]), torch.LongTensor([3, 4])] + pd.testing.assert_frame_equal( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t5), + pd.DataFrame([[1, 3], [2, 4]]), + ) + + t6 = [torch.DoubleTensor([1.2, 2.4]), torch.LongTensor([3, 4])] + pd.testing.assert_frame_equal( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t6), + pd.DataFrame([[1.2, 3], [2.4, 4]]), + ) + + t7 = [torch.LongTensor([[1, 1], [2, 2]]), torch.LongTensor([[3, 3], [4, 4]])] + pd.testing.assert_frame_equal( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t7), + pd.DataFrame({0: [np.array([1, 1]), np.array([2, 2])], 1: [np.array([3, 3]), np.array([4, 4])]}), + ) + + t8 = [torch.LongTensor([[1, 1], [2, 2]]), torch.DoubleTensor([[1.5, 6.8], [2.9, 9.2]])] + pd.testing.assert_frame_equal( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t8), + pd.DataFrame({0: [np.array([1, 1]), np.array([2, 2])], 1: [np.array([1.5, 6.8]), np.array([2.9, 9.2])]}), + ) + + def test_convert_from_df_torch_tensor(self) -> None: + t1 = [torch.LongTensor([1, 2, 3, 4])] + for idx, t in enumerate( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t1) + ) + ): + torch.testing.assert_close(t, t1[idx]) # type:ignore[attr-defined] + + t2 = [torch.DoubleTensor([1, 2, 3, 4])] + for idx, t in enumerate( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t2) + ) + ): + torch.testing.assert_close(t, t2[idx]) # type:ignore[attr-defined] + + t3 = [torch.LongTensor([[1, 1], [2, 2], [3, 3], [4, 4]])] + for idx, t in enumerate( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t3) + ) + ): + torch.testing.assert_close(t, t3[idx]) # type:ignore[attr-defined] + + t4 = [torch.LongTensor([[[1, 1], [2, 2]], [[3, 3], [4, 4]]])] + for idx, t in enumerate( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t4) + ) + ): + torch.testing.assert_close(t, t4[idx]) # type:ignore[attr-defined] + + t5 = [torch.LongTensor([1, 2]), torch.LongTensor([3, 4])] + for idx, t in enumerate( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t5) + ) + ): + torch.testing.assert_close(t, t5[idx]) # type:ignore[attr-defined] + + t6 = [torch.DoubleTensor([1.2, 2.4]), torch.LongTensor([3, 4])] + for idx, t in enumerate( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t6) + ) + ): + torch.testing.assert_close(t, t6[idx]) # type:ignore[attr-defined] + + t7 = [torch.LongTensor([[1, 1], [2, 2]]), torch.LongTensor([[3, 3], [4, 4]])] + for idx, t in enumerate( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t7) + ) + ): + torch.testing.assert_close(t, t7[idx]) # type:ignore[attr-defined] + + t8 = [torch.LongTensor([[1, 1], [2, 2]]), torch.DoubleTensor([[1.5, 6.8], [2.9, 9.2]])] + for idx, t in enumerate( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t8) + ) + ): + torch.testing.assert_close(t, t8[idx]) # type:ignore[attr-defined] + + t9 = [torch.IntTensor([1, 2, 3, 4])] + fts = pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t9, role="input") + for idx, t in enumerate( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + utils.rename_pandas_df(pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t9), fts), + fts, + ) + ): + torch.testing.assert_close(t, t9[idx]) # type:ignore[attr-defined] + + t10 = [torch.tensor([1.2, 3.4])] + fts = pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t10, role="input") + for idx, t in enumerate( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + utils.rename_pandas_df(pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t10), fts), + fts, + ) + ): + torch.testing.assert_close(t, t10[idx]) # type:ignore[attr-defined] + + t11 = [torch.tensor([[1, 1], [2, 2], [3, 3], [4, 4]])] + fts = pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t11, role="input") + for idx, t in enumerate( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + utils.rename_pandas_df(pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t11), fts), + fts, + ) + ): + torch.testing.assert_close(t, t11[idx]) # type:ignore[attr-defined] + + t12 = [torch.tensor([[[1, 1], [2, 2]], [[3, 3], [4, 4]]])] + fts = pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t12, role="input") + for idx, t in enumerate( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + utils.rename_pandas_df(pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t12), fts), + fts, + ) + ): + torch.testing.assert_close(t, t12[idx]) # type:ignore[attr-defined] + + t13 = [torch.tensor([1, 2]), torch.tensor([3, 4])] + fts = pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t13, role="input") + for idx, t in enumerate( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + utils.rename_pandas_df(pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t13), fts), + fts, + ) + ): + torch.testing.assert_close(t, t13[idx]) # type:ignore[attr-defined] + + t14 = [torch.tensor([1.2, 2.4]), torch.tensor([3, 4])] + fts = pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t14, role="input") + for idx, t in enumerate( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + utils.rename_pandas_df(pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t14), fts), + fts, + ) + ): + torch.testing.assert_close(t, t14[idx]) # type:ignore[attr-defined] + + t15 = [torch.tensor([[1, 1], [2, 2]]), torch.tensor([[3, 3], [4, 4]])] + fts = pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t15, role="input") + for idx, t in enumerate( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + utils.rename_pandas_df(pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t15), fts), + fts, + ) + ): + torch.testing.assert_close(t, t15[idx]) # type:ignore[attr-defined] + + t16 = [torch.tensor([[1, 1], [2, 2]]), torch.tensor([[1.5, 6.8], [2.9, 9.2]])] + fts = pytorch_handler.SeqOfPyTorchTensorHandler.infer_signature(t16, role="input") + for idx, t in enumerate( + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + utils.rename_pandas_df(pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(t16), fts), + fts, + ) + ): + torch.testing.assert_close(t, t16[idx]) # type:ignore[attr-defined] + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_signatures/snowpark_handler.py b/snowflake/ml/model/_signatures/snowpark_handler.py new file mode 100644 index 00000000..4c37a5ab --- /dev/null +++ b/snowflake/ml/model/_signatures/snowpark_handler.py @@ -0,0 +1,126 @@ +import json +from typing import List, Literal, Optional, Sequence, cast + +import numpy as np +import pandas as pd +from typing_extensions import TypeGuard + +import snowflake.snowpark +import snowflake.snowpark.types as spt +from snowflake.ml._internal.utils import identifier +from snowflake.ml.model import type_hints as model_types +from snowflake.ml.model._deploy_client.warehouse import infer_template +from snowflake.ml.model._signatures import base_handler, core, pandas_handler + + +class SnowparkDataFrameHandler(base_handler.BaseDataHandler[snowflake.snowpark.DataFrame]): + @staticmethod + def can_handle(data: model_types.SupportedDataType) -> TypeGuard[snowflake.snowpark.DataFrame]: + return isinstance(data, snowflake.snowpark.DataFrame) + + @staticmethod + def count(data: snowflake.snowpark.DataFrame) -> int: + return data.count() + + @staticmethod + def truncate(data: snowflake.snowpark.DataFrame) -> snowflake.snowpark.DataFrame: + return cast(snowflake.snowpark.DataFrame, data.limit(SnowparkDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT)) + + @staticmethod + def validate(data: snowflake.snowpark.DataFrame) -> None: + schema = data.schema + for field in schema.fields: + data_type = field.datatype + if isinstance(data_type, spt.ArrayType): + actual_data_type = data_type.element_type + else: + actual_data_type = data_type + if not any(type.is_same_snowpark_type(actual_data_type) for type in core.DataType): + raise ValueError( + f"Data Validation Error: Unsupported data type {field.datatype} in column {field.name}." + ) + + @staticmethod + def infer_signature( + data: snowflake.snowpark.DataFrame, role: Literal["input", "output"] + ) -> Sequence[core.BaseFeatureSpec]: + features: List[core.BaseFeatureSpec] = [] + schema = data.schema + for field in schema.fields: + name = identifier.get_unescaped_names(field.name) + if isinstance(field.datatype, spt.ArrayType): + raise NotImplementedError("Cannot infer model signature from Snowpark DataFrame with Array Type.") + else: + features.append(core.FeatureSpec(name=name, dtype=core.DataType.from_snowpark_type(field.datatype))) + return features + + @staticmethod + def convert_to_df( + data: snowflake.snowpark.DataFrame, + ensure_serializable: bool = True, + features: Optional[Sequence[core.BaseFeatureSpec]] = None, + ) -> pd.DataFrame: + # This method do things on top of to_pandas, to make sure the local dataframe got is in correct shape. + dtype_map = {} + if features: + for feature in features: + if isinstance(feature, core.FeatureGroupSpec): + raise NotImplementedError("FeatureGroupSpec is not supported.") + assert isinstance(feature, core.FeatureSpec), "Invalid feature kind." + dtype_map[feature.name] = feature.as_dtype() + df_local = data.to_pandas() + # This is because Array will become string (Even though the correct schema is set) + # and object will become variant type and requires an additional loads + # to get correct data otherwise it would be string. + for field in data.schema.fields: + if isinstance(field.datatype, spt.ArrayType): + df_local[identifier.get_unescaped_names(field.name)] = df_local[ + identifier.get_unescaped_names(field.name) + ].map(json.loads) + # Only when the feature is not from inference, we are confident to do the type casting. + # Otherwise, dtype_map will be empty + df_local = df_local.astype(dtype=dtype_map) + return df_local + + @staticmethod + def convert_from_df( + session: snowflake.snowpark.Session, df: pd.DataFrame, keep_order: bool = True + ) -> snowflake.snowpark.DataFrame: + # This method is necessary to create the Snowpark Dataframe in correct schema. + # Snowpark ignore the schema argument when providing a pandas DataFrame. + # However, in this case, if a cell of the original Dataframe is some array type, + # they will be inferred as VARIANT. + # To make sure Snowpark get the correct schema, we have to provide in a list of records. + # However, in this case, the order could not be preserved. Thus, a _ID column has to be added, + # if keep_order is True. + # Although in this case, the column with array type can get correct ARRAY type, however, the element + # type is not preserved, and will become string type. This affect the implementation of convert_from_df. + df = pandas_handler.PandasDataFrameHandler.convert_to_df(df) + df_cols = df.columns + if df_cols.dtype != np.object_: + raise ValueError("Cannot convert a Pandas DataFrame whose column index is not a string") + features = pandas_handler.PandasDataFrameHandler.infer_signature(df, role="input") + # Role will be no effect on the column index. That is to say, the feature name is the actual column name. + schema_list = [] + for feature in features: + if isinstance(feature, core.FeatureGroupSpec): + raise NotImplementedError("FeatureGroupSpec is not supported.") + assert isinstance(feature, core.FeatureSpec), "Invalid feature kind." + schema_list.append( + spt.StructField( + identifier.get_inferred_name(feature.name), + feature.as_snowpark_type(), + nullable=df[feature.name].isnull().any(), + ) + ) + + data = df.rename(columns=identifier.get_inferred_name).to_dict("records") + if keep_order: + for idx, data_item in enumerate(data): + data_item[infer_template._KEEP_ORDER_COL_NAME] = idx + schema_list.append(spt.StructField(infer_template._KEEP_ORDER_COL_NAME, spt.LongType(), nullable=False)) + sp_df = session.create_dataframe( + data, # To make sure the schema can be used, otherwise, array will become variant. + spt.StructType(schema_list), + ) + return sp_df diff --git a/snowflake/ml/model/_signatures/snowpark_test.py b/snowflake/ml/model/_signatures/snowpark_test.py new file mode 100644 index 00000000..9a843e92 --- /dev/null +++ b/snowflake/ml/model/_signatures/snowpark_test.py @@ -0,0 +1,126 @@ +import numpy as np +import pandas as pd +from absl.testing import absltest + +import snowflake.snowpark.types as spt +from snowflake.ml.model import model_signature +from snowflake.ml.model._signatures import core, snowpark_handler +from snowflake.ml.utils import connection_params +from snowflake.snowpark import Session + + +class SnowParkDataFrameHandlerTest(absltest.TestCase): + @classmethod + def setUpClass(cls) -> None: + cls._session = Session.builder.configs(connection_params.SnowflakeLoginOptions()).create() + + @classmethod + def tearDownClass(cls) -> None: + cls._session.close() + + def test_validate_snowpark_df(self) -> None: + schema = spt.StructType([spt.StructField('"a"', spt.VariantType()), spt.StructField('"b"', spt.StringType())]) + df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) + with self.assertRaisesRegex(ValueError, "Unsupported data type"): + snowpark_handler.SnowparkDataFrameHandler.validate(df) + + def test_infer_schema_snowpark_df(self) -> None: + schema = spt.StructType([spt.StructField('"a"', spt.LongType()), spt.StructField('"b"', spt.StringType())]) + df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) + self.assertListEqual( + snowpark_handler.SnowparkDataFrameHandler.infer_signature(df, role="input"), + [ + core.FeatureSpec("a", core.DataType.INT64), + core.FeatureSpec("b", core.DataType.STRING), + ], + ) + + schema = spt.StructType([spt.StructField('"""a"""', spt.LongType()), spt.StructField('"b"', spt.StringType())]) + df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) + self.assertListEqual( + snowpark_handler.SnowparkDataFrameHandler.infer_signature(df, role="input"), + [ + core.FeatureSpec('"a"', core.DataType.INT64), + core.FeatureSpec("b", core.DataType.STRING), + ], + ) + + schema = spt.StructType([spt.StructField('"""a"""', spt.ArrayType(spt.LongType()))]) + df = self._session.create_dataframe([[[1, 3]]], schema) + with self.assertRaises(NotImplementedError): + snowpark_handler.SnowparkDataFrameHandler.infer_signature(df, role="input"), + + def test_validate_data_with_features(self) -> None: + fts = [ + core.FeatureSpec("a", core.DataType.INT64), + core.FeatureSpec("b", core.DataType.INT64), + ] + df = self._session.create_dataframe([{'"a"': 1}, {'"b"': 2}]) + with self.assertWarnsRegex(RuntimeWarning, "Nullable column [^\\s]* provided"): + model_signature._validate_snowpark_data(df, fts) + + fts = [ + core.FeatureSpec("a", core.DataType.INT64), + core.FeatureSpec("b", core.DataType.STRING), + ] + schema = spt.StructType([spt.StructField('"a"', spt.LongType()), spt.StructField('"b"', spt.StringType())]) + df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) + model_signature._validate_snowpark_data(df, fts) + + schema = spt.StructType([spt.StructField('"a"', spt.LongType()), spt.StructField('"b"', spt.IntegerType())]) + df = self._session.create_dataframe([[1, 3], [3, 9]], schema) + with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by column"): + model_signature._validate_snowpark_data(df, fts) + + schema = spt.StructType([spt.StructField('"a1"', spt.LongType()), spt.StructField('"b"', spt.StringType())]) + df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) + with self.assertRaisesRegex(ValueError, "feature [^\\s]* does not exist in data."): + model_signature._validate_snowpark_data(df, fts) + + df = self._session.create_dataframe([{'"a"': 1}, {'"b"': 2}]) + with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by column"): + model_signature._validate_snowpark_data(df, fts) + + fts = [ + core.FeatureSpec("a", core.DataType.INT64, shape=(-1,)), + ] + schema = spt.StructType([spt.StructField('"a"', spt.ArrayType(spt.LongType()))]) + df = self._session.create_dataframe([[[1, 3]]], schema) + with self.assertWarns(RuntimeWarning): + model_signature._validate_snowpark_data(df, fts) + + def test_convert_to_and_from_df(self) -> None: + pd_df = pd.DataFrame([1, 2, 3, 4], columns=["col_0"]) + sp_df = snowpark_handler.SnowparkDataFrameHandler.convert_from_df(self._session, pd_df, keep_order=False) + pd.testing.assert_frame_equal( + pd_df, snowpark_handler.SnowparkDataFrameHandler.convert_to_df(sp_df), check_dtype=False + ) + + pd_df = pd.DataFrame([[1, 2], [3, 4]], columns=["col_0", "col_1"]) + sp_df = snowpark_handler.SnowparkDataFrameHandler.convert_from_df(self._session, pd_df, keep_order=False) + pd.testing.assert_frame_equal( + pd_df, snowpark_handler.SnowparkDataFrameHandler.convert_to_df(sp_df), check_dtype=False + ) + + pd_df = pd.DataFrame([[1.2, 2.4], [3, 4]], columns=["col_0", "col_1"]) + sp_df = snowpark_handler.SnowparkDataFrameHandler.convert_from_df(self._session, pd_df, keep_order=False) + pd.testing.assert_frame_equal( + pd_df, snowpark_handler.SnowparkDataFrameHandler.convert_to_df(sp_df), check_dtype=False + ) + + pd_df = pd.DataFrame([[1, [[2.5], [6.8]]], [2, [[2.5], [6.8]]]], columns=["a", "b"]) + sp_df = snowpark_handler.SnowparkDataFrameHandler.convert_from_df(self._session, pd_df, keep_order=False) + pd.testing.assert_frame_equal( + pd_df, snowpark_handler.SnowparkDataFrameHandler.convert_to_df(sp_df), check_dtype=False + ) + + a = np.array([2.5, 6.8]) + pd_df = pd.DataFrame([[1, a], [2, a]], columns=["a", "b"]) + sp_df = snowpark_handler.SnowparkDataFrameHandler.convert_from_df(self._session, pd_df, keep_order=False) + pd.testing.assert_frame_equal( + pd_df, snowpark_handler.SnowparkDataFrameHandler.convert_to_df(sp_df), check_dtype=False + ) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_signatures/tensorflow_handler.py b/snowflake/ml/model/_signatures/tensorflow_handler.py new file mode 100644 index 00000000..58518379 --- /dev/null +++ b/snowflake/ml/model/_signatures/tensorflow_handler.py @@ -0,0 +1,125 @@ +from typing import TYPE_CHECKING, List, Literal, Optional, Sequence, Union + +import numpy as np +import pandas as pd +from typing_extensions import TypeGuard + +from snowflake.ml._internal import type_utils +from snowflake.ml.model import type_hints as model_types +from snowflake.ml.model._signatures import base_handler, core + +if TYPE_CHECKING: + import tensorflow + + +class SeqOfTensorflowTensorHandler( + base_handler.BaseDataHandler[Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]] +): + @staticmethod + def can_handle( + data: model_types.SupportedDataType, + ) -> TypeGuard[Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]]: + if not isinstance(data, list): + return False + if len(data) == 0: + return False + if type_utils.LazyType("tensorflow.Tensor").isinstance(data[0]) or type_utils.LazyType( + "tensorflow.Variable" + ).isinstance(data[0]): + return all( + type_utils.LazyType("tensorflow.Tensor").isinstance(data_col) + or type_utils.LazyType("tensorflow.Variable").isinstance(data_col) + for data_col in data + ) + return False + + @staticmethod + def count(data: Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]) -> int: + import tensorflow as tf + + rows = [] + for data_col in data: + shapes = data_col.shape.as_list() + if data_col.shape == tf.TensorShape(None) or (not shapes) or (shapes[0] is None): + # Unknown shape array + raise ValueError("Data Validation Error: Unknown shape data is found.") + # Make mypy happy + assert isinstance(shapes[0], int) + + rows.append(shapes[0]) + + return min(rows) + + @staticmethod + def truncate( + data: Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]] + ) -> Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]: + return [ + data_col[ + : min(SeqOfTensorflowTensorHandler.count(data), SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT) + ] + for data_col in data + ] + + @staticmethod + def validate(data: Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]) -> None: + import tensorflow as tf + + for data_col in data: + if data_col.shape == tf.TensorShape(None) or any(dim is None for dim in data_col.shape.as_list()): + # Unknown shape array + raise ValueError("Data Validation Error: Unknown shape data is found.") + + if data_col.shape == tf.TensorShape([0]): + # Empty array + raise ValueError("Data Validation Error: Empty data is found.") + + if data_col.shape == tf.TensorShape([1]) or data_col.shape == tf.TensorShape([]): + # scalar + raise ValueError("Data Validation Error: Scalar data is found.") + + @staticmethod + def infer_signature( + data: Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]], role: Literal["input", "output"] + ) -> Sequence[core.BaseFeatureSpec]: + feature_prefix = f"{SeqOfTensorflowTensorHandler.FEATURE_PREFIX}_" + features: List[core.BaseFeatureSpec] = [] + role_prefix = ( + SeqOfTensorflowTensorHandler.INPUT_PREFIX if role == "input" else SeqOfTensorflowTensorHandler.OUTPUT_PREFIX + ) + "_" + + for i, data_col in enumerate(data): + dtype = core.DataType.from_numpy_type(data_col.dtype.as_numpy_dtype) + ft_name = f"{role_prefix}{feature_prefix}{i}" + if len(data_col.shape) == 1: + features.append(core.FeatureSpec(dtype=dtype, name=ft_name)) + else: + ft_shape = tuple(data_col.shape[1:]) + features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape)) + return features + + @staticmethod + def convert_to_df( + data: Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]], ensure_serializable: bool = True + ) -> pd.DataFrame: + if ensure_serializable: + return pd.DataFrame({i: data_col.numpy().tolist() for i, data_col in enumerate(iterable=data)}) + return pd.DataFrame({i: list(data_col.numpy()) for i, data_col in enumerate(iterable=data)}) + + @staticmethod + def convert_from_df( + df: pd.DataFrame, features: Optional[Sequence[core.BaseFeatureSpec]] = None + ) -> Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]: + import tensorflow as tf + + res = [] + if features: + for feature in features: + if isinstance(feature, core.FeatureGroupSpec): + raise NotImplementedError("FeatureGroupSpec is not supported.") + assert isinstance(feature, core.FeatureSpec), "Invalid feature kind." + res.append( + tf.convert_to_tensor(np.stack(df[feature.name].to_numpy()).astype(feature._dtype._numpy_type)) + ) + return res + return [tf.convert_to_tensor(np.stack(df[col].to_numpy())) for col in df] diff --git a/snowflake/ml/model/_signatures/tensorflow_test.py b/snowflake/ml/model/_signatures/tensorflow_test.py new file mode 100644 index 00000000..d626b65c --- /dev/null +++ b/snowflake/ml/model/_signatures/tensorflow_test.py @@ -0,0 +1,555 @@ +import numpy as np +import pandas as pd +import tensorflow as tf +from absl.testing import absltest + +from snowflake.ml.model._signatures import core, tensorflow_handler, utils + + +class SeqOfTensorflowTensorHandlerTest(absltest.TestCase): + def test_validate_list_of_tf_tensor(self) -> None: + lt1 = [np.array([1, 4]), np.array([2, 3])] + self.assertFalse(tensorflow_handler.SeqOfTensorflowTensorHandler.can_handle(lt1)) + + lt2 = [np.array([1, 4]), tf.constant([2, 3])] + self.assertFalse(tensorflow_handler.SeqOfTensorflowTensorHandler.can_handle(lt2)) + + lt3 = [tf.constant([1, 4]), tf.constant([2, 3])] + self.assertTrue(tensorflow_handler.SeqOfTensorflowTensorHandler.can_handle(lt3)) + + lt4 = [tf.constant([1, 4]), tf.Variable([2, 3])] + self.assertTrue(tensorflow_handler.SeqOfTensorflowTensorHandler.can_handle(lt4)) + + lt5 = [tf.Variable([1, 4]), tf.Variable([2, 3])] + self.assertTrue(tensorflow_handler.SeqOfTensorflowTensorHandler.can_handle(lt5)) + + def test_validate_tf_tensor(self) -> None: + t = [tf.constant([])] + with self.assertRaisesRegex(ValueError, "Empty data is found."): + tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) + + t = [tf.Variable([1, 2], shape=tf.TensorShape(None))] + with self.assertRaisesRegex(ValueError, "Unknown shape data is found."): + tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) + + t = [tf.Variable([[1, 2]], shape=tf.TensorShape([None, 2]))] + with self.assertRaisesRegex(ValueError, "Unknown shape data is found."): + tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) + + t = [tf.Variable([[1, 2]], shape=tf.TensorShape([1, None]))] + with self.assertRaisesRegex(ValueError, "Unknown shape data is found."): + tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) + + t = [tf.constant(1)] + with self.assertRaisesRegex(ValueError, "Scalar data is found."): + tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) + + t = [tf.constant([1])] + with self.assertRaisesRegex(ValueError, "Scalar data is found."): + tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) + + t = [tf.Variable(1)] + with self.assertRaisesRegex(ValueError, "Scalar data is found."): + tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) + + t = [tf.Variable([1])] + with self.assertRaisesRegex(ValueError, "Scalar data is found."): + tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) + + t = [tf.constant([1, 2]), tf.constant(1)] + with self.assertRaisesRegex(ValueError, "Scalar data is found."): + tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) + + def test_count_tf_tensor(self) -> None: + t = [tf.constant([1, 2])] + self.assertEqual(tensorflow_handler.SeqOfTensorflowTensorHandler.count(t), 2) + + t = [tf.constant([[1, 2]])] + self.assertEqual(tensorflow_handler.SeqOfTensorflowTensorHandler.count(t), 1) + + t = [tf.Variable([1, 2])] + self.assertEqual(tensorflow_handler.SeqOfTensorflowTensorHandler.count(t), 2) + + t = [tf.Variable([1, 2], shape=tf.TensorShape(None))] + with self.assertRaisesRegex(ValueError, "Unknown shape data is found."): + tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) + + t = [tf.Variable([[1, 2]], shape=tf.TensorShape([None, 2]))] + with self.assertRaisesRegex(ValueError, "Unknown shape data is found."): + tensorflow_handler.SeqOfTensorflowTensorHandler.validate(t) + + t = [tf.Variable([[1, 2]], shape=tf.TensorShape([1, None]))] + self.assertEqual(tensorflow_handler.SeqOfTensorflowTensorHandler.count(t), 1) + + def test_trunc_tf_tensor(self) -> None: + t = [tf.constant([1] * (tensorflow_handler.SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1))] + + for ts in tensorflow_handler.SeqOfTensorflowTensorHandler.truncate(t): + tf.assert_equal( + tf.constant([1] * (tensorflow_handler.SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT)), ts + ) + + t = [tf.constant([1] * (tensorflow_handler.SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1))] + + for ts in tensorflow_handler.SeqOfTensorflowTensorHandler.truncate(t): + tf.assert_equal( + tf.constant([1] * (tensorflow_handler.SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)), ts + ) + + t = [tf.constant([1] * (tensorflow_handler.SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1))] * 2 + + for ts in tensorflow_handler.SeqOfTensorflowTensorHandler.truncate(t): + tf.assert_equal( + tf.constant([1] * (tensorflow_handler.SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT)), ts + ) + + t = [ + tf.constant([1]), + tf.constant([1] * (tensorflow_handler.SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)), + ] + + for ts in tensorflow_handler.SeqOfTensorflowTensorHandler.truncate(t): + tf.assert_equal(tf.constant([1]), ts) + + def test_infer_schema_tf_tensor(self) -> None: + t1 = [tf.constant([1, 2, 3, 4], dtype=tf.int32)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t1, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT32)], + ) + + t2 = [tf.constant([1, 2, 3, 4], dtype=tf.int64)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t2, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT64)], + ) + + t3 = [tf.constant([1, 2, 3, 4], dtype=tf.int16)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t3, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT16)], + ) + + t4 = [tf.constant([1, 2, 3, 4], dtype=tf.int8)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t4, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT8)], + ) + + t5 = [tf.constant([1, 2, 3, 4], dtype=tf.uint32)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t5, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.UINT32)], + ) + + t6 = [tf.constant([1, 2, 3, 4], dtype=tf.uint64)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t6, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.UINT64)], + ) + + t7 = [tf.constant([1, 2, 3, 4], dtype=tf.uint16)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t7, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.UINT16)], + ) + + t8 = [tf.constant([1, 2, 3, 4], dtype=tf.uint8)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t8, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.UINT8)], + ) + + t9 = [tf.constant([False, True])] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t9, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.BOOL)], + ) + + t10 = [tf.constant([1.2, 3.4], dtype=tf.float32)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t10, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.FLOAT)], + ) + + t11 = [tf.constant([1.2, 3.4], dtype=tf.float64)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t11, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.DOUBLE)], + ) + + t12 = [tf.constant([[1, 2], [3, 4]])] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t12, role="input"), + [ + core.FeatureSpec("input_feature_0", core.DataType.INT32, shape=(2,)), + ], + ) + + t13 = [tf.constant([[[1, 1], [2, 2]], [[3, 3], [4, 4]]])] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t13, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT32, shape=(2, 2))], + ) + + t14 = [tf.constant([1, 2, 3, 4])] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t14, role="output"), + [core.FeatureSpec("output_feature_0", core.DataType.INT32)], + ) + + t15 = [tf.constant([1, 2]), tf.constant([3, 4])] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t15, role="output"), + [ + core.FeatureSpec("output_feature_0", core.DataType.INT32), + core.FeatureSpec("output_feature_1", core.DataType.INT32), + ], + ) + + t16 = [tf.constant([1.2, 2.4]), tf.constant([3, 4])] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t16, role="output"), + [ + core.FeatureSpec("output_feature_0", core.DataType.FLOAT), + core.FeatureSpec("output_feature_1", core.DataType.INT32), + ], + ) + + t17 = [tf.constant([[1, 1], [2, 2]]), tf.constant([[3, 3], [4, 4]])] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t17, role="output"), + [ + core.FeatureSpec("output_feature_0", core.DataType.INT32, shape=(2,)), + core.FeatureSpec("output_feature_1", core.DataType.INT32, shape=(2,)), + ], + ) + + t18 = [tf.constant([[1, 1], [2, 2]]), tf.constant([[1.5, 6.8], [2.9, 9.2]])] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t18, role="output"), + [ + core.FeatureSpec("output_feature_0", core.DataType.INT32, shape=(2,)), + core.FeatureSpec("output_feature_1", core.DataType.FLOAT, shape=(2,)), + ], + ) + + t21 = [tf.constant([1, 2, 3, 4], dtype=tf.int32)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t21, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT32)], + ) + + t22 = [tf.constant([1, 2, 3, 4], dtype=tf.int64)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t22, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT64)], + ) + + t23 = [tf.constant([1, 2, 3, 4], dtype=tf.int16)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t23, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT16)], + ) + + t24 = [tf.constant([1, 2, 3, 4], dtype=tf.int8)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t24, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT8)], + ) + + t25 = [tf.constant([1, 2, 3, 4], dtype=tf.uint32)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t25, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.UINT32)], + ) + + t26 = [tf.constant([1, 2, 3, 4], dtype=tf.uint64)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t26, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.UINT64)], + ) + + t27 = [tf.constant([1, 2, 3, 4], dtype=tf.uint16)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t27, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.UINT16)], + ) + + t28 = [tf.constant([1, 2, 3, 4], dtype=tf.uint8)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t28, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.UINT8)], + ) + + t29 = [tf.constant([False, True])] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t29, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.BOOL)], + ) + + t30 = [tf.constant([1.2, 3.4], dtype=tf.float32)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t30, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.FLOAT)], + ) + + t31 = [tf.constant([1.2, 3.4], dtype=tf.float64)] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t31, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.DOUBLE)], + ) + + t32 = [tf.constant([[1, 2], [3, 4]])] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t32, role="input"), + [ + core.FeatureSpec("input_feature_0", core.DataType.INT32, shape=(2,)), + ], + ) + + t33 = [tf.constant([[[1, 1], [2, 2]], [[3, 3], [4, 4]]])] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t33, role="input"), + [core.FeatureSpec("input_feature_0", core.DataType.INT32, shape=(2, 2))], + ) + + t34 = [tf.constant([1, 2, 3, 4])] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t34, role="output"), + [core.FeatureSpec("output_feature_0", core.DataType.INT32)], + ) + + t35 = [tf.constant([1, 2]), tf.constant([3, 4])] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t35, role="output"), + [ + core.FeatureSpec("output_feature_0", core.DataType.INT32), + core.FeatureSpec("output_feature_1", core.DataType.INT32), + ], + ) + + t36 = [tf.constant([1.2, 2.4]), tf.constant([3, 4])] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t36, role="output"), + [ + core.FeatureSpec("output_feature_0", core.DataType.FLOAT), + core.FeatureSpec("output_feature_1", core.DataType.INT32), + ], + ) + + t37 = [tf.constant([[1, 1], [2, 2]]), tf.constant([[3, 3], [4, 4]])] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t37, role="output"), + [ + core.FeatureSpec("output_feature_0", core.DataType.INT32, shape=(2,)), + core.FeatureSpec("output_feature_1", core.DataType.INT32, shape=(2,)), + ], + ) + + t38 = [tf.constant([[1, 1], [2, 2]]), tf.constant([[1.5, 6.8], [2.9, 9.2]])] + self.assertListEqual( + tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t38, role="output"), + [ + core.FeatureSpec("output_feature_0", core.DataType.INT32, shape=(2,)), + core.FeatureSpec("output_feature_1", core.DataType.FLOAT, shape=(2,)), + ], + ) + + def test_convert_to_df_tf_tensor(self) -> None: + t1 = [tf.constant([1, 2, 3, 4], dtype=tf.int64)] + pd.testing.assert_frame_equal( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t1), + pd.DataFrame([1, 2, 3, 4]), + ) + + t2 = [tf.Variable([1, 2, 3, 4], dtype=tf.int64)] + pd.testing.assert_frame_equal( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t2), + pd.DataFrame([1, 2, 3, 4]), + ) + + t3 = [tf.constant([[1, 1], [2, 2], [3, 3], [4, 4]], dtype=tf.int64)] + pd.testing.assert_frame_equal( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t3), + pd.DataFrame({0: [np.array([1, 1]), np.array([2, 2]), np.array([3, 3]), np.array([4, 4])]}), + ) + + t4 = [tf.constant([[[1, 1], [2, 2]], [[3, 3], [4, 4]]], dtype=tf.int64)] + pd.testing.assert_frame_equal( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t4), + pd.DataFrame(data={0: [np.array([[1, 1], [2, 2]]), np.array([[3, 3], [4, 4]])]}), + ) + + t5 = [tf.constant([1, 2], dtype=tf.int64), tf.constant([3, 4], dtype=tf.int64)] + pd.testing.assert_frame_equal( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t5), + pd.DataFrame([[1, 3], [2, 4]]), + ) + + t6 = [tf.constant([1.2, 2.4], dtype=tf.float64), tf.constant([3, 4], dtype=tf.int64)] + pd.testing.assert_frame_equal( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t6), + pd.DataFrame([[1.2, 3], [2.4, 4]]), + ) + + t7 = [tf.constant([[1, 1], [2, 2]], dtype=tf.int64), tf.constant([[3, 3], [4, 4]], dtype=tf.int64)] + pd.testing.assert_frame_equal( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t7), + pd.DataFrame({0: [np.array([1, 1]), np.array([2, 2])], 1: [np.array([3, 3]), np.array([4, 4])]}), + ) + + t8 = [tf.constant([[1, 1], [2, 2]], dtype=tf.int64), tf.constant([[1.5, 6.8], [2.9, 9.2]], dtype=tf.float64)] + pd.testing.assert_frame_equal( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t8), + pd.DataFrame({0: [np.array([1, 1]), np.array([2, 2])], 1: [np.array([1.5, 6.8]), np.array([2.9, 9.2])]}), + ) + + def test_convert_from_df_tf_tensor(self) -> None: + t1 = [tf.constant([1, 2, 3, 4], dtype=tf.int64)] + for idx, t in enumerate( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t1) + ) + ): + tf.assert_equal(t, t1[idx]) + + t2 = [tf.Variable([1, 2, 3, 4], dtype=tf.int64)] + for idx, t in enumerate( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t2) + ) + ): + tf.assert_equal(t, t2[idx]) + + t3 = [tf.constant([[1, 1], [2, 2], [3, 3], [4, 4]], dtype=tf.int64)] + for idx, t in enumerate( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t3) + ) + ): + tf.assert_equal(t, t3[idx]) + + t4 = [tf.constant([[[1, 1], [2, 2]], [[3, 3], [4, 4]]], dtype=tf.int64)] + for idx, t in enumerate( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t4) + ) + ): + tf.assert_equal(t, t4[idx]) + + t5 = [tf.constant([1, 2], dtype=tf.int64), tf.constant([3, 4], dtype=tf.int64)] + for idx, t in enumerate( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t5) + ) + ): + tf.assert_equal(t, t5[idx]) + + t6 = [tf.constant([1.2, 2.4], dtype=tf.float64), tf.constant([3, 4], dtype=tf.int64)] + for idx, t in enumerate( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t6) + ) + ): + tf.assert_equal(t, t6[idx]) + + t7 = [tf.constant([[1, 1], [2, 2]], dtype=tf.int64), tf.constant([[3, 3], [4, 4]], dtype=tf.int64)] + for idx, t in enumerate( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t7) + ) + ): + tf.assert_equal(t, t7[idx]) + + t8 = [tf.constant([[1, 1], [2, 2]], dtype=tf.int64), tf.constant([[1.5, 6.8], [2.9, 9.2]], dtype=tf.float64)] + for idx, t in enumerate( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t8) + ) + ): + tf.assert_equal(t, t8[idx]) + + t9 = [tf.constant([1, 2, 3, 4])] + fts = tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t9, role="input") + for idx, t in enumerate( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + utils.rename_pandas_df(tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t9), fts), + fts, + ) + ): + tf.assert_equal(t, t9[idx]) + + t10 = [tf.constant([1.2, 3.4])] + fts = tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t10, role="input") + for idx, t in enumerate( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + utils.rename_pandas_df(tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t10), fts), + fts, + ) + ): + tf.assert_equal(t, t10[idx]) + + t11 = [tf.constant([[1, 1], [2, 2], [3, 3], [4, 4]])] + fts = tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t11, role="input") + for idx, t in enumerate( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + utils.rename_pandas_df(tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t11), fts), + fts, + ) + ): + tf.assert_equal(t, t11[idx]) + + t12 = [tf.constant([[[1, 1], [2, 2]], [[3, 3], [4, 4]]])] + fts = tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t12, role="input") + for idx, t in enumerate( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + utils.rename_pandas_df(tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t12), fts), + fts, + ) + ): + tf.assert_equal(t, t12[idx]) + + t13 = [tf.constant([1, 2]), tf.constant([3, 4])] + fts = tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t13, role="input") + for idx, t in enumerate( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + utils.rename_pandas_df(tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t13), fts), + fts, + ) + ): + tf.assert_equal(t, t13[idx]) + + t14 = [tf.constant([1.2, 2.4]), tf.constant([3, 4])] + fts = tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t14, role="input") + for idx, t in enumerate( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + utils.rename_pandas_df(tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t14), fts), + fts, + ) + ): + tf.assert_equal(t, t14[idx]) + + t15 = [tf.constant([[1, 1], [2, 2]]), tf.constant([[3, 3], [4, 4]])] + fts = tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t15, role="input") + for idx, t in enumerate( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + utils.rename_pandas_df(tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t15), fts), + fts, + ) + ): + tf.assert_equal(t, t15[idx]) + + t16 = [tf.constant([[1, 1], [2, 2]]), tf.constant([[1.5, 6.8], [2.9, 9.2]])] + fts = tensorflow_handler.SeqOfTensorflowTensorHandler.infer_signature(t16, role="input") + for idx, t in enumerate( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + utils.rename_pandas_df(tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(t16), fts), + fts, + ) + ): + tf.assert_equal(t, t16[idx]) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_signatures/utils.py b/snowflake/ml/model/_signatures/utils.py new file mode 100644 index 00000000..9be3a8a9 --- /dev/null +++ b/snowflake/ml/model/_signatures/utils.py @@ -0,0 +1,88 @@ +import warnings +from typing import Any, List, Optional, Sequence + +import numpy as np +import numpy.typing as npt +import pandas as pd + +from snowflake.ml.model._signatures import core + + +def convert_list_to_ndarray(data: List[Any]) -> npt.NDArray[Any]: + """Create a numpy array from list or nested list. Avoid ragged list and unaligned types. + + Args: + data: List or nested list. + + Raises: + ValueError: Raised when ragged nested list or list containing non-basic type confronted. + ValueError: Raised when ragged nested list or list containing non-basic type confronted. + + Returns: + The converted numpy array. + """ + warnings.filterwarnings("error", category=np.VisibleDeprecationWarning) + try: + arr = np.array(data) + except np.VisibleDeprecationWarning: + # In recent version of numpy, this warning should be raised when bad list provided. + raise ValueError( + f"Unable to construct signature: Ragged nested or Unsupported list-like data {data} confronted." + ) + warnings.filterwarnings("default", category=np.VisibleDeprecationWarning) + if arr.dtype == object: + # If not raised, then a array of object would be created. + raise ValueError( + f"Unable to construct signature: Ragged nested or Unsupported list-like data {data} confronted." + ) + return arr + + +def rename_features( + features: Sequence[core.BaseFeatureSpec], feature_names: Optional[List[str]] = None +) -> Sequence[core.BaseFeatureSpec]: + """It renames the feature in features provided optional feature names. + + Args: + features: A sequence of feature specifications and feature group specifications. + feature_names: A list of names to assign to features and feature groups. Defaults to None. + + Raises: + ValueError: Raised when provided feature_names does not match the data shape. + + Returns: + A sequence of feature specifications and feature group specifications being renamed if names provided. + """ + if feature_names: + if len(feature_names) == len(features): + for ft, ft_name in zip(features, feature_names): + ft._name = ft_name + else: + raise ValueError( + f"{len(feature_names)} feature names are provided, while there are {len(features)} features." + ) + return features + + +def rename_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureSpec]) -> pd.DataFrame: + """It renames pandas dataframe that has non-object column index with provided features. + + Args: + data: A pandas dataframe to be renamed. + features: A sequence of feature specifications and feature group specifications to rename the dataframe. + + Raises: + ValueError: Raised when the data does not have the same number of features as signature. + + Returns: + A pandas dataframe with columns renamed. + """ + df_cols = data.columns + if df_cols.dtype in [np.int64, np.uint64, np.float64]: + if len(features) != len(data.columns): + raise ValueError( + "Data does not have the same number of features as signature. " + + f"Signature requires {len(features)} features, but have {len(data.columns)} in input data." + ) + data.columns = pd.Index([feature.name for feature in features]) + return data diff --git a/snowflake/ml/model/_signatures/utils_test.py b/snowflake/ml/model/_signatures/utils_test.py new file mode 100644 index 00000000..70e6c34a --- /dev/null +++ b/snowflake/ml/model/_signatures/utils_test.py @@ -0,0 +1,49 @@ +import pandas as pd +from absl.testing import absltest + +from snowflake.ml.model._signatures import core, utils + + +class ModelSignatureMiscTest(absltest.TestCase): + def testrename_features(self) -> None: + utils.rename_features([]) + + fts = [core.FeatureSpec("a", core.DataType.INT64)] + self.assertListEqual( + utils.rename_features(fts, ["b"]), + [core.FeatureSpec("b", core.DataType.INT64)], + ) + + fts = [core.FeatureSpec("a", core.DataType.INT64, shape=(2,))] + self.assertListEqual( + utils.rename_features(fts, ["b"]), + [core.FeatureSpec("b", core.DataType.INT64, shape=(2,))], + ) + + fts = [core.FeatureSpec("a", core.DataType.INT64, shape=(2,))] + utils.rename_features(fts) + + with self.assertRaises(ValueError): + fts = [core.FeatureSpec("a", core.DataType.INT64, shape=(2,))] + utils.rename_features(fts, ["b", "c"]) + + def testrename_pandas_df(self) -> None: + fts = [ + core.FeatureSpec("input_feature_0", core.DataType.INT64), + core.FeatureSpec("input_feature_1", core.DataType.INT64), + ] + + df = pd.DataFrame([[2, 5], [6, 8]], columns=["a", "b"]) + + pd.testing.assert_frame_equal(df, utils.rename_pandas_df(df, fts)) + + df = pd.DataFrame([[2, 5], [6, 8]]) + + pd.testing.assert_frame_equal(df, utils.rename_pandas_df(df, fts), check_names=False) + pd.testing.assert_index_equal( + pd.Index(["input_feature_0", "input_feature_1"]), right=utils.rename_pandas_df(df, fts).columns + ) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/model_signature.py b/snowflake/ml/model/model_signature.py index 17f0460e..47fa74d1 100644 --- a/snowflake/ml/model/model_signature.py +++ b/snowflake/ml/model/model_signature.py @@ -1,1088 +1,41 @@ -import json -import textwrap import warnings -from abc import ABC, abstractmethod -from enum import Enum -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - Final, - Generic, - List, - Literal, - Optional, - Sequence, - Tuple, - Type, - Union, - cast, - final, -) +from typing import Any, List, Literal, Optional, Sequence, Type import numpy as np -import numpy.typing as npt import pandas as pd -from typing_extensions import TypeGuard import snowflake.snowpark import snowflake.snowpark.types as spt -from snowflake.ml._internal import type_utils from snowflake.ml._internal.utils import formatting, identifier from snowflake.ml.model import type_hints as model_types -from snowflake.ml.model._deploy_client.warehouse import infer_template - -if TYPE_CHECKING: - import tensorflow - import torch - - -class DataType(Enum): - def __init__(self, value: str, snowpark_type: Type[spt.DataType], numpy_type: npt.DTypeLike) -> None: - self._value = value - self._snowpark_type = snowpark_type - self._numpy_type = numpy_type - - INT8 = ("int8", spt.ByteType, np.int8) - INT16 = ("int16", spt.ShortType, np.int16) - INT32 = ("int32", spt.IntegerType, np.int32) - INT64 = ("int64", spt.LongType, np.int64) - - FLOAT = ("float", spt.FloatType, np.float32) - DOUBLE = ("double", spt.DoubleType, np.float64) - - UINT8 = ("uint8", spt.ByteType, np.uint8) - UINT16 = ("uint16", spt.ShortType, np.uint16) - UINT32 = ("uint32", spt.IntegerType, np.uint32) - UINT64 = ("uint64", spt.LongType, np.uint64) - - BOOL = ("bool", spt.BooleanType, np.bool_) - STRING = ("string", spt.StringType, np.str_) - BYTES = ("bytes", spt.BinaryType, np.bytes_) - - def as_snowpark_type(self) -> spt.DataType: - """Convert to corresponding Snowpark Type. - - Returns: - A Snowpark type. - """ - return self._snowpark_type() - - def __repr__(self) -> str: - return f"DataType.{self.name}" - - @classmethod - def from_numpy_type(cls, np_type: npt.DTypeLike) -> "DataType": - """Translate numpy dtype to DataType for signature definition. - - Args: - np_type: The numpy dtype. - - Raises: - NotImplementedError: Raised when the given numpy type is not supported. - - Returns: - Corresponding DataType. - """ - np_to_snowml_type_mapping = {i._numpy_type: i for i in DataType} - for potential_type in np_to_snowml_type_mapping.keys(): - if np.can_cast(np_type, potential_type, casting="no"): - # This is used since the same dtype might represented in different ways. - return np_to_snowml_type_mapping[potential_type] - raise NotImplementedError(f"Type {np_type} is not supported as a DataType.") - - @classmethod - def from_torch_type(cls, torch_type: "torch.dtype") -> "DataType": - import torch - - """Translate torch dtype to DataType for signature definition. - - Args: - torch_type: The torch dtype. - - Returns: - Corresponding DataType. - """ - torch_dtype_to_numpy_dtype_mapping = { - torch.uint8: np.uint8, - torch.int8: np.int8, - torch.int16: np.int16, - torch.int32: np.int32, - torch.int64: np.int64, - torch.float32: np.float32, - torch.float64: np.float64, - torch.bool: np.bool_, - } - return cls.from_numpy_type(torch_dtype_to_numpy_dtype_mapping[torch_type]) - - @classmethod - def from_snowpark_type(cls, snowpark_type: spt.DataType) -> "DataType": - """Translate snowpark type to DataType for signature definition. - - Args: - snowpark_type: The snowpark type. - - Raises: - NotImplementedError: Raised when the given numpy type is not supported. - - Returns: - Corresponding DataType. - """ - if isinstance(snowpark_type, spt.ArrayType): - actual_sp_type = snowpark_type.element_type - else: - actual_sp_type = snowpark_type - - snowpark_to_snowml_type_mapping: Dict[Type[spt.DataType], DataType] = { - i._snowpark_type: i - for i in DataType - # We by default infer as signed integer. - if i not in [DataType.UINT8, DataType.UINT16, DataType.UINT32, DataType.UINT64] - } - for potential_type in snowpark_to_snowml_type_mapping.keys(): - if isinstance(actual_sp_type, potential_type): - return snowpark_to_snowml_type_mapping[potential_type] - # Fallback for decimal type. - if isinstance(snowpark_type, spt.DecimalType): - if snowpark_type.scale == 0: - return DataType.INT64 - raise NotImplementedError(f"Type {snowpark_type} is not supported as a DataType.") - - def is_same_snowpark_type(self, incoming_snowpark_type: spt.DataType) -> bool: - """Check if provided snowpark type is the same as Data Type. - - Args: - incoming_snowpark_type: The snowpark type. - - Raises: - NotImplementedError: Raised when the given numpy type is not supported. - - Returns: - If the provided snowpark type is the same as the DataType. - """ - # Special handle for Decimal Type. - if isinstance(incoming_snowpark_type, spt.DecimalType): - if incoming_snowpark_type.scale == 0: - return self == DataType.INT64 or self == DataType.UINT64 - raise NotImplementedError(f"Type {incoming_snowpark_type} is not supported as a DataType.") - - return isinstance(incoming_snowpark_type, self._snowpark_type) - - -class BaseFeatureSpec(ABC): - """Abstract Class for specification of a feature.""" - - def __init__(self, name: str) -> None: - self._name = name - - @final - @property - def name(self) -> str: - """Name of the feature.""" - return self._name - - @abstractmethod - def as_snowpark_type(self) -> spt.DataType: - """Convert to corresponding Snowpark Type.""" - pass - - @abstractmethod - def to_dict(self) -> Dict[str, Any]: - """Serialization""" - pass - - @classmethod - @abstractmethod - def from_dict(self, input_dict: Dict[str, Any]) -> "BaseFeatureSpec": - """Deserialization""" - pass - - -class FeatureSpec(BaseFeatureSpec): - """Specification of a feature in Snowflake native model packaging.""" - - def __init__( - self, - name: str, - dtype: DataType, - shape: Optional[Tuple[int, ...]] = None, - ) -> None: - """Initialize a feature. - - Args: - name: Name of the feature. - dtype: Type of the elements in the feature. - shape: Used to represent scalar feature, 1-d feature list or n-d tensor. - -1 is used to represent variable length.Defaults to None. - - E.g. - None: scalar - (2,): 1d list with fixed len of 2. - (-1,): 1d list with variable length. Used for ragged tensor representation. - (d1, d2, d3): 3d tensor. - - Raises: - TypeError: Raised when the dtype input type is incorrect. - TypeError: Raised when the shape input type is incorrect. - """ - super().__init__(name=name) - - if not isinstance(dtype, DataType): - raise TypeError("dtype should be a model signature datatype.") - self._dtype = dtype - - if shape and not isinstance(shape, tuple): - raise TypeError("Shape should be a tuple if presented.") - self._shape = shape - - def as_snowpark_type(self) -> spt.DataType: - result_type = self._dtype.as_snowpark_type() - if not self._shape: - return result_type - for _ in range(len(self._shape)): - result_type = spt.ArrayType(result_type) - return result_type - - def as_dtype(self) -> npt.DTypeLike: - """Convert to corresponding local Type.""" - if not self._shape: - return self._dtype._numpy_type - return np.object_ - - def __eq__(self, other: object) -> bool: - if isinstance(other, FeatureSpec): - return self._name == other._name and self._dtype == other._dtype and self._shape == other._shape - else: - return False - - def __repr__(self) -> str: - shape_str = f", shape={repr(self._shape)}" if self._shape else "" - return f"FeatureSpec(dtype={repr(self._dtype)}, name={repr(self._name)}{shape_str})" - - def to_dict(self) -> Dict[str, Any]: - """Serialize the feature group into a dict. - - Returns: - A dict that serializes the feature group. - """ - base_dict: Dict[str, Any] = { - "type": self._dtype.name, - "name": self._name, - } - if self._shape is not None: - base_dict["shape"] = self._shape - return base_dict - - @classmethod - def from_dict(cls, input_dict: Dict[str, Any]) -> "FeatureSpec": - """Deserialize the feature specification from a dict. - - Args: - input_dict: The dict containing information of the feature specification. - - Returns: - A feature specification instance deserialized and created from the dict. - """ - name = input_dict["name"] - shape = input_dict.get("shape", None) - if shape: - shape = tuple(shape) - type = DataType[input_dict["type"]] - return FeatureSpec(name=name, dtype=type, shape=shape) - - -class FeatureGroupSpec(BaseFeatureSpec): - """Specification of a group of features in Snowflake native model packaging.""" - - def __init__(self, name: str, specs: List[FeatureSpec]) -> None: - """Initialize a feature group. - - Args: - name: Name of the feature group. - specs: A list of feature specifications that composes the group. All children feature specs have to have - name. And all of them should have the same type. - """ - super().__init__(name=name) - self._specs = specs - self._validate() - - def _validate(self) -> None: - if len(self._specs) == 0: - raise ValueError("No children feature specs.") - # each has to have name, and same type - if not all(s._name is not None for s in self._specs): - raise ValueError("All children feature specs have to have name.") - if not (all(s._shape is None for s in self._specs) or all(s._shape is not None for s in self._specs)): - raise ValueError("All children feature specs have to have same shape.") - first_type = self._specs[0]._dtype - if not all(s._dtype == first_type for s in self._specs): - raise ValueError("All children feature specs have to have same type.") - - def as_snowpark_type(self) -> spt.DataType: - first_type = self._specs[0].as_snowpark_type() - return spt.MapType(spt.StringType(), first_type) - - def __eq__(self, other: object) -> bool: - if isinstance(other, FeatureGroupSpec): - return self._specs == other._specs - else: - return False - - def __repr__(self) -> str: - spec_strs = ",\n\t\t".join(repr(spec) for spec in self._specs) - return textwrap.dedent( - f"""FeatureGroupSpec( - name={repr(self._name)}, - specs=[ - {spec_strs} - ] - ) - """ - ) - - def to_dict(self) -> Dict[str, Any]: - """Serialize the feature group into a dict. - - Returns: - A dict that serializes the feature group. - """ - return {"feature_group": {"name": self._name, "specs": [s.to_dict() for s in self._specs]}} - - @classmethod - def from_dict(cls, input_dict: Dict[str, Any]) -> "FeatureGroupSpec": - """Deserialize the feature group from a dict. - - Args: - input_dict: The dict containing information of the feature group. - - Returns: - A feature group instance deserialized and created from the dict. - """ - specs = [] - for e in input_dict["feature_group"]["specs"]: - spec = FeatureSpec.from_dict(e) - specs.append(spec) - return FeatureGroupSpec(name=input_dict["feature_group"]["name"], specs=specs) - - -class ModelSignature: - """Signature of a model that specifies the input and output of a model.""" - - def __init__(self, inputs: Sequence[BaseFeatureSpec], outputs: Sequence[BaseFeatureSpec]) -> None: - """Initialize a model signature - - Args: - inputs: A sequence of feature specifications and feature group specifications that will compose the - input of the model. - outputs: A sequence of feature specifications and feature group specifications that will compose the - output of the model. - """ - self._inputs = inputs - self._outputs = outputs - - @property - def inputs(self) -> Sequence[BaseFeatureSpec]: - """Inputs of the model, containing a sequence of feature specifications and feature group specifications.""" - return self._inputs - - @property - def outputs(self) -> Sequence[BaseFeatureSpec]: - """Outputs of the model, containing a sequence of feature specifications and feature group specifications.""" - return self._outputs - - def __eq__(self, other: object) -> bool: - if isinstance(other, ModelSignature): - return self._inputs == other._inputs and self._outputs == other._outputs - else: - return False - - def to_dict(self) -> Dict[str, Any]: - """Generate a dict to represent the whole signature. - - Returns: - A dict that serializes the signature. - """ - - return { - "inputs": [spec.to_dict() for spec in self._inputs], - "outputs": [spec.to_dict() for spec in self._outputs], - } - - @classmethod - def from_dict(cls, loaded: Dict[str, Any]) -> "ModelSignature": - """Create a signature given the dict containing specifications of children features and feature groups. - - Args: - loaded: The dict to be deserialized. - - Returns: - A signature deserialized and created from the dict. - """ - sig_outs = loaded["outputs"] - sig_inputs = loaded["inputs"] - - deserialize_spec: Callable[[Dict[str, Any]], BaseFeatureSpec] = ( - lambda sig_spec: FeatureGroupSpec.from_dict(sig_spec) - if "feature_group" in sig_spec - else FeatureSpec.from_dict(sig_spec) - ) - - return ModelSignature( - inputs=[deserialize_spec(s) for s in sig_inputs], outputs=[deserialize_spec(s) for s in sig_outs] - ) - - def __repr__(self) -> str: - inputs_spec_strs = ",\n\t\t".join(repr(spec) for spec in self._inputs) - outputs_spec_strs = ",\n\t\t".join(repr(spec) for spec in self._outputs) - return textwrap.dedent( - f"""ModelSignature( - inputs=[ - {inputs_spec_strs} - ], - outputs=[ - {outputs_spec_strs} - ] - )""" - ) - - -class _BaseDataHandler(ABC, Generic[model_types._DataType]): - FEATURE_PREFIX: Final[str] = "feature" - INPUT_PREFIX: Final[str] = "input" - OUTPUT_PREFIX: Final[str] = "output" - SIG_INFER_ROWS_COUNT_LIMIT: Final[int] = 10 - - @staticmethod - @abstractmethod - def can_handle(data: model_types.SupportedDataType) -> TypeGuard[model_types._DataType]: - ... - - @staticmethod - @abstractmethod - def count(data: model_types._DataType) -> int: - ... - - @staticmethod - @abstractmethod - def truncate(data: model_types._DataType) -> model_types._DataType: - ... - - @staticmethod - @abstractmethod - def validate(data: model_types._DataType) -> None: - ... - - @staticmethod - @abstractmethod - def infer_signature(data: model_types._DataType, role: Literal["input", "output"]) -> Sequence[BaseFeatureSpec]: - ... - - @staticmethod - @abstractmethod - def convert_to_df(data: model_types._DataType, ensure_serializable: bool = True) -> pd.DataFrame: - ... - - -class _PandasDataFrameHandler(_BaseDataHandler[pd.DataFrame]): - @staticmethod - def can_handle(data: model_types.SupportedDataType) -> TypeGuard[pd.DataFrame]: - return isinstance(data, pd.DataFrame) - - @staticmethod - def count(data: pd.DataFrame) -> int: - return len(data.index) - - @staticmethod - def truncate(data: pd.DataFrame) -> pd.DataFrame: - return data.head(min(_PandasDataFrameHandler.count(data), _PandasDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT)) - - @staticmethod - def validate(data: pd.DataFrame) -> None: - df_cols = data.columns - - if df_cols.has_duplicates: # Rule out categorical index with duplicates - raise ValueError("Data Validation Error: Duplicate column index is found.") - - assert all(hasattr(data[col], "dtype") for col in data.columns), f"Unknown column confronted in {data}" - - if len(df_cols) == 0: - raise ValueError("Data Validation Error: Empty data is found.") - - if df_cols.dtype not in [ - np.int64, - np.uint64, - np.float64, - np.object_, - ]: # To keep compatibility with Pandas 2.x and 1.x - raise ValueError("Data Validation Error: Unsupported column index type is found.") - - df_col_dtypes = [data[col].dtype for col in data.columns] - for df_col, df_col_dtype in zip(df_cols, df_col_dtypes): - if df_col_dtype == np.dtype("O"): - # Check if all objects have the same type - if not all(isinstance(data_row, type(data[df_col][0])) for data_row in data[df_col]): - raise ValueError( - f"Data Validation Error: Inconsistent type of object found in column data {data[df_col]}." - ) - - if isinstance(data[df_col][0], list): - arr = _convert_list_to_ndarray(data[df_col][0]) - arr_dtype = DataType.from_numpy_type(arr.dtype) - - converted_data_list = [_convert_list_to_ndarray(data_row) for data_row in data[df_col]] - - if not all( - DataType.from_numpy_type(converted_data.dtype) == arr_dtype - for converted_data in converted_data_list - ): - raise ValueError( - "Data Validation Error: " - + f"Inconsistent type of element in object found in column data {data[df_col]}." - ) - - elif isinstance(data[df_col][0], np.ndarray): - arr_dtype = DataType.from_numpy_type(data[df_col][0].dtype) - - if not all(DataType.from_numpy_type(data_row.dtype) == arr_dtype for data_row in data[df_col]): - raise ValueError( - "Data Validation Error: " - + f"Inconsistent type of element in object found in column data {data[df_col]}." - ) - elif not isinstance(data[df_col][0], (str, bytes)): - raise ValueError(f"Data Validation Error: Unsupported type confronted in {data[df_col]}") - - @staticmethod - def infer_signature(data: pd.DataFrame, role: Literal["input", "output"]) -> Sequence[BaseFeatureSpec]: - feature_prefix = f"{_PandasDataFrameHandler.FEATURE_PREFIX}_" - df_cols = data.columns - role_prefix = ( - _PandasDataFrameHandler.INPUT_PREFIX if role == "input" else _PandasDataFrameHandler.OUTPUT_PREFIX - ) + "_" - if df_cols.dtype in [np.int64, np.uint64, np.float64]: - ft_names = [f"{role_prefix}{feature_prefix}{i}" for i in df_cols] - else: - ft_names = list(map(str, data.columns.to_list())) - - df_col_dtypes = [data[col].dtype for col in data.columns] - - specs = [] - for df_col, df_col_dtype, ft_name in zip(df_cols, df_col_dtypes, ft_names): - if df_col_dtype == np.dtype("O"): - if isinstance(data[df_col][0], list): - arr = _convert_list_to_ndarray(data[df_col][0]) - arr_dtype = DataType.from_numpy_type(arr.dtype) - ft_shape = np.shape(data[df_col][0]) - - converted_data_list = [_convert_list_to_ndarray(data_row) for data_row in data[df_col]] - - if not all(np.shape(converted_data) == ft_shape for converted_data in converted_data_list): - ft_shape = (-1,) - - specs.append(FeatureSpec(dtype=arr_dtype, name=ft_name, shape=ft_shape)) - elif isinstance(data[df_col][0], np.ndarray): - arr_dtype = DataType.from_numpy_type(data[df_col][0].dtype) - ft_shape = np.shape(data[df_col][0]) - - if not all(np.shape(data_row) == ft_shape for data_row in data[df_col]): - ft_shape = (-1,) - - specs.append(FeatureSpec(dtype=arr_dtype, name=ft_name, shape=ft_shape)) - elif isinstance(data[df_col][0], str): - specs.append(FeatureSpec(dtype=DataType.STRING, name=ft_name)) - elif isinstance(data[df_col][0], bytes): - specs.append(FeatureSpec(dtype=DataType.BYTES, name=ft_name)) - else: - specs.append(FeatureSpec(dtype=DataType.from_numpy_type(df_col_dtype), name=ft_name)) - return specs - - @staticmethod - def convert_to_df(data: pd.DataFrame, ensure_serializable: bool = True) -> pd.DataFrame: - if not ensure_serializable: - return data - # This convert is necessary since numpy dataframe cannot be correctly handled when provided as an element of - # a list when creating Snowpark Dataframe. - df_cols = data.columns - df_col_dtypes = [data[col].dtype for col in data.columns] - for df_col, df_col_dtype in zip(df_cols, df_col_dtypes): - if df_col_dtype == np.dtype("O"): - if isinstance(data[df_col][0], np.ndarray): - data[df_col] = data[df_col].map(np.ndarray.tolist) - return data - - -class _NumpyArrayHandler(_BaseDataHandler[model_types._SupportedNumpyArray]): - @staticmethod - def can_handle(data: model_types.SupportedDataType) -> TypeGuard[model_types._SupportedNumpyArray]: - return isinstance(data, np.ndarray) - - @staticmethod - def count(data: model_types._SupportedNumpyArray) -> int: - return data.shape[0] - - @staticmethod - def truncate(data: model_types._SupportedNumpyArray) -> model_types._SupportedNumpyArray: - return data[: min(_NumpyArrayHandler.count(data), _NumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT)] - - @staticmethod - def validate(data: model_types._SupportedNumpyArray) -> None: - if data.shape == (0,): - # Empty array - raise ValueError("Data Validation Error: Empty data is found.") - - if data.shape == (): - # scalar - raise ValueError("Data Validation Error: Scalar data is found.") - - @staticmethod - def infer_signature( - data: model_types._SupportedNumpyArray, role: Literal["input", "output"] - ) -> Sequence[BaseFeatureSpec]: - feature_prefix = f"{_NumpyArrayHandler.FEATURE_PREFIX}_" - dtype = DataType.from_numpy_type(data.dtype) - role_prefix = (_NumpyArrayHandler.INPUT_PREFIX if role == "input" else _NumpyArrayHandler.OUTPUT_PREFIX) + "_" - if len(data.shape) == 1: - return [FeatureSpec(dtype=dtype, name=f"{role_prefix}{feature_prefix}0")] - else: - # For high-dimension array, 0-axis is for batch, 1-axis is for column, further more is details of columns. - features = [] - n_cols = data.shape[1] - ft_names = [f"{role_prefix}{feature_prefix}{i}" for i in range(n_cols)] - for col_data, ft_name in zip(data[0], ft_names): - if isinstance(col_data, np.ndarray): - ft_shape = np.shape(col_data) - features.append(FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape)) - else: - features.append(FeatureSpec(dtype=dtype, name=ft_name)) - return features - - @staticmethod - def convert_to_df(data: model_types._SupportedNumpyArray, ensure_serializable: bool = True) -> pd.DataFrame: - if len(data.shape) == 1: - data = np.expand_dims(data, axis=1) - n_cols = data.shape[1] - if len(data.shape) == 2: - return pd.DataFrame(data) - else: - n_rows = data.shape[0] - if ensure_serializable: - return pd.DataFrame(data={i: [data[k, i].tolist() for k in range(n_rows)] for i in range(n_cols)}) - return pd.DataFrame(data={i: [list(data[k, i]) for k in range(n_rows)] for i in range(n_cols)}) - - -class _SeqOfNumpyArrayHandler(_BaseDataHandler[Sequence[model_types._SupportedNumpyArray]]): - @staticmethod - def can_handle(data: model_types.SupportedDataType) -> TypeGuard[Sequence[model_types._SupportedNumpyArray]]: - if not isinstance(data, list): - return False - if len(data) == 0: - return False - if isinstance(data[0], np.ndarray): - return all(isinstance(data_col, np.ndarray) for data_col in data) - return False - - @staticmethod - def count(data: Sequence[model_types._SupportedNumpyArray]) -> int: - return min(_NumpyArrayHandler.count(data_col) for data_col in data) - - @staticmethod - def truncate(data: Sequence[model_types._SupportedNumpyArray]) -> Sequence[model_types._SupportedNumpyArray]: - return [ - data_col[: min(_SeqOfNumpyArrayHandler.count(data), _SeqOfNumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT)] - for data_col in data - ] - - @staticmethod - def validate(data: Sequence[model_types._SupportedNumpyArray]) -> None: - for data_col in data: - _NumpyArrayHandler.validate(data_col) - - @staticmethod - def infer_signature( - data: Sequence[model_types._SupportedNumpyArray], role: Literal["input", "output"] - ) -> Sequence[BaseFeatureSpec]: - feature_prefix = f"{_SeqOfNumpyArrayHandler.FEATURE_PREFIX}_" - features: List[BaseFeatureSpec] = [] - role_prefix = ( - _SeqOfNumpyArrayHandler.INPUT_PREFIX if role == "input" else _SeqOfNumpyArrayHandler.OUTPUT_PREFIX - ) + "_" - - for i, data_col in enumerate(data): - dtype = DataType.from_numpy_type(data_col.dtype) - ft_name = f"{role_prefix}{feature_prefix}{i}" - if len(data_col.shape) == 1: - features.append(FeatureSpec(dtype=dtype, name=ft_name)) - else: - ft_shape = tuple(data_col.shape[1:]) - features.append(FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape)) - return features - - @staticmethod - def convert_to_df( - data: Sequence[model_types._SupportedNumpyArray], ensure_serializable: bool = True - ) -> pd.DataFrame: - if ensure_serializable: - return pd.DataFrame(data={i: data_col.tolist() for i, data_col in enumerate(data)}) - return pd.DataFrame(data={i: list(data_col) for i, data_col in enumerate(data)}) - - -class _SeqOfPyTorchTensorHandler(_BaseDataHandler[Sequence["torch.Tensor"]]): - @staticmethod - def can_handle(data: model_types.SupportedDataType) -> TypeGuard[Sequence["torch.Tensor"]]: - if not isinstance(data, list): - return False - if len(data) == 0: - return False - if type_utils.LazyType("torch.Tensor").isinstance(data[0]): - return all(type_utils.LazyType("torch.Tensor").isinstance(data_col) for data_col in data) - return False - - @staticmethod - def count(data: Sequence["torch.Tensor"]) -> int: - return min(data_col.shape[0] for data_col in data) - - @staticmethod - def truncate(data: Sequence["torch.Tensor"]) -> Sequence["torch.Tensor"]: - return [ - data_col[ - : min(_SeqOfPyTorchTensorHandler.count(data), _SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT) - ] - for data_col in data - ] - - @staticmethod - def validate(data: Sequence["torch.Tensor"]) -> None: - import torch - - for data_col in data: - if data_col.shape == torch.Size([0]): - # Empty array - raise ValueError("Data Validation Error: Empty data is found.") - - if data_col.shape == torch.Size([1]): - # scalar - raise ValueError("Data Validation Error: Scalar data is found.") - - @staticmethod - def infer_signature(data: Sequence["torch.Tensor"], role: Literal["input", "output"]) -> Sequence[BaseFeatureSpec]: - feature_prefix = f"{_SeqOfPyTorchTensorHandler.FEATURE_PREFIX}_" - features: List[BaseFeatureSpec] = [] - role_prefix = ( - _SeqOfPyTorchTensorHandler.INPUT_PREFIX if role == "input" else _SeqOfPyTorchTensorHandler.OUTPUT_PREFIX - ) + "_" - - for i, data_col in enumerate(data): - dtype = DataType.from_torch_type(data_col.dtype) - ft_name = f"{role_prefix}{feature_prefix}{i}" - if len(data_col.shape) == 1: - features.append(FeatureSpec(dtype=dtype, name=ft_name)) - else: - ft_shape = tuple(data_col.shape[1:]) - features.append(FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape)) - return features - - @staticmethod - def convert_to_df(data: Sequence["torch.Tensor"], ensure_serializable: bool = True) -> pd.DataFrame: - # Use list(...) instead of .tolist() to ensure that - # the content is still numpy array so that the type could be preserved. - # But that would not serializable and cannot use as UDF input and output. - if ensure_serializable: - return pd.DataFrame({i: data_col.detach().to("cpu").numpy().tolist() for i, data_col in enumerate(data)}) - return pd.DataFrame({i: list(data_col.detach().to("cpu").numpy()) for i, data_col in enumerate(data)}) - - @staticmethod - def convert_from_df( - df: pd.DataFrame, features: Optional[Sequence[BaseFeatureSpec]] = None - ) -> Sequence["torch.Tensor"]: - import torch - - res = [] - if features: - for feature in features: - if isinstance(feature, FeatureGroupSpec): - raise NotImplementedError("FeatureGroupSpec is not supported.") - assert isinstance(feature, FeatureSpec), "Invalid feature kind." - res.append(torch.from_numpy(np.stack(df[feature.name].to_numpy()).astype(feature._dtype._numpy_type))) - return res - return [torch.from_numpy(np.stack(df[col].to_numpy())) for col in df] - - -class _SeqOfTensorflowTensorHandler(_BaseDataHandler[Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]]): - @staticmethod - def can_handle( - data: model_types.SupportedDataType, - ) -> TypeGuard[Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]]: - if not isinstance(data, list): - return False - if len(data) == 0: - return False - if type_utils.LazyType("tensorflow.Tensor").isinstance(data[0]) or type_utils.LazyType( - "tensorflow.Variable" - ).isinstance(data[0]): - return all( - type_utils.LazyType("tensorflow.Tensor").isinstance(data_col) - or type_utils.LazyType("tensorflow.Variable").isinstance(data_col) - for data_col in data - ) - return False - - @staticmethod - def count(data: Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]) -> int: - import tensorflow as tf - - rows = [] - for data_col in data: - shapes = data_col.shape.as_list() - if data_col.shape == tf.TensorShape(None) or (not shapes) or (shapes[0] is None): - # Unknown shape array - raise ValueError("Data Validation Error: Unknown shape data is found.") - # Make mypy happy - assert isinstance(shapes[0], int) - - rows.append(shapes[0]) - - return min(rows) - - @staticmethod - def truncate( - data: Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]] - ) -> Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]: - return [ - data_col[ - : min( - _SeqOfTensorflowTensorHandler.count(data), _SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT - ) - ] - for data_col in data - ] - - @staticmethod - def validate(data: Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]) -> None: - import tensorflow as tf - - for data_col in data: - if data_col.shape == tf.TensorShape(None) or any(dim is None for dim in data_col.shape.as_list()): - # Unknown shape array - raise ValueError("Data Validation Error: Unknown shape data is found.") - - if data_col.shape == tf.TensorShape([0]): - # Empty array - raise ValueError("Data Validation Error: Empty data is found.") - - if data_col.shape == tf.TensorShape([1]) or data_col.shape == tf.TensorShape([]): - # scalar - raise ValueError("Data Validation Error: Scalar data is found.") - - @staticmethod - def infer_signature( - data: Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]], role: Literal["input", "output"] - ) -> Sequence[BaseFeatureSpec]: - feature_prefix = f"{_SeqOfTensorflowTensorHandler.FEATURE_PREFIX}_" - features: List[BaseFeatureSpec] = [] - role_prefix = ( - _SeqOfTensorflowTensorHandler.INPUT_PREFIX - if role == "input" - else _SeqOfTensorflowTensorHandler.OUTPUT_PREFIX - ) + "_" - - for i, data_col in enumerate(data): - dtype = DataType.from_numpy_type(data_col.dtype.as_numpy_dtype) - ft_name = f"{role_prefix}{feature_prefix}{i}" - if len(data_col.shape) == 1: - features.append(FeatureSpec(dtype=dtype, name=ft_name)) - else: - ft_shape = tuple(data_col.shape[1:]) - features.append(FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape)) - return features - - @staticmethod - def convert_to_df( - data: Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]], ensure_serializable: bool = True - ) -> pd.DataFrame: - if ensure_serializable: - return pd.DataFrame({i: data_col.numpy().tolist() for i, data_col in enumerate(iterable=data)}) - return pd.DataFrame({i: list(data_col.numpy()) for i, data_col in enumerate(iterable=data)}) - - @staticmethod - def convert_from_df( - df: pd.DataFrame, features: Optional[Sequence[BaseFeatureSpec]] = None - ) -> Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]: - import tensorflow as tf - - res = [] - if features: - for feature in features: - if isinstance(feature, FeatureGroupSpec): - raise NotImplementedError("FeatureGroupSpec is not supported.") - assert isinstance(feature, FeatureSpec), "Invalid feature kind." - res.append( - tf.convert_to_tensor(np.stack(df[feature.name].to_numpy()).astype(feature._dtype._numpy_type)) - ) - return res - return [tf.convert_to_tensor(np.stack(df[col].to_numpy())) for col in df] - - -class _ListOfBuiltinHandler(_BaseDataHandler[model_types._SupportedBuiltinsList]): - @staticmethod - def can_handle(data: model_types.SupportedDataType) -> TypeGuard[model_types._SupportedBuiltinsList]: - return ( - isinstance(data, list) - and len(data) > 0 - and all(isinstance(data_col, (int, float, bool, str, bytes, list)) for data_col in data) - ) - - @staticmethod - def count(data: model_types._SupportedBuiltinsList) -> int: - return len(data) - - @staticmethod - def truncate(data: model_types._SupportedBuiltinsList) -> model_types._SupportedBuiltinsList: - return data[: min(_ListOfBuiltinHandler.count(data), _ListOfBuiltinHandler.SIG_INFER_ROWS_COUNT_LIMIT)] - - @staticmethod - def validate(data: model_types._SupportedBuiltinsList) -> None: - if not all(isinstance(data_row, type(data[0])) for data_row in data): - raise ValueError(f"Data Validation Error: Inconsistent type of object found in data {data}.") - df = pd.DataFrame(data) - if df.isnull().values.any(): - raise ValueError(f"Data Validation Error: Ill-shaped list data {data} confronted.") - - @staticmethod - def infer_signature( - data: model_types._SupportedBuiltinsList, role: Literal["input", "output"] - ) -> Sequence[BaseFeatureSpec]: - return _PandasDataFrameHandler.infer_signature(pd.DataFrame(data), role) - - @staticmethod - def convert_to_df( - data: model_types._SupportedBuiltinsList, - ensure_serializable: bool = True, - ) -> pd.DataFrame: - return pd.DataFrame(data) - - -class _SnowparkDataFrameHandler(_BaseDataHandler[snowflake.snowpark.DataFrame]): - @staticmethod - def can_handle(data: model_types.SupportedDataType) -> TypeGuard[snowflake.snowpark.DataFrame]: - return isinstance(data, snowflake.snowpark.DataFrame) - - @staticmethod - def count(data: snowflake.snowpark.DataFrame) -> int: - return data.count() - - @staticmethod - def truncate(data: snowflake.snowpark.DataFrame) -> snowflake.snowpark.DataFrame: - return cast(snowflake.snowpark.DataFrame, data.limit(_SnowparkDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT)) - - @staticmethod - def validate(data: snowflake.snowpark.DataFrame) -> None: - schema = data.schema - for field in schema.fields: - data_type = field.datatype - if isinstance(data_type, spt.ArrayType): - actual_data_type = data_type.element_type - else: - actual_data_type = data_type - if not any(type.is_same_snowpark_type(actual_data_type) for type in DataType): - raise ValueError( - f"Data Validation Error: Unsupported data type {field.datatype} in column {field.name}." - ) - - @staticmethod - def infer_signature( - data: snowflake.snowpark.DataFrame, role: Literal["input", "output"] - ) -> Sequence[BaseFeatureSpec]: - features: List[BaseFeatureSpec] = [] - schema = data.schema - for field in schema.fields: - name = identifier.get_unescaped_names(field.name) - if isinstance(field.datatype, spt.ArrayType): - raise NotImplementedError("Cannot infer model signature from Snowpark DataFrame with Array Type.") - else: - features.append(FeatureSpec(name=name, dtype=DataType.from_snowpark_type(field.datatype))) - return features - - @staticmethod - def convert_to_df( - data: snowflake.snowpark.DataFrame, - ensure_serializable: bool = True, - features: Optional[Sequence[BaseFeatureSpec]] = None, - ) -> pd.DataFrame: - # This method do things on top of to_pandas, to make sure the local dataframe got is in correct shape. - dtype_map = {} - if features: - for feature in features: - if isinstance(feature, FeatureGroupSpec): - raise NotImplementedError("FeatureGroupSpec is not supported.") - assert isinstance(feature, FeatureSpec), "Invalid feature kind." - dtype_map[feature.name] = feature.as_dtype() - df_local = data.to_pandas() - # This is because Array will become string (Even though the correct schema is set) - # and object will become variant type and requires an additional loads - # to get correct data otherwise it would be string. - for field in data.schema.fields: - if isinstance(field.datatype, spt.ArrayType): - df_local[identifier.get_unescaped_names(field.name)] = df_local[ - identifier.get_unescaped_names(field.name) - ].map(json.loads) - # Only when the feature is not from inference, we are confident to do the type casting. - # Otherwise, dtype_map will be empty - df_local = df_local.astype(dtype=dtype_map) - return df_local - - @staticmethod - def convert_from_df( - session: snowflake.snowpark.Session, df: pd.DataFrame, keep_order: bool = True - ) -> snowflake.snowpark.DataFrame: - # This method is necessary to create the Snowpark Dataframe in correct schema. - # Snowpark ignore the schema argument when providing a pandas DataFrame. - # However, in this case, if a cell of the original Dataframe is some array type, - # they will be inferred as VARIANT. - # To make sure Snowpark get the correct schema, we have to provide in a list of records. - # However, in this case, the order could not be preserved. Thus, a _ID column has to be added, - # if keep_order is True. - # Although in this case, the column with array type can get correct ARRAY type, however, the element - # type is not preserved, and will become string type. This affect the implementation of convert_from_df. - df = _PandasDataFrameHandler.convert_to_df(df) - df_cols = df.columns - if df_cols.dtype != np.object_: - raise ValueError("Cannot convert a Pandas DataFrame whose column index is not a string") - features = _PandasDataFrameHandler.infer_signature(df, role="input") - # Role will be no effect on the column index. That is to say, the feature name is the actual column name. - schema_list = [] - for feature in features: - if isinstance(feature, FeatureGroupSpec): - raise NotImplementedError("FeatureGroupSpec is not supported.") - assert isinstance(feature, FeatureSpec), "Invalid feature kind." - schema_list.append( - spt.StructField( - identifier.get_inferred_name(feature.name), - feature.as_snowpark_type(), - nullable=df[feature.name].isnull().any(), - ) - ) +from snowflake.ml.model._signatures import ( + base_handler, + builtins_handler as builtins_handler, + core, + numpy_handler, + pandas_handler, + pytorch_handler, + snowpark_handler, + tensorflow_handler, + utils, +) - data = df.rename(columns=identifier.get_inferred_name).to_dict("records") - if keep_order: - for idx, data_item in enumerate(data): - data_item[infer_template._KEEP_ORDER_COL_NAME] = idx - schema_list.append(spt.StructField(infer_template._KEEP_ORDER_COL_NAME, spt.LongType(), nullable=False)) - sp_df = session.create_dataframe( - data, # To make sure the schema can be used, otherwise, array will become variant. - spt.StructType(schema_list), - ) - return sp_df +DataType = core.DataType +BaseFeatureSpec = core.BaseFeatureSpec +FeatureSpec = core.FeatureSpec +FeatureGroupSpec = core.FeatureGroupSpec +ModelSignature = core.ModelSignature -_LOCAL_DATA_HANDLERS: List[Type[_BaseDataHandler[Any]]] = [ - _PandasDataFrameHandler, - _NumpyArrayHandler, - _ListOfBuiltinHandler, - _SeqOfNumpyArrayHandler, - _SeqOfPyTorchTensorHandler, - _SeqOfTensorflowTensorHandler, +_LOCAL_DATA_HANDLERS: List[Type[base_handler.BaseDataHandler[Any]]] = [ + pandas_handler.PandasDataFrameHandler, + numpy_handler.NumpyArrayHandler, + builtins_handler.ListOfBuiltinHandler, + numpy_handler.SeqOfNumpyArrayHandler, + pytorch_handler.SeqOfPyTorchTensorHandler, + tensorflow_handler.SeqOfTensorflowTensorHandler, ] -_ALL_DATA_HANDLERS = _LOCAL_DATA_HANDLERS + [_SnowparkDataFrameHandler] +_ALL_DATA_HANDLERS = _LOCAL_DATA_HANDLERS + [snowpark_handler.SnowparkDataFrameHandler] def _truncate_data(data: model_types.SupportedDataType) -> model_types.SupportedDataType: @@ -1110,7 +63,7 @@ def _truncate_data(data: model_types.SupportedDataType) -> model_types.Supported def _infer_signature( data: model_types.SupportedLocalDataType, role: Literal["input", "output"] -) -> Sequence[BaseFeatureSpec]: +) -> Sequence[core.BaseFeatureSpec]: """Infer the inputs/outputs signature given a data that could be dataframe, numpy array or list. Dispatching is used to separate logic for different types. (Not using Python's singledispatch for unsupported feature of union dispatching in 3.8) @@ -1134,87 +87,7 @@ def _infer_signature( ) -def _convert_list_to_ndarray(data: List[Any]) -> npt.NDArray[Any]: - """Create a numpy array from list or nested list. Avoid ragged list and unaligned types. - - Args: - data: List or nested list. - - Raises: - ValueError: Raised when ragged nested list or list containing non-basic type confronted. - ValueError: Raised when ragged nested list or list containing non-basic type confronted. - - Returns: - The converted numpy array. - """ - warnings.filterwarnings("error", category=np.VisibleDeprecationWarning) - try: - arr = np.array(data) - except np.VisibleDeprecationWarning: - # In recent version of numpy, this warning should be raised when bad list provided. - raise ValueError( - f"Unable to construct signature: Ragged nested or Unsupported list-like data {data} confronted." - ) - warnings.filterwarnings("default", category=np.VisibleDeprecationWarning) - if arr.dtype == object: - # If not raised, then a array of object would be created. - raise ValueError( - f"Unable to construct signature: Ragged nested or Unsupported list-like data {data} confronted." - ) - return arr - - -def _rename_features( - features: Sequence[BaseFeatureSpec], feature_names: Optional[List[str]] = None -) -> Sequence[BaseFeatureSpec]: - """It renames the feature in features provided optional feature names. - - Args: - features: A sequence of feature specifications and feature group specifications. - feature_names: A list of names to assign to features and feature groups. Defaults to None. - - Raises: - ValueError: Raised when provided feature_names does not match the data shape. - - Returns: - A sequence of feature specifications and feature group specifications being renamed if names provided. - """ - if feature_names: - if len(feature_names) == len(features): - for ft, ft_name in zip(features, feature_names): - ft._name = ft_name - else: - raise ValueError( - f"{len(feature_names)} feature names are provided, while there are {len(features)} features." - ) - return features - - -def _rename_pandas_df(data: pd.DataFrame, features: Sequence[BaseFeatureSpec]) -> pd.DataFrame: - """It renames pandas dataframe that has non-object column index with provided features. - - Args: - data: A pandas dataframe to be renamed. - features: A sequence of feature specifications and feature group specifications to rename the dataframe. - - Raises: - ValueError: Raised when the data does not have the same number of features as signature. - - Returns: - A pandas dataframe with columns renamed. - """ - df_cols = data.columns - if df_cols.dtype in [np.int64, np.uint64, np.float64]: - if len(features) != len(data.columns): - raise ValueError( - "Data does not have the same number of features as signature. " - + f"Signature requires {len(features)} features, but have {len(data.columns)} in input data." - ) - data.columns = pd.Index([feature.name for feature in features]) - return data - - -def _validate_pandas_df(data: pd.DataFrame, features: Sequence[BaseFeatureSpec]) -> None: +def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureSpec]) -> None: """It validates pandas dataframe with provided features. Args: @@ -1241,14 +114,14 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[BaseFeatureSpec]) raise ValueError(f"Data Validation Error: feature {ft_name} does not exist in data.") df_col_dtype = data_col.dtype - if isinstance(feature, FeatureGroupSpec): + if isinstance(feature, core.FeatureGroupSpec): raise NotImplementedError("FeatureGroupSpec is not supported.") - assert isinstance(feature, FeatureSpec), "Invalid feature kind." + assert isinstance(feature, core.FeatureSpec), "Invalid feature kind." ft_type = feature._dtype ft_shape = feature._shape if df_col_dtype != np.dtype("O"): - if ft_type != DataType.from_numpy_type(df_col_dtype): + if ft_type != core.DataType.from_numpy_type(df_col_dtype): raise ValueError( f"Data Validation Error in feature {ft_name}: " + f"Feature type {ft_type} is not met by all elements in {data_col}." @@ -1266,10 +139,11 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[BaseFeatureSpec]) + "Feature is a scalar feature while list data is provided." ) - converted_data_list = [_convert_list_to_ndarray(data_row) for data_row in data_col] + converted_data_list = [utils.convert_list_to_ndarray(data_row) for data_row in data_col] if not all( - DataType.from_numpy_type(converted_data.dtype) == ft_type for converted_data in converted_data_list + core.DataType.from_numpy_type(converted_data.dtype) == ft_type + for converted_data in converted_data_list ): raise ValueError( f"Data Validation Error in feature {ft_name}: " @@ -1289,7 +163,7 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[BaseFeatureSpec]) + "Feature is a scalar feature while array data is provided." ) - if not all(DataType.from_numpy_type(data_row.dtype) == ft_type for data_row in data_col): + if not all(core.DataType.from_numpy_type(data_row.dtype) == ft_type for data_row in data_col): raise ValueError( f"Data Validation Error in feature {ft_name}: " + f"Feature type {ft_type} is not met by all elements in {data_col}." @@ -1309,7 +183,7 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[BaseFeatureSpec]) f"Data Validation Error in feature {ft_name}: " + "Feature is a array type feature while scalar data is provided." ) - if ft_type != DataType.STRING: + if ft_type != core.DataType.STRING: raise ValueError( f"Data Validation Error in feature {ft_name}: " + f"Feature type {ft_type} is not met by all elements in {data_col}." @@ -1320,14 +194,14 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[BaseFeatureSpec]) f"Data Validation Error in feature {ft_name}: " + "Feature is a array type feature while scalar data is provided." ) - if ft_type != DataType.BYTES: + if ft_type != core.DataType.BYTES: raise ValueError( f"Data Validation Error in feature {ft_name}: " + f"Feature type {ft_type} is not met by all elements in {data_col}." ) -def _validate_snowpark_data(data: snowflake.snowpark.DataFrame, features: Sequence[BaseFeatureSpec]) -> None: +def _validate_snowpark_data(data: snowflake.snowpark.DataFrame, features: Sequence[core.BaseFeatureSpec]) -> None: """Validate Snowpark DataFrame as input Args: @@ -1353,9 +227,9 @@ def _validate_snowpark_data(data: snowflake.snowpark.DataFrame, features: Sequen + " inference might fail if there is null value.", category=RuntimeWarning, ) - if isinstance(feature, FeatureGroupSpec): + if isinstance(feature, core.FeatureGroupSpec): raise NotImplementedError("FeatureGroupSpec is not supported.") - assert isinstance(feature, FeatureSpec), "Invalid feature kind." + assert isinstance(feature, core.FeatureSpec), "Invalid feature kind." ft_type = feature._dtype field_data_type = field.datatype if isinstance(field_data_type, spt.ArrayType): @@ -1407,7 +281,7 @@ def _convert_local_data_to_df(data: model_types.SupportedLocalDataType) -> pd.Da def _convert_and_validate_local_data( - data: model_types.SupportedLocalDataType, features: Sequence[BaseFeatureSpec] + data: model_types.SupportedLocalDataType, features: Sequence[core.BaseFeatureSpec] ) -> pd.DataFrame: """Validate the data with features in model signature and convert to DataFrame @@ -1419,9 +293,9 @@ def _convert_and_validate_local_data( The converted dataframe with renamed column index. """ df = _convert_local_data_to_df(data) - df = _rename_pandas_df(df, features) + df = utils.rename_pandas_df(df, features) _validate_pandas_df(df, features) - df = _PandasDataFrameHandler.convert_to_df(df, ensure_serializable=True) + df = pandas_handler.PandasDataFrameHandler.convert_to_df(df, ensure_serializable=True) return df @@ -1431,7 +305,7 @@ def infer_signature( output_data: model_types.SupportedLocalDataType, input_feature_names: Optional[List[str]] = None, output_feature_names: Optional[List[str]] = None, -) -> ModelSignature: +) -> core.ModelSignature: """Infer model signature from given input and output sample data. Currently, we support infer the model signature from example input/output data in the following cases: @@ -1460,7 +334,7 @@ def infer_signature( A model signature. """ inputs = _infer_signature(input_data, role="input") - inputs = _rename_features(inputs, input_feature_names) + inputs = utils.rename_features(inputs, input_feature_names) outputs = _infer_signature(output_data, role="output") - outputs = _rename_features(outputs, output_feature_names) - return ModelSignature(inputs, outputs) + outputs = utils.rename_features(outputs, output_feature_names) + return core.ModelSignature(inputs, outputs) diff --git a/snowflake/ml/model/model_signature_test.py b/snowflake/ml/model/model_signature_test.py index c9562c18..7a9580b2 100644 --- a/snowflake/ml/model/model_signature_test.py +++ b/snowflake/ml/model/model_signature_test.py @@ -4,1708 +4,10 @@ import torch from absl.testing import absltest -import snowflake.snowpark.types as spt from snowflake.ml.model import model_signature -from snowflake.ml.utils import connection_params -from snowflake.snowpark import Session - - -class DataTypeTest(absltest.TestCase): - def test_numpy_type(self) -> None: - data = np.array([1, 2, 3, 4]) - self.assertEqual(model_signature.DataType.INT64, model_signature.DataType.from_numpy_type(data.dtype)) - - data = np.array(["a", "b", "c", "d"]) - self.assertEqual(model_signature.DataType.STRING, model_signature.DataType.from_numpy_type(data.dtype)) - - def test_snowpark_type(self) -> None: - self.assertEqual(model_signature.DataType.INT8, model_signature.DataType.from_snowpark_type(spt.ByteType())) - self.assertEqual(model_signature.DataType.INT16, model_signature.DataType.from_snowpark_type(spt.ShortType())) - self.assertEqual(model_signature.DataType.INT32, model_signature.DataType.from_snowpark_type(spt.IntegerType())) - self.assertEqual(model_signature.DataType.INT64, model_signature.DataType.from_snowpark_type(spt.LongType())) - - self.assertEqual( - model_signature.DataType.INT64, model_signature.DataType.from_snowpark_type(spt.DecimalType(38, 0)) - ) - - self.assertEqual(model_signature.DataType.FLOAT, model_signature.DataType.from_snowpark_type(spt.FloatType())) - self.assertEqual(model_signature.DataType.DOUBLE, model_signature.DataType.from_snowpark_type(spt.DoubleType())) - - with self.assertRaises(NotImplementedError): - model_signature.DataType.from_snowpark_type(spt.DecimalType(38, 6)) - - self.assertEqual(model_signature.DataType.BOOL, model_signature.DataType.from_snowpark_type(spt.BooleanType())) - self.assertEqual(model_signature.DataType.STRING, model_signature.DataType.from_snowpark_type(spt.StringType())) - self.assertEqual(model_signature.DataType.BYTES, model_signature.DataType.from_snowpark_type(spt.BinaryType())) - - self.assertTrue(model_signature.DataType.INT64.is_same_snowpark_type(spt.LongType())) - self.assertTrue(model_signature.DataType.INT32.is_same_snowpark_type(spt.IntegerType())) - self.assertTrue(model_signature.DataType.INT16.is_same_snowpark_type(spt.ShortType())) - self.assertTrue(model_signature.DataType.INT8.is_same_snowpark_type(spt.ByteType())) - self.assertTrue(model_signature.DataType.UINT64.is_same_snowpark_type(spt.LongType())) - self.assertTrue(model_signature.DataType.UINT32.is_same_snowpark_type(spt.IntegerType())) - self.assertTrue(model_signature.DataType.UINT16.is_same_snowpark_type(spt.ShortType())) - self.assertTrue(model_signature.DataType.UINT8.is_same_snowpark_type(spt.ByteType())) - - self.assertTrue(model_signature.DataType.FLOAT.is_same_snowpark_type(spt.FloatType())) - self.assertTrue(model_signature.DataType.DOUBLE.is_same_snowpark_type(spt.DoubleType())) - - self.assertTrue( - model_signature.DataType.INT64.is_same_snowpark_type(incoming_snowpark_type=spt.DecimalType(38, 0)) - ) - self.assertTrue( - model_signature.DataType.UINT64.is_same_snowpark_type(incoming_snowpark_type=spt.DecimalType(38, 0)) - ) - - -class FeatureSpecTest(absltest.TestCase): - def test_feature_spec(self) -> None: - ft = model_signature.FeatureSpec(name="feature", dtype=model_signature.DataType.INT64) - self.assertEqual(ft, eval(repr(ft), model_signature.__dict__)) - self.assertEqual(ft, model_signature.FeatureSpec.from_dict(ft.to_dict())) - self.assertEqual(ft.as_snowpark_type(), spt.LongType()) - - ft = model_signature.FeatureSpec(name="feature", dtype=model_signature.DataType.INT64, shape=(2,)) - self.assertEqual(ft, eval(repr(ft), model_signature.__dict__)) - self.assertEqual(ft, model_signature.FeatureSpec.from_dict(input_dict=ft.to_dict())) - self.assertEqual(ft.as_snowpark_type(), spt.ArrayType(spt.LongType())) - - -class FeatureGroupSpecTest(absltest.TestCase): - def test_feature_group_spec(self) -> None: - with self.assertRaisesRegex(ValueError, "No children feature specs."): - _ = model_signature.FeatureGroupSpec(name="features", specs=[]) - - with self.assertRaisesRegex(ValueError, "All children feature specs have to have name."): - ft1 = model_signature.FeatureSpec(name="feature1", dtype=model_signature.DataType.INT64) - ft2 = model_signature.FeatureSpec(name="feature2", dtype=model_signature.DataType.INT64) - ft2._name = None # type: ignore[assignment] - _ = model_signature.FeatureGroupSpec(name="features", specs=[ft1, ft2]) - - with self.assertRaisesRegex(ValueError, "All children feature specs have to have same type."): - ft1 = model_signature.FeatureSpec(name="feature1", dtype=model_signature.DataType.INT64) - ft2 = model_signature.FeatureSpec(name="feature2", dtype=model_signature.DataType.FLOAT) - _ = model_signature.FeatureGroupSpec(name="features", specs=[ft1, ft2]) - - with self.assertRaisesRegex(ValueError, "All children feature specs have to have same shape."): - ft1 = model_signature.FeatureSpec(name="feature1", dtype=model_signature.DataType.INT64) - ft2 = model_signature.FeatureSpec(name="feature2", dtype=model_signature.DataType.INT64, shape=(2,)) - fts = model_signature.FeatureGroupSpec(name="features", specs=[ft1, ft2]) - - ft1 = model_signature.FeatureSpec(name="feature1", dtype=model_signature.DataType.INT64) - ft2 = model_signature.FeatureSpec(name="feature2", dtype=model_signature.DataType.INT64) - fts = model_signature.FeatureGroupSpec(name="features", specs=[ft1, ft2]) - self.assertEqual(fts, eval(repr(fts), model_signature.__dict__)) - self.assertEqual(fts, model_signature.FeatureGroupSpec.from_dict(fts.to_dict())) - self.assertEqual(fts.as_snowpark_type(), spt.MapType(spt.StringType(), spt.LongType())) - - ft1 = model_signature.FeatureSpec(name="feature1", dtype=model_signature.DataType.INT64, shape=(3,)) - ft2 = model_signature.FeatureSpec(name="feature2", dtype=model_signature.DataType.INT64, shape=(2,)) - fts = model_signature.FeatureGroupSpec(name="features", specs=[ft1, ft2]) - self.assertEqual(fts, eval(repr(fts), model_signature.__dict__)) - self.assertEqual(fts, model_signature.FeatureGroupSpec.from_dict(fts.to_dict())) - self.assertEqual(fts.as_snowpark_type(), spt.MapType(spt.StringType(), spt.ArrayType(spt.LongType()))) - - -class ModelSignatureTest(absltest.TestCase): - def test_1(self) -> None: - s = model_signature.ModelSignature( - inputs=[ - model_signature.FeatureSpec(dtype=model_signature.DataType.FLOAT, name="c1"), - model_signature.FeatureGroupSpec( - name="cg1", - specs=[ - model_signature.FeatureSpec( - dtype=model_signature.DataType.FLOAT, - name="cc1", - ), - model_signature.FeatureSpec( - dtype=model_signature.DataType.FLOAT, - name="cc2", - ), - ], - ), - model_signature.FeatureSpec(dtype=model_signature.DataType.FLOAT, name="c2", shape=(-1,)), - ], - outputs=[model_signature.FeatureSpec(name="output", dtype=model_signature.DataType.FLOAT)], - ) - target = { - "inputs": [ - {"type": "FLOAT", "name": "c1"}, - { - "feature_group": { - "name": "cg1", - "specs": [{"type": "FLOAT", "name": "cc1"}, {"type": "FLOAT", "name": "cc2"}], - } - }, - {"type": "FLOAT", "name": "c2", "shape": (-1,)}, - ], - "outputs": [{"type": "FLOAT", "name": "output"}], - } - self.assertDictEqual(s.to_dict(), target) - self.assertEqual(s, eval(repr(s), model_signature.__dict__)) - self.assertEqual(s, model_signature.ModelSignature.from_dict(s.to_dict())) - - def test_2(self) -> None: - s = model_signature.ModelSignature( - inputs=[ - model_signature.FeatureSpec(dtype=model_signature.DataType.FLOAT, name="c1"), - model_signature.FeatureGroupSpec( - name="cg1", - specs=[ - model_signature.FeatureSpec( - dtype=model_signature.DataType.FLOAT, - name="cc1", - ), - model_signature.FeatureSpec( - dtype=model_signature.DataType.FLOAT, - name="cc2", - ), - ], - ), - model_signature.FeatureSpec(dtype=model_signature.DataType.FLOAT, name="c2", shape=(-1,)), - ], - outputs=[model_signature.FeatureSpec(name="output", dtype=model_signature.DataType.FLOAT)], - ) - self.assertEqual(s, eval(repr(s), model_signature.__dict__)) - self.assertEqual(s, model_signature.ModelSignature.from_dict(s.to_dict())) - - -class PandasDataFrameHandlerTest(absltest.TestCase): - def test_validate_pd_DataFrame(self) -> None: - df = pd.DataFrame([]) - with self.assertRaisesRegex(ValueError, "Empty data is found."): - model_signature._PandasDataFrameHandler.validate(df) - - df = pd.DataFrame([[1, 2], [2, 4]], columns=["a", "a"]) - with self.assertRaisesRegex(ValueError, "Duplicate column index is found"): - model_signature._PandasDataFrameHandler.validate(df) - - sub_df = pd.DataFrame([2.5, 6.8]) - df = pd.DataFrame([[1, sub_df], [2, sub_df]], columns=["a", "b"]) - with self.assertRaisesRegex(ValueError, "Unsupported type confronted in"): - model_signature._PandasDataFrameHandler.validate(df) - - df = pd.DataFrame( - [[1, 2.0, 1, 2.0, 1, 2.0], [2, 4.0, 2, 4.0, 2, 4.0]], - columns=pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]), - ) - with self.assertRaisesRegex(ValueError, "Duplicate column index is found"): - model_signature._PandasDataFrameHandler.validate(df) - - df = pd.DataFrame([[1, 2], [2, 4]], columns=["a", "a"]) - with self.assertRaisesRegex(ValueError, "Duplicate column index is found"): - model_signature._PandasDataFrameHandler.validate(df) - - df = pd.DataFrame([[1, "Hello"], [2, [2, 6]]], columns=["a", "b"]) - with self.assertRaisesRegex(ValueError, "Inconsistent type of object"): - model_signature._PandasDataFrameHandler.validate(df) - - df = pd.DataFrame([[1, 2], [2, [2, 6]]], columns=["a", "b"]) - with self.assertRaisesRegex(ValueError, "Inconsistent type of object"): - model_signature._PandasDataFrameHandler.validate(df) - - df = pd.DataFrame([[1, [2, [6]]], [2, [2, 6]]], columns=["a", "b"]) - with self.assertRaisesRegex(ValueError, "Ragged nested or Unsupported list-like data"): - model_signature._PandasDataFrameHandler.validate(df) - - df = pd.DataFrame([[1, [2, 6]], [2, [2, [6]]]], columns=["a", "b"]) - with self.assertRaisesRegex(ValueError, "Ragged nested or Unsupported list-like data"): - model_signature._PandasDataFrameHandler.validate(df) - - df = pd.DataFrame([[1, [2.5, 6.8]], [2, [2, 6]]], columns=["a", "b"]) - with self.assertRaisesRegex(ValueError, "Inconsistent type of element in object found in column data"): - model_signature._PandasDataFrameHandler.validate(df) - - df = pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2, 6])]], columns=["a", "b"]) - with self.assertRaisesRegex(ValueError, "Inconsistent type of element in object found in column data"): - model_signature._PandasDataFrameHandler.validate(df) - - df = pd.DataFrame([[1, np.array([2.5, 6.8])], [2, 6]], columns=["a", "b"]) - with self.assertRaisesRegex(ValueError, "Inconsistent type of object found in column data"): - model_signature._PandasDataFrameHandler.validate(df) - - def test_trunc_pd_DataFrame(self) -> None: - df = pd.DataFrame([1] * (model_signature._PandasDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1)) - - pd.testing.assert_frame_equal( - pd.DataFrame([1] * (model_signature._PandasDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT)), - model_signature._PandasDataFrameHandler.truncate(df), - ) - - df = pd.DataFrame([1] * (model_signature._PandasDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)) - - pd.testing.assert_frame_equal( - df, - model_signature._PandasDataFrameHandler.truncate(df), - ) - - def test_infer_signature_pd_DataFrame(self) -> None: - df = pd.DataFrame([1, 2, 3, 4]) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64)], - ) - - df = pd.DataFrame([1, 2, 3, 4], columns=["a"]) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [model_signature.FeatureSpec("a", model_signature.DataType.INT64)], - ) - - df = pd.DataFrame(["a", "b", "c", "d"], columns=["a"]) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [model_signature.FeatureSpec("a", model_signature.DataType.STRING)], - ) - - df = pd.DataFrame([ele.encode() for ele in ["a", "b", "c", "d"]], columns=["a"]) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [model_signature.FeatureSpec("a", model_signature.DataType.BYTES)], - ) - - df = pd.DataFrame([[1, 2.0], [2, 4.0]]) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [ - model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("input_feature_1", model_signature.DataType.DOUBLE), - ], - ) - - df = pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5, 6.8]]], columns=["a", "b"]) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [ - model_signature.FeatureSpec("a", model_signature.DataType.INT64), - model_signature.FeatureSpec("b", model_signature.DataType.DOUBLE, shape=(2,)), - ], - ) - - df = pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5]]], columns=["a", "b"]) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [ - model_signature.FeatureSpec("a", model_signature.DataType.INT64), - model_signature.FeatureSpec("b", model_signature.DataType.DOUBLE, shape=(-1,)), - ], - ) - - df = pd.DataFrame([[1, [[2.5], [6.8]]], [2, [[2.5], [6.8]]]], columns=["a", "b"]) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [ - model_signature.FeatureSpec("a", model_signature.DataType.INT64), - model_signature.FeatureSpec("b", model_signature.DataType.DOUBLE, shape=(2, 1)), - ], - ) - - a = np.array([2.5, 6.8]) - df = pd.DataFrame([[1, a], [2, a]], columns=["a", "b"]) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [ - model_signature.FeatureSpec("a", model_signature.DataType.INT64), - model_signature.FeatureSpec("b", model_signature.DataType.DOUBLE, shape=(2,)), - ], - ) - - df = pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2.5])]], columns=["a", "b"]) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [ - model_signature.FeatureSpec("a", model_signature.DataType.INT64), - model_signature.FeatureSpec("b", model_signature.DataType.DOUBLE, shape=(-1,)), - ], - ) - - a = np.array([[2, 5], [6, 8]]) - df = pd.DataFrame([[1, a], [2, a]], columns=["a", "b"]) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [ - model_signature.FeatureSpec("a", model_signature.DataType.INT64), - model_signature.FeatureSpec("b", model_signature.DataType.INT64, shape=(2, 2)), - ], - ) - - df = pd.DataFrame([[1, 2.0], [2, 4.0]], columns=pd.PeriodIndex(year=[2000, 2002], quarter=[1, 3])) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [ - model_signature.FeatureSpec("2000Q1", model_signature.DataType.INT64), - model_signature.FeatureSpec("2002Q3", model_signature.DataType.DOUBLE), - ], - ) - - df = pd.DataFrame([[1, 2.0], [2, 4.0]], columns=pd.date_range("2020-01-06", "2020-03-03", freq="MS")) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [ - model_signature.FeatureSpec("2020-02-01 00:00:00", model_signature.DataType.INT64), - model_signature.FeatureSpec("2020-03-01 00:00:00", model_signature.DataType.DOUBLE), - ], - ) - - df = pd.DataFrame( - [[1, 2.0], [2, 4.0]], columns=pd.TimedeltaIndex(data=["1 days 02:00:00", "1 days 06:05:01.000030"]) - ) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [ - model_signature.FeatureSpec("1 days 02:00:00", model_signature.DataType.INT64), - model_signature.FeatureSpec("1 days 06:05:01.000030", model_signature.DataType.DOUBLE), - ], - ) - - df = pd.DataFrame([[1, 2.0], [2, 4.0]], columns=pd.interval_range(start=0, end=2)) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [ - model_signature.FeatureSpec("(0, 1]", model_signature.DataType.INT64), - model_signature.FeatureSpec("(1, 2]", model_signature.DataType.DOUBLE), - ], - ) - - arrays = [[1, 2], ["red", "blue"]] - df = pd.DataFrame([[1, 2.0], [2, 4.0]], columns=pd.MultiIndex.from_arrays(arrays, names=("number", "color"))) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [ - model_signature.FeatureSpec("(1, 'red')", model_signature.DataType.INT64), - model_signature.FeatureSpec("(2, 'blue')", model_signature.DataType.DOUBLE), - ], - ) - - df = pd.DataFrame([1, 2, 3, 4]) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="output"), - [model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT64)], - ) - - df = pd.DataFrame([1, 2, 3, 4], columns=["a"]) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="output"), - [model_signature.FeatureSpec("a", model_signature.DataType.INT64)], - ) - - df = pd.DataFrame(["a", "b", "c", "d"], columns=["a"]) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="output"), - [model_signature.FeatureSpec("a", model_signature.DataType.STRING)], - ) - - df = pd.DataFrame([ele.encode() for ele in ["a", "b", "c", "d"]], columns=["a"]) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="output"), - [model_signature.FeatureSpec("a", model_signature.DataType.BYTES)], - ) - - df = pd.DataFrame([[1, 2.0], [2, 4.0]]) - self.assertListEqual( - model_signature._PandasDataFrameHandler.infer_signature(df, role="output"), - [ - model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("output_feature_1", model_signature.DataType.DOUBLE), - ], - ) - - def test_convert_to_df_pd_DataFrame(self) -> None: - a = np.array([[2, 5], [6, 8]]) - li = [[2, 5], [6, 8]] - df1 = pd.DataFrame([[1, a], [2, a]], columns=["a", "b"]) - df2 = pd.DataFrame([[1, li], [2, li]], columns=["a", "b"]) - pd.testing.assert_frame_equal(model_signature._PandasDataFrameHandler.convert_to_df(df1), df2) - - -class NumpyArrayHandlerTest(absltest.TestCase): - def test_validate_np_ndarray(self) -> None: - arr = np.array([]) - with self.assertRaisesRegex(ValueError, "Empty data is found."): - model_signature._NumpyArrayHandler.validate(arr) - - arr = np.array(1) - with self.assertRaisesRegex(ValueError, "Scalar data is found."): - model_signature._NumpyArrayHandler.validate(arr) - - def test_trunc_np_ndarray(self) -> None: - arr = np.array([1] * (model_signature._NumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1)) - - np.testing.assert_equal( - np.array([1] * (model_signature._NumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT)), - model_signature._NumpyArrayHandler.truncate(arr), - ) - - arr = np.array([1] * (model_signature._NumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)) - - np.testing.assert_equal( - arr, - model_signature._NumpyArrayHandler.truncate(arr), - ) - - def test_infer_schema_np_ndarray(self) -> None: - arr = np.array([1, 2, 3, 4]) - self.assertListEqual( - model_signature._NumpyArrayHandler.infer_signature(arr, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64)], - ) - - arr = np.array([[1, 2], [3, 4]]) - self.assertListEqual( - model_signature._NumpyArrayHandler.infer_signature(arr, role="input"), - [ - model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("input_feature_1", model_signature.DataType.INT64), - ], - ) - - arr = np.array([[[1, 1], [2, 2]], [[3, 3], [4, 4]]]) - self.assertListEqual( - model_signature._NumpyArrayHandler.infer_signature(arr, role="input"), - [ - model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64, shape=(2,)), - model_signature.FeatureSpec("input_feature_1", model_signature.DataType.INT64, shape=(2,)), - ], - ) - - arr = np.array([1, 2, 3, 4]) - self.assertListEqual( - model_signature._NumpyArrayHandler.infer_signature(arr, role="output"), - [model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT64)], - ) - - arr = np.array([[1, 2], [3, 4]]) - self.assertListEqual( - model_signature._NumpyArrayHandler.infer_signature(arr, role="output"), - [ - model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("output_feature_1", model_signature.DataType.INT64), - ], - ) - - arr = np.array([[[1, 1], [2, 2]], [[3, 3], [4, 4]]]) - self.assertListEqual( - model_signature._NumpyArrayHandler.infer_signature(arr, role="output"), - [ - model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT64, shape=(2,)), - model_signature.FeatureSpec("output_feature_1", model_signature.DataType.INT64, shape=(2,)), - ], - ) - - def test_convert_to_df_numpy_array(self) -> None: - arr1 = np.array([1, 2, 3, 4]) - pd.testing.assert_frame_equal( - model_signature._NumpyArrayHandler.convert_to_df(arr1), - pd.DataFrame([1, 2, 3, 4]), - ) - - arr2 = np.array([[1, 1], [2, 2], [3, 3], [4, 4]]) - pd.testing.assert_frame_equal( - model_signature._NumpyArrayHandler.convert_to_df(arr2), - pd.DataFrame([[1, 1], [2, 2], [3, 3], [4, 4]]), - ) - - arr3 = np.array([[[1, 1], [2, 2]], [[3, 3], [4, 4]]]) - pd.testing.assert_frame_equal( - model_signature._NumpyArrayHandler.convert_to_df(arr3), - pd.DataFrame(data={0: [np.array([1, 1]), np.array([3, 3])], 1: [np.array([2, 2]), np.array([4, 4])]}), - ) - - -class SeqOfNumpyArrayHandlerTest(absltest.TestCase): - def test_validate_list_of_numpy_array(self) -> None: - lt8 = [pd.DataFrame([1]), pd.DataFrame([2, 3])] - self.assertFalse(model_signature._SeqOfNumpyArrayHandler.can_handle(lt8)) - - def test_trunc_np_ndarray(self) -> None: - arrs = [np.array([1] * (model_signature._SeqOfNumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1))] * 2 - - for arr in model_signature._SeqOfNumpyArrayHandler.truncate(arrs): - np.testing.assert_equal( - np.array([1] * (model_signature._SeqOfNumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT)), arr - ) - - arrs = [ - np.array([1]), - np.array([1] * (model_signature._SeqOfNumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)), - ] - - for arr in model_signature._SeqOfNumpyArrayHandler.truncate(arrs): - np.testing.assert_equal(np.array([1]), arr) - - def test_infer_signature_list_of_numpy_array(self) -> None: - arr = np.array([1, 2, 3, 4]) - lt = [arr, arr] - self.assertListEqual( - model_signature._SeqOfNumpyArrayHandler.infer_signature(lt, role="input"), - [ - model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("input_feature_1", model_signature.DataType.INT64), - ], - ) - - arr = np.array([[1, 2], [3, 4]]) - lt = [arr, arr] - self.assertListEqual( - model_signature._SeqOfNumpyArrayHandler.infer_signature(lt, role="input"), - [ - model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64, shape=(2,)), - model_signature.FeatureSpec("input_feature_1", model_signature.DataType.INT64, shape=(2,)), - ], - ) - - arr = np.array([[[1, 1], [2, 2]], [[3, 3], [4, 4]]]) - lt = [arr, arr] - self.assertListEqual( - model_signature._SeqOfNumpyArrayHandler.infer_signature(lt, role="output"), - [ - model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT64, shape=(2, 2)), - model_signature.FeatureSpec("output_feature_1", model_signature.DataType.INT64, shape=(2, 2)), - ], - ) - - def test_convert_to_df_list_of_numpy_array(self) -> None: - arr1 = np.array([1, 2, 3, 4]) - lt = [arr1, arr1] - pd.testing.assert_frame_equal( - model_signature._SeqOfNumpyArrayHandler.convert_to_df(lt), - pd.DataFrame([[1, 1], [2, 2], [3, 3], [4, 4]]), - check_names=False, - ) - - arr2 = np.array([[1, 1], [2, 2], [3, 3], [4, 4]]) - lt = [arr1, arr2] - pd.testing.assert_frame_equal( - model_signature._SeqOfNumpyArrayHandler.convert_to_df(lt), - pd.DataFrame([[1, [1, 1]], [2, [2, 2]], [3, [3, 3]], [4, [4, 4]]]), - ) - - arr = np.array([[[1, 1], [2, 2]], [[3, 3], [4, 4]]]) - lt = [arr, arr] - pd.testing.assert_frame_equal( - model_signature._SeqOfNumpyArrayHandler.convert_to_df(lt), - pd.DataFrame( - data={ - 0: [[[1, 1], [2, 2]], [[3, 3], [4, 4]]], - 1: [[[1, 1], [2, 2]], [[3, 3], [4, 4]]], - } - ), - ) - - -class ListOfBuiltinsHandlerTest(absltest.TestCase): - def test_validate_list_builtins(self) -> None: - lt6 = ["Hello", [2, 3]] - with self.assertRaisesRegex(ValueError, "Inconsistent type of object found in data"): - model_signature._ListOfBuiltinHandler.validate(lt6) # type:ignore[arg-type] - - lt7 = [[1], [2, 3]] - with self.assertRaisesRegex(ValueError, "Ill-shaped list data"): - model_signature._ListOfBuiltinHandler.validate(lt7) - - lt8 = [pd.DataFrame([1]), pd.DataFrame([2, 3])] - self.assertFalse(model_signature._ListOfBuiltinHandler.can_handle(lt8)) - - def test_infer_signature_list_builtins(self) -> None: - lt1 = [1, 2, 3, 4] - self.assertListEqual( - model_signature._ListOfBuiltinHandler.infer_signature(lt1, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64)], - ) - - lt2 = ["a", "b", "c", "d"] - self.assertListEqual( - model_signature._ListOfBuiltinHandler.infer_signature(lt2, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.STRING)], - ) - - lt3 = [ele.encode() for ele in lt2] - self.assertListEqual( - model_signature._ListOfBuiltinHandler.infer_signature(lt3, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.BYTES)], - ) - - lt4 = [[1, 2], [3, 4]] - self.assertListEqual( - model_signature._ListOfBuiltinHandler.infer_signature(lt4, role="input"), - [ - model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("input_feature_1", model_signature.DataType.INT64), - ], - ) - - lt5 = [[1, 2.0], [3, 4]] # This is not encouraged and will have type error, but we support it. - self.assertListEqual( - model_signature._ListOfBuiltinHandler.infer_signature(lt5, role="input"), # type:ignore[arg-type] - [ - model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("input_feature_1", model_signature.DataType.DOUBLE), - ], - ) - - lt6 = [[[1, 1], [2, 2]], [[3, 3], [4, 4]]] - self.assertListEqual( - model_signature._ListOfBuiltinHandler.infer_signature(lt6, role="input"), - [ - model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64, shape=(2,)), - model_signature.FeatureSpec("input_feature_1", model_signature.DataType.INT64, shape=(2,)), - ], - ) - - -class SeqOfPyTorchTensorHandlerTest(absltest.TestCase): - def test_validate_list_of_pytorch_tensor(self) -> None: - lt1 = [np.array([1, 4]), np.array([2, 3])] - self.assertFalse(model_signature._SeqOfPyTorchTensorHandler.can_handle(lt1)) - - lt2 = [np.array([1, 4]), torch.Tensor([2, 3])] - self.assertFalse(model_signature._SeqOfPyTorchTensorHandler.can_handle(lt2)) - - lt3 = [torch.Tensor([1, 4]), torch.Tensor([2, 3])] - self.assertTrue(model_signature._SeqOfPyTorchTensorHandler.can_handle(lt3)) - - def test_validate_torch_tensor(self) -> None: - t = [torch.Tensor([])] - with self.assertRaisesRegex(ValueError, "Empty data is found."): - model_signature._SeqOfPyTorchTensorHandler.validate(t) - - t = [torch.Tensor(1)] - with self.assertRaisesRegex(ValueError, "Scalar data is found."): - model_signature._SeqOfPyTorchTensorHandler.validate(t) - - t = [torch.Tensor([1, 2]), torch.Tensor(1)] - with self.assertRaisesRegex(ValueError, "Scalar data is found."): - model_signature._SeqOfPyTorchTensorHandler.validate(t) - - def test_trunc_torch_tensor(self) -> None: - t = [torch.Tensor([1] * (model_signature._SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1))] - - for ts in model_signature._SeqOfPyTorchTensorHandler.truncate(t): - torch.testing.assert_close( # type:ignore[attr-defined] - torch.Tensor([1] * (model_signature._SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT)), ts - ) - - t = [torch.Tensor([1] * (model_signature._SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1))] - - for ts in model_signature._SeqOfPyTorchTensorHandler.truncate(t): - torch.testing.assert_close( # type:ignore[attr-defined] - torch.Tensor([1] * (model_signature._SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)), ts - ) - - t = [torch.Tensor([1] * (model_signature._SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1))] * 2 - - for ts in model_signature._SeqOfPyTorchTensorHandler.truncate(t): - torch.testing.assert_close( # type:ignore[attr-defined] - torch.Tensor([1] * (model_signature._SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT)), ts - ) - - t = [ - torch.Tensor([1]), - torch.Tensor([1] * (model_signature._SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)), - ] - - for ts in model_signature._SeqOfPyTorchTensorHandler.truncate(t): - torch.testing.assert_close( # type:ignore[attr-defined] - torch.Tensor([1]), ts - ) - - def test_infer_schema_torch_tensor(self) -> None: - t1 = [torch.IntTensor([1, 2, 3, 4])] - self.assertListEqual( - model_signature._SeqOfPyTorchTensorHandler.infer_signature(t1, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT32)], - ) - - t2 = [torch.LongTensor([1, 2, 3, 4])] - self.assertListEqual( - model_signature._SeqOfPyTorchTensorHandler.infer_signature(t2, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64)], - ) - - t3 = [torch.ShortTensor([1, 2, 3, 4])] - self.assertListEqual( - model_signature._SeqOfPyTorchTensorHandler.infer_signature(t3, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT16)], - ) - - t4 = [torch.CharTensor([1, 2, 3, 4])] - self.assertListEqual( - model_signature._SeqOfPyTorchTensorHandler.infer_signature(t4, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT8)], - ) - - t5 = [torch.ByteTensor([1, 2, 3, 4])] - self.assertListEqual( - model_signature._SeqOfPyTorchTensorHandler.infer_signature(t5, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.UINT8)], - ) - - t6 = [torch.BoolTensor([False, True])] - self.assertListEqual( - model_signature._SeqOfPyTorchTensorHandler.infer_signature(t6, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.BOOL)], - ) - - t7 = [torch.FloatTensor([1.2, 3.4])] - self.assertListEqual( - model_signature._SeqOfPyTorchTensorHandler.infer_signature(t7, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.FLOAT)], - ) - - t8 = [torch.DoubleTensor([1.2, 3.4])] - self.assertListEqual( - model_signature._SeqOfPyTorchTensorHandler.infer_signature(t8, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.DOUBLE)], - ) - - t9 = [torch.LongTensor([[1, 2], [3, 4]])] - self.assertListEqual( - model_signature._SeqOfPyTorchTensorHandler.infer_signature(t9, role="input"), - [ - model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64, shape=(2,)), - ], - ) - - t10 = [torch.LongTensor([[[1, 1], [2, 2]], [[3, 3], [4, 4]]])] - self.assertListEqual( - model_signature._SeqOfPyTorchTensorHandler.infer_signature(t10, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64, shape=(2, 2))], - ) - - t11 = [torch.LongTensor([1, 2, 3, 4])] - self.assertListEqual( - model_signature._SeqOfPyTorchTensorHandler.infer_signature(t11, role="output"), - [model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT64)], - ) - - t12 = [torch.LongTensor([1, 2]), torch.LongTensor([3, 4])] - self.assertListEqual( - model_signature._SeqOfPyTorchTensorHandler.infer_signature(t12, role="output"), - [ - model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("output_feature_1", model_signature.DataType.INT64), - ], - ) - - t13 = [torch.FloatTensor([1.2, 2.4]), torch.LongTensor([3, 4])] - self.assertListEqual( - model_signature._SeqOfPyTorchTensorHandler.infer_signature(t13, role="output"), - [ - model_signature.FeatureSpec("output_feature_0", model_signature.DataType.FLOAT), - model_signature.FeatureSpec("output_feature_1", model_signature.DataType.INT64), - ], - ) - - t14 = [torch.LongTensor([[1, 1], [2, 2]]), torch.LongTensor([[3, 3], [4, 4]])] - self.assertListEqual( - model_signature._SeqOfPyTorchTensorHandler.infer_signature(t14, role="output"), - [ - model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT64, shape=(2,)), - model_signature.FeatureSpec("output_feature_1", model_signature.DataType.INT64, shape=(2,)), - ], - ) - - t15 = [torch.LongTensor([[1, 1], [2, 2]]), torch.DoubleTensor([[1.5, 6.8], [2.9, 9.2]])] - self.assertListEqual( - model_signature._SeqOfPyTorchTensorHandler.infer_signature(t15, role="output"), - [ - model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT64, shape=(2,)), - model_signature.FeatureSpec("output_feature_1", model_signature.DataType.DOUBLE, shape=(2,)), - ], - ) - - def test_convert_to_df_torch_tensor(self) -> None: - t1 = [torch.LongTensor([1, 2, 3, 4])] - pd.testing.assert_frame_equal( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t1), - pd.DataFrame([1, 2, 3, 4]), - ) - - t2 = [torch.DoubleTensor([1, 2, 3, 4])] - t2[0].requires_grad = True - pd.testing.assert_frame_equal( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t2), - pd.DataFrame([1, 2, 3, 4], dtype=np.double), - ) - - t3 = [torch.LongTensor([[1, 1], [2, 2], [3, 3], [4, 4]])] - pd.testing.assert_frame_equal( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t3), - pd.DataFrame({0: [np.array([1, 1]), np.array([2, 2]), np.array([3, 3]), np.array([4, 4])]}), - ) - - t4 = [torch.LongTensor([[[1, 1], [2, 2]], [[3, 3], [4, 4]]])] - pd.testing.assert_frame_equal( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t4), - pd.DataFrame(data={0: [np.array([[1, 1], [2, 2]]), np.array([[3, 3], [4, 4]])]}), - ) - - t5 = [torch.LongTensor([1, 2]), torch.LongTensor([3, 4])] - pd.testing.assert_frame_equal( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t5), - pd.DataFrame([[1, 3], [2, 4]]), - ) - - t6 = [torch.DoubleTensor([1.2, 2.4]), torch.LongTensor([3, 4])] - pd.testing.assert_frame_equal( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t6), - pd.DataFrame([[1.2, 3], [2.4, 4]]), - ) - - t7 = [torch.LongTensor([[1, 1], [2, 2]]), torch.LongTensor([[3, 3], [4, 4]])] - pd.testing.assert_frame_equal( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t7), - pd.DataFrame({0: [np.array([1, 1]), np.array([2, 2])], 1: [np.array([3, 3]), np.array([4, 4])]}), - ) - - t8 = [torch.LongTensor([[1, 1], [2, 2]]), torch.DoubleTensor([[1.5, 6.8], [2.9, 9.2]])] - pd.testing.assert_frame_equal( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t8), - pd.DataFrame({0: [np.array([1, 1]), np.array([2, 2])], 1: [np.array([1.5, 6.8]), np.array([2.9, 9.2])]}), - ) - - def test_convert_from_df_torch_tensor(self) -> None: - t1 = [torch.LongTensor([1, 2, 3, 4])] - for idx, t in enumerate( - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t1) - ) - ): - torch.testing.assert_close(t, t1[idx]) # type:ignore[attr-defined] - - t2 = [torch.DoubleTensor([1, 2, 3, 4])] - for idx, t in enumerate( - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t2) - ) - ): - torch.testing.assert_close(t, t2[idx]) # type:ignore[attr-defined] - - t3 = [torch.LongTensor([[1, 1], [2, 2], [3, 3], [4, 4]])] - for idx, t in enumerate( - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t3) - ) - ): - torch.testing.assert_close(t, t3[idx]) # type:ignore[attr-defined] - - t4 = [torch.LongTensor([[[1, 1], [2, 2]], [[3, 3], [4, 4]]])] - for idx, t in enumerate( - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t4) - ) - ): - torch.testing.assert_close(t, t4[idx]) # type:ignore[attr-defined] - - t5 = [torch.LongTensor([1, 2]), torch.LongTensor([3, 4])] - for idx, t in enumerate( - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t5) - ) - ): - torch.testing.assert_close(t, t5[idx]) # type:ignore[attr-defined] - - t6 = [torch.DoubleTensor([1.2, 2.4]), torch.LongTensor([3, 4])] - for idx, t in enumerate( - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t6) - ) - ): - torch.testing.assert_close(t, t6[idx]) # type:ignore[attr-defined] - - t7 = [torch.LongTensor([[1, 1], [2, 2]]), torch.LongTensor([[3, 3], [4, 4]])] - for idx, t in enumerate( - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t7) - ) - ): - torch.testing.assert_close(t, t7[idx]) # type:ignore[attr-defined] - - t8 = [torch.LongTensor([[1, 1], [2, 2]]), torch.DoubleTensor([[1.5, 6.8], [2.9, 9.2]])] - for idx, t in enumerate( - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t8) - ) - ): - torch.testing.assert_close(t, t8[idx]) # type:ignore[attr-defined] - - t9 = [torch.IntTensor([1, 2, 3, 4])] - fts = model_signature._SeqOfPyTorchTensorHandler.infer_signature(t9, role="input") - for idx, t in enumerate( - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._rename_pandas_df(model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t9), fts), - fts, - ) - ): - torch.testing.assert_close(t, t9[idx]) # type:ignore[attr-defined] - - t10 = [torch.tensor([1.2, 3.4])] - fts = model_signature._SeqOfPyTorchTensorHandler.infer_signature(t10, role="input") - for idx, t in enumerate( - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._rename_pandas_df(model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t10), fts), - fts, - ) - ): - torch.testing.assert_close(t, t10[idx]) # type:ignore[attr-defined] - - t11 = [torch.tensor([[1, 1], [2, 2], [3, 3], [4, 4]])] - fts = model_signature._SeqOfPyTorchTensorHandler.infer_signature(t11, role="input") - for idx, t in enumerate( - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._rename_pandas_df(model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t11), fts), - fts, - ) - ): - torch.testing.assert_close(t, t11[idx]) # type:ignore[attr-defined] - - t12 = [torch.tensor([[[1, 1], [2, 2]], [[3, 3], [4, 4]]])] - fts = model_signature._SeqOfPyTorchTensorHandler.infer_signature(t12, role="input") - for idx, t in enumerate( - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._rename_pandas_df(model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t12), fts), - fts, - ) - ): - torch.testing.assert_close(t, t12[idx]) # type:ignore[attr-defined] - - t13 = [torch.tensor([1, 2]), torch.tensor([3, 4])] - fts = model_signature._SeqOfPyTorchTensorHandler.infer_signature(t13, role="input") - for idx, t in enumerate( - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._rename_pandas_df(model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t13), fts), - fts, - ) - ): - torch.testing.assert_close(t, t13[idx]) # type:ignore[attr-defined] - - t14 = [torch.tensor([1.2, 2.4]), torch.tensor([3, 4])] - fts = model_signature._SeqOfPyTorchTensorHandler.infer_signature(t14, role="input") - for idx, t in enumerate( - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._rename_pandas_df(model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t14), fts), - fts, - ) - ): - torch.testing.assert_close(t, t14[idx]) # type:ignore[attr-defined] - - t15 = [torch.tensor([[1, 1], [2, 2]]), torch.tensor([[3, 3], [4, 4]])] - fts = model_signature._SeqOfPyTorchTensorHandler.infer_signature(t15, role="input") - for idx, t in enumerate( - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._rename_pandas_df(model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t15), fts), - fts, - ) - ): - torch.testing.assert_close(t, t15[idx]) # type:ignore[attr-defined] - - t16 = [torch.tensor([[1, 1], [2, 2]]), torch.tensor([[1.5, 6.8], [2.9, 9.2]])] - fts = model_signature._SeqOfPyTorchTensorHandler.infer_signature(t16, role="input") - for idx, t in enumerate( - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._rename_pandas_df(model_signature._SeqOfPyTorchTensorHandler.convert_to_df(t16), fts), - fts, - ) - ): - torch.testing.assert_close(t, t16[idx]) # type:ignore[attr-defined] - - -class SeqOfTensorflowTensorHandlerTest(absltest.TestCase): - def test_validate_list_of_tf_tensor(self) -> None: - lt1 = [np.array([1, 4]), np.array([2, 3])] - self.assertFalse(model_signature._SeqOfTensorflowTensorHandler.can_handle(lt1)) - - lt2 = [np.array([1, 4]), tf.constant([2, 3])] - self.assertFalse(model_signature._SeqOfTensorflowTensorHandler.can_handle(lt2)) - - lt3 = [tf.constant([1, 4]), tf.constant([2, 3])] - self.assertTrue(model_signature._SeqOfTensorflowTensorHandler.can_handle(lt3)) - - lt4 = [tf.constant([1, 4]), tf.Variable([2, 3])] - self.assertTrue(model_signature._SeqOfTensorflowTensorHandler.can_handle(lt4)) - - lt5 = [tf.Variable([1, 4]), tf.Variable([2, 3])] - self.assertTrue(model_signature._SeqOfTensorflowTensorHandler.can_handle(lt5)) - - def test_validate_tf_tensor(self) -> None: - t = [tf.constant([])] - with self.assertRaisesRegex(ValueError, "Empty data is found."): - model_signature._SeqOfTensorflowTensorHandler.validate(t) - - t = [tf.Variable([1, 2], shape=tf.TensorShape(None))] - with self.assertRaisesRegex(ValueError, "Unknown shape data is found."): - model_signature._SeqOfTensorflowTensorHandler.validate(t) - - t = [tf.Variable([[1, 2]], shape=tf.TensorShape([None, 2]))] - with self.assertRaisesRegex(ValueError, "Unknown shape data is found."): - model_signature._SeqOfTensorflowTensorHandler.validate(t) - - t = [tf.Variable([[1, 2]], shape=tf.TensorShape([1, None]))] - with self.assertRaisesRegex(ValueError, "Unknown shape data is found."): - model_signature._SeqOfTensorflowTensorHandler.validate(t) - - t = [tf.constant(1)] - with self.assertRaisesRegex(ValueError, "Scalar data is found."): - model_signature._SeqOfTensorflowTensorHandler.validate(t) - - t = [tf.constant([1])] - with self.assertRaisesRegex(ValueError, "Scalar data is found."): - model_signature._SeqOfTensorflowTensorHandler.validate(t) - - t = [tf.Variable(1)] - with self.assertRaisesRegex(ValueError, "Scalar data is found."): - model_signature._SeqOfTensorflowTensorHandler.validate(t) - - t = [tf.Variable([1])] - with self.assertRaisesRegex(ValueError, "Scalar data is found."): - model_signature._SeqOfTensorflowTensorHandler.validate(t) - - t = [tf.constant([1, 2]), tf.constant(1)] - with self.assertRaisesRegex(ValueError, "Scalar data is found."): - model_signature._SeqOfTensorflowTensorHandler.validate(t) - - def test_count_tf_tensor(self) -> None: - t = [tf.constant([1, 2])] - self.assertEqual(model_signature._SeqOfTensorflowTensorHandler.count(t), 2) - - t = [tf.constant([[1, 2]])] - self.assertEqual(model_signature._SeqOfTensorflowTensorHandler.count(t), 1) - - t = [tf.Variable([1, 2])] - self.assertEqual(model_signature._SeqOfTensorflowTensorHandler.count(t), 2) - - t = [tf.Variable([1, 2], shape=tf.TensorShape(None))] - with self.assertRaisesRegex(ValueError, "Unknown shape data is found."): - model_signature._SeqOfTensorflowTensorHandler.validate(t) - - t = [tf.Variable([[1, 2]], shape=tf.TensorShape([None, 2]))] - with self.assertRaisesRegex(ValueError, "Unknown shape data is found."): - model_signature._SeqOfTensorflowTensorHandler.validate(t) - - t = [tf.Variable([[1, 2]], shape=tf.TensorShape([1, None]))] - self.assertEqual(model_signature._SeqOfTensorflowTensorHandler.count(t), 1) - - def test_trunc_tf_tensor(self) -> None: - t = [tf.constant([1] * (model_signature._SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1))] - - for ts in model_signature._SeqOfTensorflowTensorHandler.truncate(t): - tf.assert_equal( - tf.constant([1] * (model_signature._SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT)), ts - ) - - t = [tf.constant([1] * (model_signature._SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1))] - - for ts in model_signature._SeqOfTensorflowTensorHandler.truncate(t): - tf.assert_equal( - tf.constant([1] * (model_signature._SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)), ts - ) - - t = [tf.constant([1] * (model_signature._SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1))] * 2 - - for ts in model_signature._SeqOfTensorflowTensorHandler.truncate(t): - tf.assert_equal( - tf.constant([1] * (model_signature._SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT)), ts - ) - - t = [ - tf.constant([1]), - tf.constant([1] * (model_signature._SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)), - ] - - for ts in model_signature._SeqOfTensorflowTensorHandler.truncate(t): - tf.assert_equal(tf.constant([1]), ts) - - def test_infer_schema_tf_tensor(self) -> None: - t1 = [tf.constant([1, 2, 3, 4], dtype=tf.int32)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t1, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT32)], - ) - - t2 = [tf.constant([1, 2, 3, 4], dtype=tf.int64)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t2, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64)], - ) - - t3 = [tf.constant([1, 2, 3, 4], dtype=tf.int16)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t3, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT16)], - ) - - t4 = [tf.constant([1, 2, 3, 4], dtype=tf.int8)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t4, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT8)], - ) - - t5 = [tf.constant([1, 2, 3, 4], dtype=tf.uint32)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t5, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.UINT32)], - ) - - t6 = [tf.constant([1, 2, 3, 4], dtype=tf.uint64)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t6, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.UINT64)], - ) - - t7 = [tf.constant([1, 2, 3, 4], dtype=tf.uint16)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t7, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.UINT16)], - ) - - t8 = [tf.constant([1, 2, 3, 4], dtype=tf.uint8)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t8, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.UINT8)], - ) - - t9 = [tf.constant([False, True])] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t9, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.BOOL)], - ) - - t10 = [tf.constant([1.2, 3.4], dtype=tf.float32)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t10, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.FLOAT)], - ) - - t11 = [tf.constant([1.2, 3.4], dtype=tf.float64)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t11, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.DOUBLE)], - ) - - t12 = [tf.constant([[1, 2], [3, 4]])] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t12, role="input"), - [ - model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT32, shape=(2,)), - ], - ) - - t13 = [tf.constant([[[1, 1], [2, 2]], [[3, 3], [4, 4]]])] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t13, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT32, shape=(2, 2))], - ) - - t14 = [tf.constant([1, 2, 3, 4])] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t14, role="output"), - [model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT32)], - ) - - t15 = [tf.constant([1, 2]), tf.constant([3, 4])] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t15, role="output"), - [ - model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT32), - model_signature.FeatureSpec("output_feature_1", model_signature.DataType.INT32), - ], - ) - - t16 = [tf.constant([1.2, 2.4]), tf.constant([3, 4])] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t16, role="output"), - [ - model_signature.FeatureSpec("output_feature_0", model_signature.DataType.FLOAT), - model_signature.FeatureSpec("output_feature_1", model_signature.DataType.INT32), - ], - ) - - t17 = [tf.constant([[1, 1], [2, 2]]), tf.constant([[3, 3], [4, 4]])] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t17, role="output"), - [ - model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT32, shape=(2,)), - model_signature.FeatureSpec("output_feature_1", model_signature.DataType.INT32, shape=(2,)), - ], - ) - - t18 = [tf.constant([[1, 1], [2, 2]]), tf.constant([[1.5, 6.8], [2.9, 9.2]])] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t18, role="output"), - [ - model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT32, shape=(2,)), - model_signature.FeatureSpec("output_feature_1", model_signature.DataType.FLOAT, shape=(2,)), - ], - ) - - t21 = [tf.constant([1, 2, 3, 4], dtype=tf.int32)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t21, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT32)], - ) - - t22 = [tf.constant([1, 2, 3, 4], dtype=tf.int64)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t22, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64)], - ) - - t23 = [tf.constant([1, 2, 3, 4], dtype=tf.int16)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t23, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT16)], - ) - - t24 = [tf.constant([1, 2, 3, 4], dtype=tf.int8)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t24, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT8)], - ) - - t25 = [tf.constant([1, 2, 3, 4], dtype=tf.uint32)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t25, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.UINT32)], - ) - - t26 = [tf.constant([1, 2, 3, 4], dtype=tf.uint64)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t26, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.UINT64)], - ) - - t27 = [tf.constant([1, 2, 3, 4], dtype=tf.uint16)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t27, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.UINT16)], - ) - - t28 = [tf.constant([1, 2, 3, 4], dtype=tf.uint8)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t28, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.UINT8)], - ) - - t29 = [tf.constant([False, True])] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t29, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.BOOL)], - ) - - t30 = [tf.constant([1.2, 3.4], dtype=tf.float32)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t30, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.FLOAT)], - ) - - t31 = [tf.constant([1.2, 3.4], dtype=tf.float64)] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t31, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.DOUBLE)], - ) - - t32 = [tf.constant([[1, 2], [3, 4]])] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t32, role="input"), - [ - model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT32, shape=(2,)), - ], - ) - - t33 = [tf.constant([[[1, 1], [2, 2]], [[3, 3], [4, 4]]])] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t33, role="input"), - [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT32, shape=(2, 2))], - ) - - t34 = [tf.constant([1, 2, 3, 4])] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t34, role="output"), - [model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT32)], - ) - - t35 = [tf.constant([1, 2]), tf.constant([3, 4])] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t35, role="output"), - [ - model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT32), - model_signature.FeatureSpec("output_feature_1", model_signature.DataType.INT32), - ], - ) - - t36 = [tf.constant([1.2, 2.4]), tf.constant([3, 4])] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t36, role="output"), - [ - model_signature.FeatureSpec("output_feature_0", model_signature.DataType.FLOAT), - model_signature.FeatureSpec("output_feature_1", model_signature.DataType.INT32), - ], - ) - - t37 = [tf.constant([[1, 1], [2, 2]]), tf.constant([[3, 3], [4, 4]])] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t37, role="output"), - [ - model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT32, shape=(2,)), - model_signature.FeatureSpec("output_feature_1", model_signature.DataType.INT32, shape=(2,)), - ], - ) - - t38 = [tf.constant([[1, 1], [2, 2]]), tf.constant([[1.5, 6.8], [2.9, 9.2]])] - self.assertListEqual( - model_signature._SeqOfTensorflowTensorHandler.infer_signature(t38, role="output"), - [ - model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT32, shape=(2,)), - model_signature.FeatureSpec("output_feature_1", model_signature.DataType.FLOAT, shape=(2,)), - ], - ) - - def test_convert_to_df_tf_tensor(self) -> None: - t1 = [tf.constant([1, 2, 3, 4], dtype=tf.int64)] - pd.testing.assert_frame_equal( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t1), - pd.DataFrame([1, 2, 3, 4]), - ) - - t2 = [tf.Variable([1, 2, 3, 4], dtype=tf.int64)] - pd.testing.assert_frame_equal( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t2), - pd.DataFrame([1, 2, 3, 4]), - ) - - t3 = [tf.constant([[1, 1], [2, 2], [3, 3], [4, 4]], dtype=tf.int64)] - pd.testing.assert_frame_equal( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t3), - pd.DataFrame({0: [np.array([1, 1]), np.array([2, 2]), np.array([3, 3]), np.array([4, 4])]}), - ) - - t4 = [tf.constant([[[1, 1], [2, 2]], [[3, 3], [4, 4]]], dtype=tf.int64)] - pd.testing.assert_frame_equal( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t4), - pd.DataFrame(data={0: [np.array([[1, 1], [2, 2]]), np.array([[3, 3], [4, 4]])]}), - ) - - t5 = [tf.constant([1, 2], dtype=tf.int64), tf.constant([3, 4], dtype=tf.int64)] - pd.testing.assert_frame_equal( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t5), - pd.DataFrame([[1, 3], [2, 4]]), - ) - - t6 = [tf.constant([1.2, 2.4], dtype=tf.float64), tf.constant([3, 4], dtype=tf.int64)] - pd.testing.assert_frame_equal( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t6), - pd.DataFrame([[1.2, 3], [2.4, 4]]), - ) - - t7 = [tf.constant([[1, 1], [2, 2]], dtype=tf.int64), tf.constant([[3, 3], [4, 4]], dtype=tf.int64)] - pd.testing.assert_frame_equal( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t7), - pd.DataFrame({0: [np.array([1, 1]), np.array([2, 2])], 1: [np.array([3, 3]), np.array([4, 4])]}), - ) - - t8 = [tf.constant([[1, 1], [2, 2]], dtype=tf.int64), tf.constant([[1.5, 6.8], [2.9, 9.2]], dtype=tf.float64)] - pd.testing.assert_frame_equal( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t8), - pd.DataFrame({0: [np.array([1, 1]), np.array([2, 2])], 1: [np.array([1.5, 6.8]), np.array([2.9, 9.2])]}), - ) - - def test_convert_from_df_tf_tensor(self) -> None: - t1 = [tf.constant([1, 2, 3, 4], dtype=tf.int64)] - for idx, t in enumerate( - model_signature._SeqOfTensorflowTensorHandler.convert_from_df( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t1) - ) - ): - tf.assert_equal(t, t1[idx]) - - t2 = [tf.Variable([1, 2, 3, 4], dtype=tf.int64)] - for idx, t in enumerate( - model_signature._SeqOfTensorflowTensorHandler.convert_from_df( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t2) - ) - ): - tf.assert_equal(t, t2[idx]) - - t3 = [tf.constant([[1, 1], [2, 2], [3, 3], [4, 4]], dtype=tf.int64)] - for idx, t in enumerate( - model_signature._SeqOfTensorflowTensorHandler.convert_from_df( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t3) - ) - ): - tf.assert_equal(t, t3[idx]) - - t4 = [tf.constant([[[1, 1], [2, 2]], [[3, 3], [4, 4]]], dtype=tf.int64)] - for idx, t in enumerate( - model_signature._SeqOfTensorflowTensorHandler.convert_from_df( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t4) - ) - ): - tf.assert_equal(t, t4[idx]) - - t5 = [tf.constant([1, 2], dtype=tf.int64), tf.constant([3, 4], dtype=tf.int64)] - for idx, t in enumerate( - model_signature._SeqOfTensorflowTensorHandler.convert_from_df( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t5) - ) - ): - tf.assert_equal(t, t5[idx]) - - t6 = [tf.constant([1.2, 2.4], dtype=tf.float64), tf.constant([3, 4], dtype=tf.int64)] - for idx, t in enumerate( - model_signature._SeqOfTensorflowTensorHandler.convert_from_df( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t6) - ) - ): - tf.assert_equal(t, t6[idx]) - - t7 = [tf.constant([[1, 1], [2, 2]], dtype=tf.int64), tf.constant([[3, 3], [4, 4]], dtype=tf.int64)] - for idx, t in enumerate( - model_signature._SeqOfTensorflowTensorHandler.convert_from_df( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t7) - ) - ): - tf.assert_equal(t, t7[idx]) - - t8 = [tf.constant([[1, 1], [2, 2]], dtype=tf.int64), tf.constant([[1.5, 6.8], [2.9, 9.2]], dtype=tf.float64)] - for idx, t in enumerate( - model_signature._SeqOfTensorflowTensorHandler.convert_from_df( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t8) - ) - ): - tf.assert_equal(t, t8[idx]) - - t9 = [tf.constant([1, 2, 3, 4])] - fts = model_signature._SeqOfTensorflowTensorHandler.infer_signature(t9, role="input") - for idx, t in enumerate( - model_signature._SeqOfTensorflowTensorHandler.convert_from_df( - model_signature._rename_pandas_df(model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t9), fts), - fts, - ) - ): - tf.assert_equal(t, t9[idx]) - - t10 = [tf.constant([1.2, 3.4])] - fts = model_signature._SeqOfTensorflowTensorHandler.infer_signature(t10, role="input") - for idx, t in enumerate( - model_signature._SeqOfTensorflowTensorHandler.convert_from_df( - model_signature._rename_pandas_df( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t10), fts - ), - fts, - ) - ): - tf.assert_equal(t, t10[idx]) - - t11 = [tf.constant([[1, 1], [2, 2], [3, 3], [4, 4]])] - fts = model_signature._SeqOfTensorflowTensorHandler.infer_signature(t11, role="input") - for idx, t in enumerate( - model_signature._SeqOfTensorflowTensorHandler.convert_from_df( - model_signature._rename_pandas_df( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t11), fts - ), - fts, - ) - ): - tf.assert_equal(t, t11[idx]) - - t12 = [tf.constant([[[1, 1], [2, 2]], [[3, 3], [4, 4]]])] - fts = model_signature._SeqOfTensorflowTensorHandler.infer_signature(t12, role="input") - for idx, t in enumerate( - model_signature._SeqOfTensorflowTensorHandler.convert_from_df( - model_signature._rename_pandas_df( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t12), fts - ), - fts, - ) - ): - tf.assert_equal(t, t12[idx]) - - t13 = [tf.constant([1, 2]), tf.constant([3, 4])] - fts = model_signature._SeqOfTensorflowTensorHandler.infer_signature(t13, role="input") - for idx, t in enumerate( - model_signature._SeqOfTensorflowTensorHandler.convert_from_df( - model_signature._rename_pandas_df( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t13), fts - ), - fts, - ) - ): - tf.assert_equal(t, t13[idx]) - - t14 = [tf.constant([1.2, 2.4]), tf.constant([3, 4])] - fts = model_signature._SeqOfTensorflowTensorHandler.infer_signature(t14, role="input") - for idx, t in enumerate( - model_signature._SeqOfTensorflowTensorHandler.convert_from_df( - model_signature._rename_pandas_df( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t14), fts - ), - fts, - ) - ): - tf.assert_equal(t, t14[idx]) - - t15 = [tf.constant([[1, 1], [2, 2]]), tf.constant([[3, 3], [4, 4]])] - fts = model_signature._SeqOfTensorflowTensorHandler.infer_signature(t15, role="input") - for idx, t in enumerate( - model_signature._SeqOfTensorflowTensorHandler.convert_from_df( - model_signature._rename_pandas_df( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t15), fts - ), - fts, - ) - ): - tf.assert_equal(t, t15[idx]) - - t16 = [tf.constant([[1, 1], [2, 2]]), tf.constant([[1.5, 6.8], [2.9, 9.2]])] - fts = model_signature._SeqOfTensorflowTensorHandler.infer_signature(t16, role="input") - for idx, t in enumerate( - model_signature._SeqOfTensorflowTensorHandler.convert_from_df( - model_signature._rename_pandas_df( - model_signature._SeqOfTensorflowTensorHandler.convert_to_df(t16), fts - ), - fts, - ) - ): - tf.assert_equal(t, t16[idx]) - - -class SnowParkDataFrameHandlerTest(absltest.TestCase): - @classmethod - def setUpClass(cls) -> None: - cls._session = Session.builder.configs(connection_params.SnowflakeLoginOptions()).create() - - @classmethod - def tearDownClass(cls) -> None: - cls._session.close() - - def test_validate_snowpark_df(self) -> None: - schema = spt.StructType([spt.StructField('"a"', spt.VariantType()), spt.StructField('"b"', spt.StringType())]) - df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) - with self.assertRaisesRegex(ValueError, "Unsupported data type"): - model_signature._SnowparkDataFrameHandler.validate(df) - - def test_infer_schema_snowpark_df(self) -> None: - schema = spt.StructType([spt.StructField('"a"', spt.LongType()), spt.StructField('"b"', spt.StringType())]) - df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) - self.assertListEqual( - model_signature._SnowparkDataFrameHandler.infer_signature(df, role="input"), - [ - model_signature.FeatureSpec("a", model_signature.DataType.INT64), - model_signature.FeatureSpec("b", model_signature.DataType.STRING), - ], - ) - - schema = spt.StructType([spt.StructField('"""a"""', spt.LongType()), spt.StructField('"b"', spt.StringType())]) - df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) - self.assertListEqual( - model_signature._SnowparkDataFrameHandler.infer_signature(df, role="input"), - [ - model_signature.FeatureSpec('"a"', model_signature.DataType.INT64), - model_signature.FeatureSpec("b", model_signature.DataType.STRING), - ], - ) - - schema = spt.StructType([spt.StructField('"""a"""', spt.ArrayType(spt.LongType()))]) - df = self._session.create_dataframe([[[1, 3]]], schema) - with self.assertRaises(NotImplementedError): - model_signature._SnowparkDataFrameHandler.infer_signature(df, role="input"), - - def test_validate_data_with_features(self) -> None: - fts = [ - model_signature.FeatureSpec("a", model_signature.DataType.INT64), - model_signature.FeatureSpec("b", model_signature.DataType.INT64), - ] - df = self._session.create_dataframe([{'"a"': 1}, {'"b"': 2}]) - with self.assertWarnsRegex(RuntimeWarning, "Nullable column [^\\s]* provided"): - model_signature._validate_snowpark_data(df, fts) - - fts = [ - model_signature.FeatureSpec("a", model_signature.DataType.INT64), - model_signature.FeatureSpec("b", model_signature.DataType.STRING), - ] - schema = spt.StructType([spt.StructField('"a"', spt.LongType()), spt.StructField('"b"', spt.StringType())]) - df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) - model_signature._validate_snowpark_data(df, fts) - - schema = spt.StructType([spt.StructField('"a"', spt.LongType()), spt.StructField('"b"', spt.IntegerType())]) - df = self._session.create_dataframe([[1, 3], [3, 9]], schema) - with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by column"): - model_signature._validate_snowpark_data(df, fts) - - schema = spt.StructType([spt.StructField('"a1"', spt.LongType()), spt.StructField('"b"', spt.StringType())]) - df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) - with self.assertRaisesRegex(ValueError, "feature [^\\s]* does not exist in data."): - model_signature._validate_snowpark_data(df, fts) - - df = self._session.create_dataframe([{'"a"': 1}, {'"b"': 2}]) - with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by column"): - model_signature._validate_snowpark_data(df, fts) - - fts = [ - model_signature.FeatureSpec("a", model_signature.DataType.INT64, shape=(-1,)), - ] - schema = spt.StructType([spt.StructField('"a"', spt.ArrayType(spt.LongType()))]) - df = self._session.create_dataframe([[[1, 3]]], schema) - with self.assertWarns(RuntimeWarning): - model_signature._validate_snowpark_data(df, fts) - - def test_convert_to_and_from_df(self) -> None: - pd_df = pd.DataFrame([1, 2, 3, 4], columns=["col_0"]) - sp_df = model_signature._SnowparkDataFrameHandler.convert_from_df(self._session, pd_df, keep_order=False) - pd.testing.assert_frame_equal( - pd_df, model_signature._SnowparkDataFrameHandler.convert_to_df(sp_df), check_dtype=False - ) - - pd_df = pd.DataFrame([[1, 2], [3, 4]], columns=["col_0", "col_1"]) - sp_df = model_signature._SnowparkDataFrameHandler.convert_from_df(self._session, pd_df, keep_order=False) - pd.testing.assert_frame_equal( - pd_df, model_signature._SnowparkDataFrameHandler.convert_to_df(sp_df), check_dtype=False - ) - - pd_df = pd.DataFrame([[1.2, 2.4], [3, 4]], columns=["col_0", "col_1"]) - sp_df = model_signature._SnowparkDataFrameHandler.convert_from_df(self._session, pd_df, keep_order=False) - pd.testing.assert_frame_equal( - pd_df, model_signature._SnowparkDataFrameHandler.convert_to_df(sp_df), check_dtype=False - ) - - pd_df = pd.DataFrame([[1, [[2.5], [6.8]]], [2, [[2.5], [6.8]]]], columns=["a", "b"]) - sp_df = model_signature._SnowparkDataFrameHandler.convert_from_df(self._session, pd_df, keep_order=False) - pd.testing.assert_frame_equal( - pd_df, model_signature._SnowparkDataFrameHandler.convert_to_df(sp_df), check_dtype=False - ) - - a = np.array([2.5, 6.8]) - pd_df = pd.DataFrame([[1, a], [2, a]], columns=["a", "b"]) - sp_df = model_signature._SnowparkDataFrameHandler.convert_from_df(self._session, pd_df, keep_order=False) - pd.testing.assert_frame_equal( - pd_df, model_signature._SnowparkDataFrameHandler.convert_to_df(sp_df), check_dtype=False - ) class ModelSignatureMiscTest(absltest.TestCase): - def test_rename_features(self) -> None: - model_signature._rename_features([]) - - fts = [model_signature.FeatureSpec("a", model_signature.DataType.INT64)] - self.assertListEqual( - model_signature._rename_features(fts, ["b"]), - [model_signature.FeatureSpec("b", model_signature.DataType.INT64)], - ) - - fts = [model_signature.FeatureSpec("a", model_signature.DataType.INT64, shape=(2,))] - self.assertListEqual( - model_signature._rename_features(fts, ["b"]), - [model_signature.FeatureSpec("b", model_signature.DataType.INT64, shape=(2,))], - ) - - fts = [model_signature.FeatureSpec("a", model_signature.DataType.INT64, shape=(2,))] - model_signature._rename_features(fts) - - with self.assertRaises(ValueError): - fts = [model_signature.FeatureSpec("a", model_signature.DataType.INT64, shape=(2,))] - model_signature._rename_features(fts, ["b", "c"]) - def test_infer_signature(self) -> None: df = pd.DataFrame([1, 2, 3, 4]) self.assertListEqual( @@ -1982,23 +284,6 @@ def test_validate_pandas_df(self) -> None: with self.assertRaisesRegex(ValueError, "Feature is a scalar feature while array data is provided."): model_signature._validate_pandas_df(pd.DataFrame(data={"a": [np.array([1, 2])]}), fts) - def test_rename_pandas_df(self) -> None: - fts = [ - model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("input_feature_1", model_signature.DataType.INT64), - ] - - df = pd.DataFrame([[2, 5], [6, 8]], columns=["a", "b"]) - - pd.testing.assert_frame_equal(df, model_signature._rename_pandas_df(df, fts)) - - df = pd.DataFrame([[2, 5], [6, 8]]) - - pd.testing.assert_frame_equal(df, model_signature._rename_pandas_df(df, fts), check_names=False) - pd.testing.assert_index_equal( - pd.Index(["input_feature_0", "input_feature_1"]), model_signature._rename_pandas_df(df, fts).columns - ) - def test_validate_data_with_features(self) -> None: fts = [ model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64), diff --git a/snowflake/ml/model/type_hints.py b/snowflake/ml/model/type_hints.py index 3375d61f..c40fc6e9 100644 --- a/snowflake/ml/model/type_hints.py +++ b/snowflake/ml/model/type_hints.py @@ -5,6 +5,7 @@ from typing_extensions import NotRequired, TypeAlias if TYPE_CHECKING: + import mlflow import numpy as np import pandas as pd import sklearn.base @@ -48,7 +49,7 @@ CustomModelType = TypeVar("CustomModelType", bound="snowflake.ml.model.custom_model.CustomModel") -SupportedLocalModelType = Union[ +SupportedRequireSignatureModelType = Union[ "snowflake.ml.model.custom_model.CustomModel", "sklearn.base.BaseEstimator", "sklearn.pipeline.Pipeline", @@ -56,13 +57,14 @@ "xgboost.Booster", "torch.nn.Module", "torch.jit.ScriptModule", # type:ignore[name-defined] + "tensorflow.Module", ] -SupportedSnowMLModelType: TypeAlias = "base.BaseEstimator" +SupportedNoSignatureRequirementsModelType: TypeAlias = Union["base.BaseEstimator", "mlflow.pyfunc.PyFuncModel"] SupportedModelType = Union[ - SupportedLocalModelType, - SupportedSnowMLModelType, + SupportedRequireSignatureModelType, + SupportedNoSignatureRequirementsModelType, ] """This is defined as the type that Snowflake native model packaging could accept. Here is all acceptable types of Snowflake native model packaging and its handler file in _handlers/ folder. @@ -76,7 +78,8 @@ | xgboost.Booster | xgboost.py | _XGBModelHandler | | snowflake.ml.framework.base.BaseEstimator | snowmlmodel.py | _SnowMLModelHandler | | torch.nn.Module | pytroch.py | _PyTorchHandler | -| torch.jit.ScriptModule | torchscript.py | _TorchScripthHandler | +| torch.jit.ScriptModule | torchscript.py | _TorchScriptHandler | +| tensorflow.Module | tensorflow.py | _TensorFlowHandler | """ @@ -86,15 +89,12 @@ class DeployOptions(TypedDict): """Common Options for deploying to Snowflake. - disable_local_conda_resolver: Set to disable use local conda resolver to do pre-check on environment and rely on - the information schema only. Defaults to False. keep_order: Whether or not preserve the row order when predicting. Only available for dataframe has fewer than 2**64 rows. Defaults to True. output_with_input_features: Whether or not preserve the input columns in the output when predicting. Defaults to False. """ - disable_local_conda_resolver: NotRequired[bool] keep_order: NotRequired[bool] output_with_input_features: NotRequired[bool] @@ -115,7 +115,37 @@ class WarehouseDeployOptions(DeployOptions): replace_udf: NotRequired[bool] -class ModelSaveOption(TypedDict): +class SnowparkContainerServiceDeployOptions(DeployOptions): + """Deployment options for deploying to SnowService. + When type hint is updated, please ensure the concrete class is updated accordingly at: + //snowflake/ml/model/_deploy_client/snowservice/_deploy_options + + compute_pool[REQUIRED]: SnowService compute pool name. Please refer to official doc for how to create a + compute pool: https://docs.snowflake.com/LIMITEDACCESS/snowpark-containers/reference/compute-pool + image_repo: SnowService image repo path. e.g. "///". Default to auto + inferred based on session information. + min_instances: Minimum number of service replicas. Default to 1. + max_instances: Maximum number of service replicas. Default to 1. + endpoint: The specific name of the endpoint that the service function will communicate with. This option is + useful when the service has multiple endpoints. Default to “predict”. + prebuilt_snowflake_image: When provided, the image-building step is skipped, and the pre-built image from + Snowflake is used as is. This option is for users who consistently use the same image for multiple use + cases, allowing faster deployment. The snowflake image used for deployment is logged to the console for + future use. Default to None. + use_gpu: When set to True, a CUDA-enabled Docker image will be used to provide a runtime CUDA environment. + Default to False. + """ + + compute_pool: str + image_repo: NotRequired[str] + min_instances: NotRequired[int] + max_instances: NotRequired[int] + endpoint: NotRequired[str] + prebuilt_snowflake_image: NotRequired[str] + use_gpu: NotRequired[bool] + + +class BaseModelSaveOption(TypedDict): """Options for saving the model. embed_local_ml_library: Embedding local SnowML into the code directory of the folder. @@ -127,25 +157,48 @@ class ModelSaveOption(TypedDict): allow_overwritten_stage_file: NotRequired[bool] -class CustomModelSaveOption(ModelSaveOption): +class CustomModelSaveOption(BaseModelSaveOption): ... -class SKLModelSaveOptions(ModelSaveOption): +class SKLModelSaveOptions(BaseModelSaveOption): target_methods: NotRequired[Sequence[str]] -class XGBModelSaveOptions(ModelSaveOption): +class XGBModelSaveOptions(BaseModelSaveOption): target_methods: NotRequired[Sequence[str]] -class SNOWModelSaveOptions(ModelSaveOption): +class SNOWModelSaveOptions(BaseModelSaveOption): target_methods: NotRequired[Sequence[str]] -class PyTorchSaveOptions(ModelSaveOption): +class PyTorchSaveOptions(BaseModelSaveOption): target_methods: NotRequired[Sequence[str]] -class TorchScriptSaveOptions(ModelSaveOption): +class TorchScriptSaveOptions(BaseModelSaveOption): target_methods: NotRequired[Sequence[str]] + + +class TensorflowSaveOptions(BaseModelSaveOption): + target_methods: NotRequired[Sequence[str]] + + +class MLFlowSaveOptions(BaseModelSaveOption): + model_uri: NotRequired[str] + ignore_mlflow_metadata: NotRequired[bool] + ignore_mlflow_dependencies: NotRequired[bool] + + +ModelSaveOption = Union[ + BaseModelSaveOption, + CustomModelSaveOption, + SKLModelSaveOptions, + XGBModelSaveOptions, + SNOWModelSaveOptions, + PyTorchSaveOptions, + TorchScriptSaveOptions, + TensorflowSaveOptions, + MLFlowSaveOptions, +] diff --git a/snowflake/ml/modeling/framework/BUILD.bazel b/snowflake/ml/modeling/framework/BUILD.bazel index 8fcbd293..39bf575b 100644 --- a/snowflake/ml/modeling/framework/BUILD.bazel +++ b/snowflake/ml/modeling/framework/BUILD.bazel @@ -11,6 +11,9 @@ py_library( deps = [ "//snowflake/ml:version", "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/_internal/exceptions:error_messages", + "//snowflake/ml/_internal/exceptions:exceptions", + "//snowflake/ml/_internal/exceptions:modeling_error_messages", "//snowflake/ml/_internal/utils:parallelize", ], ) diff --git a/snowflake/ml/modeling/framework/_utils.py b/snowflake/ml/modeling/framework/_utils.py index 01bfdbc6..8e7776e2 100644 --- a/snowflake/ml/modeling/framework/_utils.py +++ b/snowflake/ml/modeling/framework/_utils.py @@ -13,7 +13,14 @@ from packaging import version from snowflake import snowpark -from snowflake.snowpark import exceptions, functions as F +from snowflake.ml._internal.exceptions import ( + error_codes, + error_messages, + exceptions, + exceptions as snowml_exceptions, + modeling_error_messages, +) +from snowflake.snowpark import exceptions as snowpark_exceptions, functions as F from snowflake.snowpark._internal import utils DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f" @@ -103,8 +110,8 @@ def get_filtered_valid_sklearn_args( Sklearn keyword arguments. Raises: - TypeError: If the input args contains an invalid key. - ImportError: If the scikit-learn package version does not meet the requirements + SnowflakeMLException: If the input args contains an invalid key. + SnowflakeMLException: If the scikit-learn package version does not meet the requirements for the keyword arguments. """ # get args to be passed to sklearn @@ -142,8 +149,10 @@ def get_filtered_valid_sklearn_args( continue # unknown keyword else: - msg = f"Unexpected keyword: {key}." - raise TypeError(msg) + raise snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=ValueError(error_messages.UNEXPECTED_KEYWORD.format(key)), + ) # validate sklearn version sklearn_version = sklearn.__version__ @@ -155,11 +164,14 @@ def get_filtered_valid_sklearn_args( and (version.parse(sklearn_version) < version.parse(sklearn_added_keyword_to_version_dict[key])) ): required_version = sklearn_added_keyword_to_version_dict[key] - msg = ( - f"scikit-learn version mismatch: parameter '{key}' requires scikit-learn>={required_version}, " - f"but got an incompatible version: {sklearn_version}." + raise exceptions.SnowflakeMLException( + error_code=error_codes.DEPENDENCY_VERSION_ERROR, + original_exception=ImportError( + modeling_error_messages.INCOMPATIBLE_NEW_SKLEARN_PARAM.format( + key, required_version, sklearn_version + ) + ), ) - raise ImportError(msg) # added keyword argument value if ( @@ -169,11 +181,14 @@ def get_filtered_valid_sklearn_args( and (version.parse(sklearn_version) < version.parse(sklearn_added_kwarg_value_to_version_dict[key][val])) ): required_version = sklearn_added_kwarg_value_to_version_dict[key][val] - msg = ( - f"scikit-learn version mismatch: parameter '{key}={val}' requires " - f"scikit-learn>={required_version}, but got an incompatible version: {sklearn_version}." + raise exceptions.SnowflakeMLException( + error_code=error_codes.DEPENDENCY_VERSION_ERROR, + original_exception=ImportError( + modeling_error_messages.INCOMPATIBLE_NEW_SKLEARN_PARAM.format( + f"{key}={val}", required_version, sklearn_version + ) + ), ) - raise ImportError(msg) # deprecated sklearn keyword if ( @@ -182,7 +197,7 @@ def get_filtered_valid_sklearn_args( and (version.parse(sklearn_version) >= version.parse(sklearn_deprecated_keyword_to_version_dict[key])) ): deprecated_version = sklearn_deprecated_keyword_to_version_dict[key] - msg = f"Parameter '{key}' deprecated since scikit-learn version {deprecated_version}." + msg = f"Incompatible scikit-learn version: '{key}' deprecated since scikit-learn={deprecated_version}.." warnings.warn(msg, DeprecationWarning) # removed sklearn keyword @@ -192,8 +207,12 @@ def get_filtered_valid_sklearn_args( and (version.parse(sklearn_version) >= version.parse(sklearn_removed_keyword_to_version_dict[key])) ): removed_version = sklearn_removed_keyword_to_version_dict[key] - msg = f"Parameter '{key}' removed since scikit-learn version {removed_version}." - raise ImportError(msg) + raise exceptions.SnowflakeMLException( + error_code=error_codes.DEPENDENCY_VERSION_ERROR, + original_exception=ImportError( + modeling_error_messages.REMOVED_SKLEARN_PARAM.format(key, removed_version, sklearn_version) + ), + ) return sklearn_args @@ -224,5 +243,5 @@ def table_exists(session: snowpark.Session, table_name: str, statement_params: D try: session.table(table_name).limit(0).collect(statement_params=statement_params) return True - except exceptions.SnowparkSQLException: + except snowpark_exceptions.SnowparkSQLException: return False diff --git a/snowflake/ml/modeling/framework/base.py b/snowflake/ml/modeling/framework/base.py index e37036fb..c54f1adf 100644 --- a/snowflake/ml/modeling/framework/base.py +++ b/snowflake/ml/modeling/framework/base.py @@ -14,6 +14,11 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry +from snowflake.ml._internal.exceptions import ( + error_codes, + exceptions, + modeling_error_messages, +) from snowflake.ml._internal.utils import parallelize from snowflake.ml.modeling.framework import _utils from snowflake.snowpark import functions as F @@ -29,12 +34,10 @@ def _process_cols(cols: Optional[Union[str, Iterable[str]]]) -> List[str]: return col_list elif type(cols) is list: col_list = cols - elif type(cols) in [range, set, tuple]: - col_list = list(cols) elif type(cols) is str: col_list = [cols] else: - raise TypeError(f"Could not convert {cols} to list") + col_list = list(cols) return col_list @@ -132,24 +135,55 @@ def _check_input_cols(self) -> None: Check if `self.input_cols` is set. Raises: - RuntimeError: If `self.input_cols` is not set. + SnowflakeMLException: `self.input_cols` is not set. """ if not self.input_cols: - raise RuntimeError("input_cols is not set.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.NOT_FOUND, + original_exception=RuntimeError(modeling_error_messages.ATTRIBUTE_NOT_SET.format("input_cols")), + ) def _check_output_cols(self) -> None: """ Check if `self.output_cols` is set. Raises: - RuntimeError: If `self.output_cols` is not set or if the size of `self.output_cols` - does not match that of `self.input_cols`. + SnowflakeMLException: `self.output_cols` is not set. + SnowflakeMLException: `self.output_cols` and `self.input_cols` are of different lengths. """ if not self.output_cols: - raise RuntimeError("output_cols is not set.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.NOT_FOUND, + original_exception=RuntimeError(modeling_error_messages.ATTRIBUTE_NOT_SET.format("output_cols")), + ) if len(self.output_cols) != len(self.input_cols): - raise RuntimeError( - f"Size mismatch: input_cols: {len(self.input_cols)}, output_cols: {len(self.output_cols)}." + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=RuntimeError( + modeling_error_messages.SIZE_MISMATCH.format( + "input_cols", len(self.input_cols), "output_cols", len(self.output_cols) + ) + ), + ) + + @staticmethod + def _check_dataset_type(dataset: Any) -> None: + """ + Check if the type of input dataset is supported. + + Args: + dataset: Input dataset passed to an API. + + Raises: + SnowflakeMLException: `self.input_cols` is not set. + """ + if not (isinstance(dataset, snowpark.DataFrame) or isinstance(dataset, pd.DataFrame)): + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=TypeError( + f"Unexpected dataset type: {type(dataset)}." + f"Supported dataset types: {type(snowpark.DataFrame)}, {type(pd.DataFrame)}." + ), ) @classmethod @@ -169,12 +203,15 @@ def _get_param_names(cls) -> List[str]: parameters = [p for p in init_signature.parameters.values() if p.name != "self" and p.kind != p.VAR_KEYWORD] for p in parameters: if p.kind == p.VAR_POSITIONAL: - raise RuntimeError( - "Transformers should always specify" - " their parameters in the signature" - " of their __init__ (no varargs)." - " %s with constructor %s doesn't " - " follow this convention." % (cls, init_signature) + raise exceptions.SnowflakeMLException( + error_code=error_codes.INTERNAL_PYTHON_ERROR, + original_exception=RuntimeError( + "Models should always specify" + " their parameters in the signature" + " of their __init__ (no varargs)." + f" {cls} with constructor {init_signature} doesn't " + " follow this convention." + ), ) # Extract and sort argument names excluding 'self' return sorted(p.name for p in parameters) @@ -212,7 +249,7 @@ def set_params(self, **params: Dict[str, Any]) -> None: **params: Transformer parameter names mapped to their values. Raises: - ValueError: For invalid parameter keys. + SnowflakeMLException: Invalid parameter keys. """ if not params: # simple optimization to gain speed (inspect is slow) @@ -224,8 +261,13 @@ def set_params(self, **params: Dict[str, Any]) -> None: key, delim, sub_key = key.partition("__") if key not in valid_params: local_valid_params = self._get_param_names() - raise ValueError( - f"Invalid parameter {key} for transformer {self}. Valid parameters are: {local_valid_params}." + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=ValueError( + modeling_error_messages.INVALID_MODEL_PARAM.format( + key, self.__class__.__name__, local_valid_params + ) + ), ) if delim: @@ -359,9 +401,12 @@ def _use_input_cols_only(self, dataset: pd.DataFrame) -> pd.DataFrame: input_cols = set(self.input_cols) dataset_cols = set(dataset.columns.to_list()) if not input_cols.issubset(dataset_cols): - raise KeyError( - "The `input_cols` contains columns that do not match any of the columns in " - f"the dataframe: {input_cols - dataset_cols}." + raise exceptions.SnowflakeMLException( + error_code=error_codes.NOT_FOUND, + original_exception=KeyError( + "input_cols contains columns that do not match any of the columns in " + f"the pandas dataframe: {input_cols - dataset_cols}." + ), ) return dataset[self.input_cols] @@ -450,9 +495,14 @@ def transform(self, dataset: pd.DataFrame) -> pd.DataFrame: def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[snowpark.DataFrame, pd.DataFrame]: raise NotImplementedError() - def enforce_fit(self) -> None: + def _enforce_fit(self) -> None: if not self._is_fitted: - raise RuntimeError("Transformer not fitted before calling transform().") + raise exceptions.SnowflakeMLException( + error_code=error_codes.METHOD_NOT_ALLOWED, + original_exception=RuntimeError( + f"Model {self.__class__.__name__} not fitted before calling predict/transform." + ), + ) def set_drop_input_cols(self, drop_input_cols: Optional[bool] = False) -> None: self._drop_input_cols = drop_input_cols @@ -465,10 +515,20 @@ def to_sklearn(self) -> Any: # to_xgboost would be only used in XGB estimators # to_lightgbm would be only used in LightGBM estimators, but they function the same def to_xgboost(self) -> Any: - raise AttributeError("Object doesn't support to_xgboost. Please use to_sklearn()") + raise exceptions.SnowflakeMLException( + error_code=error_codes.METHOD_NOT_ALLOWED, + original_exception=AttributeError( + modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format("to_xgboost", "to_sklearn") + ), + ) def to_lightgbm(self) -> Any: - raise AttributeError("Object doesn't support to_lightgbm. Please use to_sklearn()") + raise exceptions.SnowflakeMLException( + error_code=error_codes.METHOD_NOT_ALLOWED, + original_exception=AttributeError( + modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format("to_lightgbm", "to_sklearn") + ), + ) def _reset(self) -> None: self._sklearn_object = None @@ -520,17 +580,21 @@ def _transform_sklearn(self, dataset: pd.DataFrame) -> pd.DataFrame: Transformed dataset Raises: - TypeError: If the supplied output columns don't match that of the transformed array. + SnowflakeMLException: If the supplied output columns don't match that of the transformed array. """ - self.enforce_fit() + self._enforce_fit() dataset = dataset.copy() sklearn_transform = self.to_sklearn() transformed_data = sklearn_transform.transform(dataset[self.input_cols]) shape = transformed_data.shape if (len(shape) == 1 and len(self.output_cols) != 1) or (len(shape) > 1 and shape[1] != len(self.output_cols)): - raise TypeError( - f"output_cols must be same length as transformed array. Got output_cols len: {len(self.output_cols)}," - f" transformed array shape: {shape}" + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=RuntimeError( + modeling_error_messages.SIZE_MISMATCH.format( + "output_cols", len(self.output_cols), "transformed array shape", shape + ) + ), ) if len(shape) == 1: @@ -546,13 +610,13 @@ def _validate_data_has_no_nulls(self, dataset: snowpark.DataFrame) -> None: dataset: DataFrame to validate. Raises: - ValueError: If the dataset contains nulls in the input_cols. + SnowflakeMLException: If the dataset contains nulls in the input_cols. """ self._check_input_cols() null_count_columns = [] for input_col in self.input_cols: - col = F.count(F.lit("*")) - F.count(dataset[input_col]) # type:ignore[arg-type, operator] + col = F.count(F.lit("*")) - F.count(dataset[input_col]) null_count_columns.append(col) null_counts = dataset.agg(*null_count_columns).collect( @@ -563,9 +627,12 @@ def _validate_data_has_no_nulls(self, dataset: snowpark.DataFrame) -> None: invalid_columns = {col: n for (col, n) in zip(self.input_cols, null_counts[0].as_dict().values()) if n > 0} if any(invalid_columns): - raise ValueError( - "Dataset may not contain nulls, but " - f"the following columns have a non-zero number of nulls: {invalid_columns}." + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA, + original_exception=ValueError( + "Dataset may not contain nulls." + f"The following columns have non-zero numbers of nulls: {invalid_columns}." + ), ) def _drop_input_columns( @@ -581,21 +648,19 @@ def _drop_input_columns( Return a dataset with input columns dropped. Raises: - TypeError: If the dataset is neither DataFrame or Pandas DataFrame. - RuntimeError: drop_input_cols flag must be true before calling this function. + SnowflakeMLException: drop_input_cols flag must be true before calling this function. """ if not self._drop_input_cols: - raise RuntimeError("drop_input_cols must set true before calling.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INTERNAL_PYTHON_ERROR, + original_exception=RuntimeError("drop_input_cols must set true."), + ) # In case of input_cols and output_cols are the same, compare with get_output_cols() input_subset = list(set(self.input_cols) - set(self.get_output_cols())) + super()._check_dataset_type(dataset) if isinstance(dataset, snowpark.DataFrame): return dataset.drop(input_subset) - elif isinstance(dataset, pd.DataFrame): - return dataset.drop(columns=input_subset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + return dataset.drop(columns=input_subset) diff --git a/snowflake/ml/modeling/impute/BUILD_NATIVE.bzl b/snowflake/ml/modeling/impute/BUILD_NATIVE.bzl index e9caa22f..8f62df2b 100644 --- a/snowflake/ml/modeling/impute/BUILD_NATIVE.bzl +++ b/snowflake/ml/modeling/impute/BUILD_NATIVE.bzl @@ -20,6 +20,7 @@ def get_build_rules_for_native_impl(): deps = [ ":init", "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/_internal/exceptions:exceptions", "//snowflake/ml/modeling/framework", ], ) diff --git a/snowflake/ml/modeling/impute/simple_imputer.py b/snowflake/ml/modeling/impute/simple_imputer.py index 6f5c6af0..5ce07e9e 100644 --- a/snowflake/ml/modeling/impute/simple_imputer.py +++ b/snowflake/ml/modeling/impute/simple_imputer.py @@ -12,6 +12,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry +from snowflake.ml._internal.exceptions import error_codes, exceptions from snowflake.ml.modeling.framework import _utils, base from snowflake.snowpark import functions as F, types as T from snowflake.snowpark._internal import utils as snowpark_utils @@ -124,17 +125,26 @@ def __init__( `snowflake.ml.impute.MissingIndicator` to be implemented. Raises: - ValueError: If strategy is invalid, or if fill value is specified for strategy that isn't "constant". + SnowflakeMLException: If strategy is invalid, or if fill value is specified for strategy that isn't + "constant". """ super().__init__(drop_input_cols=drop_input_cols) if strategy in STRATEGY_TO_STATE_DICT: self.strategy = strategy else: - raise ValueError(f"Invalid strategy {strategy}. Strategy must be one of {STRATEGY_TO_STATE_DICT.keys()}") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=ValueError( + f"Invalid strategy {strategy}. Strategy must be one of {STRATEGY_TO_STATE_DICT.keys()}" + ), + ) # Check that the strategy is "constant" if `fill_value` is specified. if fill_value is not None and strategy != "constant": - raise ValueError("fill_value may only be specified if the strategy is 'constant'.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=ValueError("fill_value may only be specified if the strategy is 'constant'."), + ) self.fill_value = fill_value self.missing_values = missing_values @@ -171,7 +181,8 @@ def _get_dataset_input_col_datatypes(self, dataset: snowpark.DataFrame) -> Dict[ The datatype of the input columns. Raises: - TypeError: If the input columns are not all the same datatype category or if the datatype is not supported. + SnowflakeMLException: If the input columns are not all the same datatype category or if the datatype is not + supported. """ def check_type_consistency(col_types: Dict[str, T.DataType]) -> None: @@ -180,9 +191,12 @@ def check_type_consistency(col_types: Dict[str, T.DataType]) -> None: if is_numeric_type is None: is_numeric_type = True if col_type in _NUMERIC_TYPES else False if (col_type in _NUMERIC_TYPES) ^ is_numeric_type: - raise TypeError( - f"Inconsistent input column types. Column {col_name} type {col_type} does not match previous" - " type category." + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA_TYPE, + original_exception=TypeError( + f"Inconsistent input column types. Column {col_name} type {col_type} does not match" + " previous type category." + ), ) input_col_datatypes = {} @@ -192,7 +206,12 @@ def check_type_consistency(col_types: Dict[str, T.DataType]) -> None: isinstance(field.datatype, potential_type) for potential_type in SNOWFLAKE_DATATYPE_TO_NUMPY_DTYPE_MAP.keys() ): - raise TypeError(f"Input column type {field.datatype} is not supported by the simple imputer.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_DATA_TYPE, + original_exception=TypeError( + f"Input column type {field.datatype} is not supported by the SimpleImputer." + ), + ) input_col_datatypes[field.name] = field.datatype if self.strategy != "most_frequent": check_type_consistency(input_col_datatypes) @@ -249,7 +268,7 @@ def fit(self, dataset: snowpark.DataFrame) -> "SimpleImputer": dataset_copy = copy.copy(dataset) if not pd.isna(self.missing_values): # Replace `self.missing_values` with null to avoid including it when computing states. - dataset_copy = dataset_copy.na.replace(self.missing_values, None) # type: ignore[arg-type] + dataset_copy = dataset_copy.na.replace(self.missing_values, None) _computed_states = self._compute(dataset_copy, self.input_cols, states=[state]) for input_col in self.input_cols: statistic = _computed_states[input_col][state] @@ -287,25 +306,16 @@ def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[s Returns: Output dataset. - - Raises: - RuntimeError: If transformer is not fitted first. - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ - if not self._is_fitted: - raise RuntimeError("Transformer not fitted before calling transform().") + self._enforce_fit() super()._check_input_cols() super()._check_output_cols() + super()._check_dataset_type(dataset) if isinstance(dataset, snowpark.DataFrame): output_df = self._transform_snowpark(dataset) - elif isinstance(dataset, pd.DataFrame): - output_df = self._transform_sklearn(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + output_df = self._transform_sklearn(dataset) return self._drop_input_columns(output_df) if self._drop_input_cols is True else output_df @@ -358,7 +368,7 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame transformed_dataset = transformed_dataset.na.fill({output_col: statistic}) else: transformed_dataset = transformed_dataset.na.replace( - self.missing_values, # type: ignore[arg-type] + self.missing_values, fill_value, subset=[output_col], ) diff --git a/snowflake/ml/modeling/metrics/classification.py b/snowflake/ml/modeling/metrics/classification.py index 66db9604..e21dae7f 100644 --- a/snowflake/ml/modeling/metrics/classification.py +++ b/snowflake/ml/modeling/metrics/classification.py @@ -59,11 +59,11 @@ def accuracy_score( if isinstance(y_true_col_names, str) else (y_true_col_names[0], y_pred_col_names[0]) ) - score_column = F.iff(df[y_true] == df[y_pred], 1, 0) # type: ignore[arg-type] + score_column = F.iff(df[y_true] == df[y_pred], 1, 0) # multilabel else: expr = " and ".join([f"({y_true_col_names[i]} = {y_pred_col_names[i]})" for i in range(len(y_true_col_names))]) - score_column = F.iff(expr, 1, 0) # type: ignore[arg-type] + score_column = F.iff(expr, 1, 0) return metrics_utils.weighted_sum( df=df, sample_score_column=score_column, @@ -143,7 +143,7 @@ def confusion_matrix( rand = snowpark_utils.generate_random_alphanumeric() if sample_weight_col_name is None: sample_weight_col_name = f'"_SAMPLE_WEIGHT_{rand}"' - df = df.with_column(sample_weight_col_name, F.lit(1)) # type: ignore[arg-type] + df = df.with_column(sample_weight_col_name, F.lit(1)) if normalize not in ["true", "pred", "all", None]: raise ValueError("normalize must be one of {'true', 'pred', 'all', None}") @@ -175,13 +175,11 @@ def confusion_matrix( # Compute the confusion matrix. temp_df1 = ind_df.select( - F.array_construct(sample_weight_col_name, y_true_index_col, y_pred_index_col).alias( # type: ignore[arg-type] - "ARR_COL" - ) + F.array_construct(sample_weight_col_name, y_true_index_col, y_pred_index_col).alias("ARR_COL") + ) + temp_df2 = temp_df1.select(confusion_matrix_computer_udtf(F.col("ARR_COL"), F.lit(n_labels))).with_column_renamed( + "RESULT", "RES" ) - temp_df2 = temp_df1.select( - confusion_matrix_computer_udtf(F.col("ARR_COL"), F.lit(n_labels)) # type: ignore[arg-type] - ).with_column_renamed("RESULT", "RES") res_df = temp_df2.select(accumulator_udtf(F.col("RES")).over(partition_by="PART"), F.col("PART")) results = res_df.collect(statement_params=statement_params) @@ -514,7 +512,7 @@ def log_loss( sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) - query = df[cols].queries["queries"][-1] + queries = df[cols].queries["queries"] @F.sproc( # type: ignore[misc] session=session, @@ -528,7 +526,9 @@ def log_loss( statement_params=statement_params, ) def log_loss_sproc(session: snowpark.Session) -> float: - df = session.sql(query).to_pandas(statement_params=statement_params) + for query in queries[:-1]: + _ = session.sql(query).collect(statement_params=statement_params) + df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) y_true = df[y_true_col_names] y_pred = df[y_pred_col_names] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None @@ -652,7 +652,7 @@ def precision_recall_fscore_support( sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) - query = df[cols].queries["queries"][-1] + queries = df[cols].queries["queries"] @F.sproc( # type: ignore[misc] session=session, @@ -666,7 +666,9 @@ def precision_recall_fscore_support( statement_params=statement_params, ) def precision_recall_fscore_support_sproc(session: snowpark.Session) -> bytes: - df = session.sql(query).to_pandas(statement_params=statement_params) + for query in queries[:-1]: + _ = session.sql(query).collect(statement_params=statement_params) + df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) y_true = df[y_true_col_names] y_pred = df[y_pred_col_names] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None diff --git a/snowflake/ml/modeling/metrics/correlation.py b/snowflake/ml/modeling/metrics/correlation.py index 808501f3..6d1a83f0 100644 --- a/snowflake/ml/modeling/metrics/correlation.py +++ b/snowflake/ml/modeling/metrics/correlation.py @@ -62,9 +62,9 @@ def correlation(*, df: snowpark.DataFrame, columns: Optional[Collection[str]] = accumulator_udtf = F.table_function(accumulator) # Compute the confusion matrix. - temp_df1 = input_df.select(F.array_construct(*input_df.columns).alias("ARR_COL")) # type: ignore[arg-type] + temp_df1 = input_df.select(F.array_construct(*input_df.columns).alias("ARR_COL")) temp_df2 = temp_df1.select( - sharded_dot_and_sum_computer_udtf(F.col("ARR_COL"), F.lit(count), F.lit(0)) # type: ignore[arg-type] + sharded_dot_and_sum_computer_udtf(F.col("ARR_COL"), F.lit(count), F.lit(0)) ).with_column_renamed("RESULT", "RES") res_df = temp_df2.select(accumulator_udtf(F.col("RES")).over(partition_by="PART"), F.col("PART")) results = res_df.collect(statement_params=statement_params) diff --git a/snowflake/ml/modeling/metrics/covariance.py b/snowflake/ml/modeling/metrics/covariance.py index 2569f8e4..f48ac7c9 100644 --- a/snowflake/ml/modeling/metrics/covariance.py +++ b/snowflake/ml/modeling/metrics/covariance.py @@ -64,9 +64,9 @@ def covariance(*, df: DataFrame, columns: Optional[Collection[str]] = None, ddof accumulator_udtf = F.table_function(accumulator) # Compute the confusion matrix. - temp_df1 = input_df.select(F.array_construct(*input_df.columns).alias("ARR_COL")) # type: ignore[arg-type] + temp_df1 = input_df.select(F.array_construct(*input_df.columns).alias("ARR_COL")) temp_df2 = temp_df1.select( - sharded_dot_and_sum_computer_udtf(F.col("ARR_COL"), F.lit(count), F.lit(ddof)) # type: ignore[arg-type] + sharded_dot_and_sum_computer_udtf(F.col("ARR_COL"), F.lit(count), F.lit(ddof)) ).with_column_renamed("RESULT", "RES") res_df = temp_df2.select(accumulator_udtf(F.col("RES")).over(partition_by="PART"), F.col("PART")) results = res_df.collect(statement_params=statement_params) diff --git a/snowflake/ml/modeling/metrics/metrics_utils.py b/snowflake/ml/modeling/metrics/metrics_utils.py index 270e8d84..76d5dfa6 100644 --- a/snowflake/ml/modeling/metrics/metrics_utils.py +++ b/snowflake/ml/modeling/metrics/metrics_utils.py @@ -267,15 +267,13 @@ def weighted_sum( """ if normalize: if sample_weight_column is not None: - res = F.sum(sample_score_column * sample_weight_column) / F.sum( # type: ignore[arg-type, operator] - sample_weight_column # type: ignore[arg-type] - ) + res = F.sum(sample_score_column * sample_weight_column) / F.sum(sample_weight_column) else: - res = F.avg(sample_score_column) # type: ignore[arg-type] + res = F.avg(sample_score_column) elif sample_weight_column is not None: - res = F.sum(sample_score_column * sample_weight_column) # type: ignore[arg-type, operator] + res = F.sum(sample_score_column * sample_weight_column) else: - res = F.sum(sample_score_column) # type: ignore[arg-type] + res = F.sum(sample_score_column) return float(df.select(res).collect(statement_params=statement_params)[0][0]) @@ -306,7 +304,5 @@ def unique_labels( # append an index column dense ranking labels assert union_df is not None - res: snowpark.DataFrame = union_df.with_column( - INDEX, F.dense_rank().over(snowpark.Window.order_by(LABEL)) - 1 # type: ignore[arg-type, operator] - ) + res: snowpark.DataFrame = union_df.with_column(INDEX, F.dense_rank().over(snowpark.Window.order_by(LABEL)) - 1) return res diff --git a/snowflake/ml/modeling/metrics/ranking.py b/snowflake/ml/modeling/metrics/ranking.py index 85988698..e3c7fb2f 100644 --- a/snowflake/ml/modeling/metrics/ranking.py +++ b/snowflake/ml/modeling/metrics/ranking.py @@ -78,7 +78,7 @@ def precision_recall_curve( sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_name, probas_pred_col_name, sample_weight_col_name]) - query = df[cols].queries["queries"][-1] + queries = df[cols].queries["queries"] @F.sproc( # type: ignore[misc] session=session, @@ -92,7 +92,9 @@ def precision_recall_curve( statement_params=statement_params, ) def precision_recall_curve_sproc(session: snowpark.Session) -> bytes: - df = session.sql(query).to_pandas(statement_params=statement_params) + for query in queries[:-1]: + _ = session.sql(query).collect(statement_params=statement_params) + df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) y_true = df[y_true_col_name] probas_pred = df[probas_pred_col_name] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None @@ -209,7 +211,7 @@ class scores must correspond to the order of ``labels``, sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_score_col_names, sample_weight_col_name]) - query = df[cols].queries["queries"][-1] + queries = df[cols].queries["queries"] @F.sproc( # type: ignore[misc] session=session, @@ -223,7 +225,9 @@ class scores must correspond to the order of ``labels``, statement_params=statement_params, ) def roc_auc_score_sproc(session: snowpark.Session) -> bytes: - df = session.sql(query).to_pandas(statement_params=statement_params) + for query in queries[:-1]: + _ = session.sql(query).collect(statement_params=statement_params) + df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) y_true = df[y_true_col_names] y_score = df[y_score_col_names] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None @@ -295,7 +299,7 @@ def roc_curve( sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_name, y_score_col_name, sample_weight_col_name]) - query = df[cols].queries["queries"][-1] + queries = df[cols].queries["queries"] @F.sproc( # type: ignore[misc] session=session, @@ -309,7 +313,9 @@ def roc_curve( statement_params=statement_params, ) def roc_curve_sproc(session: snowpark.Session) -> bytes: - df = session.sql(query).to_pandas(statement_params=statement_params) + for query in queries[:-1]: + _ = session.sql(query).collect(statement_params=statement_params) + df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) y_true = df[y_true_col_name] y_score = df[y_score_col_name] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None diff --git a/snowflake/ml/modeling/metrics/regression.py b/snowflake/ml/modeling/metrics/regression.py index 81ea0582..8a45757a 100644 --- a/snowflake/ml/modeling/metrics/regression.py +++ b/snowflake/ml/modeling/metrics/regression.py @@ -67,7 +67,7 @@ def d2_absolute_error_score( sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) - query = df[cols].queries["queries"][-1] + queries = df[cols].queries["queries"] @F.sproc( # type: ignore[misc] session=session, @@ -81,7 +81,9 @@ def d2_absolute_error_score( statement_params=statement_params, ) def d2_absolute_error_score_sproc(session: snowpark.Session) -> bytes: - df = session.sql(query).to_pandas(statement_params=statement_params) + for query in queries[:-1]: + _ = session.sql(query).collect(statement_params=statement_params) + df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) y_true = df[y_true_col_names] y_pred = df[y_pred_col_names] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None @@ -149,7 +151,7 @@ def d2_pinball_score( sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) - query = df[cols].queries["queries"][-1] + queries = df[cols].queries["queries"] @F.sproc( # type: ignore[misc] session=session, @@ -163,7 +165,9 @@ def d2_pinball_score( statement_params=statement_params, ) def d2_pinball_score_sproc(session: snowpark.Session) -> bytes: - df = session.sql(query).to_pandas(statement_params=statement_params) + for query in queries[:-1]: + _ = session.sql(query).collect(statement_params=statement_params) + df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) y_true = df[y_true_col_names] y_pred = df[y_pred_col_names] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None @@ -248,7 +252,7 @@ def explained_variance_score( sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) - query = df[cols].queries["queries"][-1] + queries = df[cols].queries["queries"] @F.sproc( # type: ignore[misc] session=session, @@ -262,7 +266,9 @@ def explained_variance_score( statement_params=statement_params, ) def explained_variance_score_sproc(session: snowpark.Session) -> bytes: - df = session.sql(query).to_pandas(statement_params=statement_params) + for query in queries[:-1]: + _ = session.sql(query).collect(statement_params=statement_params) + df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) y_true = df[y_true_col_names] y_pred = df[y_pred_col_names] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None @@ -326,7 +332,7 @@ def mean_absolute_error( sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) - query = df[cols].queries["queries"][-1] + queries = df[cols].queries["queries"] @F.sproc( # type: ignore[misc] session=session, @@ -340,7 +346,9 @@ def mean_absolute_error( statement_params=statement_params, ) def mean_absolute_error_sproc(session: snowpark.Session) -> bytes: - df = session.sql(query).to_pandas(statement_params=statement_params) + for query in queries[:-1]: + _ = session.sql(query).collect(statement_params=statement_params) + df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) y_true = df[y_true_col_names] y_pred = df[y_pred_col_names] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None @@ -412,7 +420,7 @@ def mean_absolute_percentage_error( sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) - query = df[cols].queries["queries"][-1] + queries = df[cols].queries["queries"] @F.sproc( # type: ignore[misc] session=session, @@ -426,7 +434,9 @@ def mean_absolute_percentage_error( statement_params=statement_params, ) def mean_absolute_percentage_error_sproc(session: snowpark.Session) -> bytes: - df = session.sql(query).to_pandas(statement_params=statement_params) + for query in queries[:-1]: + _ = session.sql(query).collect(statement_params=statement_params) + df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) y_true = df[y_true_col_names] y_pred = df[y_pred_col_names] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None @@ -487,7 +497,7 @@ def mean_squared_error( sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) - query = df[cols].queries["queries"][-1] + queries = df[cols].queries["queries"] @F.sproc( # type: ignore[misc] session=session, @@ -501,7 +511,9 @@ def mean_squared_error( statement_params=statement_params, ) def mean_squared_error_sproc(session: snowpark.Session) -> bytes: - df = session.sql(query).to_pandas(statement_params=statement_params) + for query in queries[:-1]: + _ = session.sql(query).collect(statement_params=statement_params) + df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) y_true = df[y_true_col_names] y_pred = df[y_pred_col_names] sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None @@ -543,11 +555,11 @@ def r2_score(*, df: snowpark.DataFrame, y_true_col_name: str, y_pred_col_name: s R squared metric. """ - df_avg = df.select(F.avg(y_true_col_name).as_("avg_y_true")) # type: ignore[arg-type] + df_avg = df.select(F.avg(y_true_col_name).as_("avg_y_true")) df_r_square = df.join(df_avg).select( - F.lit(1) # type: ignore[arg-type] - - F.sum((df[y_true_col_name] - df[y_pred_col_name]) ** 2) # type: ignore[operator] - / F.sum((df[y_true_col_name] - df_avg["avg_y_true"]) ** 2) # type: ignore[operator] + F.lit(1) + - F.sum((df[y_true_col_name] - df[y_pred_col_name]) ** 2) + / F.sum((df[y_true_col_name] - df_avg["avg_y_true"]) ** 2) ) statement_params = telemetry.get_function_usage_statement_params( diff --git a/snowflake/ml/modeling/pipeline/BUILD.bazel b/snowflake/ml/modeling/pipeline/BUILD.bazel index 8ccf8c36..3bc9d6a2 100644 --- a/snowflake/ml/modeling/pipeline/BUILD.bazel +++ b/snowflake/ml/modeling/pipeline/BUILD.bazel @@ -21,6 +21,7 @@ py_library( deps = [ ":init", "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/_internal/exceptions:exceptions", ], ) diff --git a/snowflake/ml/modeling/pipeline/pipeline.py b/snowflake/ml/modeling/pipeline/pipeline.py index 3896136b..2a05ed07 100644 --- a/snowflake/ml/modeling/pipeline/pipeline.py +++ b/snowflake/ml/modeling/pipeline/pipeline.py @@ -14,6 +14,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry +from snowflake.ml._internal.exceptions import error_codes, exceptions from snowflake.ml.model.model_signature import ModelSignature, _infer_signature from snowflake.ml.modeling.framework import _utils, base @@ -58,7 +59,7 @@ def _get_column_indices(all_columns: List[str], target_columns: List[str]) -> Li Return the list of indices of target column in the original column array. Raises: - ValueError: If the target column is not present in the original column array. + SnowflakeMLException: If the target column is not present in the original column array. """ column_indices = [] for col in target_columns: @@ -69,8 +70,11 @@ def _get_column_indices(all_columns: List[str], target_columns: List[str]) -> Li found = True break if not found: - raise ValueError( - f"Selected column {col} is not found in the input dataframe. Columns in the input df : {all_columns}" + raise exceptions.SnowflakeMLException( + error_code=error_codes.NOT_FOUND, + original_exception=ValueError( + f"Selected column {col} is not found in the input dataframe. Columns in the input df: {all_columns}" + ), ) return column_indices @@ -130,10 +134,13 @@ def _get_estimator(self) -> Optional[Tuple[str, Any]]: def _validate_steps(self) -> None: for name, t in self._get_transformers(): if not Pipeline._is_transformer(t): - raise TypeError( - "All intermediate steps should be " - "transformers and implement both fit() and transform() methods, but" - f"{name} (type {type(t)}) doesn't." + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=TypeError( + "All intermediate steps should be " + "transformers and implement both fit() and transform() methods, but" + f"{name} (type {type(t)}) doesn't." + ), ) def _reset(self) -> None: @@ -262,13 +269,8 @@ def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[s Returns: Transformed data. Output datatype will be same as input datatype. - - Raises: - RuntimeError: If the pipeline is not fitted first. """ - - if not self._is_fitted: - raise RuntimeError("Pipeline is not fitted before calling transform().") + self._enforce_fit() transformed_dataset = self._transform_dataset(dataset=dataset) estimator = self._get_estimator() @@ -439,13 +441,16 @@ def _invoke_estimator_func( dataset: Input dataset. Raises: - RuntimeError: If the pipeline is not fitted first. + SnowflakeMLException: If the pipeline is not fitted first. Returns: Output dataset. """ if not self._is_fitted: - raise RuntimeError(f"Pipeline is not fitted before calling {func_name}().") + raise exceptions.SnowflakeMLException( + error_code=error_codes.METHOD_NOT_ALLOWED, + original_exception=RuntimeError(f"Pipeline is not fitted before calling {func_name}()."), + ) transformed_dataset = self._transform_dataset(dataset=dataset) estimator = self._get_estimator() @@ -525,9 +530,12 @@ def _create_sklearn_object(self) -> pipeline.Pipeline: return self._create_unfitted_sklearn_object() if not self._is_convertable_to_sklearn: - raise ValueError( - "The pipeline can't be converted to SKLearn equivalent because it processing label or sample_weight " - "columns as part of pipeline preprocessing steps which is not allowed in SKLearn." + raise exceptions.SnowflakeMLException( + error_code=error_codes.METHOD_NOT_ALLOWED, + original_exception=ValueError( + "The pipeline can't be converted to SKLearn equivalent because it processing label or " + "sample_weight columns as part of pipeline preprocessing steps which is not allowed in SKLearn." + ), ) # Create a fitted sklearn pipeline object by translating each non-estimator step in pipeline with with @@ -581,5 +589,8 @@ def _get_model_signatures(self, dataset: Union[snowpark.DataFrame, pd.DataFrame] @property def model_signatures(self) -> Dict[str, ModelSignature]: if self._model_signature_dict is None: - raise RuntimeError("Estimator not fitted before accessing property model_signatures! ") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"), + ) return self._model_signature_dict diff --git a/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl b/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl index 66c3c093..24d16931 100644 --- a/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl +++ b/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl @@ -20,6 +20,7 @@ def get_build_rules_for_native_impl(): deps = [ ":init", "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/_internal/exceptions:exceptions", "//snowflake/ml/modeling/framework", ], ) @@ -32,6 +33,7 @@ def get_build_rules_for_native_impl(): deps = [ ":init", "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/_internal/exceptions:exceptions", "//snowflake/ml/modeling/framework", ], ) @@ -46,6 +48,7 @@ def get_build_rules_for_native_impl(): ":ordinal_encoder", "//snowflake/ml/_internal:telemetry", "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/_internal/exceptions:exceptions", "//snowflake/ml/modeling/framework", ], ) @@ -82,6 +85,7 @@ def get_build_rules_for_native_impl(): deps = [ ":init", "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/_internal/exceptions:exceptions", "//snowflake/ml/modeling/framework", ], ) @@ -123,6 +127,7 @@ def get_build_rules_for_native_impl(): deps = [ ":init", "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/_internal/exceptions:exceptions", "//snowflake/ml/modeling/framework", ], ) diff --git a/snowflake/ml/modeling/preprocessing/binarizer.py b/snowflake/ml/modeling/preprocessing/binarizer.py index e1546759..4131378e 100644 --- a/snowflake/ml/modeling/preprocessing/binarizer.py +++ b/snowflake/ml/modeling/preprocessing/binarizer.py @@ -9,6 +9,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry +from snowflake.ml._internal.exceptions import error_codes, exceptions from snowflake.ml.modeling.framework import base from snowflake.snowpark import functions as F, types as T @@ -83,10 +84,13 @@ def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "Binarizer": self Raises: - TypeError: If the threshold is not a float. + SnowflakeMLException: If the threshold is not a float. """ if not isinstance(self.threshold, float): - raise TypeError(f"Binarizer threshold must be a float, but got {type(self.threshold)}.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=TypeError(f"Binarizer threshold must be a float, but got {type(self.threshold)}."), + ) self._is_fitted = True return self @@ -108,22 +112,15 @@ def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[s Returns: Output dataset. - - Raises: - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ super()._check_input_cols() super()._check_output_cols() + super()._check_dataset_type(dataset) if isinstance(dataset, snowpark.DataFrame): output_df = self._transform_snowpark(dataset) - elif isinstance(dataset, pd.DataFrame): - output_df = self._transform_sklearn(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + output_df = self._transform_sklearn(dataset) return self._drop_input_columns(output_df) if self._drop_input_cols is True else output_df @@ -132,7 +129,7 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame self._validate_data_has_no_nulls(dataset) output_columns = [] for input_col in self.input_cols: - col = F.iff(dataset[input_col] > self.threshold, 1.0, 0.0).cast(T.FloatType()) # type: ignore[arg-type] + col = F.iff(dataset[input_col] > self.threshold, 1.0, 0.0).cast(T.FloatType()) output_columns.append(col) transformed_dataset: snowpark.DataFrame = dataset.with_columns(self.output_cols, output_columns) diff --git a/snowflake/ml/modeling/preprocessing/k_bins_discretizer.py b/snowflake/ml/modeling/preprocessing/k_bins_discretizer.py index 87fa662a..f48f194f 100644 --- a/snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +++ b/snowflake/ml/modeling/preprocessing/k_bins_discretizer.py @@ -15,6 +15,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry +from snowflake.ml._internal.exceptions import error_codes, exceptions from snowflake.ml.modeling.framework import base from snowflake.snowpark import functions as F, types as T from snowflake.snowpark._internal import utils as snowpark_utils @@ -108,14 +109,28 @@ def __init__( def _enforce_params(self) -> None: self.n_bins = self.n_bins if isinstance(self.n_bins, Iterable) else [self.n_bins] * len(self.input_cols) if len(self.n_bins) != len(self.input_cols): - raise ValueError(f"n_bins must have same size as input_cols, got: {self.n_bins} vs {self.input_cols}") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError( + f"n_bins must have same size as input_cols, got: {self.n_bins} vs {self.input_cols}" + ), + ) for idx, b in enumerate(self.n_bins): if b < 2: - raise ValueError(f"n_bins cannot be less than 2, got: {b} at index {idx}") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError(f"n_bins cannot be less than 2, got: {b} at index {idx}"), + ) if self.encode not in _VALID_ENCODING_SCHEME: - raise ValueError(f"encode must be one of f{_VALID_ENCODING_SCHEME}, got: {self.encode}") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError(f"encode must be one of f{_VALID_ENCODING_SCHEME}, got: {self.encode}"), + ) if self.strategy not in _VALID_STRATEGY: - raise ValueError(f"strategy must be one of f{_VALID_STRATEGY}, got: {self.strategy}") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError(f"strategy must be one of f{_VALID_STRATEGY}, got: {self.strategy}"), + ) def _reset(self) -> None: super()._reset() @@ -135,23 +150,16 @@ def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> KBinsDiscreti Returns: Fitted self instance. - - Raises: - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ self._reset() self._enforce_params() super()._check_input_cols() + super()._check_dataset_type(dataset) if isinstance(dataset, pd.DataFrame): self._fit_sklearn(dataset) - elif isinstance(dataset, snowpark.DataFrame): - self._fit_snowpark(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + self._fit_snowpark(dataset) self._is_fitted = True return self @@ -178,23 +186,16 @@ def transform( - If input is snowpark DataFrame, returns snowpark DataFrame - If input is a pd.DataFrame and 'self.encdoe=onehot', returns 'csr_matrix' - If input is a pd.DataFrame and 'self.encode in ['ordinal', 'onehot-dense']', returns 'pd.DataFrame' - - Raises: - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ - self.enforce_fit() + self._enforce_fit() super()._check_input_cols() super()._check_output_cols() + super()._check_dataset_type(dataset) if isinstance(dataset, snowpark.DataFrame): output_df = self._transform_snowpark(dataset) - elif isinstance(dataset, pd.DataFrame): - output_df = self._transform_sklearn(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + output_df = self._transform_sklearn(dataset) return self._drop_input_columns(output_df) if self._drop_input_cols is True else output_df @@ -204,7 +205,10 @@ def _fit_snowpark(self, dataset: snowpark.DataFrame) -> None: elif self.strategy == "uniform": self._handle_uniform(dataset) elif self.strategy == "kmeans": - raise NotImplementedError("kmeans not supported yet") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=NotImplementedError("kmeans not supported yet"), + ) def _handle_quantile(self, dataset: snowpark.DataFrame) -> None: """ @@ -221,9 +225,7 @@ def _handle_quantile(self, dataset: snowpark.DataFrame) -> None: for idx, col_name in enumerate(self.input_cols): percentiles = np.linspace(0, 1, cast(List[int], self.n_bins)[idx] + 1) for i, pct in enumerate(percentiles.tolist()): - agg_queries.append( - F.percentile_cont(pct).within_group(col_name).alias(f"{col_name}_pct_{i}") # type: ignore[arg-type] - ) + agg_queries.append(F.percentile_cont(pct).within_group(col_name).alias(f"{col_name}_pct_{i}")) state_df = dataset.agg(agg_queries) state = ( state_df.to_pandas( @@ -253,10 +255,7 @@ def _handle_uniform(self, dataset: snowpark.DataFrame) -> None: """ # 1. Collect min and max for each feature column agg_queries = list( - chain.from_iterable( - (F.min(x).alias(f"{x}_min"), F.max(x).alias(f"{x}_max")) # type: ignore[arg-type] - for x in self.input_cols - ) + chain.from_iterable((F.min(x).alias(f"{x}_min"), F.max(x).alias(f"{x}_max")) for x in self.input_cols) ) state_df = dataset.select(*agg_queries) state = ( @@ -310,7 +309,10 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame elif self.encode == "onehot-dense": return self._handle_onehot_dense(dataset) else: - raise ValueError(f"{self.encode} is not a valid encoding scheme.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError(f"{self.encode} is not a valid encoding scheme."), + ) def _handle_ordinal(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame: """ @@ -345,12 +347,11 @@ def vec_bucketize(x: T.PandasSeries[float], boarders: T.PandasSeries[List[float] # 2. compute bucket per feature column for idx, input_col in enumerate(self.input_cols): output_col = self.output_cols[idx] - boarders = [F.lit(float(x)) for x in self.bin_edges_[idx]] # type: ignore[arg-type, index] + assert self.bin_edges_ is not None + boarders = [F.lit(float(x)) for x in self.bin_edges_[idx]] dataset = dataset.select( *dataset.columns, - F.call_udf(udf_name, F.col(input_col), F.array_construct(*boarders)).alias( # type: ignore[arg-type] - output_col - ), + F.call_udf(udf_name, F.col(input_col), F.array_construct(*boarders)).alias(output_col), ) # Reorder columns. Passthrough columns are added at the right to the output of the transformers. dataset = dataset[self.output_cols + passthrough_columns] @@ -390,11 +391,12 @@ def vec_bucketize_sparse_output( return pd.Series(res) for idx, input_col in enumerate(self.input_cols): + assert self.bin_edges_ is not None output_col = self.output_cols[idx] - boarders = [F.lit(float(x)) for x in self.bin_edges_[idx]] # type: ignore[arg-type, index] + boarders = [F.lit(float(x)) for x in self.bin_edges_[idx]] dataset = dataset.select( *dataset.columns, - F.call_udf(udf_name, F.col(input_col), F.array_construct(*boarders)).alias(output_col), # type: ignore + F.call_udf(udf_name, F.col(input_col), F.array_construct(*boarders)).alias(output_col), ) # Reorder columns. Passthrough columns are added at the right to the output of the transformers. dataset = dataset[self.output_cols + passthrough_columns] @@ -437,11 +439,12 @@ def vec_bucketize_dense_output( return pd.Series(res) for idx, input_col in enumerate(self.input_cols): + assert self.bin_edges_ is not None output_col = self.output_cols[idx] - boarders = [F.lit(float(x)) for x in self.bin_edges_[idx]] # type: ignore[arg-type, index] + boarders = [F.lit(float(x)) for x in self.bin_edges_[idx]] dataset = dataset.select( *dataset.columns, - F.call_udf(udf_name, F.col(input_col), F.array_construct(*boarders)).alias(output_col), # type: ignore + F.call_udf(udf_name, F.col(input_col), F.array_construct(*boarders)).alias(output_col), ) dataset = dataset.with_columns( [f"{output_col}_{i}" for i in range(len(boarders) - 1)], @@ -464,7 +467,7 @@ def _transform_sklearn(self, dataset: pd.DataFrame) -> Union[pd.DataFrame, spars Returns: Output dataset. """ - self.enforce_fit() + self._enforce_fit() encoder_sklearn = self.to_sklearn() transformed_dataset = encoder_sklearn.transform(dataset[self.input_cols]) diff --git a/snowflake/ml/modeling/preprocessing/label_encoder.py b/snowflake/ml/modeling/preprocessing/label_encoder.py index fe41087e..14ec5315 100644 --- a/snowflake/ml/modeling/preprocessing/label_encoder.py +++ b/snowflake/ml/modeling/preprocessing/label_encoder.py @@ -9,6 +9,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry, type_utils +from snowflake.ml._internal.exceptions import error_codes, exceptions from snowflake.ml.modeling.framework import base from snowflake.ml.modeling.preprocessing import ordinal_encoder @@ -81,14 +82,20 @@ def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "LabelEncoder self Raises: - ValueError: If length of input_cols is not 1 or length of output_cols is greater than 1. + SnowflakeMLException: If length of input_cols is not 1 or length of output_cols is greater than 1. """ if len(self.input_cols) != 1: - raise ValueError("Label encoder must specify one input column.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError("Label encoder must specify one input column."), + ) input_col = self.input_cols[0] if len(self.output_cols) != 1: - raise ValueError("Label encoder must specify one output column.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError("Label encoder must specify one output column."), + ) self._reset() @@ -121,28 +128,20 @@ def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[s Returns: Output dataset. - - Raises: - RuntimeError: If transformer is not fitted first. - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ - if not self._is_fitted or self._ordinal_encoder is None or self.classes_ is None: - raise RuntimeError("Label encoder must be fitted before calling transform().") + self._enforce_fit() + super()._check_dataset_type(dataset) if isinstance(dataset, snowpark.DataFrame): # [SNOW-802691] Support for mypy type checking + assert self._ordinal_encoder is not None output_df = self._ordinal_encoder.transform(dataset).na.replace( - float("nan"), # type: ignore[arg-type] + float("nan"), len(self.classes_) - 1, # type: ignore[arg-type] subset=self.output_cols, ) - elif isinstance(dataset, pd.DataFrame): - output_df = self._transform_sklearn(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + output_df = self._transform_sklearn(dataset) return self._drop_input_columns(output_df) if self._drop_input_cols is True else output_df diff --git a/snowflake/ml/modeling/preprocessing/max_abs_scaler.py b/snowflake/ml/modeling/preprocessing/max_abs_scaler.py index 59815a6b..36eedaec 100644 --- a/snowflake/ml/modeling/preprocessing/max_abs_scaler.py +++ b/snowflake/ml/modeling/preprocessing/max_abs_scaler.py @@ -105,22 +105,15 @@ def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "MaxAbsScaler Returns: Return self as fitted scaler. - - Raises: - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ super()._check_input_cols() + super()._check_dataset_type(dataset) self._reset() if isinstance(dataset, pd.DataFrame): self._fit_sklearn(dataset) - elif isinstance(dataset, snowpark.DataFrame): - self._fit_snowpark(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + self._fit_snowpark(dataset) self._is_fitted = True return self @@ -161,25 +154,16 @@ def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[s Returns: Output dataset. - - Raises: - RuntimeError: If transformer is not fitted first. - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ - if not self._is_fitted: - raise RuntimeError("Transformer not fitted before calling transform().") + self._enforce_fit() super()._check_input_cols() super()._check_output_cols() + super()._check_dataset_type(dataset) if isinstance(dataset, snowpark.DataFrame): output_df = self._transform_snowpark(dataset) - elif isinstance(dataset, pd.DataFrame): - output_df = self._transform_sklearn(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + output_df = self._transform_sklearn(dataset) return self._drop_input_columns(output_df) if self._drop_input_cols is True else output_df diff --git a/snowflake/ml/modeling/preprocessing/min_max_scaler.py b/snowflake/ml/modeling/preprocessing/min_max_scaler.py index a7869212..4fdd7741 100644 --- a/snowflake/ml/modeling/preprocessing/min_max_scaler.py +++ b/snowflake/ml/modeling/preprocessing/min_max_scaler.py @@ -114,22 +114,15 @@ def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "MinMaxScaler Returns: Fitted scaler. - - Raises: - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ super()._check_input_cols() + super()._check_dataset_type(dataset) self._reset() if isinstance(dataset, pd.DataFrame): self._fit_sklearn(dataset) - elif isinstance(dataset, snowpark.DataFrame): - self._fit_snowpark(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + self._fit_snowpark(dataset) self._is_fitted = True return self @@ -181,25 +174,16 @@ def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[s Returns: Output dataset. - - Raises: - RuntimeError: If transformer is not fitted first. - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ - if not self._is_fitted: - raise RuntimeError("Transformer not fitted before calling transform().") + self._enforce_fit() super()._check_input_cols() super()._check_output_cols() + super()._check_dataset_type(dataset) if isinstance(dataset, snowpark.DataFrame): output_df = self._transform_snowpark(dataset) - elif isinstance(dataset, pd.DataFrame): - output_df = self._transform_sklearn(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + output_df = self._transform_sklearn(dataset) return self._drop_input_columns(output_df) if self._drop_input_cols is True else output_df @@ -222,11 +206,11 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame if self.clip: output_column = F.greatest( output_column, - F.lit(self.feature_range[0]), # type: ignore[arg-type] + F.lit(self.feature_range[0]), ) output_column = F.least( output_column, - F.lit(self.feature_range[1]), # type: ignore[arg-type] + F.lit(self.feature_range[1]), ) output_columns.append(output_column) diff --git a/snowflake/ml/modeling/preprocessing/normalizer.py b/snowflake/ml/modeling/preprocessing/normalizer.py index b3bf8b0a..8e1e255d 100644 --- a/snowflake/ml/modeling/preprocessing/normalizer.py +++ b/snowflake/ml/modeling/preprocessing/normalizer.py @@ -9,6 +9,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry +from snowflake.ml._internal.exceptions import error_codes, exceptions from snowflake.ml.modeling.framework import base from snowflake.snowpark import functions as F, types as T @@ -90,24 +91,22 @@ def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[s transformed_dataset: Output dataset. Raises: - ValueError: If the dataset contains nulls, or if the supplied norm is invalid. - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. + SnowflakeMLException: If the dataset contains nulls, or if the supplied norm is invalid. """ if self.norm not in _VALID_NORMS: - raise ValueError(f"'{self.norm}' is not a supported norm.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError(f"'{self.norm}' is not a supported norm."), + ) super()._check_input_cols() super()._check_output_cols() + super()._check_dataset_type(dataset) if isinstance(dataset, snowpark.DataFrame): output_df = self._transform_snowpark(dataset) - elif isinstance(dataset, pd.DataFrame): - output_df = self._transform_sklearn(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + output_df = self._transform_sklearn(dataset) return self._drop_input_columns(output_df) if self._drop_input_cols is True else output_df @@ -115,31 +114,37 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame passthrough_columns = [c for c in dataset.columns if c not in self.output_cols] self._validate_data_has_no_nulls(dataset) if len(self.input_cols) == 0: - raise ValueError("Found array with 0 columns, but a minimum of 1 is required.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError("Found array with 0 columns, but a minimum of 1 is required."), + ) if self.norm == "l1": - norm = F.lit("0") # type: ignore[arg-type] + norm = F.lit("0") for input_col in self.input_cols: - norm += F.abs(dataset[input_col]) # type: ignore[operator] + norm += F.abs(dataset[input_col]) elif self.norm == "l2": - norm = F.lit("0") # type: ignore[arg-type] + norm = F.lit("0") for input_col in self.input_cols: norm += dataset[input_col] * dataset[input_col] - norm = F.sqrt(norm) # type: ignore[arg-type] + norm = F.sqrt(norm) elif self.norm == "max": - norm = F.greatest(*[F.abs(dataset[input_col]) for input_col in self.input_cols]) # type: ignore[arg-type] + norm = F.greatest(*[F.abs(dataset[input_col]) for input_col in self.input_cols]) else: - raise ValueError(f"'{self.norm}' is not a supported norm.") + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError(f"'{self.norm}' is not a supported norm."), + ) output_columns = [] for input_col in self.input_cols: # Set the entry to 0 if the norm is 0, because the norm is 0 only when all entries are 0. output_column = F.div0( dataset[input_col].cast(T.FloatType()), - norm, # type: ignore[arg-type] + norm, ) output_columns.append(output_column) diff --git a/snowflake/ml/modeling/preprocessing/one_hot_encoder.py b/snowflake/ml/modeling/preprocessing/one_hot_encoder.py index 325a14f4..7fea93c1 100644 --- a/snowflake/ml/modeling/preprocessing/one_hot_encoder.py +++ b/snowflake/ml/modeling/preprocessing/one_hot_encoder.py @@ -3,6 +3,7 @@ # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # import numbers +import uuid from typing import Any, Dict, Iterable, List, Optional, Union import numpy as np @@ -207,9 +208,10 @@ def __init__( self.drop_idx_: Optional[npt.NDArray[np.int_]] = None self._drop_idx_after_grouping: Optional[npt.NDArray[np.int_]] = None self._n_features_outs: List[int] = [] - self._dense_output_cols_mappings: Dict[ - str, List[str] - ] = {} # transform state when output columns are unset before fitting + + # Fit state if output columns are set before fitting + self._dense_output_cols_mappings: Dict[str, List[str]] = {} + self._inferred_output_cols: List[str] = [] self.set_input_cols(input_cols) self.set_output_cols(output_cols) @@ -279,10 +281,13 @@ def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "OneHotEncode f"Unexpected dataset type: {type(dataset)}." "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." ) + self._is_fitted = True + if not self.sparse and self.output_cols: - self._get_dense_output_cols_mappings() + self._handle_dense_output_cols() + if self.output_cols: + self._handle_inferred_output_cols(dataset) - self._is_fitted = True return self def _fit_sklearn(self, dataset: pd.DataFrame) -> None: @@ -393,25 +398,21 @@ def _get_category_count_state_df(self, dataset: snowpark.DataFrame) -> snowpark. found_state_df: Optional[snowpark.DataFrame] = None for input_col in self.input_cols: state_columns = [ - F.lit(input_col).alias(_COLUMN_NAME), # type: ignore[arg-type] + F.lit(input_col).alias(_COLUMN_NAME), F.col(input_col).cast(T.StringType()).alias(_CATEGORY), F.iff( # null or nan values - F.col(input_col).is_null() # type: ignore[arg-type] - | (F.col(input_col).cast(T.StringType()).equal_nan()), + F.col(input_col).is_null() | (F.col(input_col).cast(T.StringType()).equal_nan()), # count null and nan values - F.sum( # type: ignore[arg-type] - F.iff( # type: ignore[arg-type] - F.col(input_col).is_null() # type: ignore[arg-type] - | (F.col(input_col).cast(T.StringType()).equal_nan()), - 1, # type: ignore[arg-type] - 0, # type: ignore[arg-type] + F.sum( + F.iff( + F.col(input_col).is_null() | (F.col(input_col).cast(T.StringType()).equal_nan()), + 1, + 0, ) - ).over( - snowpark.Window.partition_by(input_col) # type: ignore[arg-type] - ), + ).over(snowpark.Window.partition_by(input_col)), # count values that are not null or nan - F.count(input_col).over(snowpark.Window.partition_by(input_col)), # type: ignore[arg-type] + F.count(input_col).over(snowpark.Window.partition_by(input_col)), ).alias(_COUNT), ] temp_df = dataset.select(state_columns).distinct() @@ -660,17 +661,15 @@ def transform( - If input is a pd.DataFrame and `self.sparse=False`, returns `pd.DataFrame` Raises: - RuntimeError: If transformer is not fitted first. TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ - if not self._is_fitted: - raise RuntimeError("Transformer not fitted before calling transform().") + self._enforce_fit() super()._check_input_cols() super()._check_output_cols() # output columns are unset before fitting if not self.sparse and not self._dense_output_cols_mappings: - self._get_dense_output_cols_mappings() + self._handle_dense_output_cols() if isinstance(dataset, snowpark.DataFrame): output_df = self._transform_snowpark(dataset) @@ -734,9 +733,12 @@ def map_encoded_value(row: pd.Series) -> Dict[str, Any]: assert dataset._session is not None, "dataset._session cannot be None" state_df = dataset._session.create_dataframe(state_pandas) + suffix = "_" + uuid.uuid4().hex.upper() transformed_dataset = dataset - origional_dataset_columns = transformed_dataset.columns[:] + original_dataset_cols = transformed_dataset.columns[:] all_output_cols = [] + suffixed_input_cols = [] + joined_input_cols = [] for idx, input_col in enumerate(self.input_cols): output_col = self.output_cols[idx] all_output_cols += [output_col] @@ -749,11 +751,24 @@ def map_encoded_value(row: pd.Series) -> Dict[str, Any]: input_col_state_df, on=transformed_dataset[input_col].equal_null(input_col_state_df[_CATEGORY]), how="left", - )[transformed_dataset.columns + [output_col]] + lsuffix=suffix, + ).drop(_CATEGORY) + + # handle identical input & output cols + if input_col == output_col: + col = identifier.concat_names([input_col, suffix]) + suffixed_input_cols.append(col) + joined_input_cols.append(col) + else: + joined_input_cols.append(input_col) - transformed_dataset = self._handle_unknown_in_transform(transformed_dataset) + if not self._inferred_output_cols: + self._inferred_output_cols = transformed_dataset[all_output_cols].columns + + transformed_dataset = self._handle_unknown_in_transform(transformed_dataset, joined_input_cols) # Reorder columns. Passthrough columns are added at the right to the output of the transformers. - transformed_dataset = transformed_dataset[all_output_cols + origional_dataset_columns] + passthrough_cols = list(set(original_dataset_cols) - set(all_output_cols)) + transformed_dataset = transformed_dataset.drop(suffixed_input_cols)[all_output_cols + passthrough_cols] return transformed_dataset def _transform_snowpark_dense(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame: @@ -816,6 +831,9 @@ def map_encoded_value(row: pd.Series) -> List[int]: how="left", )[transformed_dataset.columns + output_cols] + if not self._inferred_output_cols: + self._inferred_output_cols = transformed_dataset[all_output_cols].columns + transformed_dataset = self._handle_unknown_in_transform(transformed_dataset) # Reorder columns. Passthrough columns are added at the right to the output of the transformers. transformed_dataset = transformed_dataset[all_output_cols + original_dataset_columns] @@ -900,6 +918,9 @@ def _transform_sklearn(self, dataset: pd.DataFrame) -> Union[pd.DataFrame, spars if self.sparse: return transformed_dataset + if not self._inferred_output_cols: + self._inferred_output_cols = self._get_inferred_output_cols() + dataset = dataset.copy() dataset[self.get_output_cols()] = transformed_dataset return dataset @@ -979,12 +1000,17 @@ def _validate_keywords(self) -> None: msg = "`min_frequency` must be an integer at least 1, a float in (0.0, 1.0), or None, " "got float {}" raise ValueError(msg.format(self.min_frequency)) - def _handle_unknown_in_transform(self, transformed_dataset: snowpark.DataFrame) -> snowpark.DataFrame: + def _handle_unknown_in_transform( + self, + transformed_dataset: snowpark.DataFrame, + input_cols: Optional[List[str]] = None, + ) -> snowpark.DataFrame: """ Handle unknown values in the transformed dataset. Args: transformed_dataset: Transformed dataset without unknown values handled. + input_cols: Input columns (may be suffixed). Returns: Transformed dataset with unknown values handled. @@ -997,17 +1023,21 @@ def _handle_unknown_in_transform(self, transformed_dataset: snowpark.DataFrame) # dataframe with unknown values # columns: COLUMN_NAME, UNKNOWN_VALUE unknown_df: Optional[snowpark.DataFrame] = None - for idx, input_col in enumerate(self.input_cols): + cols = input_cols or self.input_cols + for idx, input_col in enumerate(cols): output_col = self.output_cols[idx] check_col = output_col if not self.sparse: - output_cat_cols = [x for x in transformed_dataset.columns if f"{output_col}_" in x] + output_cat_cols = [ + identifier.quote_name_without_upper_casing(col) + for col in self._dense_output_cols_mappings[input_col] + ] if not output_cat_cols: continue check_col = output_cat_cols[0] unknown_columns = [ - F.lit(input_col), # type: ignore[arg-type] + F.lit(self.input_cols[idx]), F.col(input_col), ] temp_df = ( @@ -1028,14 +1058,8 @@ def _handle_unknown_in_transform(self, transformed_dataset: snowpark.DataFrame) if not unknown_pandas.empty: msg = f"Found unknown categories during transform:\n{unknown_pandas.to_string()}" raise ValueError(msg) - if self.handle_unknown == "ignore" and not self.sparse: - all_output_cat_cols = [] - for idx, _ in enumerate(self.input_cols): - output_col = self.output_cols[idx] - output_cat_cols = [x for x in transformed_dataset.columns if f"{output_col}_" in x] - all_output_cat_cols.extend(output_cat_cols) - transformed_dataset = transformed_dataset.na.fill(0, all_output_cat_cols) # type: ignore[arg-type] + transformed_dataset = transformed_dataset.na.fill(0, self._inferred_output_cols) # TODO(hayu): [SNOW-752263] Support OneHotEncoder handle_unknown="infrequent_if_exist". # Add back when `handle_unknown="infrequent_if_exist"` is supported. @@ -1329,25 +1353,25 @@ def get_output_cols(self) -> List[str]: Returns: Output columns. """ - if self.sparse: - return self.output_cols - - output_cols = ( - [ - identifier.get_inferred_name(col) - for input_col in self.input_cols - for col in self._dense_output_cols_mappings[input_col] - ] - if self._dense_output_cols_mappings - else [] - ) - return output_cols + return self._inferred_output_cols - def _get_dense_output_cols_mappings(self) -> None: + def _get_inferred_output_cols(self) -> List[str]: """ - Get input column to dense output columns mappings and assign them to - `self._dense_output_cols_mappings`. + Get output column names meeting Snowflake requirements. + Only useful when fitting a pandas dataframe. + + Returns: + Inferred output columns. """ + cols = ( + self.output_cols + if self.sparse + else [col for input_col in self.input_cols for col in self._dense_output_cols_mappings[input_col]] + ) + return [identifier.get_inferred_name(c) for c in cols] + + def _handle_dense_output_cols(self) -> None: + """Assign input column to dense output columns mappings to `self._dense_output_cols_mappings`.""" for idx, input_col in enumerate(self.input_cols): output_col = self.output_cols[idx] n_features_out = self._n_features_outs[idx] @@ -1393,6 +1417,22 @@ def _get_dense_output_cols_mappings(self) -> None: cat = cat.replace('"', "'") self._dense_output_cols_mappings[input_col].append(f"{output_col}_{cat}") + def _handle_inferred_output_cols(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> None: + """ + Assign output column names used to transform pandas dataframes to `self._inferred_output_cols`. + This ensures consistent (double quoted) column names in Snowpark and pandas transformed dataframes. + + Args: + dataset: Input dataset. + """ + if isinstance(dataset, snowpark.DataFrame): + temp = self.handle_unknown + self.handle_unknown = "ignore" + self.transform(dataset[self.input_cols].limit(0)) + self.handle_unknown = temp + else: + self._inferred_output_cols = self._get_inferred_output_cols() + def get_sklearn_args( self, default_sklearn_obj: Optional[object] = None, diff --git a/snowflake/ml/modeling/preprocessing/ordinal_encoder.py b/snowflake/ml/modeling/preprocessing/ordinal_encoder.py index 4285159f..307c70ee 100644 --- a/snowflake/ml/modeling/preprocessing/ordinal_encoder.py +++ b/snowflake/ml/modeling/preprocessing/ordinal_encoder.py @@ -267,12 +267,9 @@ def _get_category_index_state_df(self, dataset: snowpark.DataFrame) -> snowpark. # encode non-missing categories encoded_value_columns = [ - F.lit(input_col).alias(_COLUMN_NAME), # type: ignore[arg-type] + F.lit(input_col).alias(_COLUMN_NAME), F.col(input_col).alias(_CATEGORY), - ( - F.dense_rank().over(snowpark.Window.order_by(input_col)) # type: ignore[arg-type] - - 1 # type: ignore[operator] - ) + (F.dense_rank().over(snowpark.Window.order_by(input_col)) - 1) .cast(T.FloatType()) .alias(_INDEX), # index categories ] @@ -284,10 +281,10 @@ def _get_category_index_state_df(self, dataset: snowpark.DataFrame) -> snowpark. # encode missing categories encoded_missing_value_columns = [ - F.lit(input_col).alias(_COLUMN_NAME), # type: ignore[arg-type] + F.lit(input_col).alias(_COLUMN_NAME), F.col(input_col).alias(_CATEGORY), # index missing categories - F.lit(self.encoded_missing_value).alias(_INDEX), # type: ignore[arg-type] + F.lit(self.encoded_missing_value).alias(_INDEX), ] encoded_missing_value_df = distinct_dataset.filter(F.col(input_col).is_null()).select( encoded_missing_value_columns @@ -443,11 +440,9 @@ def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[s Output dataset. Raises: - RuntimeError: If transformer is not fitted first. TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ - if not self._is_fitted: - raise RuntimeError("Transformer not fitted before calling transform().") + self._enforce_fit() super()._check_input_cols() super()._check_output_cols() @@ -487,7 +482,7 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame # replace NULL with nan null_category_state_df = state_df.filter(F.col(_CATEGORY).is_null()).with_column( - _INDEX, F.lit(self.encoded_missing_value) # type: ignore[arg-type] + _INDEX, F.lit(self.encoded_missing_value) ) state_df = state_df.filter(F.col(_CATEGORY).is_not_null()).union_by_name(null_category_state_df) @@ -602,7 +597,7 @@ def _handle_unknown_in_transform(self, transformed_dataset: snowpark.DataFrame) for idx, input_col in enumerate(self.input_cols): output_col = self.output_cols[idx] unknown_columns = [ - F.lit(input_col), # type: ignore[arg-type] + F.lit(input_col), F.col(input_col), ] temp_df = ( @@ -627,8 +622,6 @@ def _handle_unknown_in_transform(self, transformed_dataset: snowpark.DataFrame) if self.handle_unknown == "use_encoded_value": # left outer join has already filled unknown values with null if not (self.unknown_value is None or sklearn_utils.is_scalar_nan(self.unknown_value)): - transformed_dataset = transformed_dataset.na.fill( - self.unknown_value, self.output_cols # type: ignore[arg-type] - ) + transformed_dataset = transformed_dataset.na.fill(self.unknown_value, self.output_cols) return transformed_dataset diff --git a/snowflake/ml/modeling/preprocessing/robust_scaler.py b/snowflake/ml/modeling/preprocessing/robust_scaler.py index 120c2b06..f3946df9 100644 --- a/snowflake/ml/modeling/preprocessing/robust_scaler.py +++ b/snowflake/ml/modeling/preprocessing/robust_scaler.py @@ -12,6 +12,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry +from snowflake.ml._internal.exceptions import error_codes, exceptions from snowflake.ml.modeling.framework import _utils, base @@ -129,22 +130,15 @@ def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "RobustScaler Returns: Return self as fitted scaler. - - Raises: - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ super()._check_input_cols() + super()._check_dataset_type(dataset) self._reset() if isinstance(dataset, pd.DataFrame): self._fit_sklearn(dataset) - elif isinstance(dataset, snowpark.DataFrame): - self._fit_snowpark(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + self._fit_snowpark(dataset) self._is_fitted = True self._state_is_set = True @@ -166,7 +160,10 @@ def _fit_snowpark(self, dataset: snowpark.DataFrame) -> None: q_min, q_max = self.quantile_range if not 0 <= q_min <= q_max <= 100: - raise ValueError("Invalid quantile range: %s" % str(self.quantile_range)) + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError("Invalid quantile range: %s" % str(self.quantile_range)), + ) pcont_left = self.custom_states[1] pcont_right = self.custom_states[2] @@ -206,25 +203,16 @@ def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[s Returns: Output dataset. - - Raises: - RuntimeError: If transformer is not fitted first. - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ - if not self._is_fitted: - raise RuntimeError("Transformer not fitted before calling transform().") + self._enforce_fit() super()._check_input_cols() super()._check_output_cols() + super()._check_dataset_type(dataset) if isinstance(dataset, snowpark.DataFrame): output_df = self._transform_snowpark(dataset) - elif isinstance(dataset, pd.DataFrame): - output_df = self._transform_sklearn(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + output_df = self._transform_sklearn(dataset) return self._drop_input_columns(output_df) if self._drop_input_cols is True else output_df @@ -268,13 +256,7 @@ def _create_sklearn_object(self) -> preprocessing.RobustScaler: Returns: Sklearn RobustScaler. - - Raises: - RuntimeError: If transformer is not fitted first. """ - if self.scale_ is None or self.center_ is None: - raise RuntimeError("Transformer not fitted before calling transform().") - scaler = self._create_unfitted_sklearn_object() if self._is_fitted: scaler.scale_ = self._convert_attribute_dict_to_ndarray(self.scale_, np.float64) diff --git a/snowflake/ml/modeling/preprocessing/standard_scaler.py b/snowflake/ml/modeling/preprocessing/standard_scaler.py index e5f20f6e..af2dc756 100644 --- a/snowflake/ml/modeling/preprocessing/standard_scaler.py +++ b/snowflake/ml/modeling/preprocessing/standard_scaler.py @@ -120,22 +120,15 @@ def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "StandardScal Returns: Fitted scaler. - - Raises: - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ super()._check_input_cols() + super()._check_dataset_type(dataset) self._reset() if isinstance(dataset, pd.DataFrame): self._fit_sklearn(dataset) - elif isinstance(dataset, snowpark.DataFrame): - self._fit_snowpark(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + self._fit_snowpark(dataset) self._is_fitted = True return self @@ -188,25 +181,16 @@ def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[s Returns: transformed_dataset: Output dataset. - - Raises: - RuntimeError: If transformer is not fitted first. - TypeError: If the input dataset is neither a pandas nor Snowpark DataFrame. """ - if not self._is_fitted: - raise RuntimeError("Transformer not fitted before calling transform().") + self._enforce_fit() super()._check_input_cols() super()._check_output_cols() + super()._check_dataset_type(dataset) if isinstance(dataset, snowpark.DataFrame): output_df = self._transform_snowpark(dataset) - elif isinstance(dataset, pd.DataFrame): - output_df = self._transform_sklearn(dataset) else: - raise TypeError( - f"Unexpected dataset type: {type(dataset)}." - "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." - ) + output_df = self._transform_sklearn(dataset) return self._drop_input_columns(output_df) if self._drop_input_cols is True else output_df @@ -248,7 +232,6 @@ def _create_sklearn_object(self) -> preprocessing.StandardScaler: Returns: The Sklearn StandardScaler. """ - scaler = self._create_unfitted_sklearn_object() if self._is_fitted: scaler.scale_ = self._convert_attribute_dict_to_ndarray(self.scale_, np.float64) diff --git a/snowflake/ml/registry/model_registry.py b/snowflake/ml/registry/model_registry.py index 786fa2b2..2e1bb7f5 100644 --- a/snowflake/ml/registry/model_registry.py +++ b/snowflake/ml/registry/model_registry.py @@ -6,7 +6,7 @@ import tempfile import types import zipfile -from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast from uuid import uuid1 from absl import logging @@ -614,7 +614,9 @@ def _insert_deployment_entry( stage_path: str, signature: Dict[str, Any], target_method: str, - options: Optional[model_types.WarehouseDeployOptions] = None, + options: Optional[ + Union[model_types.WarehouseDeployOptions, model_types.SnowparkContainerServiceDeployOptions] + ] = None, ) -> List[snowpark.Row]: """Insert a new row into the model deployment table. @@ -660,9 +662,10 @@ def _prepare_deployment_stage(self) -> str: schema = self._fully_qualified_schema_name() fully_qualified_deployment_stage_name = f"{schema}.{self._permanent_deployment_stage}" statement_params = self._get_statement_params(inspect.currentframe()) - self._session.sql(f"CREATE STAGE IF NOT EXISTS {fully_qualified_deployment_stage_name}").collect( - statement_params=statement_params - ) + self._session.sql( + f"CREATE STAGE IF NOT EXISTS {fully_qualified_deployment_stage_name} " + f"ENCRYPTION = (TYPE= 'SNOWFLAKE_SSE')" + ).collect(statement_params=statement_params) return f"@{fully_qualified_deployment_stage_name}" def _prepare_model_stage(self, model_id: str) -> str: @@ -691,9 +694,9 @@ def _prepare_model_stage(self, model_id: str) -> str: fully_qualified_model_stage_name = f"{schema}.{model_stage_name}" statement_params = self._get_statement_params(inspect.currentframe()) - create_stage_result = self._session.sql(f"CREATE OR REPLACE STAGE {fully_qualified_model_stage_name}").collect( - statement_params=statement_params - ) + create_stage_result = self._session.sql( + f"CREATE OR REPLACE STAGE {fully_qualified_model_stage_name} ENCRYPTION = (TYPE= 'SNOWFLAKE_SSE')" + ).collect(statement_params=statement_params) if not create_stage_result: raise connector.DatabaseError("Unable to create stage for model. Operation returned not result.") if len(create_stage_result) != 1: @@ -702,12 +705,6 @@ def _prepare_model_stage(self, model_id: str) -> str: str(create_stage_result) ) ) - if create_stage_result[0]["status"] != f"Stage area {model_stage_name} successfully created.": - raise connector.DatabaseError( - "Unable to create stage for model. Return status of operation was: {}".format( - create_stage_result[0]["status"] - ) - ) return fully_qualified_model_stage_name @@ -1471,7 +1468,7 @@ def log_model( signatures: Optional[Dict[str, model_signature.ModelSignature]] = None, sample_input_data: Optional[Any] = None, code_paths: Optional[List[str]] = None, - options: Optional[model_types.ModelSaveOption] = None, + options: Optional[model_types.BaseModelSaveOption] = None, ) -> str: """Uploads and register a model to the Model Registry. @@ -1601,9 +1598,12 @@ def deploy( deployment_name: str, target_method: str, permanent: bool = False, - options: Optional[model_types.WarehouseDeployOptions] = None, + platform: _deployer.TargetPlatform = _deployer.TargetPlatform.WAREHOUSE, + options: Optional[ + Union[model_types.WarehouseDeployOptions, model_types.SnowparkContainerServiceDeployOptions] + ] = None, ) -> None: - """Deploy the model with the the given deployment name. + """Deploy the model with the given deployment name. Args: model_name: Model Name string. @@ -1611,41 +1611,67 @@ def deploy( deployment_name: name of the generated UDF. target_method: The method name to use in deployment. permanent: Whether the deployment is permanent or not. Permanent deployment will generate a permanent UDF. + (Only applicable for Warehouse deployment) + platform: Target platform to deploy the model to. Currently supported platforms are + ['warehouse', 'snowpark_container_service'] options: Optional options for model deployment. Defaults to None. - The following keys are acceptable: - - "output_with_input_features": Whether or not preserve the input columns in the output when predicting. - Defaults to False. - - "keep_order": Whether or not preserve the row order when predicting. Only available for dataframe has - fewer than 2**64 rows. Defaults to True. - - "permanent_udf_stage_location": Customized Snowflake stage option where the UDF should be persisted. - - "relax_version": Whether or not relax the version constraints of the dependencies if unresolvable. - Defaults to False. + + Raises: + RuntimeError: Raised when parameters are not properly enabled when deploying to Warehouse with temporary UDF """ if options is None: options = {} deployment_stage_path = "" - if permanent: - # Every deployment-generated UDF should reside in its own unique directory. As long as each deployment - # is allocated a distinct directory, multiple deployments can coexist within the same stage. - # Given that each permanent deployment possesses a unique deployment_name, sharing the same stage doesn't - # present any issues - deployment_stage_path = ( - options.get("permanent_udf_stage_location") or f"{self._prepare_deployment_stage()}/{deployment_name}/" - ) - options["permanent_udf_stage_location"] = deployment_stage_path + if platform == _deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE: + permanent = True + options = cast(model_types.SnowparkContainerServiceDeployOptions, options) + deployment_stage_path = f"{self._prepare_deployment_stage()}/{deployment_name}/" + elif platform == _deployer.TargetPlatform.WAREHOUSE: + options = cast(model_types.WarehouseDeployOptions, options) + if permanent: + # Every deployment-generated UDF should reside in its own unique directory. As long as each deployment + # is allocated a distinct directory, multiple deployments can coexist within the same stage. + # Given that each permanent deployment possesses a unique deployment_name, sharing the same stage does + # not present any issues + deployment_stage_path = ( + options.get("permanent_udf_stage_location") + or f"{self._prepare_deployment_stage()}/{deployment_name}/" + ) + options["permanent_udf_stage_location"] = deployment_stage_path remote_model_path = "@" + self._get_model_path(model_name=model_name, model_version=model_version) model_id = self._get_model_id(model_name, model_version) + # https://snowflakecomputing.atlassian.net/browse/SNOW-858376 + # During temporary deployment on the Warehouse, Snowpark creates an unencrypted temporary stage for UDF-related + # artifacts. However, UDF generation fails when importing from a mix of encrypted and unencrypted stages. + # The following workaround copies model between stages (PrPr as of July 7th, 2023) to transfer the SSE + # encrypted model zip from model stage to the temporary unencrypted stage. + if not permanent and platform == _deployer.TargetPlatform.WAREHOUSE: + schema = self._fully_qualified_schema_name() + unencrypted_stage = f"@{schema}.TEMP_UNENCRYPTED_{self._get_new_unique_identifier()}" + self._session.sql(f"CREATE TEMPORARY STAGE {unencrypted_stage[1:]}").collect() + try: + self._session.sql(f"COPY FILES INTO {unencrypted_stage} from {remote_model_path}").collect() + except Exception: + raise RuntimeError( + "Please ensure parameters are enabled in your Snowflake account by running " + "'ALTER ACCOUNT SET ENABLE_COPY_FILES=TRUE, " + "ENABLE_COPY_FILES_API_IN_STORAGE=TRUE'" + ) + remote_model_path = f"{unencrypted_stage}/{os.path.basename(remote_model_path)}" + # Step 1: Deploy to get the UDF deployment_info = _deployer.deploy( session=self._session, name=self._fully_qualified_deployment_name(deployment_name), - platform=_deployer.TargetPlatform.WAREHOUSE, + platform=platform, target_method=target_method, model_stage_file_path=remote_model_path, + deployment_stage_path=deployment_stage_path, + model_id=model_id, options=options, ) @@ -1839,8 +1865,8 @@ def delete_model( if delete_artifact: if uri.is_snowflake_stage_uri(model_uri): stage_path = self._get_fully_qualified_stage_name_from_uri(model_uri) - query_result_checker.SqlResultValidator(self._session, f"DROP STAGE {stage_path}").has_value_match( - row_idx=0, col_idx=0, expected_value="successfully dropped." + query_result_checker.SqlResultValidator(self._session, f"DROP STAGE {stage_path}").has_dimensions( + expected_rows=1, expected_cols=1 ).validate() # Step 3/3: Record the deletion event. @@ -2029,7 +2055,13 @@ def predict(self, deployment_name: str, data: Any) -> "pd.DataFrame": platform = _deployer.TargetPlatform(deployment["TARGET_PLATFORM"]) signature = model_signature.ModelSignature.from_dict(json.loads(deployment["SIGNATURE"])) options_dict = cast(Dict[str, Any], json.loads(deployment["OPTIONS"])) - options = model_types.WarehouseDeployOptions(options_dict) # type: ignore + platform_options = { + _deployer.TargetPlatform.WAREHOUSE: model_types.WarehouseDeployOptions, + _deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE: model_types.SnowparkContainerServiceDeployOptions, + } + if platform not in platform_options: + raise ValueError(f"Unsupported target Platform: {platform}") + options = platform_options[platform](options_dict) di = _deployer.Deployment( name=self._registry._fully_qualified_deployment_name(deployment_name), platform=platform, diff --git a/snowflake/ml/registry/model_registry_test.py b/snowflake/ml/registry/model_registry_test.py index e8a0d2f8..4e6c3e6d 100644 --- a/snowflake/ml/registry/model_registry_test.py +++ b/snowflake/ml/registry/model_registry_test.py @@ -758,7 +758,8 @@ def test_log_model_path_file(self) -> None: expected_stage_postfix = f"{self.model_id}".upper() self.add_session_mock_sql( - query=f"CREATE OR REPLACE STAGE {_DATABASE_NAME}.{_SCHEMA_NAME}.SNOWML_MODEL_{expected_stage_postfix}", + query=f"CREATE OR REPLACE STAGE {_DATABASE_NAME}.{_SCHEMA_NAME}.SNOWML_MODEL_{expected_stage_postfix} " + f"ENCRYPTION = (TYPE= 'SNOWFLAKE_SSE')", result=mock_data_frame.MockDataFrame( [snowpark.Row(**{"status": f"Stage area SNOWML_MODEL_{expected_stage_postfix} successfully created."})] ), diff --git a/snowflake/ml/registry/notebooks/Snowpark ML - Deployment to Snowpark Container Service Demo.ipynb b/snowflake/ml/registry/notebooks/Snowpark ML - Deployment to Snowpark Container Service Demo.ipynb new file mode 100644 index 00000000..d83ecc27 --- /dev/null +++ b/snowflake/ml/registry/notebooks/Snowpark ML - Deployment to Snowpark Container Service Demo.ipynb @@ -0,0 +1,644 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a45960e1", + "metadata": {}, + "source": [ + "# Snowpark ML - Deployment to Snowpark Container Service Demo" + ] + }, + { + "cell_type": "markdown", + "id": "aa7a329a", + "metadata": {}, + "source": [ + "## Prerequisite\n", + "\n", + "- Install and have a running Docker Client (required only for PrPr for client-side image build)" + ] + }, + { + "cell_type": "markdown", + "id": "3b50d774", + "metadata": {}, + "source": [ + "## Train a model with Snowpark ML API " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "18a75d71", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Tuple\n", + "from snowflake.ml.modeling.linear_model import LogisticRegression\n", + "from sklearn import datasets\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "def prepare_logistic_model() -> Tuple[LogisticRegression, pd.DataFrame]:\n", + " iris = datasets.load_iris()\n", + " df = pd.DataFrame(data=np.c_[iris[\"data\"], iris[\"target\"]], columns=iris[\"feature_names\"] + [\"target\"])\n", + " df.columns = [s.replace(\" (CM)\", \"\").replace(\" \", \"\") for s in df.columns.str.upper()]\n", + "\n", + " input_cols = [\"SEPALLENGTH\", \"SEPALWIDTH\", \"PETALLENGTH\", \"PETALWIDTH\"]\n", + " label_cols = \"TARGET\"\n", + " output_cols = \"PREDICTED_TARGET\"\n", + "\n", + " estimator = LogisticRegression(\n", + " input_cols=input_cols, output_cols=output_cols, label_cols=label_cols, random_state=0, max_iter=1000\n", + " ).fit(df)\n", + "\n", + " return estimator, df.drop(columns=label_cols).head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "db25f95b", + "metadata": {}, + "source": [ + "## Train a HuggingFace Model (cross-encoder/nli-MiniLM2-L6-H768)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e319bd2", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification\n", + "from snowflake.ml.model import custom_model\n", + "import torch\n", + "\n", + "def prepare_cross_encoder_model() -> Tuple[custom_model.CustomModel, pd.DataFrame]:\n", + " \"\"\"\n", + " Pretrained cross encoder model from huggingface.\n", + " \"\"\"\n", + " classifier = pipeline(\"zero-shot-classification\", model='cross-encoder/nli-MiniLM2-L6-H768') \n", + " candidate_labels = ['customer support', 'product experience', 'account issues']\n", + "\n", + " class HuggingFaceModel(custom_model.CustomModel):\n", + " def __init__(self, context: custom_model.ModelContext) -> None:\n", + " super().__init__(context)\n", + " \n", + " @custom_model.inference_api\n", + " def predict(self, input_df: pd.DataFrame) -> pd.DataFrame: \n", + " sequences_to_classify = input_df.values.flatten().tolist()\n", + " data = [classifier(sequence, candidate_labels) for sequence in sequences_to_classify]\n", + " max_score_labels = []\n", + " for record in data:\n", + " max_score_label = max(zip(record['labels'], record['scores']), key=lambda x: x[1])[0]\n", + " max_score_labels.append(max_score_label) \n", + " return pd.DataFrame({\"output\": max_score_labels})\n", + "\n", + " cross_encoder_model = HuggingFaceModel(custom_model.ModelContext())\n", + " test_data = pd.DataFrame([\"The interface gets frozen very often\"])\n", + "\n", + " return cross_encoder_model, test_data" + ] + }, + { + "cell_type": "markdown", + "id": "db6734fa", + "metadata": {}, + "source": [ + "## Start Snowpark Session\n", + "\n", + "To avoid exposing credentials in Github, we use a small utility `SnowflakeLoginOptions`. It allows you to score your default credentials in `~/.snowsql/config` in the following format:\n", + "```\n", + "[connections]\n", + "accountname = # Account identifier to connect to Snowflake.\n", + "username = # User name in the account.\n", + "password = # User password.\n", + "dbname = # Default database.\n", + "schemaname = # Default schema.\n", + "warehousename = # Default warehouse.\n", + "rolename = # Default role.\n", + "```\n", + "Please follow [this](https://docs.snowflake.com/en/user-guide/snowsql-start.html#configuring-default-connection-settings) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "58dd3604", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. \n" + ] + } + ], + "source": [ + "from snowflake.ml.utils.connection_params import SnowflakeLoginOptions\n", + "from snowflake.snowpark import Session, Column, functions\n", + "\n", + "session = Session.builder.configs(SnowflakeLoginOptions()).create()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "27dfbc42", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:create_model_registry() is in private preview since 0.2.0. Do not use it in production. \n", + "WARNING:absl:The database INFERENCE_CONTAINER_DB already exists. Skipping creation.\n", + "WARNING:absl:The schema INFERENCE_CONTAINER_DB.INFERENCE_CONTAINER_SCHEMAalready exists. Skipping creation.\n" + ] + } + ], + "source": [ + "from snowflake.ml.registry import model_registry\n", + "\n", + "conn = session._conn._conn\n", + "# will be a no-op if registry already exists\n", + "model_registry.create_model_registry(session=session, database_name=conn._database, schema_name=conn._schema) \n", + "registry = model_registry.ModelRegistry(session=session, database_name=conn._database, schema_name=conn._schema)" + ] + }, + { + "cell_type": "markdown", + "id": "38e0a975", + "metadata": {}, + "source": [ + "## Register SnowML Model" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "574e7a43", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:create_model_registry() is in private preview since 0.2.0. Do not use it in production. \n", + "WARNING:absl:The database INFERENCE_CONTAINER_DB already exists. Skipping creation.\n", + "WARNING:absl:The schema INFERENCE_CONTAINER_DB.INFERENCE_CONTAINER_SCHEMAalready exists. Skipping creation.\n", + "WARNING:snowflake.snowpark:ModelRegistry.log_model() is in private preview since 0.2.0. Do not use it in production. \n", + "WARNING:snowflake.snowpark:ModelRegistry.list_models() is in private preview since 0.2.0. Do not use it in production. \n" + ] + }, + { + "data": { + "text/plain": [ + "'42374efe274011eea4ff5ac3f3b698e1'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logistic_model, test_features = prepare_logistic_model()\n", + "model_name = \"snowpark_ml_logistic\"\n", + "model_version = \"v2\"\n", + "\n", + "registry.log_model(\n", + " model_name=model_name,\n", + " model_version=model_version,\n", + " model=logistic_model,\n", + " sample_input_data=test_features,\n", + " options={\"embed_local_ml_library\": True}, # This option is enabled to pull latest dev code changes.\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "054a3862", + "metadata": {}, + "source": [ + "## Model Deployment to Snowpark Container Service" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "72ff114f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Building the Docker image and deploying to Snowpark Container Service. This process may take a few minutes.\n", + "WARNING:root:Image successfully built! To prevent the need for rebuilding the Docker image in future deployments, simply specify 'prebuilt_snowflake_image': 'temptest002038-servicesnow.registry-dev.snowflakecomputing.com/inference_container_db/inference_container_schema/snowml_repo/42374efe274011eea4ff5ac3f3b698e1:latest' in the options field of the deploy() function\n" + ] + } + ], + "source": [ + "from snowflake.ml.model import _deployer\n", + "from snowflake import snowpark\n", + "\n", + "model_ref = model_registry.ModelReference(\n", + " registry=registry, model_name=model_name, model_version=model_version\n", + ")\n", + "\n", + "compute_pool = \"SHULIN_GPU_POOL\" # Pre-created\n", + "deployment_name = \"LOGISTIC_FUNC\" # Name of the resulting UDF\n", + "\n", + "model_ref.deploy(\n", + " deployment_name=deployment_name, \n", + " platform=_deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE,\n", + " target_method=\"predict\",\n", + " options={\n", + " \"compute_pool\": compute_pool\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1c754e72", + "metadata": {}, + "source": [ + "## Batch Prediction on Snowpark Container Service" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a5c02328", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SEPALLENGTHSEPALWIDTHPETALLENGTHPETALWIDTHPREDICTED_TARGET
05.13.51.40.20.0
14.93.01.40.20.0
24.73.21.30.20.0
34.63.11.50.20.0
45.03.61.40.20.0
55.43.91.70.40.0
64.63.41.40.30.0
75.03.41.50.20.0
84.42.91.40.20.0
94.93.11.50.10.0
\n", + "
" + ], + "text/plain": [ + " SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH PREDICTED_TARGET\n", + "0 5.1 3.5 1.4 0.2 0.0\n", + "1 4.9 3.0 1.4 0.2 0.0\n", + "2 4.7 3.2 1.3 0.2 0.0\n", + "3 4.6 3.1 1.5 0.2 0.0\n", + "4 5.0 3.6 1.4 0.2 0.0\n", + "5 5.4 3.9 1.7 0.4 0.0\n", + "6 4.6 3.4 1.4 0.3 0.0\n", + "7 5.0 3.4 1.5 0.2 0.0\n", + "8 4.4 2.9 1.4 0.2 0.0\n", + "9 4.9 3.1 1.5 0.1 0.0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_ref.predict(deployment_name, test_features)" + ] + }, + { + "cell_type": "markdown", + "id": "67d6a7d2", + "metadata": {}, + "source": [ + "## Register Cross Encoder Model" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "9dd84f88", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'bafae568275d11ee95175ac3f3b698e1'" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from snowflake.ml.registry import model_registry\n", + "\n", + "model, test_features = prepare_cross_encoder_model()\n", + "model_name = \"cross_encoder_model\"\n", + "model_version = \"v2\"\n", + "\n", + "registry.log_model(\n", + " model_name=model_name,\n", + " model_version=model_version,\n", + " model=model,\n", + " conda_dependencies=[\"pytorch::pytorch==2.0.1\", \"conda-forge::transformers==4.18.0\"],\n", + " sample_input_data=test_features,\n", + " options={\"embed_local_ml_library\": True}, # This option is enabled to pull latest dev code changes.\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c6db686e", + "metadata": {}, + "source": [ + "## Model Deployment to Snowpark Container Service (GPU)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "701152f7", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Building the Docker image and deploying to Snowpark Container Service. This process may take a few minutes.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Image successfully built! To prevent the need for rebuilding the Docker image in future deployments, simply specify 'prebuilt_snowflake_image': 'temptest002038-servicesnow.registry-dev.snowflakecomputing.com/inference_container_db/inference_container_schema/snowml_repo/bafae568275d11ee95175ac3f3b698e1:latest' in the options field of the deploy() function\n" + ] + } + ], + "source": [ + "from snowflake.ml.model import _deployer\n", + "from snowflake import snowpark\n", + "\n", + "model_ref = model_registry.ModelReference(\n", + " registry=registry, model_name=model_name, model_version=model_version\n", + ")\n", + "\n", + "compute_pool = \"SHULIN_GPU_POOL\" # Pre-created\n", + "deployment_name = \"CROSS_ENCODER\" # Name of the resulting UDF\n", + "\n", + "model_ref.deploy(\n", + " deployment_name=deployment_name, \n", + " platform=_deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE,\n", + " target_method=\"predict\",\n", + " options={\n", + " \"compute_pool\": compute_pool,\n", + " \"use_gpu\": True\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7b0fba61", + "metadata": {}, + "source": [ + "## Zero-Shot Classification" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "936840df", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " input_feature_0\n", + "0 The interface gets frozen very often\n" + ] + } + ], + "source": [ + "print(test_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "302daaf9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
output
0product experience
\n", + "
" + ], + "text/plain": [ + " output\n", + "0 product experience" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_ref.predict(deployment_name, test_features)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:local_snowml] *", + "language": "python", + "name": "conda-env-local_snowml-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/snowflake/ml/requirements.bzl b/snowflake/ml/requirements.bzl index 53224d1b..17a32df5 100755 --- a/snowflake/ml/requirements.bzl +++ b/snowflake/ml/requirements.bzl @@ -1,6 +1,6 @@ # DO NOT EDIT! # Generate by running 'bazel run //bazel/requirements:sync_requirements' -EXTRA_REQUIREMENTS={'lightgbm': ['lightgbm==3.3.5'], 'tensorflow': ['tensorflow>=2.9,<3'], 'torch': ['torchdata>=0.4,<1'], 'all': ['lightgbm==3.3.5', 'tensorflow>=2.9,<3', 'torchdata>=0.4,<1']} +EXTRA_REQUIREMENTS={'lightgbm': ['lightgbm==3.3.5'], 'mlflow': ['mlflow>=2.1.0,<3'], 'tensorflow': ['tensorflow>=2.9,<3'], 'torch': ['torchdata>=0.4,<1'], 'all': ['lightgbm==3.3.5', 'mlflow>=2.1.0,<3', 'tensorflow>=2.9,<3', 'torchdata>=0.4,<1']} -REQUIREMENTS=['absl-py>=0.15,<2', 'anyio>=3.5.0,<4', 'cloudpickle', 'fsspec[http]>=2022.11,<=2023.1', 'numpy>=1.23,<2', 'packaging>=20.9,<24', 'pandas>=1.0.0,<2', 'pyyaml>=6.0,<7', 'scikit-learn>=1.2.1,<1.3', 'scipy>=1.9,<2', 'snowflake-connector-python[pandas]>=3.0.3,<4', 'snowflake-snowpark-python>=1.4.0,<2', 'sqlparse>=0.4,<1', 'typing-extensions>=4.1.0,<5', 'xgboost>=1.7.3,<2'] +REQUIREMENTS=['absl-py>=0.15,<2', 'anyio>=3.5.0,<4', 'cloudpickle', 'fsspec[http]>=2022.11,<=2023.1', 'numpy>=1.23,<2', 'packaging>=20.9,<24', 'pandas>=1.0.0,<2', 'pyyaml>=6.0,<7', 'scikit-learn>=1.2.1,<1.3', 'scipy>=1.9,<2', 'snowflake-connector-python[pandas]>=3.0.3,<4', 'snowflake-snowpark-python>=1.5.1,<2', 'sqlparse>=0.4,<1', 'typing-extensions>=4.1.0,<5', 'xgboost>=1.7.3,<2'] diff --git a/snowflake/ml/utils/BUILD.bazel b/snowflake/ml/utils/BUILD.bazel index 6f434ad7..70e72eda 100644 --- a/snowflake/ml/utils/BUILD.bazel +++ b/snowflake/ml/utils/BUILD.bazel @@ -34,6 +34,6 @@ py_package( packages = ["snowflake.ml"], deps = [ ":connection_params", - ":sparse", + ":sparse" ], ) diff --git a/snowflake/ml/version.bzl b/snowflake/ml/version.bzl index 2c0bdaa0..8c131c53 100644 --- a/snowflake/ml/version.bzl +++ b/snowflake/ml/version.bzl @@ -1,2 +1,2 @@ # This is parsed by regex in conda reciper meta file. Make sure not to break it. -VERSION = "1.0.3" +VERSION = "1.0.4" diff --git a/tests/integ/snowflake/ml/fileset/BUILD.bazel b/tests/integ/snowflake/ml/fileset/BUILD.bazel index 188eddbd..a7511cb9 100644 --- a/tests/integ/snowflake/ml/fileset/BUILD.bazel +++ b/tests/integ/snowflake/ml/fileset/BUILD.bazel @@ -24,6 +24,7 @@ py_test( shard_count = 5, deps = [ ":fileset_integ_utils", + "//snowflake/ml/_internal/exceptions:fileset_errors", "//snowflake/ml/fileset", "//snowflake/ml/utils:connection_params", ], diff --git a/tests/integ/snowflake/ml/fileset/fileset_integ_test.py b/tests/integ/snowflake/ml/fileset/fileset_integ_test.py index d7f144b3..b7d9b3a6 100644 --- a/tests/integ/snowflake/ml/fileset/fileset_integ_test.py +++ b/tests/integ/snowflake/ml/fileset/fileset_integ_test.py @@ -14,7 +14,8 @@ from torch.utils import data from snowflake import connector, snowpark -from snowflake.ml.fileset import fileset, fileset_errors +from snowflake.ml._internal.exceptions import fileset_errors +from snowflake.ml.fileset import fileset from snowflake.ml.utils import connection_params from snowflake.snowpark import functions from tests.integ.snowflake.ml.fileset import fileset_integ_utils @@ -122,17 +123,17 @@ def _validate_snowpark_dataframe(self, df: snowpark.DataFrame) -> None: for key in ["NUMBER_INT_COL", "NUMBER_FIXED_POINT_COL"]: self.assertAlmostEqual( fileset_integ_utils.get_column_min(key), - df.select(functions.min(key)).collect()[0][0], # type:ignore[arg-type] + df.select(functions.min(key)).collect()[0][0], 1, ) self.assertAlmostEqual( fileset_integ_utils.get_column_max(key, self.num_rows), - df.select(functions.max(key)).collect()[0][0], # type:ignore[arg-type] + df.select(functions.max(key)).collect()[0][0], 1, ) self.assertAlmostEqual( fileset_integ_utils.get_column_avg(key, self.num_rows), - df.select(functions.avg(key)).collect()[0][0], # type:ignore[arg-type] + df.select(functions.avg(key)).collect()[0][0], 1, ) diff --git a/tests/integ/snowflake/ml/fileset/sfcfs_integ_test.py b/tests/integ/snowflake/ml/fileset/sfcfs_integ_test.py index 42a0901f..d7c1b891 100644 --- a/tests/integ/snowflake/ml/fileset/sfcfs_integ_test.py +++ b/tests/integ/snowflake/ml/fileset/sfcfs_integ_test.py @@ -5,7 +5,8 @@ import fsspec from absl.testing import absltest -from snowflake.ml.fileset import fileset_errors, sfcfs +from snowflake.ml._internal.exceptions import fileset_errors +from snowflake.ml.fileset import sfcfs from snowflake.ml.utils import connection_params from snowflake.snowpark import Session from tests.integ.snowflake.ml.fileset import fileset_integ_utils diff --git a/tests/integ/snowflake/ml/model/BUILD.bazel b/tests/integ/snowflake/ml/model/BUILD.bazel index b3db9f33..967fb054 100644 --- a/tests/integ/snowflake/ml/model/BUILD.bazel +++ b/tests/integ/snowflake/ml/model/BUILD.bazel @@ -5,11 +5,11 @@ py_library( testonly = True, srcs = ["warehouse_model_integ_test_utils.py"], deps = [ - "//tests/integ/snowflake/ml/test_utils:db_manager", "//snowflake/ml/model:_deployer", "//snowflake/ml/model:_model", - "//snowflake/ml/model:type_hints" - ] + "//snowflake/ml/model:type_hints", + "//tests/integ/snowflake/ml/test_utils:db_manager", + ], ) py_test( @@ -19,10 +19,10 @@ py_test( shard_count = 5, deps = [ ":warehouse_model_integ_test_utils", - "//tests/integ/snowflake/ml/test_utils:db_manager", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:type_hints", "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/test_utils:db_manager", ], ) @@ -33,13 +33,28 @@ py_test( shard_count = 4, deps = [ ":warehouse_model_integ_test_utils", - "//tests/integ/snowflake/ml/test_utils:db_manager", - "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_signatures:pytorch_handler", + "//snowflake/ml/model/_signatures:snowpark_handler", "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/test_utils:db_manager", ], ) +py_test( + name = "warehouse_tensorflow_model_integ_test", + timeout = "long", + srcs = ["warehouse_tensorflow_model_integ_test.py"], + shard_count = 4, + deps = [ + ":warehouse_model_integ_test_utils", + "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_signatures:snowpark_handler", + "//snowflake/ml/model/_signatures:tensorflow_handler", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/test_utils:db_manager", + ], +) py_test( name = "warehouse_sklearn_xgboost_model_integ_test", @@ -48,9 +63,9 @@ py_test( shard_count = 3, deps = [ ":warehouse_model_integ_test_utils", - "//tests/integ/snowflake/ml/test_utils:db_manager", "//snowflake/ml/model:type_hints", "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/test_utils:db_manager", ], ) @@ -61,12 +76,12 @@ py_test( shard_count = 2, deps = [ ":warehouse_model_integ_test_utils", - "//tests/integ/snowflake/ml/test_utils:db_manager", "//snowflake/ml/model:type_hints", "//snowflake/ml/modeling/lightgbm:lgbm_regressor", "//snowflake/ml/modeling/linear_model:logistic_regression", "//snowflake/ml/modeling/xgboost:xgb_regressor", "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/test_utils:db_manager", ], ) @@ -76,25 +91,40 @@ py_test( srcs = ["model_badcase_integ_test.py"], deps = [ ":warehouse_model_integ_test_utils", - "//tests/integ/snowflake/ml/test_utils:db_manager", "//snowflake/ml/model:_deployer", "//snowflake/ml/model:_model", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:type_hints", "//snowflake/ml/utils:connection_params", - ] + "//tests/integ/snowflake/ml/test_utils:db_manager", + ], ) +py_test( + name = "warehouse_mlflow_model_integ_test", + timeout = "long", + srcs = ["warehouse_mlflow_model_integ_test.py"], + shard_count = 2, + deps = [ + ":warehouse_model_integ_test_utils", + "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_signatures:numpy_handler", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/test_utils:db_manager", + ], +) -#py_test( -#name = "deployment_to_snowservice_integ_test", -# timeout = "long", -# srcs = ["deployment_to_snowservice_integ_test.py"], -# deps = [ -# "//tests/integ/snowflake/ml/test_utils:db_manager", -# "//snowflake/ml/model:_model", -# "//snowflake/ml/model:custom_model", -# "//snowflake/ml/model/_deploy_client/snowservice:deploy", -# "//snowflake/ml/utils:connection_params", -# ] -#) +py_test( + name = "deployment_to_snowservice_integ_test", + timeout = "long", + srcs = ["deployment_to_snowservice_integ_test.py"], + deps = [ + "//snowflake/ml/model:_model", + "//snowflake/ml/model:custom_model", + "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/_deploy_client/snowservice:deploy", + "//snowflake/ml/model/_deploy_client/utils:constants", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/test_utils:db_manager", + ], +) diff --git a/tests/integ/snowflake/ml/model/deployment_to_snowservice_integ_test.py b/tests/integ/snowflake/ml/model/deployment_to_snowservice_integ_test.py index 2f956ef6..1b00bef1 100644 --- a/tests/integ/snowflake/ml/model/deployment_to_snowservice_integ_test.py +++ b/tests/integ/snowflake/ml/model/deployment_to_snowservice_integ_test.py @@ -11,11 +11,13 @@ from absl.testing import absltest from sklearn import neighbors -from snowflake.ml.model import _model as model_api, custom_model -from snowflake.ml.model._deploy_client.snowservice import ( - deploy as snowservice_api, - deploy_options, +from snowflake.ml.model import ( + _model as model_api, + custom_model, + type_hints as model_types, ) +from snowflake.ml.model._deploy_client.snowservice import deploy as snowservice_api +from snowflake.ml.model._deploy_client.utils import constants from snowflake.ml.utils import connection_params from snowflake.snowpark import Session from tests.integ.snowflake.ml.test_utils import db_manager @@ -50,12 +52,12 @@ def setUpClass(cls) -> None: except KeyError: raise SkipTest("SnowService connection parameters not present: skipping SnowServicesIntegTest.") - deployment_name = login_options["host"].split(".")[1] - registry_host = f"{deployment_name}-{login_options['account']}.registry-dev.snowflakecomputing.com" - - cls.FULL_IMAGE_REPO_PATH = f"{registry_host}/{cls.TEST_DB}/{cls.TEST_SCHEMA}/{cls.TEST_IMAGE_REPO}/".lower() - - cls._session = Session.builder.configs({**login_options}).create() + cls._session = Session.builder.configs( + { + **login_options, + **{"database": cls.TEST_DB, "schema": cls.TEST_SCHEMA}, + } + ).create() cls._db_manager = db_manager.DBManager(cls._session) cls._db_manager.set_role(cls.TEST_ROLE) cls._db_manager.create_stage(cls.TEST_STAGE, cls.TEST_SCHEMA, cls.TEST_DB, sse_encrypted=True) @@ -75,12 +77,13 @@ def setUp(self) -> None: def _save_model_to_stage(self, model: custom_model.CustomModel, sample_input: pd.DataFrame) -> str: stage_path = f"@{self.TEST_STAGE}/{self.uid}/model.zip" - model_api.save_model( + model_api.save_model( # type: ignore[call-overload] name="model", session=self._session, model_stage_file_path=stage_path, model=model, sample_input=sample_input, + options={"embed_local_ml_library": True}, ) return stage_path @@ -89,16 +92,19 @@ def test_deployment_workflow(self) -> None: service_func_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( self._RUN_ID, f"func_{self.uid}" ) - deployment_options: deploy_options.SnowServiceDeployOptionsTypedHint = { - "stage": self.TEST_STAGE, + deployment_options: model_types.SnowparkContainerServiceDeployOptions = { "compute_pool": self.TEST_COMPUTE_POOL, - "image_repo": self.FULL_IMAGE_REPO_PATH, + # image_repo is optional for user, pass in full image repo for test purposes only + "image_repo": self._db_manager.get_snowservice_image_repo( + subdomain=constants.DEV_IMAGE_REGISTRY_SUBDOMAIN, repo=self.TEST_IMAGE_REPO + ), } snowservice_api._deploy( self._session, model_id=uuid.uuid4().hex, service_func_name=service_func_name, model_zip_stage_path=model_stage_file_path, + deployment_stage_path=model_stage_file_path, # use the same stage for testing **deployment_options, ) diff --git a/tests/integ/snowflake/ml/model/model_badcase_integ_test.py b/tests/integ/snowflake/ml/model/model_badcase_integ_test.py index 4a9babd0..4077dfbd 100644 --- a/tests/integ/snowflake/ml/model/model_badcase_integ_test.py +++ b/tests/integ/snowflake/ml/model/model_badcase_integ_test.py @@ -75,7 +75,7 @@ def test_bad_model_deploy(self) -> None: sample_input=pd_df, metadata={"author": "halu", "version": "1"}, conda_dependencies=["invalidnumpy==1.22.4"], - options={"embed_local_ml_library": True}, + options=model_types.CustomModelSaveOption({"embed_local_ml_library": True}), ) function_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( self.run_id, "custom_bad_model" @@ -101,7 +101,7 @@ def test_custom_demo_model(self) -> None: model=lm, sample_input=pd_df, metadata={"author": "halu", "version": "1"}, - options={"embed_local_ml_library": True}, + options=model_types.CustomModelSaveOption({"embed_local_ml_library": True}), ) function_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( self.run_id, "custom_demo_model" diff --git a/tests/integ/snowflake/ml/model/warehouse_mlflow_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_mlflow_model_integ_test.py new file mode 100644 index 00000000..81920f1a --- /dev/null +++ b/tests/integ/snowflake/ml/model/warehouse_mlflow_model_integ_test.py @@ -0,0 +1,201 @@ +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# + +import uuid +from typing import Any, Callable, Dict, Optional, Tuple, Union + +import mlflow +import numpy as np +import pandas as pd +from absl.testing import absltest, parameterized +from sklearn import datasets, ensemble, model_selection + +from snowflake.ml.model import type_hints as model_types +from snowflake.ml.model._signatures import numpy_handler +from snowflake.ml.utils import connection_params +from snowflake.snowpark import DataFrame as SnowparkDataFrame, Session +from tests.integ.snowflake.ml.model import warehouse_model_integ_test_utils +from tests.integ.snowflake.ml.test_utils import db_manager + + +class TestWarehouseMLFlowModelInteg(parameterized.TestCase): + @classmethod + def setUpClass(self) -> None: + """Creates Snowpark and Snowflake environments for testing.""" + self._session = Session.builder.configs(connection_params.SnowflakeLoginOptions()).create() + + self._db_manager = db_manager.DBManager(self._session) + self._db_manager.cleanup_schemas() + self._db_manager.cleanup_stages() + self._db_manager.cleanup_user_functions() + + # To create different UDF names among different runs + self.run_id = uuid.uuid4().hex + self._test_schema_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( + self.run_id, "model_deployment_mlflow_model_test_schema" + ) + self._db_manager.create_schema(self._test_schema_name) + self._db_manager.use_schema(self._test_schema_name) + + self.deploy_stage_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( + self.run_id, "deployment_stage" + ) + self.full_qual_stage = self._db_manager.create_stage( + self.deploy_stage_name, schema_name=self._test_schema_name, sse_encrypted=False + ) + + @classmethod + def tearDownClass(self) -> None: + self._db_manager.drop_stage(self.deploy_stage_name, schema_name=self._test_schema_name) + self._db_manager.drop_schema(self._test_schema_name) + self._session.close() + + def base_test_case( + self, + name: str, + model: model_types.SupportedModelType, + sample_input: model_types.SupportedDataType, + test_input: model_types.SupportedDataType, + deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], + model_in_stage: Optional[bool] = False, + permanent_deploy: Optional[bool] = False, + test_released_library: Optional[bool] = False, + ) -> None: + warehouse_model_integ_test_utils.base_test_case( + self._db_manager, + run_id=self.run_id, + full_qual_stage=self.full_qual_stage, + name=name, + model=model, + sample_input=sample_input, + test_input=test_input, + deploy_params=deploy_params, + model_in_stage=model_in_stage, + permanent_deploy=permanent_deploy, + test_released_library=test_released_library, + ) + + @parameterized.parameters( # type: ignore[misc] + {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, + # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, + # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + ) + def test_mlflow_model_deploy_sklearn_df( + self, + model_in_stage: Optional[bool] = False, + permanent_deploy: Optional[bool] = False, + test_released_library: Optional[bool] = False, + ) -> None: + db = datasets.load_diabetes(as_frame=True) + X_train, X_test, y_train, y_test = model_selection.train_test_split(db.data, db.target) + with mlflow.start_run() as run: + rf = ensemble.RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3) + rf.fit(X_train, y_train) + + # Use the model to make predictions on the test dataset. + predictions = rf.predict(X_test) + signature = mlflow.models.signature.infer_signature(X_test, predictions) + mlflow.sklearn.log_model( + rf, + "model", + signature=signature, + metadata={"author": "halu", "version": "1"}, + conda_env={ + "dependencies": [ + "python=3.8.13", + "mlflow==2.3.1", + "cloudpickle==2.0.0", + "numpy==1.23.4", + "psutil==5.9.0", + "scikit-learn==1.2.2", + "scipy==1.9.3", + "typing-extensions==4.5.0", + ], + "name": "mlflow-env", + }, + ) + + run_id = run.info.run_id + + self.base_test_case( + name="mlflow_model_sklearn_df", + model=mlflow.pyfunc.load_model(f"runs:/{run_id}/model"), + sample_input=None, + test_input=X_test, + deploy_params={ + "predict": ( + {}, + lambda res: np.testing.assert_allclose(np.expand_dims(predictions, axis=1), res.to_numpy()), + ), + }, + model_in_stage=model_in_stage, + permanent_deploy=permanent_deploy, + test_released_library=test_released_library, + ) + + @parameterized.parameters( # type: ignore[misc] + {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, + # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, + # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + ) + def test_mlflow_model_deploy_sklearn( + self, + model_in_stage: Optional[bool] = False, + permanent_deploy: Optional[bool] = False, + test_released_library: Optional[bool] = False, + ) -> None: + db = datasets.load_diabetes() + X_train, X_test, y_train, y_test = model_selection.train_test_split(db.data, db.target) + with mlflow.start_run() as run: + rf = ensemble.RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3) + rf.fit(X_train, y_train) + + # Use the model to make predictions on the test dataset. + predictions = rf.predict(X_test) + signature = mlflow.models.signature.infer_signature(X_test, predictions) + mlflow.sklearn.log_model( + rf, + "model", + signature=signature, + metadata={"author": "halu", "version": "1"}, + conda_env={ + "dependencies": [ + "python=3.8.13", + "mlflow==2.3.1", + "cloudpickle==2.0.0", + "numpy==1.23.4", + "psutil==5.9.0", + "scikit-learn==1.2.2", + "scipy==1.9.3", + "typing-extensions==4.5.0", + ], + "name": "mlflow-env", + }, + ) + + run_id = run.info.run_id + + X_test_df = numpy_handler.SeqOfNumpyArrayHandler.convert_to_df([X_test]) + + self.base_test_case( + name="mlflow_model_sklearn", + model=mlflow.pyfunc.load_model(f"runs:/{run_id}/model"), + sample_input=None, + test_input=X_test_df, + deploy_params={ + "predict": ( + {}, + lambda res: np.testing.assert_allclose(np.expand_dims(predictions, axis=1), res.to_numpy()), + ), + }, + model_in_stage=model_in_stage, + permanent_deploy=permanent_deploy, + test_released_library=test_released_library, + ) + + +if __name__ == "__main__": + absltest.main() diff --git a/tests/integ/snowflake/ml/model/warehouse_pytorch_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_pytorch_model_integ_test.py index 9491d269..b9cebf24 100644 --- a/tests/integ/snowflake/ml/model/warehouse_pytorch_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_pytorch_model_integ_test.py @@ -10,7 +10,8 @@ import torch from absl.testing import absltest, parameterized -from snowflake.ml.model import model_signature, type_hints as model_types +from snowflake.ml.model import type_hints as model_types +from snowflake.ml.model._signatures import pytorch_handler, snowpark_handler from snowflake.ml.utils import connection_params from snowflake.snowpark import DataFrame as SnowparkDataFrame, Session from tests.integ.snowflake.ml.model import warehouse_model_integ_test_utils @@ -121,7 +122,7 @@ def test_pytorch_tensor_as_sample( test_released_library: Optional[bool] = False, ) -> None: model, data_x, data_y = _prepare_torch_model() - x_df = model_signature._SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) + x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) y_pred = model.forward(data_x)[0].detach() self.base_test_case( @@ -133,7 +134,7 @@ def test_pytorch_tensor_as_sample( "forward": ( {}, lambda res: torch.testing.assert_close( # type:ignore[attr-defined] - model_signature._SeqOfPyTorchTensorHandler.convert_from_df(res)[0], y_pred, check_dtype=False + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(res)[0], y_pred, check_dtype=False ), ), }, @@ -155,7 +156,7 @@ def test_pytorch_df_as_sample( test_released_library: Optional[bool] = False, ) -> None: model, data_x, data_y = _prepare_torch_model(torch.float64) - x_df = model_signature._SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) + x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) y_pred = model.forward(data_x)[0].detach() self.base_test_case( @@ -167,7 +168,7 @@ def test_pytorch_df_as_sample( "forward": ( {}, lambda res: torch.testing.assert_close( # type:ignore[attr-defined] - model_signature._SeqOfPyTorchTensorHandler.convert_from_df(res)[0], y_pred + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(res)[0], y_pred ), ), }, @@ -189,10 +190,10 @@ def test_pytorch_sp( test_released_library: Optional[bool] = False, ) -> None: model, data_x, data_y = _prepare_torch_model(torch.float64) - x_df = model_signature._SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) + x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) x_df.columns = ["col_0"] y_pred = model.forward(data_x)[0].detach() - x_df_sp = model_signature._SnowparkDataFrameHandler.convert_from_df(self._session, x_df, keep_order=True) + x_df_sp = snowpark_handler.SnowparkDataFrameHandler.convert_from_df(self._session, x_df, keep_order=True) self.base_test_case( name="pytorch_model_sp", @@ -203,8 +204,8 @@ def test_pytorch_sp( "forward": ( {}, lambda res: torch.testing.assert_close( # type:ignore[attr-defined] - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._SnowparkDataFrameHandler.convert_to_df(res) + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + snowpark_handler.SnowparkDataFrameHandler.convert_to_df(res) )[0], y_pred, ), @@ -228,7 +229,7 @@ def test_torchscript_tensor_as_sample( test_released_library: Optional[bool] = False, ) -> None: model, data_x, data_y = _prepare_torch_model() - x_df = model_signature._SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) + x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) model_script = torch.jit.script(model) # type:ignore[attr-defined] y_pred = model_script.forward(data_x)[0].detach() @@ -241,7 +242,7 @@ def test_torchscript_tensor_as_sample( "forward": ( {}, lambda res: torch.testing.assert_close( # type:ignore[attr-defined] - model_signature._SeqOfPyTorchTensorHandler.convert_from_df(res)[0], y_pred, check_dtype=False + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(res)[0], y_pred, check_dtype=False ), ), }, @@ -263,7 +264,7 @@ def test_torchscript_df_as_sample( test_released_library: Optional[bool] = False, ) -> None: model, data_x, data_y = _prepare_torch_model(torch.float64) - x_df = model_signature._SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) + x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) model_script = torch.jit.script(model) # type:ignore[attr-defined] y_pred = model_script.forward(data_x)[0].detach() @@ -276,7 +277,7 @@ def test_torchscript_df_as_sample( "forward": ( {}, lambda res: torch.testing.assert_close( # type:ignore[attr-defined] - model_signature._SeqOfPyTorchTensorHandler.convert_from_df(res)[0], y_pred + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(res)[0], y_pred ), ), }, @@ -298,11 +299,11 @@ def test_torchscript_sp( test_released_library: Optional[bool] = False, ) -> None: model, data_x, data_y = _prepare_torch_model(torch.float64) - x_df = model_signature._SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) + x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) x_df.columns = ["col_0"] model_script = torch.jit.script(model) # type:ignore[attr-defined] y_pred = model_script.forward(data_x)[0].detach() - x_df_sp = model_signature._SnowparkDataFrameHandler.convert_from_df(self._session, x_df, keep_order=True) + x_df_sp = snowpark_handler.SnowparkDataFrameHandler.convert_from_df(self._session, x_df, keep_order=True) self.base_test_case( name="torch_script_model_sp", @@ -313,8 +314,8 @@ def test_torchscript_sp( "forward": ( {}, lambda res: torch.testing.assert_close( # type:ignore[attr-defined] - model_signature._SeqOfPyTorchTensorHandler.convert_from_df( - model_signature._SnowparkDataFrameHandler.convert_to_df(res) + pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( + snowpark_handler.SnowparkDataFrameHandler.convert_to_df(res) )[0], y_pred, ), diff --git a/tests/integ/snowflake/ml/model/warehouse_tensorflow_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_tensorflow_model_integ_test.py new file mode 100644 index 00000000..08699931 --- /dev/null +++ b/tests/integ/snowflake/ml/model/warehouse_tensorflow_model_integ_test.py @@ -0,0 +1,355 @@ +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# + +import uuid +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import numpy as np +import pandas as pd +import tensorflow as tf +from absl.testing import absltest, parameterized + +from snowflake.ml.model import type_hints as model_types +from snowflake.ml.model._signatures import snowpark_handler, tensorflow_handler +from snowflake.ml.utils import connection_params +from snowflake.snowpark import DataFrame as SnowparkDataFrame, Session +from tests.integ.snowflake.ml.model import warehouse_model_integ_test_utils +from tests.integ.snowflake.ml.test_utils import db_manager + + +class SimpleModule(tf.Module): + def __init__(self, name: str = None) -> None: + super().__init__(name=name) + self.a_variable = tf.Variable(5.0, name="train_me") + self.non_trainable_variable = tf.Variable(5.0, trainable=False, name="do_not_train_me") + + @tf.function(input_signature=[[tf.TensorSpec(shape=(None, 1), dtype=tf.float32)]]) # type: ignore[misc] + def __call__(self, tensors: List[tf.Tensor]) -> List[tf.Tensor]: + return [self.a_variable * tensors[0] + self.non_trainable_variable] + + +class KerasModel(tf.keras.Model): + def __init__(self, n_hidden: int, n_out: int) -> None: + super().__init__() + self.fc_1 = tf.keras.layers.Dense(n_hidden, activation="relu") + self.fc_2 = tf.keras.layers.Dense(n_out, activation="sigmoid") + + def call(self, tensors: List[tf.Tensor]) -> List[tf.Tensor]: + input = tensors[0] + x = self.fc_1(input) + x = self.fc_2(x) + return [x] + + +def _prepare_keras_model( + dtype: tf.dtypes.DType = tf.float32, +) -> Tuple[tf.keras.Model, List[tf.Tensor], List[tf.Tensor]]: + n_input, n_hidden, n_out, batch_size, learning_rate = 10, 15, 1, 100, 0.01 + x = np.random.rand(batch_size, n_input) + data_x = [tf.convert_to_tensor(x, dtype=dtype)] + raw_data_y = tf.random.uniform((batch_size, 1)) + raw_data_y = tf.where(raw_data_y > 0.5, tf.ones_like(raw_data_y), tf.zeros_like(raw_data_y)) + data_y = [tf.cast(raw_data_y, dtype=dtype)] + + def loss_fn(y_true: List[tf.Tensor], y_pred: List[tf.Tensor]) -> tf.Tensor: + return tf.keras.losses.mse(y_true[0], y_pred[0]) + + model = KerasModel(n_hidden, n_out) + model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), loss=loss_fn) + model.fit(data_x, data_y, batch_size=batch_size, epochs=100) + return model, data_x, data_y + + +class TestWarehouseTensorflowModelInteg(parameterized.TestCase): + @classmethod + def setUpClass(self) -> None: + """Creates Snowpark and Snowflake environments for testing.""" + self._session = Session.builder.configs(connection_params.SnowflakeLoginOptions()).create() + + self._db_manager = db_manager.DBManager(self._session) + self._db_manager.cleanup_schemas() + self._db_manager.cleanup_stages() + self._db_manager.cleanup_user_functions() + + # To create different UDF names among different runs + self.run_id = uuid.uuid4().hex + self._test_schema_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( + self.run_id, "model_deployment_tensorflow_model_test_schema" + ) + self._db_manager.create_schema(self._test_schema_name) + self._db_manager.use_schema(self._test_schema_name) + + self.deploy_stage_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( + self.run_id, "deployment_stage" + ) + self.full_qual_stage = self._db_manager.create_stage( + self.deploy_stage_name, schema_name=self._test_schema_name, sse_encrypted=False + ) + + @classmethod + def tearDownClass(self) -> None: + self._db_manager.drop_stage(self.deploy_stage_name, schema_name=self._test_schema_name) + self._db_manager.drop_schema(self._test_schema_name) + self._session.close() + + def base_test_case( + self, + name: str, + model: model_types.SupportedModelType, + sample_input: model_types.SupportedDataType, + test_input: model_types.SupportedDataType, + deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], + model_in_stage: Optional[bool] = False, + permanent_deploy: Optional[bool] = False, + test_released_library: Optional[bool] = False, + ) -> None: + warehouse_model_integ_test_utils.base_test_case( + self._db_manager, + run_id=self.run_id, + full_qual_stage=self.full_qual_stage, + name=name, + model=model, + sample_input=sample_input, + test_input=test_input, + deploy_params=deploy_params, + model_in_stage=model_in_stage, + permanent_deploy=permanent_deploy, + test_released_library=test_released_library, + ) + + @parameterized.parameters( # type: ignore[misc] + {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, + # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, + # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + ) + def test_tf_tensor_as_sample( + self, + model_in_stage: Optional[bool] = False, + permanent_deploy: Optional[bool] = False, + test_released_library: Optional[bool] = False, + ) -> None: + model = SimpleModule(name="simple") + data_x = [tf.constant([[5.0], [10.0]])] + x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False) + y_pred = model(data_x) + + self.base_test_case( + name="tf_model_tensor_as_sample", + model=model, + sample_input=data_x, + test_input=x_df, + deploy_params={ + "__call__": ( + {}, + lambda res: np.testing.assert_allclose( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(res)[0].numpy(), + y_pred[0].numpy(), + ), + ), + }, + model_in_stage=model_in_stage, + permanent_deploy=permanent_deploy, + test_released_library=test_released_library, + ) + + @parameterized.parameters( # type: ignore[misc] + {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, + # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, + # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + ) + def test_tf_df_as_sample( + self, + model_in_stage: Optional[bool] = False, + permanent_deploy: Optional[bool] = False, + test_released_library: Optional[bool] = False, + ) -> None: + model = SimpleModule(name="simple") + data_x = [tf.constant([[5.0], [10.0]])] + x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False) + y_pred = model(data_x) + + self.base_test_case( + name="tf_model_df_as_sample", + model=model, + sample_input=x_df, + test_input=x_df, + deploy_params={ + "__call__": ( + {}, + lambda res: np.testing.assert_allclose( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(res)[0].numpy(), + y_pred[0].numpy(), + ), + ), + }, + model_in_stage=model_in_stage, + permanent_deploy=permanent_deploy, + test_released_library=test_released_library, + ) + + @parameterized.parameters( # type: ignore[misc] + {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, + # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, + # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + ) + def test_tf_sp( + self, + model_in_stage: Optional[bool] = False, + permanent_deploy: Optional[bool] = False, + test_released_library: Optional[bool] = False, + ) -> None: + model = SimpleModule(name="simple") + data_x = [tf.constant([[5.0], [10.0]])] + x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False) + x_df.columns = ["col_0"] + y_pred = model(data_x) + x_df_sp = snowpark_handler.SnowparkDataFrameHandler.convert_from_df( + self._session, + x_df, + keep_order=True, + ) + + self.base_test_case( + name="tf_model_sp", + model=model, + sample_input=x_df, + test_input=x_df_sp, + deploy_params={ + "__call__": ( + {}, + lambda res: np.testing.assert_allclose( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + snowpark_handler.SnowparkDataFrameHandler.convert_to_df(res) + )[0].numpy(), + y_pred[0].numpy(), + ), + ), + }, + model_in_stage=model_in_stage, + permanent_deploy=permanent_deploy, + test_released_library=test_released_library, + ) + + @parameterized.parameters( # type: ignore[misc] + {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, + # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, + # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + ) + def test_keras_tensor_as_sample( + self, + model_in_stage: Optional[bool] = False, + permanent_deploy: Optional[bool] = False, + test_released_library: Optional[bool] = False, + ) -> None: + model, data_x, data_y = _prepare_keras_model() + x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False) + y_pred = model.predict(data_x)[0] + + self.base_test_case( + name="keras_model_tensor_as_sample", + model=model, + sample_input=data_x, + test_input=x_df, + deploy_params={ + "predict": ( + {}, + lambda res: np.testing.assert_allclose( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(res)[0].numpy(), + y_pred, + atol=1e-6, + ), + ), + }, + model_in_stage=model_in_stage, + permanent_deploy=permanent_deploy, + test_released_library=test_released_library, + ) + + @parameterized.parameters( # type: ignore[misc] + {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, + # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, + # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + ) + def test_keras_df_as_sample( + self, + model_in_stage: Optional[bool] = False, + permanent_deploy: Optional[bool] = False, + test_released_library: Optional[bool] = False, + ) -> None: + model, data_x, data_y = _prepare_keras_model() + x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False) + y_pred = model.predict(data_x)[0] + + self.base_test_case( + name="keras_model_df_as_sample", + model=model, + sample_input=x_df, + test_input=x_df, + deploy_params={ + "predict": ( + {}, + lambda res: np.testing.assert_allclose( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(res)[0].numpy(), + y_pred, + atol=1e-6, + ), + ), + }, + model_in_stage=model_in_stage, + permanent_deploy=permanent_deploy, + test_released_library=test_released_library, + ) + + @parameterized.parameters( # type: ignore[misc] + {"model_in_stage": True, "permanent_deploy": True, "test_released_library": False}, + {"model_in_stage": False, "permanent_deploy": False, "test_released_library": False}, + # {"model_in_stage": True, "permanent_deploy": False, "test_released_library": True}, + # {"model_in_stage": False, "permanent_deploy": True, "test_released_library": True}, + ) + def test_keras_sp( + self, + model_in_stage: Optional[bool] = False, + permanent_deploy: Optional[bool] = False, + test_released_library: Optional[bool] = False, + ) -> None: + model, data_x, data_y = _prepare_keras_model() + x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False) + x_df.columns = ["col_0"] + y_pred = model.predict(data_x)[0] + x_df_sp = snowpark_handler.SnowparkDataFrameHandler.convert_from_df( + self._session, + x_df, + keep_order=True, + ) + + self.base_test_case( + name="keras_model_sp", + model=model, + sample_input=x_df, + test_input=x_df_sp, + deploy_params={ + "predict": ( + {}, + lambda res: np.testing.assert_allclose( + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( + snowpark_handler.SnowparkDataFrameHandler.convert_to_df(res) + )[0].numpy(), + y_pred, + atol=1e-6, + ), + ), + }, + model_in_stage=model_in_stage, + permanent_deploy=permanent_deploy, + test_released_library=test_released_library, + ) + + +if __name__ == "__main__": + absltest.main() diff --git a/tests/integ/snowflake/ml/modeling/framework/BUILD.bazel b/tests/integ/snowflake/ml/modeling/framework/BUILD.bazel index c0257bfa..dd23b8b1 100644 --- a/tests/integ/snowflake/ml/modeling/framework/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/framework/BUILD.bazel @@ -7,6 +7,7 @@ py_test( srcs = ["test_base.py"], deps = [ ":utils", + "//snowflake/ml/_internal/exceptions:modeling_error_messages", "//snowflake/ml/modeling/preprocessing:min_max_scaler", "//snowflake/ml/modeling/preprocessing:standard_scaler", "//snowflake/ml/utils:connection_params", diff --git a/tests/integ/snowflake/ml/modeling/framework/test_base.py b/tests/integ/snowflake/ml/modeling/framework/test_base.py index 7f86b9b7..9bd39a09 100644 --- a/tests/integ/snowflake/ml/modeling/framework/test_base.py +++ b/tests/integ/snowflake/ml/modeling/framework/test_base.py @@ -7,6 +7,7 @@ import pytest from absl.testing.absltest import TestCase, main +from snowflake.ml._internal.exceptions.exceptions import SnowflakeMLException from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols from snowflake.ml.modeling.preprocessing import ( # type: ignore[attr-defined] MinMaxScaler, @@ -136,15 +137,13 @@ class TestTransformer(BaseTransformer): # Assert that numeric data with null data raises transformer = TestTransformer().set_input_cols(input_cols) # type: ignore[abstract] - with pytest.raises(ValueError) as excinfo: + with self.assertRaises(SnowflakeMLException): transformer._validate_data_has_no_nulls(df_with_nulls) # type: ignore[attr-defined] - assert "Dataset may not contain nulls" in excinfo.value.args[0] # Assert that extra input columns raises transformer = TestTransformer().set_input_cols(input_cols + ["nonexistent_column"]) # type: ignore[abstract] - with pytest.raises(SnowparkColumnException) as excinfo: # type: ignore[assignment] + with self.assertRaises(SnowparkColumnException): transformer._validate_data_has_no_nulls(df) # type: ignore[attr-defined] - assert "The DataFrame does not contain the column" in excinfo.value.args[0] def test_base_double_quoted_identifiers(self) -> None: """ diff --git a/tests/integ/snowflake/ml/modeling/impute/test_simple_imputer.py b/tests/integ/snowflake/ml/modeling/impute/test_simple_imputer.py index 2c96785e..e56efaf3 100644 --- a/tests/integ/snowflake/ml/modeling/impute/test_simple_imputer.py +++ b/tests/integ/snowflake/ml/modeling/impute/test_simple_imputer.py @@ -61,9 +61,8 @@ def test_inconsistent_input_col_type(self) -> None: for strategy in ["mean", "constant", "median"]: simple_imputer = SimpleImputer(strategy=strategy, input_cols=input_cols, output_cols=output_cols) - with self.assertRaises(TypeError) as ex: + with self.assertRaisesRegex(TypeError, "Inconsistent input column types."): simple_imputer.fit(df) - self.assertTrue(str(ex.exception).startswith("Inconsistent input column types.")) def test_fit(self) -> None: """ diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_roc_curve.py b/tests/integ/snowflake/ml/modeling/metrics/test_roc_curve.py index 1652b7d7..ab399ef0 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/test_roc_curve.py +++ b/tests/integ/snowflake/ml/modeling/metrics/test_roc_curve.py @@ -1,6 +1,8 @@ # # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # +import os +import tempfile from typing import Any, Dict import numpy as np @@ -116,6 +118,43 @@ def test_drop_intermediate(self, params: Dict[str, Any]) -> None: np.array((sklearn_fpr, sklearn_tpr, sklearn_thresholds)), ) + def test_multi_query_df(self) -> None: + """Test ROC curve for DataFrames that require multiple queries to reconstruct.""" + stage = "temp" + self._session.sql(f"create temp stage {stage}").collect() + + # Load data into the stage. + pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) + with tempfile.TemporaryDirectory() as temp_dir: + filename = "data.parquet" + local_path = os.path.join(temp_dir, filename) + pandas_df.to_parquet(local_path) + # TODO: Do I need to clean this up? + _ = self._session.file.put(local_path, f"@{stage}", auto_compress=False) + + # Retrieve data from the stage, and join it against data from an existing DataFrame. + df_lhs = self._session.read.parquet(f"@{stage}/{filename}") + pandas_df = pd.DataFrame(_BINARY_DATA, columns=["ID", "A", "B", "C"]) + df_rhs = self._session.create_dataframe(pandas_df) + + input_df = df_lhs.join(df_rhs, ["ID"]) + pd_df = input_df.to_pandas() + + actual_fpr, actual_tpr, actual_thresholds = snowml_metrics.roc_curve( + df=input_df, + y_true_col_name=_Y_TRUE_COL, + y_score_col_name=_Y_SCORE_COL, + ) + + sklearn_fpr, sklearn_tpr, sklearn_thresholds = sklearn_metrics.roc_curve( + pd_df[_Y_TRUE_COL], + pd_df[_Y_SCORE_COL], + ) + np.testing.assert_allclose( + np.array((actual_fpr, actual_tpr, actual_thresholds)), + np.array((sklearn_fpr, sklearn_tpr, sklearn_thresholds)), + ) + if __name__ == "__main__": main() diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/test_k_bins_discretizer.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_k_bins_discretizer.py index f20f1743..4be19f54 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/test_k_bins_discretizer.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_k_bins_discretizer.py @@ -68,43 +68,39 @@ def test_invalid_inputs(self) -> None: _, snowpark_df = utils.get_df(self._session, utils.DATA, utils.SCHEMA) # 1. Invalid n_bins - with self.assertRaises(ValueError) as ex: + with self.assertRaisesRegex(ValueError, "n_bins must have same size as input_cols"): discretizer = KBinsDiscretizer( n_bins=[3], encode="ordinal", input_cols=INPUT_COLS, ) discretizer.fit(snowpark_df) - self.assertTrue(str(ex.exception).startswith("n_bins must have same size as input_cols")) - with self.assertRaises(ValueError) as ex: + with self.assertRaisesRegex(ValueError, "n_bins cannot be less than 2"): discretizer = KBinsDiscretizer( n_bins=[1, 3], encode="ordinal", input_cols=INPUT_COLS, ) discretizer.fit(snowpark_df) - self.assertTrue(str(ex.exception).startswith("n_bins cannot be less than 2")) # 2. Invalid encode - with self.assertRaises(ValueError) as ex: + with self.assertRaisesRegex(ValueError, "encode must be one of"): discretizer = KBinsDiscretizer( n_bins=[2, 3], encode="foo", input_cols=INPUT_COLS, ) discretizer.fit(snowpark_df) - self.assertTrue(str(ex.exception).startswith("encode must be one of")) # 3. Invalid strategy - with self.assertRaises(ValueError) as ex: + with self.assertRaisesRegex(ValueError, "strategy must be one of"): discretizer = KBinsDiscretizer( n_bins=[2, 3], strategy="foo", input_cols=INPUT_COLS, ) discretizer.fit(snowpark_df) - self.assertTrue(str(ex.exception).startswith("strategy must be one of")) def test_fit(self) -> None: N_BINS = [3, 2] diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/test_one_hot_encoder.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_one_hot_encoder.py index 37a1bffd..fa90a2c3 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/test_one_hot_encoder.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_one_hot_encoder.py @@ -56,6 +56,8 @@ ["5", "''b''", "'g1ehQl''L80t", 2.5, -1.0], ["6", "'b\"", "''g1ehQlL80t", 4.0, 457946.23462], ["7", "'b", "\"'g1ehQl'L80t'", -10.0, -12.564], + ["8", "C", "zOyDvcyZ2s", 0.0, 0.0], + ["9", '"c"', "g1ehQlL80t", 0.0, 0.0], ] @@ -627,7 +629,7 @@ def test_transform_quotes_dense(self) -> None: id_col = ID_COL input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - df_pandas, df = framework_utils.get_df(self._session, _DATA_QUOTES, SCHEMA_BOOLEAN, np.nan) + df_pandas, df = framework_utils.get_df(self._session, _DATA_QUOTES, SCHEMA, np.nan) sparse = False encoder = OneHotEncoder(sparse=sparse).set_input_cols(input_cols).set_output_cols(output_cols) @@ -1639,13 +1641,13 @@ def test_fit_snowpark_transform_everydtypes(self) -> None: ("Y", np.float64), ("Z", np.str_), ("A", np.bool8), - ("B", np.bytes0), + # ("B", np.bytes0), ("C", np.object0), ], ) pd_df = pd.DataFrame(x) df = self._session.create_dataframe(pd_df) - input_cols = ["A", "B", "C", "X", "Y", "Z"] + input_cols = ["A", "C", "X", "Y", "Z"] output_cols = [f"OHE_{c}" for c in input_cols] ohe = OneHotEncoder(input_cols=input_cols, output_cols=output_cols) @@ -1660,6 +1662,7 @@ def test_fit_snowpark_transform_everydtypes(self) -> None: np.testing.assert_allclose(actual_arr, sklearn_arr.toarray()) def test_identical_snowpark_vs_pandas_output_column_names(self) -> None: + # UCI_BANK_MARKETING_20COLUMNS snow_df = self._session.sql( """SELECT *, IFF(Y = 'yes', 1.0, 0.0) as LABEL FROM ML_DATASETS.PUBLIC.UCI_BANK_MARKETING_20COLUMNS @@ -1677,12 +1680,48 @@ def test_identical_snowpark_vs_pandas_output_column_names(self) -> None: "DURATION", ] - ohe = OneHotEncoder(input_cols=cols, output_cols=cols).fit(snow_df) + ohe = OneHotEncoder(input_cols=cols, output_cols=cols, sparse=False).fit(snow_df) snow_cols = ohe.transform(snow_df).columns pd_cols = ohe.transform(pd_df).columns.tolist() - self.assertCountEqual(snow_cols, pd_cols) + def test_select_partial_cols(self) -> None: + snow_df = self._session.sql( + """SELECT AGE as AGE_1, * + FROM ML_DATASETS.PUBLIC.UCI_BANK_MARKETING_20COLUMNS + LIMIT 1000""" + ).drop("Y") + cols = [ + "AGE", + "CAMPAIGN", + "CONTACT", + "DAY_OF_WEEK", + "EDUCATION", + ] + + ohe = OneHotEncoder(input_cols=cols, output_cols=cols).fit(snow_df) + ohe.transform(snow_df) + # custom data + input_cols, output_cols = CATEGORICAL_COLS, OUTPUT_COLS + pd_df2, snow_df2 = framework_utils.get_df(self._session, _DATA_QUOTES, SCHEMA, np.nan) + + ohe2 = OneHotEncoder(input_cols=input_cols, output_cols=output_cols, sparse=False).fit(snow_df2) + snow_cols2 = ohe2.transform(snow_df2).columns + pd_cols2 = ohe2.transform(pd_df2).columns.tolist() + self.assertCountEqual(snow_cols2, pd_cols2) + + def test_get_output_cols_sparse(self) -> None: + output_cols = ["OUT1", '"Out2"', "Out3", "OUT 4", "Out5"] + input_cols = [f"COL_{i}" for i in range(len(output_cols))] + data = [[0 for _ in range(len(input_cols))]] + _, snow_df = framework_utils.get_df(self._session, data, input_cols, np.nan) + + ohe = OneHotEncoder(input_cols=input_cols, output_cols=output_cols, sparse=True, drop_input_cols=True).fit( + snow_df + ) + out_cols = ohe.transform(snow_df).columns + self.assertCountEqual(ohe.get_output_cols(), out_cols) + if __name__ == "__main__": main() diff --git a/tests/integ/snowflake/ml/registry/BUILD.bazel b/tests/integ/snowflake/ml/registry/BUILD.bazel index 633231e4..8f7eb64e 100644 --- a/tests/integ/snowflake/ml/registry/BUILD.bazel +++ b/tests/integ/snowflake/ml/registry/BUILD.bazel @@ -16,11 +16,19 @@ py_test( srcs = ["model_registry_integ_test.py"], deps = [ "//tests/integ/snowflake/ml/test_utils:db_manager", - "//snowflake/ml/modeling/linear_model:logistic_regression", - "//snowflake/ml/modeling/pipeline", - "//snowflake/ml/modeling/preprocessing:min_max_scaler", - "//snowflake/ml/modeling/preprocessing:one_hot_encoder", - "//snowflake/ml/modeling/xgboost:xgb_classifier", + "//tests/integ/snowflake/ml/test_utils:model_factory", + "//snowflake/ml/registry:model_registry", + "//snowflake/ml/utils:connection_params", + ], +) + +py_test( + name = "model_registry_integ_test_with_snowservice", + timeout = "eternal", # 3600s + srcs = ["model_registry_integ_test_with_snowservice.py"], + deps = [ + "//tests/integ/snowflake/ml/test_utils:db_manager", + "//tests/integ/snowflake/ml/test_utils:model_factory", "//snowflake/ml/registry:model_registry", "//snowflake/ml/utils:connection_params", ], diff --git a/tests/integ/snowflake/ml/registry/model_registry_integ_test.py b/tests/integ/snowflake/ml/registry/model_registry_integ_test.py index 4d6ca72b..e1bd5e1d 100644 --- a/tests/integ/snowflake/ml/registry/model_registry_integ_test.py +++ b/tests/integ/snowflake/ml/registry/model_registry_integ_test.py @@ -3,104 +3,19 @@ # import uuid -from typing import Dict, List, Tuple +from typing import Dict import numpy as np -import numpy.typing as npt import pandas as pd import pytest from absl.testing import absltest -from sklearn import datasets, metrics, svm +from sklearn import metrics from snowflake import connector -from snowflake.ml.modeling.linear_model import LogisticRegression -from snowflake.ml.modeling.pipeline import Pipeline -from snowflake.ml.modeling.preprocessing import MinMaxScaler, OneHotEncoder -from snowflake.ml.modeling.xgboost import XGBClassifier from snowflake.ml.registry import model_registry from snowflake.ml.utils import connection_params -from snowflake.snowpark import DataFrame, Session -from tests.integ.snowflake.ml.test_utils import db_manager - - -def _prepare_sklearn_model() -> Tuple[svm.SVC, npt.ArrayLike, npt.ArrayLike]: - digits = datasets.load_digits() - target_digit = 6 - num_training_examples = 10 - svc_gamma = 0.001 - svc_C = 10.0 - - clf = svm.SVC(gamma=svc_gamma, C=svc_C, probability=True) - - def one_vs_all(dataset: npt.NDArray[np.float64], digit: int) -> List[bool]: - return [x == digit for x in dataset] - - # Train a classifier using num_training_examples and use the last 100 examples for test. - train_features = digits.data[:num_training_examples] - train_labels = one_vs_all(digits.target[:num_training_examples], target_digit) - clf.fit(train_features, train_labels) - - test_features = digits.data[-100:] - test_labels = one_vs_all(digits.target[-100:], target_digit) - - return clf, test_features, test_labels - - -def _prepare_snowml_model() -> Tuple[XGBClassifier, pd.DataFrame]: - iris = datasets.load_iris() - df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=iris["feature_names"] + ["target"]) - df.columns = [s.replace(" (CM)", "").replace(" ", "") for s in df.columns.str.upper()] - - input_cols = ["SEPALLENGTH", "SEPALWIDTH", "PETALLENGTH", "PETALWIDTH"] - label_cols = "TARGET" - output_cols = "PREDICTED_TARGET" - - clf_xgb = XGBClassifier(input_cols=input_cols, output_cols=output_cols, label_cols=label_cols, drop_input_cols=True) - - clf_xgb.fit(df) - - return clf_xgb, df.drop(columns=label_cols).head(10) - - -def _prepare_snowml_pipeline(session: Session) -> Tuple[Pipeline, DataFrame]: - iris = datasets.load_iris() - df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=iris["feature_names"] + ["target"]) - df.columns = [s.replace(" (CM)", "").replace(" ", "") for s in df.columns.str.upper()] - - def add_simple_category(df: pd.DataFrame) -> pd.DataFrame: - bins = (-1, 4, 5, 6, 10) - group_names = ["Unknown", "1_quartile", "2_quartile", "3_quartile"] - categories = pd.cut(df.SEPALLENGTH, bins, labels=group_names) - df["SIMPLE"] = categories - return df - - df_cat = add_simple_category(df) - iris_df = session.create_dataframe(df_cat) - - numeric_features = ["SEPALLENGTH", "SEPALWIDTH", "PETALLENGTH", "PETALWIDTH"] - categorical_features = ["SIMPLE"] - numeric_features_output = [x + "_O" for x in numeric_features] - label_cols = "TARGET" - - pipeline = Pipeline( - steps=[ - ( - "OHEHOT", - OneHotEncoder(input_cols=categorical_features, output_cols="cat_output", drop_input_cols=True), - ), - ( - "SCALER", - MinMaxScaler( - clip=True, input_cols=numeric_features, output_cols=numeric_features_output, drop_input_cols=True - ), - ), - # TODO: Remove drop_input_cols=True after SNOW-853632 gets fixed. - ("CLASSIFIER", LogisticRegression(label_cols=label_cols, drop_input_cols=True)), - ] - ) - pipeline.fit(iris_df) - - return pipeline, iris_df.drop(label_cols).limit(10) +from snowflake.snowpark import Session +from tests.integ.snowflake.ml.test_utils import db_manager, model_factory class TestModelRegistryInteg(absltest.TestCase): @@ -113,9 +28,7 @@ def setUpClass(cls) -> None: cls.registry_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(cls.run_id, "registry_db") model_registry.create_model_registry(session=cls._session, database_name=cls.registry_name) cls.perm_stage = "@" + cls._db_manager.create_stage( - "model_registry_test_stage", - "PUBLIC", - cls.registry_name, + "model_registry_test_stage", "PUBLIC", cls.registry_name, sse_encrypted=True ) @classmethod @@ -129,7 +42,7 @@ def test_basic_workflow(self) -> None: # Prepare the model model_name = "basic_model" model_version = self.run_id - model, test_features, test_labels = _prepare_sklearn_model() + model, test_features, test_labels = model_factory.ModelFactory.prepare_sklearn_model() local_prediction = model.predict(test_features) local_prediction_proba = model.predict_proba(test_features) @@ -331,7 +244,7 @@ def test_snowml_model(self) -> None: model_name = "snowml_xgb_classifier" model_version = self.run_id - model, test_features = _prepare_snowml_model() + model, test_features = model_factory.ModelFactory.prepare_snowml_model() local_prediction = model.predict(test_features) local_prediction_proba = model.predict_proba(test_features) @@ -378,7 +291,7 @@ def test_snowml_pipeline(self) -> None: model_name = "snowml_pipeline" model_version = self.run_id - model, test_features = _prepare_snowml_pipeline(self._session) + model, test_features = model_factory.ModelFactory.prepare_snowml_pipeline(self._session) local_prediction = model.predict(test_features) diff --git a/tests/integ/snowflake/ml/registry/model_registry_integ_test_with_snowservice.py b/tests/integ/snowflake/ml/registry/model_registry_integ_test_with_snowservice.py new file mode 100644 index 00000000..9f00b38a --- /dev/null +++ b/tests/integ/snowflake/ml/registry/model_registry_integ_test_with_snowservice.py @@ -0,0 +1,224 @@ +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# +import functools +import tempfile +import uuid +from typing import Any, Callable, Dict, List, Optional, Tuple +from unittest import SkipTest + +import numpy as np +import pandas as pd +import pytest +from absl.testing import absltest, parameterized + +from snowflake.ml.model import _deployer +from snowflake.ml.registry import model_registry +from snowflake.ml.utils import connection_params +from snowflake.snowpark import Session +from tests.integ.snowflake.ml.test_utils import db_manager, model_factory + + +class TestModelRegistryIntegWithSnowServiceDeployment(parameterized.TestCase): + _SNOWSERVICE_CONNECTION_NAME = "snowservice" + _TEST_CPU_COMPUTE_POOL = "MODEL_DEPLOYMENT_INTEG_TEST_POOL" + _TEST_GPU_COMPUTE_POOL = "MODEL_DEPLOYMENT_INTEG_TEST_POOL_GPU_3" + _RUN_ID = uuid.uuid4().hex[:2] + _TEST_DB = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "db").upper() + _TEST_SCHEMA = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "schema").upper() + _TEST_IMAGE_REPO = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(_RUN_ID, "repo").upper() + _TEST_ROLE = "SYSADMIN" + _TEST_WAREHOUSE = "SNOW_ML_XSMALL" + + @classmethod + def setUpClass(cls) -> None: + """Creates Snowpark and Snowflake environments for testing.""" + try: + login_options = connection_params.SnowflakeLoginOptions(connection_name=cls._SNOWSERVICE_CONNECTION_NAME) + except KeyError: + raise SkipTest( + "SnowService connection parameters not present: skipping " + "TestModelRegistryIntegWithSnowServiceDeployment." + ) + + cls._session = Session.builder.configs( + { + **login_options, + **{"database": cls._TEST_DB, "schema": cls._TEST_SCHEMA}, + } + ).create() + + cls._db_manager = db_manager.DBManager(cls._session) + cls._db_manager.set_role(cls._TEST_ROLE) + cls._db_manager.set_warehouse(cls._TEST_WAREHOUSE) + model_registry.create_model_registry( + session=cls._session, database_name=cls._TEST_DB, schema_name=cls._TEST_SCHEMA + ) + cls.registry = model_registry.ModelRegistry( + session=cls._session, database_name=cls._TEST_DB, schema_name=cls._TEST_SCHEMA + ) + cls._db_manager.create_image_repo(cls._TEST_IMAGE_REPO) + + @classmethod + def tearDownClass(cls) -> None: + cls._db_manager.drop_image_repo(cls._TEST_IMAGE_REPO) + cls._db_manager.drop_database(cls._TEST_DB) + cls._session.close() + + def _test_snowservice_deployment( + self, + model_name: str, + model_version: str, + prepare_model_and_feature_fn: Callable[[], Tuple[Any, Any]], + deployment_options: Dict[str, Any], + conda_dependencies: Optional[List[str]] = None, + embed_local_ml_library: Optional[bool] = True, + ): + + model, test_features, *_ = prepare_model_and_feature_fn() + + self.registry.log_model( + model_name=model_name, + model_version=model_version, + model=model, + conda_dependencies=conda_dependencies, + sample_input_data=test_features, + options={"embed_local_ml_library": embed_local_ml_library}, + ) + + model_ref = model_registry.ModelReference( + registry=self.registry, model_name=model_name, model_version=model_version + ) + + deployment_name = f"{model_name}_{model_version}_deployment" + deployment_options["deployment_name"] = deployment_name + model_ref.deploy(**deployment_options) + target_method = deployment_options["target_method"] + local_prediction = getattr(model, target_method)(test_features) + remote_prediction = model_ref.predict(deployment_name, test_features) + + if isinstance(local_prediction, np.ndarray): + np.testing.assert_allclose(remote_prediction.to_numpy(), np.expand_dims(local_prediction, axis=1)) + else: + pd.testing.assert_frame_equal(remote_prediction, local_prediction, check_dtype=False) + + model_deployment_list = model_ref.list_deployments().to_pandas() # type: ignore[attr-defined] + self.assertEqual(model_deployment_list.shape[0], 1) + self.assertEqual(model_deployment_list["MODEL_NAME"][0], model_name) + self.assertEqual(model_deployment_list["MODEL_VERSION"][0], model_version) + self.assertEqual(model_deployment_list["DEPLOYMENT_NAME"][0], deployment_name) + + model_ref.delete_deployment(deployment_name=deployment_name) # type: ignore[attr-defined] + self.assertEqual(model_ref.list_deployments().to_pandas().shape[0], 0) # type: ignore[attr-defined] + + self.assertEqual(self.registry.list_models().to_pandas().shape[0], 1) + self.registry.delete_model(model_name=model_name, model_version=model_version, delete_artifact=True) + self.assertEqual(self.registry.list_models().to_pandas().shape[0], 0) + + # TODO: doesnt work, Mismatched elements: 10 / 100 (10%). could be due to version mismatch? + @pytest.mark.pip_incompatible + def test_sklearn_deployment_with_snowml_conda(self) -> None: + self._test_snowservice_deployment( + model_name="test_sklearn_model", + model_version=uuid.uuid4().hex, + prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_sklearn_model, + embed_local_ml_library=False, + conda_dependencies=["snowflake-ml-python==1.0.2"], + deployment_options={ + "platform": _deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE, + "target_method": "predict", + "options": { + "compute_pool": self._TEST_CPU_COMPUTE_POOL, + "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), + }, + }, + ) + + @pytest.mark.pip_incompatible + def test_sklearn_deployment_with_local_source_code(self) -> None: + self._test_snowservice_deployment( + model_name="test_sklearn_model", + model_version=uuid.uuid4().hex, + prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_sklearn_model, + deployment_options={ + "platform": _deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE, + "target_method": "predict", + "options": { + "compute_pool": self._TEST_CPU_COMPUTE_POOL, + "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), + }, + }, + ) + + @pytest.mark.pip_incompatible + def test_sklearn_deployment(self) -> None: + self._test_snowservice_deployment( + model_name="test_sklearn_model", + model_version=uuid.uuid4().hex, + prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_sklearn_model, + deployment_options={ + "platform": _deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE, + "target_method": "predict", + "options": { + "compute_pool": self._TEST_CPU_COMPUTE_POOL, + "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), + }, + }, + ) + + @pytest.mark.pip_incompatible + def test_huggingface_deployment(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + self._test_snowservice_deployment( + model_name="gpt2_model_gpu", + model_version=uuid.uuid4().hex, + conda_dependencies=["pytorch", "transformers"], + prepare_model_and_feature_fn=functools.partial( + model_factory.ModelFactory.prepare_gpt2_model, local_cache_dir=tmpdir + ), + deployment_options={ + "platform": _deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE, + "target_method": "predict", + "options": { + "compute_pool": self._TEST_GPU_COMPUTE_POOL, + "use_gpu": True, + "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), + }, + }, + ) + + @pytest.mark.pip_incompatible + def test_snowml_model_deployment_logistic_with_sourcecode_embedded_in_model(self) -> None: + self._test_snowservice_deployment( + model_name="snowml", + model_version=uuid.uuid4().hex, + prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_snowml_model_logistic, + deployment_options={ + "platform": _deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE, + "target_method": "predict", + "options": { + "compute_pool": self._TEST_GPU_COMPUTE_POOL, + "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), + }, + }, + ) + + # + # TODO[schen], SNOW-861613, investigate xgboost model prediction hanging issue when run with Gunicorn --preload + # def test_snowml_model_deployment_xgboost(self) -> None: + # self._test_snowservice_deployment( + # model_name="snowml", + # model_version=uuid.uuid4().hex, + # prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_snowml_model, + # deployment_options={ + # "platform": _deployer.TargetPlatform.SNOWPARK_CONTAINER_SERVICE, + # "target_method": "predict", + # "options": { + # "compute_pool": self._TEST_GPU_COMPUTE_POOL, + # } + # }, + # ) + + +if __name__ == "__main__": + absltest.main() diff --git a/tests/integ/snowflake/ml/test_utils/BUILD.bazel b/tests/integ/snowflake/ml/test_utils/BUILD.bazel index d1649786..117ac691 100644 --- a/tests/integ/snowflake/ml/test_utils/BUILD.bazel +++ b/tests/integ/snowflake/ml/test_utils/BUILD.bazel @@ -6,5 +6,21 @@ py_library( name = "db_manager", testonly = True, srcs = ["db_manager.py"], - deps = ["//snowflake/ml/_internal/utils:identifier"], + deps = [ + "//snowflake/ml/_internal/utils:identifier", + "//snowflake/ml/model/_deploy_client/utils:constants" + ], +) + +py_library( + name = "model_factory", + testonly = True, + srcs = ["model_factory.py"], + deps = [ + "//snowflake/ml/modeling/linear_model:logistic_regression", + "//snowflake/ml/modeling/pipeline", + "//snowflake/ml/modeling/preprocessing:min_max_scaler", + "//snowflake/ml/modeling/preprocessing:one_hot_encoder", + "//snowflake/ml/modeling/xgboost:xgb_classifier", + ] ) diff --git a/tests/integ/snowflake/ml/test_utils/db_manager.py b/tests/integ/snowflake/ml/test_utils/db_manager.py index 1fa50010..29054192 100644 --- a/tests/integ/snowflake/ml/test_utils/db_manager.py +++ b/tests/integ/snowflake/ml/test_utils/db_manager.py @@ -7,6 +7,7 @@ from snowflake import snowpark from snowflake.ml._internal.utils import identifier +from snowflake.ml.model._deploy_client.utils import constants _COMMON_PREFIX = "snowml_test_" @@ -19,6 +20,9 @@ def __init__(self, session: snowpark.Session) -> None: def set_role(self, role: str) -> None: self._session.sql(f"USE ROLE {role}").collect() + def set_warehouse(self, warehouse: str) -> None: + self._session.sql(f"USE WAREHOUSE {warehouse}").collect() + def create_database( self, db_name: str, @@ -262,6 +266,18 @@ def cleanup_user_functions( func_def = func_argments.partition("RETURN")[0].strip() self.drop_function(function_def=func_def, schema_name=schema_name, db_name=db_name, if_exists=True) + def get_snowservice_image_repo( + self, + repo: str, + subdomain: str = constants.DEV_IMAGE_REGISTRY_SUBDOMAIN, + ) -> str: + conn = self._session._conn._conn + org = conn.host.split(".")[1] + account = conn.account + db = conn._database + schema = conn._schema + return f"{org}-{account}.{subdomain}.{constants.PROD_IMAGE_REGISTRY_DOMAIN}/{db}/{schema}/{repo}".lower() + class TestObjectNameGenerator: @staticmethod diff --git a/tests/integ/snowflake/ml/test_utils/model_factory.py b/tests/integ/snowflake/ml/test_utils/model_factory.py new file mode 100644 index 00000000..632f5a90 --- /dev/null +++ b/tests/integ/snowflake/ml/test_utils/model_factory.py @@ -0,0 +1,160 @@ +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# + +from typing import List, Tuple + +import numpy as np +import numpy.typing as npt +import pandas as pd +from sklearn import datasets, svm + +from snowflake.ml.model import custom_model +from snowflake.ml.modeling.linear_model import LogisticRegression +from snowflake.ml.modeling.pipeline import Pipeline +from snowflake.ml.modeling.preprocessing import MinMaxScaler, OneHotEncoder +from snowflake.ml.modeling.xgboost import XGBClassifier +from snowflake.snowpark import DataFrame, Session + + +class ModelFactory: + @staticmethod + def prepare_sklearn_model() -> Tuple[svm.SVC, npt.ArrayLike, npt.ArrayLike]: + digits = datasets.load_digits() + target_digit = 6 + num_training_examples = 10 + svc_gamma = 0.001 + svc_C = 10.0 + + clf = svm.SVC(gamma=svc_gamma, C=svc_C, probability=True) + + def one_vs_all(dataset: npt.NDArray[np.float64], digit: int) -> List[bool]: + return [x == digit for x in dataset] + + # Train a classifier using num_training_examples and use the last 100 examples for test. + train_features = digits.data[:num_training_examples] + train_labels = one_vs_all(digits.target[:num_training_examples], target_digit) + clf.fit(train_features, train_labels) + + test_features = digits.data[-100:] + test_labels = one_vs_all(digits.target[-100:], target_digit) + + return clf, test_features, test_labels + + @staticmethod + def prepare_snowml_model() -> Tuple[XGBClassifier, pd.DataFrame]: + iris = datasets.load_iris() + df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=iris["feature_names"] + ["target"]) + df.columns = [s.replace(" (CM)", "").replace(" ", "") for s in df.columns.str.upper()] + + input_cols = ["SEPALLENGTH", "SEPALWIDTH", "PETALLENGTH", "PETALWIDTH"] + label_cols = "TARGET" + output_cols = "PREDICTED_TARGET" + + clf_xgb = XGBClassifier( + input_cols=input_cols, output_cols=output_cols, label_cols=label_cols, drop_input_cols=True + ) + + clf_xgb.fit(df) + + return clf_xgb, df.drop(columns=label_cols).head(10) + + @staticmethod + def prepare_snowml_pipeline(session: Session) -> Tuple[Pipeline, DataFrame]: + iris = datasets.load_iris() + df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=iris["feature_names"] + ["target"]) + df.columns = [s.replace(" (CM)", "").replace(" ", "") for s in df.columns.str.upper()] + + def add_simple_category(df: pd.DataFrame) -> pd.DataFrame: + bins = (-1, 4, 5, 6, 10) + group_names = ["Unknown", "1_quartile", "2_quartile", "3_quartile"] + categories = pd.cut(df.SEPALLENGTH, bins, labels=group_names) + df["SIMPLE"] = categories + return df + + df_cat = add_simple_category(df) + iris_df = session.create_dataframe(df_cat) + + numeric_features = ["SEPALLENGTH", "SEPALWIDTH", "PETALLENGTH", "PETALWIDTH"] + categorical_features = ["SIMPLE"] + numeric_features_output = [x + "_O" for x in numeric_features] + label_cols = "TARGET" + + pipeline = Pipeline( + steps=[ + ( + "OHEHOT", + OneHotEncoder(input_cols=categorical_features, output_cols="cat_output", drop_input_cols=True), + ), + ( + "SCALER", + MinMaxScaler( + clip=True, + input_cols=numeric_features, + output_cols=numeric_features_output, + drop_input_cols=True, + ), + ), + # TODO: Remove drop_input_cols=True after SNOW-853632 gets fixed. + ("CLASSIFIER", LogisticRegression(label_cols=label_cols, drop_input_cols=True)), + ] + ) + pipeline.fit(iris_df) + + return pipeline, iris_df.drop(label_cols).limit(10) + + @staticmethod + def prepare_snowml_model_logistic() -> Tuple[LogisticRegression, pd.DataFrame]: + iris = datasets.load_iris() + df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=iris["feature_names"] + ["target"]) + df.columns = [s.replace(" (CM)", "").replace(" ", "") for s in df.columns.str.upper()] + + input_cols = ["SEPALLENGTH", "SEPALWIDTH", "PETALLENGTH", "PETALWIDTH"] + label_cols = "TARGET" + output_cols = "PREDICTED_TARGET" + + estimator = LogisticRegression( + input_cols=input_cols, output_cols=output_cols, label_cols=label_cols, random_state=0, max_iter=100 + ).fit(df) + + return estimator, df.drop(columns=label_cols).head(10) + + @staticmethod + def prepare_gpt2_model(local_cache_dir: str = None) -> Tuple[custom_model.CustomModel, pd.DataFrame]: + """ + Pretrained GPT2 model from huggingface. + """ + import torch + from transformers import GPT2LMHeadModel, GPT2Tokenizer + + model = GPT2LMHeadModel.from_pretrained("gpt2", cache_dir=local_cache_dir) + tokenizer = GPT2Tokenizer.from_pretrained("gpt2", cache_dir=local_cache_dir) + + class HuggingFaceModel(custom_model.CustomModel): + def __init__(self, context: custom_model.ModelContext) -> None: + super().__init__(context) + + @custom_model.inference_api + def predict(self, input_df: pd.DataFrame) -> pd.DataFrame: + torch_device = "cuda" if torch.cuda.is_available() else "cpu" + model.to(torch_device) + tokenizer.padding_side = "left" + + # Define PAD Token = EOS Token = 50256 + tokenizer.pad_token = tokenizer.eos_token + model.config.pad_token_id = model.config.eos_token_id + + prompts = input_df.values.flatten().tolist() + inputs = tokenizer(prompts, return_tensors="pt", padding=True) + torch.manual_seed(0) + outputs = model.generate( + input_ids=inputs["input_ids"].to(torch_device), + attention_mask=inputs["attention_mask"].to(torch_device), + ) + generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True) + return pd.DataFrame({"output": generated_texts}) + + gpt2_model = HuggingFaceModel(custom_model.ModelContext()) + test_data = pd.DataFrame(["Hello, how are you?", "Once upon a time"]) + + return gpt2_model, test_data