Release snowflake-ml-python: 1.0.11 (#61)

Co-authored-by: Snowflake Authors <[email protected]>
snowflakedb · Oct 26, 2023 · 73c2cf0 · 73c2cf0
1 parent 9130a0b
commit 73c2cf0
Show file tree

Hide file tree

Showing 114 changed files with 3,918 additions and 2,006 deletions.
diff --git a/.bazelrc b/.bazelrc
@@ -14,6 +14,7 @@ coverage --instrumentation_filter="-//tests[/:]"
 build:_build --platforms //bazel/platforms:snowflake_conda_env --host_platform //bazel/platforms:snowflake_conda_env --repo_env=BAZEL_CONDA_ENV_NAME=build
 build:_sf_only --platforms //bazel/platforms:snowflake_conda_env --host_platform //bazel/platforms:snowflake_conda_env --repo_env=BAZEL_CONDA_ENV_NAME=sf_only
 build:_extended --platforms //bazel/platforms:extended_conda_env --host_platform //bazel/platforms:extended_conda_env --repo_env=BAZEL_CONDA_ENV_NAME=extended
+build:_extended_oss --platforms //bazel/platforms:extended_conda_env --host_platform //bazel/platforms:extended_conda_env --repo_env=BAZEL_CONDA_ENV_NAME=extended_oss
 
 # Public definitions
 
@@ -35,6 +36,7 @@ run:pre_build --config=_build --config=py3.8
 
 # Config to run type check
 build:typecheck --aspects @rules_mypy//:mypy.bzl%mypy_aspect --output_groups=mypy --config=_extended --config=py3.8
+build:typecheck_oss --aspects @rules_mypy//:mypy.bzl%mypy_aspect --output_groups=mypy --config=_extended_oss --config=py3.8
 
 # Config to build the doc
 build:docs --config=_sf_only --config=py3.8
@@ -44,3 +46,6 @@ build:docs --config=_sf_only --config=py3.8
 test:extended --config=_extended
 run:extended --config=_extended
 cquery:extended --config=_extended
+test:extended_oss --config=_extended_oss
+run:extended_oss --config=_extended_oss
+cquery:extended_oss --config=_extended_oss
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,23 @@
 # Release History
 
+## 1.0.11
+
+### New Features
+
+- Model Registry: Add log_artifact() public method.
+- Model Development: Add support for `kneighbors`.
+
+### Behavior Changes
+
+- Model Registry: Change log_model() argument from TrainingDataset to List of Artifact.
+- Model Registry: Change get_training_dataset() to get_artifact().
+
+### Bug Fixes
+
+- Model Development: Fix support for XGBoost and LightGBM models using SKLearn Grid Search and Randomized Search model selectors.
+- Model Development: DecimalType is now supported as a DataType.
+- Model Development: Fix metrics compatibility with Snowpark Dataframes that use Snowflake identifiers
+
 ## 1.0.10
 
 ### Behavior Changes

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -347,6 +347,57 @@ When you add a new test file, you should always ensure the existence of a `if __
  the test file will not be instructed by bazel. We have a test wrapper [here](./bazel/test_wrapper.sh) to ensure that the
 test will fail if you forget that part.
 
+## Integration test
+
+### Test in Store Procedure
+
+To test if your code is working in store procedure or not simply, you could work based on `CommonTestBase` in
+`tests/integ/snowflake/ml/test_utils/common_test_base.py`. An example of such test could be found in
+`tests/integ/snowflake/ml/_internal/file_utils_integ_test.py`.
+
+To write a such test, you need to
+
+1. Let your test case inherit from `common_test_base.CommonTestBase`.
+1. Remove all Snowpark Session creation in your test, and use `self.session` to access the session if needed.
+1. If you write your own `setUp` and `tearDown` method, remember to call `super().setUp()` or `super().tearDown().`
+1. Decorate your test method with `common_test_base.CommonTestBase.sproc_test()`. If you want your test running in
+store procedure only rather than both locally and in store procedure, set `local=False`. If you don't want to test
+with caller's rights, set `test_callers_rights=False`. (Owner's rights store procedure is always tested)
+
+    **Attention**: Depending on your configurations, 1-3 sub-tests will be run in your test method.
+    Sub-test means that `setUp` and `tearDown` won't run every sub-test and will only run once before and
+    after the whole test method. So it is important to make your test case self-contained.
+
+### Compatibility Test
+
+To test if your code is compatible with previous version simply, you could work based on `CommonTestBase` in
+`tests/integ/snowflake/ml/test_utils/common_test_base.py`. An example of such test could be found in
+`tests/integ/snowflake/ml/registry/model_registry_compat_test.py`.
+
+To write a such test, you need to
+
+1. Let your test case inherit from `common_test_base.CommonTestBase`.
+1. Remove all Snowpark Session creation in your test, and use `self.session` to access the session if needed.
+1. If you write your own `setUp` and `tearDown` method, remember to call `super().setUp()` or `super().tearDown().`
+1. Write a factory method in your test class that return a tuple of a function and its parameters as a tuple. The
+function will be run as a store procedure in the environment with previous version of library.
+
+    **Note**: Since the function will be created as a store procedure, the first argument must be a Snowpark Session.
+    The arguments tuple you provided via the factory method does not require to include the session object.
+
+    **Note**: To avoid any objects from current environment affecting the result, instead of using `cloudpickle` to
+    pickle the function, the function will be created as a Python file and registered as a store procedure. This means
+    you cannot use any object outside of the function, and if you want to import anything, you need to import inside
+    the function definition. So it would help if you make your prepare function as simple as possible.
+
+1. Decorate your test method with `common_test_base.CommonTestBase.compatibility_test`, providing the factory method
+you created in the above step, optional version range to test with, as well as additional package requirements.
+
+    **Attention**: For every version available in the server and within the version range, a sub-test will be run that
+    contains a run of prepare function in the store procedure and a run of the method. Sub-test means that `setUp` and
+    `tearDown` won't run every sub-test and will only run once before and after the whole test method. So it is
+    important to make your test case self-contained.
+
 ## `pre-commit`
 
 Pull requests against the main branch are subject to `pre-commit` checks. Those checks enforce the code style.

diff --git a/README.md b/README.md
@@ -2,31 +2,32 @@
 
 Snowpark ML is a set of tools including SDKs and underlying infrastructure to build and deploy machine learning models.
 With Snowpark ML, you can pre-process data, train, manage and deploy ML models all within Snowflake, using a single SDK,
- and benefit from Snowflake’s proven performance, scalability, stability and governance at every stage of the Machine
- Learning workflow.
+and benefit from Snowflake’s proven performance, scalability, stability and governance at every stage of the Machine
+Learning workflow.
 
 ## Key Components of Snowpark ML
 
 The Snowpark ML Python SDK provides a number of APIs to support each stage of an end-to-end Machine Learning development
- and deployment process, and includes two key components.
+and deployment process, and includes two key components.
 
 ### Snowpark ML Development [Public Preview]
 
-A collection of python APIs to enable efficient model development directly in Snowflake:
+[Snowpark ML Development](https://docs.snowflake.com/en/developer-guide/snowpark-ml/index#snowpark-ml-development)
+provides a collection of python APIs enabling efficient ML model development directly in Snowflake:
 
-1. Modeling API (snowflake.ml.modeling) for data preprocessing, feature engineering and model training in Snowflake.
-This includes snowflake.ml.modeling.preprocessing for scalable data transformations on large data sets utilizing the
-compute resources of underlying Snowpark Optimized High Memory Warehouses, and a large collection of ML model
-development classes based on sklearn, xgboost, and lightgbm. See the private preview limited access docs (Preprocessing,
- Modeling for more details on these.
+1. Modeling API (`snowflake.ml.modeling`) for data preprocessing, feature engineering and model training in Snowflake.
+This includes the `snowflake.ml.modeling.preprocessing` module for scalable data transformations on large data sets
+utilizing the compute resources of underlying Snowpark Optimized High Memory Warehouses, and a large collection of ML
+model development classes based on sklearn, xgboost, and lightgbm.
 
 1. Framework Connectors: Optimized, secure and performant data provisioning for Pytorch and Tensorflow frameworks in
 their native data loader formats.
 
 ### Snowpark ML Ops [Private Preview]
 
-Snowpark MLOps complements the Snowpark ML Development API, and provides model management capabilities along with
-integrated deployment into Snowflake. Currently, the API consists of
+[Snowpark MLOps](https://docs.snowflake.com/en/developer-guide/snowpark-ml/index#snowpark-ml-ops) complements the
+Snowpark ML Development API, and provides model management capabilities along with integrated deployment into Snowflake.
+Currently, the API consists of:
 
 1. FileSet API: FileSet provides a Python fsspec-compliant API for materializing data into a Snowflake internal stage
 from a query or Snowpark Dataframe along with a number of convenience APIs.
@@ -37,26 +38,48 @@ Snowflake Warehouses as vectorized UDFs.
 During PrPr, we are iterating on API without backward compatibility guarantees. It is better to recreate your registry
 everytime you update the package. This means, at this time, you cannot use the registry for production use.
 
-- [Documentation](https://docs.snowflake.com/developer-guide/snowpark-ml)
-
 ## Getting started
 
 ### Have your Snowflake account ready
 
 If you don't have a Snowflake account yet, you can [sign up for a 30-day free trial account](https://signup.snowflake.com/).
 
-### Create a Python virtual environment
+### Installation
+
+Follow the [installation instructions](https://docs.snowflake.com/en/developer-guide/snowpark-ml/index#installing-snowpark-ml)
+in the Snowflake documentation.
 
-Python version 3.8, 3.9 & 3.10 are supported. You can use [miniconda](https://docs.conda.io/en/latest/miniconda.html),
-[anaconda](https://www.anaconda.com/), or [virtualenv](https://docs.python.org/3/tutorial/venv.html) to create a virtual
- environment.
+Python versions 3.8, 3.9 & 3.10 are supported. You can use [miniconda](https://docs.conda.io/en/latest/miniconda.html) or
+[anaconda](https://www.anaconda.com/) to create a Conda environment (recommended),
+or [virtualenv](https://docs.python.org/3/tutorial/venv.html) to create a virtual environment.
 
-To have the best experience when using this library, [creating a local conda environment with the Snowflake channel](
-    https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-packages.html#local-development-and-testing)
-is recommended.
+### Conda channels
 
-### Install the library to the Python virtual environment
+The [Snowflake Conda Channel](https://repo.anaconda.com/pkgs/snowflake/) contains the official snowpark ML package releases.
+The recommended approach is to install `snowflake-ml-python` this conda channel:
 
 ```sh
-pip install snowflake-ml-python
+conda install \
+  -c https://repo.anaconda.com/pkgs/snowflake \
+  --override-channels \
+  snowflake-ml-python
+```
+
+See [the developer guide](https://docs.snowflake.com/en/developer-guide/snowpark-ml/index) for installation instructions.
+
+The latest version of the `snowpark-ml-python` package is also published in a conda channel in this repository. Package versions
+in this channel may not yet be present in the official Snowflake conda channel.
+
+Install `snowflake-ml-python` from this channel with the following (being sure to replace `<version_specifier>` with the
+desired version, e.g. `1.0.10`):
+
+```bash
+conda install \
+  -c https://raw.githubusercontent.com/snowflakedb/snowflake-ml-python/conda/releases/  \
+  -c https://repo.anaconda.com/pkgs/snowflake \
+  --override-channels \
+  snowflake-ml-python==<version_specifier>
 ```
+
+Note that until a `snowflake-ml-python` package version is available in the official Snowflake conda channel, there may
+be compatibility issues. Server-side functionality that `snowflake-ml-python` depends on may not yet be released.
diff --git a/bazel/environments/conda-env.yml b/bazel/environments/conda-env.yml
@@ -58,3 +58,7 @@ dependencies:
   - types-requests==2.30.0.0
   - typing-extensions==4.5.0
   - xgboost==1.7.3
+  - pip
+  - pip:
+      - --extra-index-url https://pypi.org/simple
+      - peft==0.5.0
diff --git a/bazel/environments/fetch_conda_env_config.bzl b/bazel/environments/fetch_conda_env_config.bzl
@@ -16,6 +16,12 @@ def _fetch_conda_env_config_impl(rctx):
             "compatible_target": ["@SnowML//bazel/platforms:extended_conda_channels"],
             "environment": "@//bazel/environments:conda-env.yml",
         },
+        # `extended_oss` is the extended env for OSS repo which is a  strict subset of `extended`.
+        # It's intended for development without dev VPN.
+        "extended_oss": {
+            "compatible_target": ["@SnowML//bazel/platforms:extended_conda_channels"],
+            "environment": "@//bazel/environments:conda-env.yml",
+        },
         "sf_only": {
             "compatible_target": ["@SnowML//bazel/platforms:snowflake_conda_channel"],
             "environment": "@//bazel/environments:conda-env-snowflake.yml",

diff --git a/bazel/requirements/parse_and_generate_requirements.py b/bazel/requirements/parse_and_generate_requirements.py
@@ -1,6 +1,7 @@
 import argparse
 import collections
 import contextlib
+import copy
 import functools
 import itertools
 import json
@@ -146,6 +147,9 @@ def generate_dev_pinned_string(
         version = req_info.get("dev_version_conda", req_info.get("dev_version", None))
         if version is None:
             raise ValueError("No pinned version exists.")
+        if env == "conda-only":
+            if "dev_version_conda" in req_info or "dev_version" in req_info:
+                return None
         from_channel = req_info.get("from_channel", None)
         if version == "":
             version_str = ""
@@ -158,6 +162,9 @@ def generate_dev_pinned_string(
         version = req_info.get("dev_version_pypi", req_info.get("dev_version", None))
         if version is None:
             raise ValueError("No pinned version exists.")
+        if env == "pip-only":
+            if "dev_version_conda" in req_info or "dev_version" in req_info:
+                return None
         if version == "":
             version_str = ""
         else:
@@ -341,9 +348,15 @@ def generate_requirements(
         sorted(filter(None, map(lambda req_info: generate_dev_pinned_string(req_info, "conda"), requirements)))
     )
 
-    extended_env: List[Union[str, MutableMapping[str, Sequence[str]]]] = extended_env_conda  # type: ignore[assignment]
+    extended_env: List[Union[str, MutableMapping[str, Sequence[str]]]] = copy.deepcopy(
+        extended_env_conda  # type: ignore[arg-type]
+    )
+    # Relative order needs to be maintained here without sorting.
+    # For external pip-only packages, we want to it able to access pypi.org index,
+    # while for internal pip-only packages, nexus is the only viable index.
+    # Relative order is here to prevent nexus index overriding public index.
     pip_only_reqs = list(
-        sorted(filter(None, map(lambda req_info: generate_dev_pinned_string(req_info, "pip-only"), requirements)))
+        filter(None, map(lambda req_info: generate_dev_pinned_string(req_info, "pip-only"), requirements))
     )
     if pip_only_reqs:
         extended_env.extend(["pip", {"pip": pip_only_reqs}])

diff --git a/ci/RunBazelAction.sh b/ci/RunBazelAction.sh
@@ -158,6 +158,7 @@ elif [[ "${action}" = "coverage" ]]; then
         "${cache_test_results}" \
         --combined_report=lcov \
         "${coverage_tag_filter}" \
+        --experimental_collect_code_coverage_for_generated_files \
         --target_pattern_file "${sf_only_test_targets_file}"
     sf_only_bazel_exit_code=$?
 
@@ -170,6 +171,7 @@ elif [[ "${action}" = "coverage" ]]; then
         "${cache_test_results}" \
         --combined_report=lcov \
         "${coverage_tag_filter}" \
+        --experimental_collect_code_coverage_for_generated_files \
         --target_pattern_file "${extended_test_targets_file}"
     extended_bazel_exit_code=$?
 

diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml
@@ -17,7 +17,7 @@ build:
   noarch: python
 package:
   name: snowflake-ml-python
-  version: 1.0.10
+  version: 1.0.11
 requirements:
   build:
     - python
@@ -49,7 +49,7 @@ requirements:
     - mlflow>=2.1.0,<2.4
     - sentencepiece>=0.1.95,<0.2
     - shap==0.42.1
-    - tensorflow>=2.9,<3
+    - tensorflow>=2.9,<3,!=2.12.0
     - tokenizers>=0.10,<1
     - torchdata>=0.4,<1
     - transformers>=4.29.2,<5