diff --git a/.markdownlint.yaml b/.markdownlint.yaml new file mode 100644 index 00000000..ac8279b0 --- /dev/null +++ b/.markdownlint.yaml @@ -0,0 +1,243 @@ +--- +# Example markdownlint configuration with all properties set to their default value + +# Default state for all rules +default: true + +# Path to configuration file to extend +extends: +# MD001/heading-increment/header-increment - Heading levels should only increment by one level at a time +MD001: true + +# MD002/first-heading-h1/first-header-h1 - First heading should be a top-level heading +MD002: + # Heading level + level: 1 + +# MD003/heading-style/header-style - Heading style +MD003: + # Heading style + style: consistent + +# MD004/ul-style - Unordered list style +MD004: + # List style + style: consistent +# MD005/list-indent - Inconsistent indentation for list items at the same level +MD005: true + +# MD006/ul-start-left - Consider starting bulleted lists at the beginning of the line +MD006: true + +# MD007/ul-indent - Unordered list indentation +MD007: + # Spaces for indent + indent: 2 + # Whether to indent the first level of the list + start_indented: false + # Spaces for first level indent (when start_indented is set) + start_indent: 2 + +# MD009/no-trailing-spaces - Trailing spaces +MD009: + # Spaces for line break + br_spaces: 2 + # Allow spaces for empty lines in list items + list_item_empty_lines: false + # Include unnecessary breaks + strict: false + +# MD010/no-hard-tabs - Hard tabs +MD010: + # Include code blocks + code_blocks: true + # Fenced code languages to ignore + ignore_code_languages: [] + # Number of spaces for each hard tab + spaces_per_tab: 2 +# MD011/no-reversed-links - Reversed link syntax +MD011: true + +# MD012/no-multiple-blanks - Multiple consecutive blank lines +MD012: + # Consecutive blank lines + maximum: 1 + +# MD013/line-length - Line length +MD013: + # Number of characters + line_length: 120 + # Number of characters for headings + heading_line_length: 80 + # Number of characters for code blocks + code_block_line_length: 120 + # Include code blocks + code_blocks: true + # Include tables + tables: true + # Include headings + headings: true + # Include headings + headers: true + # Strict length checking + strict: false + # Stern length checking + stern: false +# MD014/commands-show-output - Dollar signs used before commands without showing output +MD014: true + +# MD018/no-missing-space-atx - No space after hash on atx style heading +MD018: true + +# MD019/no-multiple-space-atx - Multiple spaces after hash on atx style heading +MD019: true + +# MD020/no-missing-space-closed-atx - No space inside hashes on closed atx style heading +MD020: true + +# MD021/no-multiple-space-closed-atx - Multiple spaces inside hashes on closed atx style heading +MD021: true + +# MD022/blanks-around-headings/blanks-around-headers - Headings should be surrounded by blank lines +MD022: + # Blank lines above heading + lines_above: 1 + # Blank lines below heading + lines_below: 1 +# MD023/heading-start-left/header-start-left - Headings must start at the beginning of the line +MD023: true + +# MD024/no-duplicate-heading/no-duplicate-header - Multiple headings with the same content +MD024: + # Only check sibling headings + allow_different_nesting: true + # Only check sibling headings + siblings_only: false + +# MD025/single-title/single-h1 - Multiple top-level headings in the same document +MD025: + # Heading level + level: 1 + # RegExp for matching title in front matter + front_matter_title: ^\s*title\s*[:=] + +# MD026/no-trailing-punctuation - Trailing punctuation in heading +MD026: + # Punctuation characters + punctuation: .,;:!。,;:! +# MD027/no-multiple-space-blockquote - Multiple spaces after blockquote symbol +MD027: true + +# MD028/no-blanks-blockquote - Blank line inside blockquote +MD028: true + +# MD029/ol-prefix - Ordered list item prefix +MD029: + # List style + style: one_or_ordered + +# MD030/list-marker-space - Spaces after list markers +MD030: + # Spaces for single-line unordered list items + ul_single: 1 + # Spaces for single-line ordered list items + ol_single: 1 + # Spaces for multi-line unordered list items + ul_multi: 1 + # Spaces for multi-line ordered list items + ol_multi: 1 + +# MD031/blanks-around-fences - Fenced code blocks should be surrounded by blank lines +MD031: + # Include list items + list_items: true +# MD032/blanks-around-lists - Lists should be surrounded by blank lines +MD032: true + +# MD033/no-inline-html - Inline HTML +MD033: + # Allowed elements + allowed_elements: [] +# MD034/no-bare-urls - Bare URL used +MD034: true + +# MD035/hr-style - Horizontal rule style +MD035: + # Horizontal rule style + style: consistent + +# MD036/no-emphasis-as-heading/no-emphasis-as-header - Emphasis used instead of a heading +MD036: + # Punctuation characters + punctuation: .,;:!?。,;:!? +# MD037/no-space-in-emphasis - Spaces inside emphasis markers +MD037: true + +# MD038/no-space-in-code - Spaces inside code span elements +MD038: true + +# MD039/no-space-in-links - Spaces inside link text +MD039: true + +# MD040/fenced-code-language - Fenced code blocks should have a language specified +MD040: + # List of languages + allowed_languages: [] + # Require language only + language_only: false + +# MD041/first-line-heading/first-line-h1 - First line in a file should be a top-level heading +MD041: + # Heading level + level: 1 + # RegExp for matching title in front matter + front_matter_title: ^\s*title\s*[:=] +# MD042/no-empty-links - No empty links +MD042: true + +# MD043/required-headings/required-headers - Required heading structure +MD043: false + +# MD044/proper-names - Proper names should have the correct capitalization +MD044: + # List of proper names + names: [] + # Include code blocks + code_blocks: true + # Include HTML elements + html_elements: true +# MD045/no-alt-text - Images should have alternate text (alt text) +MD045: true + +# MD046/code-block-style - Code block style +MD046: + # Block style + style: consistent +# MD047/single-trailing-newline - Files should end with a single newline character +MD047: true + +# MD048/code-fence-style - Code fence style +MD048: + # Code fence style + style: consistent + +# MD049/emphasis-style - Emphasis style should be consistent +MD049: + # Emphasis style + style: consistent + +# MD050/strong-style - Strong style should be consistent +MD050: + # Strong style + style: consistent +# MD051/link-fragments - Link fragments should be valid +MD051: true + +# MD052/reference-links-images - Reference links and images should use a label that is defined +MD052: true + +# MD053/link-image-reference-definitions - Link and image reference definitions should be needed +MD053: + # Ignored definitions + ignored_definitions: + - // diff --git a/BUILD.bazel b/BUILD.bazel index 8debc289..3fb7cb2f 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1,8 +1,12 @@ load("@rules_java//java:defs.bzl", "java_binary") +load("//:packages.bzl", "PACKAGES") +load("//bazel:py_rules.bzl", "py_wheel") +load("//bazel/requirements:rules.bzl", "generate_pyproject_file") exports_files([ "CHANGELOG.md", "README.md", + "LICENSE.txt", "conda-env-extended.yml", "conda-env-snowflake.yml", "conda-env.yml", @@ -16,3 +20,19 @@ java_binary( main_class = "com.bazel_diff.Main", runtime_deps = ["@bazel_diff//jar"], ) + +generate_pyproject_file( + name = "snowml", + src_requirement_file = "//:requirements.yml", +) + +py_wheel( + name = "wheel", + data = [ + "//:CHANGELOG.md", + "//:LICENSE.txt", + "//:README.md", + ], + pyproject_toml = ":snowml_pyproject", + deps = PACKAGES, +) diff --git a/CHANGELOG.md b/CHANGELOG.md index da6550e4..2f9f8392 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Release History +## 1.2.1 + +### New Features + +- Model Development: Infers output column data type for transformers when possible. +- Registry: `relax_version` option is available in the `options` argument when logging the model. + ## 1.2.0 ### Bug Fixes diff --git a/README.md b/README.md index 1610d8b8..af37ef3a 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Learning workflow. The Snowpark ML Python SDK provides a number of APIs to support each stage of an end-to-end Machine Learning development and deployment process, and includes two key components. -### Snowpark ML Development [Public Preview] +### Snowpark ML Development [Snowpark ML Development](https://docs.snowflake.com/en/developer-guide/snowpark-ml/index#snowpark-ml-development) provides a collection of python APIs enabling efficient ML model development directly in Snowflake: diff --git a/_typos.toml b/_typos.toml new file mode 100644 index 00000000..c3ab485b --- /dev/null +++ b/_typos.toml @@ -0,0 +1,16 @@ +[default] +extend-ignore-identifiers-re = [ + ".*[A-Fa-f0-9]{8,}" +] + +[default.extend-identifiers] +LiteralNDArrayType = "LiteralNDArrayType" + +[default.extend-words] +MAPE = "MAPE" +fpr = "fpr" +fwe = "fwe" + +[type.po] +extend-glob = ["*.ipynb"] +check-file = false diff --git a/bazel/BUILD.bazel b/bazel/BUILD.bazel index 5b29739a..238f81ba 100644 --- a/bazel/BUILD.bazel +++ b/bazel/BUILD.bazel @@ -2,6 +2,10 @@ load("@rules_python//python:defs.bzl", native_py_test = "py_test") package(default_visibility = ["//visibility:public"]) +exports_files([ + "wheelbuilder.py", +]) + native_py_test( name = "repo_paths_test", srcs = ["repo_paths_test.py"], @@ -19,8 +23,9 @@ sh_binary( package_group( name = "snowml_public_common", packages = [ - "//bazel/...", - "//ci/...", - "//docs/...", + "-//codegen/...", + "-//snowflake/...", + "-//tests/...", + "//...", ], ) diff --git a/bazel/environments/conda-env-build.yml b/bazel/environments/conda-env-build.yml index 1bd13e45..fcb8d511 100644 --- a/bazel/environments/conda-env-build.yml +++ b/bazel/environments/conda-env-build.yml @@ -7,6 +7,7 @@ channels: - nodefaults dependencies: - absl-py==1.3.0 + - build==0.10.0 - conda-libmamba-solver==23.7.0 - inflection==0.5.1 - jsonschema==3.2.0 @@ -16,4 +17,6 @@ dependencies: - ruamel.yaml==0.17.21 - scikit-learn==1.3.0 - sphinx==5.0.2 + - toml==0.10.2 + - types-toml==0.10.8.6 - xgboost==1.7.3 diff --git a/bazel/environments/conda-env-snowflake.yml b/bazel/environments/conda-env-snowflake.yml index 0c670d26..9b7fa3f6 100644 --- a/bazel/environments/conda-env-snowflake.yml +++ b/bazel/environments/conda-env-snowflake.yml @@ -10,6 +10,7 @@ dependencies: - aiohttp==3.8.3 - anyio==3.5.0 - boto3==1.24.28 + - build==0.10.0 - cachetools==4.2.2 - cloudpickle==2.0.0 - conda-libmamba-solver==23.7.0 @@ -50,9 +51,11 @@ dependencies: - sqlparse==0.4.4 - tensorflow==2.10.0 - tokenizers==0.13.2 + - toml==0.10.2 - torchdata==0.6.1 - transformers==4.32.1 - types-protobuf==4.23.0.1 - types-requests==2.30.0.0 + - types-toml==0.10.8.6 - typing-extensions==4.5.0 - xgboost==1.7.3 diff --git a/bazel/environments/conda-env.yml b/bazel/environments/conda-env.yml index febfa4bd..ae47481a 100644 --- a/bazel/environments/conda-env.yml +++ b/bazel/environments/conda-env.yml @@ -10,6 +10,7 @@ dependencies: - aiohttp==3.8.3 - anyio==3.5.0 - boto3==1.24.28 + - build==0.10.0 - cachetools==4.2.2 - cloudpickle==2.0.0 - conda-forge::accelerate==0.22.0 @@ -55,10 +56,12 @@ dependencies: - sqlparse==0.4.4 - tensorflow==2.10.0 - tokenizers==0.13.2 + - toml==0.10.2 - torchdata==0.6.1 - transformers==4.32.1 - types-protobuf==4.23.0.1 - types-requests==2.30.0.0 + - types-toml==0.10.8.6 - typing-extensions==4.5.0 - xgboost==1.7.3 - pip diff --git a/bazel/environments/conda-gpu-env.yml b/bazel/environments/conda-gpu-env.yml index 3385d0ff..ab2ac789 100755 --- a/bazel/environments/conda-gpu-env.yml +++ b/bazel/environments/conda-gpu-env.yml @@ -10,6 +10,7 @@ dependencies: - aiohttp==3.8.3 - anyio==3.5.0 - boto3==1.24.28 + - build==0.10.0 - cachetools==4.2.2 - cloudpickle==2.0.0 - conda-forge::accelerate==0.22.0 @@ -57,10 +58,12 @@ dependencies: - sqlparse==0.4.4 - tensorflow==2.10.0 - tokenizers==0.13.2 + - toml==0.10.2 - torchdata==0.6.1 - transformers==4.32.1 - types-protobuf==4.23.0.1 - types-requests==2.30.0.0 + - types-toml==0.10.8.6 - typing-extensions==4.5.0 - xgboost==1.7.3 - pip diff --git a/bazel/py_rules.bzl b/bazel/py_rules.bzl index 72d6bd1a..c01e6d60 100644 --- a/bazel/py_rules.bzl +++ b/bazel/py_rules.bzl @@ -34,7 +34,6 @@ load( native_py_library = "py_library", native_py_test = "py_test", ) -load("@rules_python//python:packaging.bzl", native_py_wheel = "py_wheel") load(":repo_paths.bzl", "check_for_experimental_dependencies", "check_for_test_name", "check_for_tests_dependencies") def py_genrule(**attrs): @@ -244,89 +243,50 @@ This rule is intended to be used as data dependency to py_wheel rule. attrs = py_package_lib.attrs, ) -def py_wheel(compatible_with_snowpark = True, **attrs): - """Modified version of py_wheel rule from rules_python. +def _py_wheel_impl(ctx): + runfiles_root = ctx.executable.wheel_builder.path + ".runfiles" + workspace_name = ctx.workspace_name + execution_root_relative_path = "%s/%s" % (runfiles_root, workspace_name) + wheel_output_dir = ctx.actions.declare_directory("dist") + ctx.actions.run( + inputs = [ctx.file.pyproject_toml], + outputs = [wheel_output_dir], + executable = ctx.executable.wheel_builder, + arguments = [ + ctx.file.pyproject_toml.path, + execution_root_relative_path, + "--wheel", + "--outdir", + wheel_output_dir.path, + ], + progress_message = "Building Wheel", + mnemonic = "WheelBuild", + ) - Args: - compatible_with_snowpark: adds a tag to the wheel to indicate that this - wheel is compatible with the snowpark running environment. - **attrs: attributes supported by the native py_wheel rules. - """ + return [DefaultInfo(files = depset([wheel_output_dir]))] - if compatible_with_snowpark: - tags = attrs.setdefault("tags", []) - tags.append(_COMPATIBLE_WITH_SNOWPARK_TAG) - native_py_wheel(**attrs) - -def snowml_wheel( - name, - requires, - extra_requires, - version, - deps, - description_file = None, - development_status = "Alpha", - compatible_with_snowpark = True): - """A SnowML customized wheel definition with lots of default values filled in. +_py_wheel = rule( + attrs = { + "pyproject_toml": attr.label( + mandatory = True, + allow_single_file = True, + ), + "wheel_builder": attr.label( + executable = True, + cfg = "exec", + ), + }, + implementation = _py_wheel_impl, +) - Args: - name: Name of the target - requires: List of required dependencies - extra_requires(Dict[str, List[str]]): Dict of soft dependencies - version: Version string - deps: List of dependencies of type py_package - development_status: String with PrPr, PuPr & GA - description_file: Label of readme file. - compatible_with_snowpark: adds a tag to the wheel to indicate that this - wheel is compatible with the snowpark running environment. - """ - dev_status = "Development Status :: 5 - Production/Stable" - if development_status.lower() == "prpr": - dev_status = "Development Status :: 3 - Alpha" - elif development_status.lower() == "pupr": - dev_status = "Development Status :: 3 - Beta" - homepage = "https://github.com/snowflakedb/snowflake-ml-python" - py_wheel( - name = name, - author = "Snowflake, Inc", - author_email = "support@snowflake.com", - classifiers = [dev_status] + - [ - "Environment :: Console", - "Environment :: Other Environment", - "Intended Audience :: Developers", - "Intended Audience :: Education", - "Intended Audience :: Information Technology", - "Intended Audience :: System Administrators", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Topic :: Database", - "Topic :: Software Development", - "Topic :: Software Development :: Libraries", - "Topic :: Software Development :: Libraries :: Application Frameworks", - "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Scientific/Engineering :: Information Analysis", - ], - description_file = description_file, - description_content_type = "text/markdown", - compatible_with_snowpark = compatible_with_snowpark, - distribution = "snowflake-ml-python", - extra_requires = extra_requires, - homepage = homepage, - project_urls = { - "Changelog": homepage + "/blob/main/CHANGELOG.md", - "Documentation": "https://docs.snowflake.com/developer-guide/snowpark-ml", - "Issues": homepage + "/issues", - "Source": homepage, - }, - license = "Apache License, Version 2.0", - python_requires = ">=3.8,<4", - python_tag = "py3", - requires = requires, - summary = "The machine learning client library that is used for interacting with Snowflake to build machine learning solutions.", - version = version, +def py_wheel(name, deps, data = [], **kwargs): + wheel_builder_name = name + "_wheel_builder_main" + py_binary( + name = wheel_builder_name, + srcs = ["@//bazel:wheelbuilder.py"], + visibility = ["//visibility:public"], + main = "wheelbuilder.py", deps = deps, + data = data, ) + _py_wheel(name = name, wheel_builder = wheel_builder_name, **kwargs) diff --git a/bazel/requirements/BUILD.bazel b/bazel/requirements/BUILD.bazel index 1c8acc9f..0e7daa0e 100644 --- a/bazel/requirements/BUILD.bazel +++ b/bazel/requirements/BUILD.bazel @@ -48,11 +48,6 @@ _GENERATED_REQUIREMENTS_FILES = { "generated": "meta.yaml", "target": "//ci/conda_recipe:meta.yaml", }, - "requirements_bzl": { - "cmd": "--mode version_requirements --format bzl", - "generated": "requirements.bzl", - "target": "//snowflake/ml:requirements.bzl", - }, "requirements_txt": { "cmd": "--mode dev_version --format text", "generated": "requirements.txt", diff --git a/bazel/requirements/parse_and_generate_requirements.py b/bazel/requirements/parse_and_generate_requirements.py index a70a999c..0195a0f9 100644 --- a/bazel/requirements/parse_and_generate_requirements.py +++ b/bazel/requirements/parse_and_generate_requirements.py @@ -22,6 +22,7 @@ ) import jsonschema +import toml from conda_libmamba_solver import solver from packaging import requirements as packaging_requirements from ruamel.yaml import YAML @@ -304,6 +305,7 @@ def fold_channel(channels: Set[str], req_info: RequirementInfo) -> Set[str]: def generate_requirements( req_file_path: str, schema_file_path: str, + pyproject_file_path: str, mode: str, format: Optional[str], snowflake_channel_only: bool, @@ -415,9 +417,24 @@ def generate_requirements( ) ) sys.stdout.writelines(results) - elif (mode, format) == ("dev_version", "python"): - sys.stdout.write(f"REQUIREMENTS = {json.dumps(snowflake_only_env, indent=4)}\n") - elif (mode, format) == ("version_requirements", "bzl"): + elif (mode, format) == ("version_requirements", "python"): + results = list( + sorted( + filter( + None, + map( + lambda req_info: generate_user_requirements_string(req_info, "conda"), + filter( + lambda req_info: req_info.get("from_channel", SNOWFLAKE_CONDA_CHANNEL) + == SNOWFLAKE_CONDA_CHANNEL, + requirements, + ), + ), + ), + ) + ) + sys.stdout.write(f"REQUIREMENTS = {json.dumps(results, indent=4)}\n") + elif (mode, format) == ("version_requirements", "toml"): extras_requirements = list(filter(lambda req_info: filter_by_extras(req_info, True, False), requirements)) extras_results: MutableMapping[str, Sequence[str]] = {} all_extras_tags: Set[str] = set() @@ -453,11 +470,11 @@ def generate_requirements( ) ) ) - sys.stdout.write( - "EXTRA_REQUIREMENTS = {extra_requirements}\n\nREQUIREMENTS = {requirements}\n".format( - extra_requirements=json.dumps(extras_results, indent=4), requirements=json.dumps(results, indent=4) - ) - ) + with open(pyproject_file_path, encoding="utf-8") as f: + pyproject_config = toml.load(f) + pyproject_config["project"]["dependencies"] = results + pyproject_config["project"]["optional-dependencies"] = extras_results + toml.dump(pyproject_config, sys.stdout) elif (mode, format) == ("version_requirements", "python"): results = list( sorted( @@ -509,6 +526,7 @@ def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("requirement_file", help="Path to the requirement.yaml file", type=str) parser.add_argument("--schema", type=str, help="Path to the json schema file.", required=True) + parser.add_argument("--pyproject-template", type=str, help="PyProject template file.") parser.add_argument( "--mode", type=str, @@ -519,7 +537,7 @@ def main() -> None: parser.add_argument( "--format", type=str, - choices=["text", "bzl", "python", "conda_env", "conda_meta"], + choices=["text", "toml", "python", "conda_env", "conda_meta"], help="Define the output format.", ) parser.add_argument("--filter_by_tag", type=str, default=None, help="Filter the result by tags.") @@ -535,8 +553,8 @@ def main() -> None: VALID_SETTINGS = [ ("validate", None, False), # Validate the environment ("dev_version", "text", False), # requirements.txt - ("dev_version", "python", True), # sproc test dependencies list - ("version_requirements", "bzl", False), # wheel rule requirements + ("version_requirements", "python", True), # sproc test dependencies list + ("version_requirements", "toml", False), # wheel rule requirements ("version_requirements", "python", False), # model deployment core dependencies list ("dev_version", "conda_env", False), # dev conda-env.yml file ("dev_gpu_version", "conda_env", False), # dev conda-gpu-env.yml file @@ -550,6 +568,7 @@ def main() -> None: generate_requirements( args.requirement_file, args.schema, + args.pyproject_template, args.mode, args.format, args.snowflake_channel_only, diff --git a/bazel/requirements/rules.bzl b/bazel/requirements/rules.bzl index e2cd91eb..b65e6ece 100644 --- a/bazel/requirements/rules.bzl +++ b/bazel/requirements/rules.bzl @@ -8,15 +8,32 @@ _AUTOGEN_HEADERS = """# DO NOT EDIT! """ _SCHEMA_FILE = "//bazel/requirements:requirements.schema.json" +_PYPROJECT_FILE = "//bazel/requirements/templates:pyproject.toml" _GENERATE_TOOL = "//bazel/requirements:parse_and_generate_requirements" -_GENERATE_COMMAND = "$(location " + _GENERATE_TOOL + ") $(location {src_requirement_file} ) --schema $(location " + _SCHEMA_FILE + ") {options} > $@" +_GENERATE_COMMAND = "$(location {}) $(location {{src_requirement_file}}) --schema $(location {}) {{options}} > $@".format(_GENERATE_TOOL, _SCHEMA_FILE) # "---" is a document start marker, which is legit but optional (https://yaml.org/spec/1.1/#c-document-start). This # is needed for conda meta.yaml to bypass some bug from conda side. _YAML_START_DOCUMENT_MARKER = "---" +def generate_pyproject_file( + name, + src_requirement_file): + _cmd = "$(location {}) $(location {{src_requirement_file}} ) --schema $(location {}) --pyproject-template $(location {}) --mode version_requirements --format toml > $@".format(_GENERATE_TOOL, _SCHEMA_FILE, _PYPROJECT_FILE) + py_genrule( + name = "{name}_pyproject".format(name = name), + srcs = [ + src_requirement_file, + _SCHEMA_FILE, + _PYPROJECT_FILE, + ], + outs = ["pyproject.toml"], + cmd = _cmd.format(src_requirement_file = src_requirement_file), + tools = [_GENERATE_TOOL], + ) + def generate_requirement_file( name, generated_file, diff --git a/bazel/requirements/templates/BUILD.bazel b/bazel/requirements/templates/BUILD.bazel index 9924b59a..ff3448d4 100644 --- a/bazel/requirements/templates/BUILD.bazel +++ b/bazel/requirements/templates/BUILD.bazel @@ -1,3 +1,4 @@ exports_files([ "meta.tpl.yaml", + "pyproject.toml", ]) diff --git a/bazel/requirements/templates/meta.tpl.yaml b/bazel/requirements/templates/meta.tpl.yaml index 7d0b22c2..2ddac5b8 100644 --- a/bazel/requirements/templates/meta.tpl.yaml +++ b/bazel/requirements/templates/meta.tpl.yaml @@ -14,6 +14,8 @@ requirements: - bazel >=6.0.0 run: - python>=3.8,<3.11 + run_constrained: + - pytorch<2.1.0 # [win] about: home: https://github.com/snowflakedb/snowflake-ml-python diff --git a/bazel/requirements/templates/pyproject.toml b/bazel/requirements/templates/pyproject.toml new file mode 100644 index 00000000..60ada30c --- /dev/null +++ b/bazel/requirements/templates/pyproject.toml @@ -0,0 +1,51 @@ +[build-system] +requires = ["setuptools >= 61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "snowflake-ml-python" +authors = [ + {name = "Snowflake, Inc", email = "support@snowflake.com"} +] +description = "The machine learning client library that is used for interacting with Snowflake to build machine learning solutions." +license = {file = "LICENSE.txt"} +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Environment :: Other Environment", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Information Technology", + "Intended Audience :: System Administrators", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Database", + "Topic :: Software Development", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Application Frameworks", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Scientific/Engineering :: Information Analysis" +] +requires-python = ">=3.8, <4" +dynamic = ["version", "readme"] + +[project.urls] +Homepage = "https://github.com/snowflakedb/snowflake-ml-python" +Documentation = "https://docs.snowflake.com/developer-guide/snowpark-ml" +Repository = "https://github.com/snowflakedb/snowflake-ml-python" +Issues = "https://github.com/snowflakedb/snowflake-ml-python/issues" +Changelog = "https://github.com/snowflakedb/snowflake-ml-python/blob/master/CHANGELOG.md" + +[tool.setuptools.dynamic] +version = {attr = "snowflake.ml.version.VERSION"} +readme = {file = ["README.md", "CHANGELOG.md"], content-type = "text/markdown"} + +[tool.setuptools.package-data] +"*" = ["*"] + +[tool.setuptools.packages.find] +where = ["."] +include = ["snowflake*"] diff --git a/bazel/wheelbuilder.py b/bazel/wheelbuilder.py new file mode 100644 index 00000000..cc1b62c8 --- /dev/null +++ b/bazel/wheelbuilder.py @@ -0,0 +1,13 @@ +import os +import sys + +from build.__main__ import main + +if __name__ == "__main__": + assert len(sys.argv) >= 3 + pyproject_toml_path = sys.argv[1] + src_path = sys.argv[2] + with open(pyproject_toml_path, encoding="utf-8") as rf: + with open(os.path.join(src_path, "pyproject.toml"), "w", encoding="utf-8") as wf: + wf.write(rf.read()) + main(sys.argv[2:]) diff --git a/ci/build_and_run_tests.sh b/ci/build_and_run_tests.sh index 209b9e1a..f31e5428 100755 --- a/ci/build_and_run_tests.sh +++ b/ci/build_and_run_tests.sh @@ -240,8 +240,8 @@ if [ "${ENV}" = "pip" ]; then # Build SnowML pushd ${SNOWML_DIR} - "${BAZEL}" "${BAZEL_ADDITIONAL_STARTUP_FLAGS[@]+"${BAZEL_ADDITIONAL_STARTUP_FLAGS[@]}"}" build "${BAZEL_ADDITIONAL_BUILD_FLAGS[@]+"${BAZEL_ADDITIONAL_BUILD_FLAGS[@]}"}" //snowflake/ml:wheel - cp "$(${BAZEL} info bazel-bin)/snowflake/ml/snowflake_ml_python-${VERSION}-py3-none-any.whl" "${WORKSPACE}" + "${BAZEL}" "${BAZEL_ADDITIONAL_STARTUP_FLAGS[@]+"${BAZEL_ADDITIONAL_STARTUP_FLAGS[@]}"}" build "${BAZEL_ADDITIONAL_BUILD_FLAGS[@]+"${BAZEL_ADDITIONAL_BUILD_FLAGS[@]}"}" //:wheel + cp "$(${BAZEL} info bazel-bin)/dist/snowflake_ml_python-${VERSION}-py3-none-any.whl" "${WORKSPACE}" popd else # Clean conda cache diff --git a/ci/conda_recipe/bld.bat b/ci/conda_recipe/bld.bat index 6578c90f..9e8f4f01 100644 --- a/ci/conda_recipe/bld.bat +++ b/ci/conda_recipe/bld.bat @@ -1,10 +1,10 @@ @echo off setlocal EnableDelayedExpansion -bazel "--output_user_root=C:\broot" "build" "--repository_cache=" "--nobuild_python_zip" "--enable_runfiles" --action_env="USERPROFILE=%USERPROFILE%" --host_action_env="USERPROFILE=%USERPROFILE%" --config="py%PY_VER%" "//snowflake/ml:wheel" || exit /b %errorlevel% +bazel "--output_user_root=C:\broot" "build" "--repository_cache=" "--nobuild_python_zip" "--enable_runfiles" --action_env="USERPROFILE=%USERPROFILE%" --host_action_env="USERPROFILE=%USERPROFILE%" --config="py%PY_VER%" "//:wheel" || exit /b %errorlevel% SET BAZEL_BIN_PATH= FOR /f "delims=" %%a in ('bazel --output_user_root=C:\broot info bazel-bin') DO (SET "BAZEL_BIN_PATH=!BAZEL_BIN_PATH!%%a") -SET WHEEL_PATH_PATTERN="!BAZEL_BIN_PATH:/=\!\snowflake\ml\*.whl" +SET WHEEL_PATH_PATTERN="!BAZEL_BIN_PATH:/=\!\dist\*.whl" SET WHEEL_PATH= FOR /f "delims=" %%a in ('dir /b/s !WHEEL_PATH_PATTERN!') DO (SET "WHEEL_PATH=!WHEEL_PATH!%%a") pip "install" "--no-dependencies" "!WHEEL_PATH!" || exit /b %errorlevel% diff --git a/ci/conda_recipe/build.sh b/ci/conda_recipe/build.sh index f197fa5d..30667d5f 100644 --- a/ci/conda_recipe/build.sh +++ b/ci/conda_recipe/build.sh @@ -2,8 +2,8 @@ set -eux # --repository_cache="" disables the repository cache. The repository cache won't be # cleaned by `bazel clean --expunge`, thus would leave trace. -bazel build --repository_cache="" --config="py${PY_VER}" //snowflake/ml:wheel -pip install --no-dependencies "$(bazel info bazel-bin)"/snowflake/ml/*.whl +bazel build --repository_cache="" --config="py${PY_VER}" //:wheel +pip install --no-dependencies "$(bazel info bazel-bin)"/dist/*.whl # We need to remove all the bazel's output because `conda build` looks at the diff of # the working environment before and after running this script. And bazel's output # are included in that working environment. diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml index 3c244f9d..c2fbc9cb 100644 --- a/ci/conda_recipe/meta.yaml +++ b/ci/conda_recipe/meta.yaml @@ -17,7 +17,7 @@ build: noarch: python package: name: snowflake-ml-python - version: 1.2.0 + version: 1.2.1 requirements: build: - python @@ -56,5 +56,6 @@ requirements: - tokenizers>=0.10,<1 - torchdata>=0.4,<1 - transformers>=4.32.1,<5 + - pytorch<2.1.0 # [win] source: path: ../../ diff --git a/ci/get_excluded_tests.sh b/ci/get_excluded_tests.sh index 0b66c7a5..1da9df0c 100755 --- a/ci/get_excluded_tests.sh +++ b/ci/get_excluded_tests.sh @@ -70,7 +70,7 @@ if [[ $mode = "unused" || $mode = "all" ]]; then # -- Begin of Query Rules Heredoc -- cat >"${unused_test_rule_file}" <=2.0.1,<2.1.0; platform_system == "Windows"' + requirements_extra_tags: + - torch - name: pyyaml dev_version: '6.0' version_requirements: '>=6.0,<7' @@ -257,6 +264,10 @@ version_requirements: '>=0.10,<1' requirements_extra_tags: - transformers +- name: toml + dev_version: 0.10.2 + tags: + - build_essential - name: torchdata dev_version: 0.6.1 version_requirements: '>=0.4,<1' @@ -274,6 +285,10 @@ - name: types-PyYAML dev_version: 6.0.12 from_channel: conda-forge +- name: types-toml + dev_version: 0.10.8.6 + tags: + - build_essential - name: typing-extensions dev_version: 4.5.0 version_requirements: '>=4.1.0,<5' diff --git a/snowflake/ml/BUILD.bazel b/snowflake/ml/BUILD.bazel index 35879684..6a67928e 100644 --- a/snowflake/ml/BUILD.bazel +++ b/snowflake/ml/BUILD.bazel @@ -1,6 +1,4 @@ -load("//bazel:py_rules.bzl", "py_library", "snowml_wheel") -load(":packages.bzl", "PACKAGES") -load(":requirements.bzl", "EXTRA_REQUIREMENTS", "REQUIREMENTS") +load("//bazel:py_rules.bzl", "py_library") load(":version.bzl", "VERSION") package(default_visibility = ["//visibility:public"]) @@ -30,14 +28,3 @@ genrule( outs = ["description.md"], cmd = "cat $(location //:README.md) $(location //:CHANGELOG.md) > $@", ) - -snowml_wheel( - name = "wheel", - compatible_with_snowpark = False, - description_file = ":description.md", - development_status = "PrPr", - extra_requires = EXTRA_REQUIREMENTS, - requires = REQUIREMENTS, - version = VERSION, - deps = PACKAGES, -) diff --git a/snowflake/ml/_internal/telemetry.py b/snowflake/ml/_internal/telemetry.py index 30ceeb2b..42f30e21 100644 --- a/snowflake/ml/_internal/telemetry.py +++ b/snowflake/ml/_internal/telemetry.py @@ -584,3 +584,22 @@ def send_batch(self) -> None: """Send the telemetry data batch immediately.""" if self._telemetry: self._telemetry.send_batch() + + +def get_sproc_statement_params_kwargs(sproc: Callable[..., Any], statement_params: Dict[str, Any]) -> Dict[str, Any]: + """ + Get statement_params keyword argument for sproc call. + + Args: + sproc: sproc function + statement_params: dictionary to be passed as statement params, if possible + + Returns: + Keyword arguments dict + """ + sproc_argspec = inspect.getfullargspec(sproc) + kwargs = {} + if "statement_params" in sproc_argspec.args: + kwargs["statement_params"] = statement_params + + return kwargs diff --git a/snowflake/ml/_internal/telemetry_test.py b/snowflake/ml/_internal/telemetry_test.py index 95c7b403..671473b2 100644 --- a/snowflake/ml/_internal/telemetry_test.py +++ b/snowflake/ml/_internal/telemetry_test.py @@ -511,6 +511,20 @@ def test_send_custom_usage(self, mock_get_active_sessions: mock.MagicMock) -> No } ) + def test_get_sproc_statement_params_kwargs(self) -> None: + def test_sproc_no_statement_params() -> None: + return None + + def test_sproc_statement_params(statement_params: Optional[Dict[str, Any]] = None) -> Optional[Dict[str, Any]]: + return statement_params + + statement_params = {"test": "test"} + kwargs = utils_telemetry.get_sproc_statement_params_kwargs(test_sproc_statement_params, statement_params) + assert "statement_params" in kwargs + + kwargs = utils_telemetry.get_sproc_statement_params_kwargs(test_sproc_no_statement_params, statement_params) + assert "statement_params" not in kwargs + if __name__ == "__main__": absltest.main() diff --git a/snowflake/ml/feature_store/feature_store.py b/snowflake/ml/feature_store/feature_store.py index 34376081..3d4525d9 100644 --- a/snowflake/ml/feature_store/feature_store.py +++ b/snowflake/ml/feature_store/feature_store.py @@ -8,9 +8,10 @@ import warnings from dataclasses import dataclass from enum import Enum -from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast +from typing import Callable, Dict, List, Optional, Tuple, TypeVar, Union, cast from pytimeparse.timeparse import timeparse +from typing_extensions import Concatenate, ParamSpec from snowflake.ml._internal import telemetry from snowflake.ml._internal.exceptions import ( @@ -42,6 +43,9 @@ from snowflake.snowpark.exceptions import SnowparkSQLException from snowflake.snowpark.types import StructField +_Args = ParamSpec("_Args") +_RT = TypeVar("_RT") + logger = logging.getLogger(__name__) _ENTITY_TAG_PREFIX = "SNOWML_FEATURE_STORE_ENTITY_" @@ -77,9 +81,11 @@ def full_schema_path(self) -> str: return f"{self.database}.{self.schema}" -def switch_warehouse(f: Callable[..., Any]) -> Callable[..., Any]: +def switch_warehouse( + f: Callable[Concatenate[FeatureStore, _Args], _RT] +) -> Callable[Concatenate[FeatureStore, _Args], _RT]: @functools.wraps(f) - def wrapper(self: FeatureStore, *args: Any, **kargs: Any) -> Any: + def wrapper(self: FeatureStore, /, *args: _Args.args, **kargs: _Args.kwargs) -> _RT: original_warehouse = self._session.get_current_warehouse() try: if self._default_warehouse is not None: @@ -91,13 +97,17 @@ def wrapper(self: FeatureStore, *args: Any, **kargs: Any) -> Any: return wrapper -def dispatch_decorator(prpr_version: str) -> Callable[..., Any]: - def decorator(f: Callable[..., Any]) -> Callable[..., Any]: +def dispatch_decorator( + prpr_version: str, +) -> Callable[[Callable[Concatenate[FeatureStore, _Args], _RT]], Callable[Concatenate[FeatureStore, _Args], _RT],]: + def decorator( + f: Callable[Concatenate[FeatureStore, _Args], _RT] + ) -> Callable[Concatenate[FeatureStore, _Args], _RT]: @telemetry.send_api_usage_telemetry(project=_PROJECT) @snowpark_utils.private_preview(version=prpr_version) @switch_warehouse @functools.wraps(f) - def wrap(self: FeatureStore, *args: Any, **kargs: Any) -> Any: + def wrap(self: FeatureStore, /, *args: _Args.args, **kargs: _Args.kwargs) -> _RT: return f(self, *args, **kargs) return wrap @@ -359,7 +369,7 @@ def create_col_desc(col: StructField) -> str: ) from e logger.info(f"Registered FeatureView {feature_view.name}/{version}.") - return self.get_feature_view(feature_view.name, str(version)) # type: ignore[no-any-return] + return self.get_feature_view(feature_view.name, str(version)) @dispatch_decorator(prpr_version="1.1.0") def update_feature_view(self, feature_view: FeatureView) -> None: @@ -751,7 +761,7 @@ def delete_entity(self, name: str) -> None: original_exception=ValueError(f"Entity {name} does not exist."), ) - active_feature_views = list(self.list_feature_views(entity_name=name, as_dataframe=False)) + active_feature_views = cast(List[FeatureView], self.list_feature_views(entity_name=name, as_dataframe=False)) if len(active_feature_views) > 0: raise snowml_exceptions.SnowflakeMLException( error_code=error_codes.SNOWML_DELETE_FAILED, diff --git a/snowflake/ml/feature_store/tests/feature_store_test.py b/snowflake/ml/feature_store/tests/feature_store_test.py index 65246002..6be2f7e7 100644 --- a/snowflake/ml/feature_store/tests/feature_store_test.py +++ b/snowflake/ml/feature_store/tests/feature_store_test.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import List, Optional, Union, cast from uuid import uuid4 from absl.testing import absltest @@ -15,21 +15,22 @@ from snowflake.ml._internal.utils.sql_identifier import SqlIdentifier from snowflake.ml.dataset.dataset import Dataset -from snowflake.ml.feature_store import ( # type: ignore[attr-defined] - CreationMode, - Entity, - FeatureStore, - FeatureView, - FeatureViewStatus, -) +from snowflake.ml.feature_store.entity import Entity from snowflake.ml.feature_store.feature_store import ( _ENTITY_TAG_PREFIX, _FEATURE_STORE_OBJECT_TAG, _FEATURE_VIEW_ENTITY_TAG, _FEATURE_VIEW_TS_COL_TAG, + CreationMode, + FeatureStore, +) +from snowflake.ml.feature_store.feature_view import ( + FeatureView, + FeatureViewSlice, + FeatureViewStatus, ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions -from snowflake.snowpark import Session, exceptions as snowpark_exceptions +from snowflake.snowpark import DataFrame, Session, exceptions as snowpark_exceptions from snowflake.snowpark.functions import call_udf, col, udf @@ -230,7 +231,12 @@ def test_create_and_delete_entities(self) -> None: # test delete entity failure with active feature views # create a new feature view sql = f'SELECT name, id AS "uid" FROM {self._mock_table}' - fv = FeatureView(name="fv", entities=[entities["User"]], feature_df=self._session.sql(sql), refresh_freq="1m") + fv = FeatureView( + name="fv", + entities=[entities["User"]], + feature_df=self._session.sql(sql), + refresh_freq="1m", + ) fs.register_feature_view(feature_view=fv, version="FIRST") with self.assertRaisesRegex(ValueError, "Cannot delete Entity .* due to active FeatureViews.*"): fs.delete_entity("User") @@ -482,7 +488,7 @@ def test_create_and_delete_feature_views(self) -> None: self.assertEqual(fv.timestamp_col, None) fv = fs.get_feature_view(name="fv0", version="SECOND") - self.assertEqual(fv.timestamp_col.upper(), "TS") + self.assertEqual(str(fv.timestamp_col).upper(), "TS") def test_create_duplicated_feature_view(self) -> None: fs = self._create_feature_store() @@ -491,10 +497,20 @@ def test_create_duplicated_feature_view(self) -> None: fs.register_entity(e) sql = f"SELECT * FROM {self._mock_table}" - fv = FeatureView(name="fv", entities=[e], feature_df=self._session.sql(sql), refresh_freq="1m") + fv = FeatureView( + name="fv", + entities=[e], + feature_df=self._session.sql(sql), + refresh_freq="1m", + ) fv = fs.register_feature_view(feature_view=fv, version="v1") - fv = FeatureView(name="fv", entities=[e], feature_df=self._session.sql(sql), refresh_freq="1m") + fv = FeatureView( + name="fv", + entities=[e], + feature_df=self._session.sql(sql), + refresh_freq="1m", + ) with self.assertRaisesRegex(ValueError, "FeatureView .* already exists."): fv = fs.register_feature_view(feature_view=fv, version="v1") @@ -505,7 +521,10 @@ def test_resume_and_suspend_feature_view(self) -> None: fs.register_entity(e) my_fv = FeatureView( - name="my_fv", entities=[e], feature_df=self._session.table(self._mock_table), refresh_freq="DOWNSTREAM" + name="my_fv", + entities=[e], + feature_df=self._session.table(self._mock_table), + refresh_freq="DOWNSTREAM", ) my_fv = fs.register_feature_view(feature_view=my_fv, version="v1", block=True) @@ -526,7 +545,10 @@ def test_resume_and_suspend_feature_view_system_error(self) -> None: e = Entity("foo", ["id"]) fs.register_entity(e) my_fv = FeatureView( - name="my_fv", entities=[e], feature_df=self._session.table(self._mock_table), refresh_freq="DOWNSTREAM" + name="my_fv", + entities=[e], + feature_df=self._session.table(self._mock_table), + refresh_freq="DOWNSTREAM", ) my_fv = fs.register_feature_view(feature_view=my_fv, version="v1", block=True) @@ -557,7 +579,12 @@ def test_read_feature_view(self) -> None: fs.register_entity(e) sql = f"SELECT name, id, title, age, ts FROM {self._mock_table}" - my_fv = FeatureView(name="my_fv", entities=[e], feature_df=self._session.sql(sql), refresh_freq="DOWNSTREAM") + my_fv = FeatureView( + name="my_fv", + entities=[e], + feature_df=self._session.sql(sql), + refresh_freq="DOWNSTREAM", + ) my_fv = fs.register_feature_view(feature_view=my_fv, version="v1", block=True) df = fs.read_feature_view(my_fv) @@ -581,7 +608,10 @@ def test_register_with_cron_expr(self) -> None: sql = f"SELECT name, id, title, age, ts FROM {self._mock_table}" my_fv = FeatureView( - name="my_fv", entities=[e], feature_df=self._session.sql(sql), refresh_freq="* * * * * America/Los_Angeles" + name="my_fv", + entities=[e], + feature_df=self._session.sql(sql), + refresh_freq="* * * * * America/Los_Angeles", ).attach_feature_desc({"title": "my title"}) my_fv = fs.register_feature_view( feature_view=my_fv, @@ -636,13 +666,18 @@ def test_retrieve_time_series_feature_values(self) -> None: fv2 = fs.register_feature_view(feature_view=fv2, version="v1", block=True) sql3 = f"SELECT id, dept FROM {self._mock_table}" - fv3 = FeatureView(name="fv3", entities=[e], feature_df=self._session.sql(sql3), refresh_freq="DOWNSTREAM") + fv3 = FeatureView( + name="fv3", + entities=[e], + feature_df=self._session.sql(sql3), + refresh_freq="DOWNSTREAM", + ) fv3 = fs.register_feature_view(feature_view=fv3, version="v1", block=True) spine_df = self._session.create_dataframe([(1, 101), (2, 202), (1, 90)], schema=["id", "ts"]) df = fs.retrieve_feature_values( spine_df=spine_df, - features=[fv1, fv2, fv3], + features=cast(List[Union[FeatureView, FeatureViewSlice]], [fv1, fv2, fv3]), spine_timestamp_col="ts", ) compare_dataframe( @@ -665,15 +700,28 @@ def test_retrieve_feature_values(self) -> None: fs.register_entity(e) sql1 = f"SELECT id, name, title FROM {self._mock_table}" - fv1 = FeatureView(name="fv1", entities=[e], feature_df=self._session.sql(sql1), refresh_freq="DOWNSTREAM") + fv1 = FeatureView( + name="fv1", + entities=[e], + feature_df=self._session.sql(sql1), + refresh_freq="DOWNSTREAM", + ) fv1 = fs.register_feature_view(feature_view=fv1, version="v1", block=True) sql2 = f"SELECT id, age FROM {self._mock_table}" - fv2 = FeatureView(name="fv2", entities=[e], feature_df=self._session.sql(sql2), refresh_freq="DOWNSTREAM") + fv2 = FeatureView( + name="fv2", + entities=[e], + feature_df=self._session.sql(sql2), + refresh_freq="DOWNSTREAM", + ) fv2 = fs.register_feature_view(feature_view=fv2, version="v1", block=True) spine_df = self._session.create_dataframe([(1), (2)], schema=["id"]) - df = fs.retrieve_feature_values(spine_df=spine_df, features=[fv1, fv2]) + df = fs.retrieve_feature_values( + spine_df=spine_df, + features=cast(List[Union[FeatureView, FeatureViewSlice]], [fv1, fv2]), + ) compare_dataframe( actual_df=df.to_pandas(), @@ -686,7 +734,10 @@ def test_retrieve_feature_values(self) -> None: sort_cols=["ID"], ) - df = fs.retrieve_feature_values(spine_df=spine_df, features=[fv1.slice(["name"]), fv2]) + df = fs.retrieve_feature_values( + spine_df=spine_df, + features=cast(List[Union[FeatureView, FeatureViewSlice]], [fv1.slice(["name"]), fv2]), + ) compare_dataframe( actual_df=df.to_pandas(), target_data={ @@ -698,7 +749,9 @@ def test_retrieve_feature_values(self) -> None: ) df = fs.retrieve_feature_values( - spine_df=spine_df, features=[fv1.slice(["name"]), fv2], exclude_columns=["NAME"] + spine_df=spine_df, + features=cast(List[Union[FeatureView, FeatureViewSlice]], [fv1.slice(["name"]), fv2]), + exclude_columns=["NAME"], ) compare_dataframe( actual_df=df.to_pandas(), @@ -712,7 +765,7 @@ def test_retrieve_feature_values(self) -> None: # test retrieve_feature_values with serialized feature objects fv1_slice = fv1.slice(["name"]) dataset = fs.generate_dataset(spine_df, features=[fv1_slice, fv2]) - df = fs.retrieve_feature_values(spine_df=spine_df, features=dataset.load_features()) + df = fs.retrieve_feature_values(spine_df=spine_df, features=cast(List[str], dataset.load_features())) compare_dataframe( actual_df=df.to_pandas(), target_data={ @@ -850,11 +903,21 @@ def test_merge_feature_view_slice(self) -> None: # 1. Right side is FeatureViewSlice sql1 = f"SELECT id, name FROM {self._mock_table}" - fv1 = FeatureView(name="fv1", entities=[e], feature_df=self._session.sql(sql1), refresh_freq="DOWNSTREAM") + fv1 = FeatureView( + name="fv1", + entities=[e], + feature_df=self._session.sql(sql1), + refresh_freq="DOWNSTREAM", + ) fv1 = fs.register_feature_view(feature_view=fv1, version="v1", block=True) sql2 = f"SELECT id, title, age FROM {self._mock_table}" - fv2 = FeatureView(name="fv2", entities=[e], feature_df=self._session.sql(sql2), refresh_freq="DOWNSTREAM") + fv2 = FeatureView( + name="fv2", + entities=[e], + feature_df=self._session.sql(sql2), + refresh_freq="DOWNSTREAM", + ) fv2 = fs.register_feature_view(feature_view=fv2, version="v1", block=True) merged_fv = fs.merge_features(features=[fv1, fv2.slice(["title"])], name="merged_fv") @@ -872,10 +935,20 @@ def test_merge_feature_view_slice(self) -> None: ) # 2. Left side is FeatureViewSlice - fv3 = FeatureView(name="fv3", entities=[e], feature_df=self._session.sql(sql2), refresh_freq="DOWNSTREAM") + fv3 = FeatureView( + name="fv3", + entities=[e], + feature_df=self._session.sql(sql2), + refresh_freq="DOWNSTREAM", + ) fv3 = fs.register_feature_view(feature_view=fv3, version="v1", block=True) - fv4 = FeatureView(name="fv4", entities=[e], feature_df=self._session.sql(sql1), refresh_freq="DOWNSTREAM") + fv4 = FeatureView( + name="fv4", + entities=[e], + feature_df=self._session.sql(sql1), + refresh_freq="DOWNSTREAM", + ) fv4 = fs.register_feature_view(feature_view=fv4, version="v1", block=True) merged_fv_2 = fs.merge_features(features=[fv3.slice(["title"]), fv4], name="merged_fv_2") @@ -953,23 +1026,39 @@ def test_list_feature_views(self) -> None: # 1. Right side is FeatureViewSlice sql1 = f"SELECT id, name, ts FROM {self._mock_table}" fv1 = FeatureView( - name="fv1", entities=[e], feature_df=self._session.sql(sql1), timestamp_col="ts", refresh_freq="DOWNSTREAM" + name="fv1", + entities=[e], + feature_df=self._session.sql(sql1), + timestamp_col="ts", + refresh_freq="DOWNSTREAM", ) fv1.attach_feature_desc({"name": "this is my name col"}) fv1 = fs.register_feature_view(feature_view=fv1, version="v1", block=True) sql2 = f"SELECT id, title, age FROM {self._mock_table}" - fv2 = FeatureView(name="fv2", entities=[e], feature_df=self._session.sql(sql2), refresh_freq="DOWNSTREAM") + fv2 = FeatureView( + name="fv2", + entities=[e], + feature_df=self._session.sql(sql2), + refresh_freq="DOWNSTREAM", + ) fv2 = fs.register_feature_view(feature_view=fv2, version="v1", block=True) self.assertEqual( - sorted(fs.list_feature_views(entity_name="Foo", as_dataframe=False), key=lambda fv: fv.name), [fv1, fv2] + sorted( + cast( + List[FeatureView], + fs.list_feature_views(entity_name="Foo", as_dataframe=False), + ), + key=lambda fv: fv.name, + ), + [fv1, fv2], ) self.assertEqual( fs.list_feature_views(entity_name="foo", feature_view_name="fv1", as_dataframe=False), [fv1], ) - df = fs.list_feature_views() + df = cast(DataFrame, fs.list_feature_views()) self.assertListEqual( df.columns, [ @@ -999,7 +1088,12 @@ def test_list_feature_views_system_error(self) -> None: fs.register_entity(e) sql1 = f"SELECT id, name FROM {self._mock_table}" - fv1 = FeatureView(name="fv1", entities=[e], feature_df=self._session.sql(sql1), refresh_freq="DOWNSTREAM") + fv1 = FeatureView( + name="fv1", + entities=[e], + feature_df=self._session.sql(sql1), + refresh_freq="DOWNSTREAM", + ) fs.register_feature_view(feature_view=fv1, version="v1", block=True) fs._session = create_mock_session( @@ -1048,7 +1142,12 @@ def test_generate_dataset(self) -> None: fs.register_entity(e) sql1 = f"SELECT id, name, title FROM {self._mock_table}" - fv1 = FeatureView(name="fv1", entities=[e], feature_df=self._session.sql(sql1), refresh_freq="DOWNSTREAM") + fv1 = FeatureView( + name="fv1", + entities=[e], + feature_df=self._session.sql(sql1), + refresh_freq="DOWNSTREAM", + ) fv1 = fs.register_feature_view(feature_view=fv1, version="v1", block=True) sql2 = f"SELECT id, age, ts FROM {self._mock_table}" @@ -1237,7 +1336,10 @@ def test_clear_feature_store_in_existing_schema(self) -> None: fs.register_entity(e) sql = f"SELECT name, id FROM {self._mock_table}" fv = FeatureView( - name="fv", entities=[e], feature_df=self._session.sql(sql), refresh_freq="* * * * * America/Los_Angeles" + name="fv", + entities=[e], + feature_df=self._session.sql(sql), + refresh_freq="* * * * * America/Los_Angeles", ) fv = fs.register_feature_view(feature_view=fv, version="v1", block=True) @@ -1347,19 +1449,27 @@ def test_update_feature_view(self) -> None: sql = f"SELECT id, name, title FROM {self._mock_table}" alternative_wh = "REGTEST_ML_SMALL" - fv = FeatureView(name="fv1", entities=[e], feature_df=self._session.sql(sql), refresh_freq="DOWNSTREAM") + fv = FeatureView( + name="fv1", + entities=[e], + feature_df=self._session.sql(sql), + refresh_freq="DOWNSTREAM", + ) with self.assertRaisesRegex( - RuntimeError, "Feature view .* must be registered and non-static to update refresh_freq.*" + RuntimeError, + "Feature view .* must be registered and non-static to update refresh_freq.*", ): fv.refresh_freq = "1 minute" with self.assertRaisesRegex( - RuntimeError, "Feature view .* must be registered and non-static to update warehouse.*" + RuntimeError, + "Feature view .* must be registered and non-static to update warehouse.*", ): - fv.warehouse = alternative_wh + fv.warehouse = SqlIdentifier(alternative_wh) with self.assertRaisesRegex( - RuntimeError, "Feature view .* must be registered and non-static so that can be updated.*" + RuntimeError, + "Feature view .* must be registered and non-static so that can be updated.*", ): fs.update_feature_view(fv) @@ -1367,10 +1477,13 @@ def test_update_feature_view(self) -> None: result = self._session.sql(f"SHOW DYNAMIC TABLES IN SCHEMA {fv.database}.{fv.schema}").collect() self.assertEqual(len(result), 1) self.assertEqual(result[0]["target_lag"], "DOWNSTREAM") - self.assertEqual(SqlIdentifier(result[0]["warehouse"]), SqlIdentifier(self._session.get_current_warehouse())) + self.assertEqual( + SqlIdentifier(result[0]["warehouse"]), + SqlIdentifier(self._session.get_current_warehouse()), + ) fv.refresh_freq = "1 minute" - fv.warehouse = alternative_wh + fv.warehouse = SqlIdentifier(alternative_wh) fs.update_feature_view(fv) result = self._session.sql(f"SHOW DYNAMIC TABLES IN SCHEMA {fv.database}.{fv.schema}").collect() self.assertEqual(len(result), 1) diff --git a/snowflake/ml/fileset/BUILD.bazel b/snowflake/ml/fileset/BUILD.bazel index 6cb4a9bb..ac5e77e5 100644 --- a/snowflake/ml/fileset/BUILD.bazel +++ b/snowflake/ml/fileset/BUILD.bazel @@ -1,4 +1,4 @@ -load("//bazel:py_rules.bzl", "py_library", "py_package", "py_test", "snowml_wheel") +load("//bazel:py_rules.bzl", "py_library", "py_package", "py_test") package(default_visibility = ["//visibility:public"]) @@ -128,34 +128,3 @@ py_library( testonly = True, srcs = ["parquet_test_util.py"], ) - -_TENSORFLOW_REQUIRES = ["tensorflow>=2.9,<3"] - -_PYTORCH_REQUIRES = ["torchdata>=0.4,<1"] - -_ALL_REQUIRES = _TENSORFLOW_REQUIRES + _PYTORCH_REQUIRES - -snowml_wheel( - name = "fileset_wheel", - compatible_with_snowpark = False, - development_status = "PrPr", - extra_requires = { - "all": _ALL_REQUIRES, - "pytorch": _PYTORCH_REQUIRES, - "tensorflow": _TENSORFLOW_REQUIRES, - }, - # TODO(zhuo): consider adding a check to make sure what's listed - # here is a subset that is compatible with what is specified in conda-env.yml. - requires = [ - "absl-py>=0.15,<2", - "fsspec[http]>=2022.11,<=2023.1", - "numpy>=1.23,<1.24", - "pyyaml>=6.0,<7", - "snowflake-connector-python[pandas]", - "snowflake-snowpark-python>=1.4.0,<2", - ], - version = "0.2.2", - deps = [ - "//snowflake/ml/fileset:fileset_pkg", - ], -) diff --git a/snowflake/ml/model/_client/ops/BUILD.bazel b/snowflake/ml/model/_client/ops/BUILD.bazel index 1e914e93..81fd72f4 100644 --- a/snowflake/ml/model/_client/ops/BUILD.bazel +++ b/snowflake/ml/model/_client/ops/BUILD.bazel @@ -12,7 +12,6 @@ py_library( deps = [ ":metadata_ops", "//snowflake/ml/_internal/utils:identifier", - "//snowflake/ml/_internal/utils:snowflake_env", "//snowflake/ml/_internal/utils:sql_identifier", "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", @@ -34,7 +33,6 @@ py_test( srcs = ["model_ops_test.py"], deps = [ ":model_ops", - "//snowflake/ml/_internal/utils:snowflake_env", "//snowflake/ml/_internal/utils:sql_identifier", "//snowflake/ml/model:model_signature", "//snowflake/ml/model/_signatures:snowpark_handler", diff --git a/snowflake/ml/model/_client/ops/model_ops.py b/snowflake/ml/model/_client/ops/model_ops.py index 9794807e..d7ae1f8c 100644 --- a/snowflake/ml/model/_client/ops/model_ops.py +++ b/snowflake/ml/model/_client/ops/model_ops.py @@ -4,9 +4,8 @@ from typing import Any, Dict, List, Optional, Union, cast import yaml -from packaging import version -from snowflake.ml._internal.utils import identifier, snowflake_env, sql_identifier +from snowflake.ml._internal.utils import identifier, sql_identifier from snowflake.ml.model import model_signature, type_hints from snowflake.ml.model._client.ops import metadata_ops from snowflake.ml.model._client.sql import ( @@ -25,8 +24,6 @@ from snowflake.snowpark import dataframe, row, session from snowflake.snowpark._internal import utils as snowpark_utils -_TAG_ON_MODEL_AVAILABLE_VERSION = version.parse("8.2.0") - class ModelOperator: def __init__( @@ -296,21 +293,14 @@ def set_tag( tag_value: str, statement_params: Optional[Dict[str, Any]] = None, ) -> None: - sf_version = snowflake_env.get_current_snowflake_version(self._session, statement_params=statement_params) - if sf_version >= _TAG_ON_MODEL_AVAILABLE_VERSION: - self._tag_client.set_tag_on_model( - model_name=model_name, - tag_database_name=tag_database_name, - tag_schema_name=tag_schema_name, - tag_name=tag_name, - tag_value=tag_value, - statement_params=statement_params, - ) - else: - raise NotImplementedError( - f"`set_tag` won't work before Snowflake version {_TAG_ON_MODEL_AVAILABLE_VERSION}," - f" currently is {sf_version}" - ) + self._tag_client.set_tag_on_model( + model_name=model_name, + tag_database_name=tag_database_name, + tag_schema_name=tag_schema_name, + tag_name=tag_name, + tag_value=tag_value, + statement_params=statement_params, + ) def unset_tag( self, @@ -321,20 +311,13 @@ def unset_tag( tag_name: sql_identifier.SqlIdentifier, statement_params: Optional[Dict[str, Any]] = None, ) -> None: - sf_version = snowflake_env.get_current_snowflake_version(self._session, statement_params=statement_params) - if sf_version >= _TAG_ON_MODEL_AVAILABLE_VERSION: - self._tag_client.unset_tag_on_model( - model_name=model_name, - tag_database_name=tag_database_name, - tag_schema_name=tag_schema_name, - tag_name=tag_name, - statement_params=statement_params, - ) - else: - raise NotImplementedError( - f"`unset_tag` won't work before Snowflake version {_TAG_ON_MODEL_AVAILABLE_VERSION}," - f" currently is {sf_version}" - ) + self._tag_client.unset_tag_on_model( + model_name=model_name, + tag_database_name=tag_database_name, + tag_schema_name=tag_schema_name, + tag_name=tag_name, + statement_params=statement_params, + ) def get_model_version_manifest( self, @@ -382,11 +365,6 @@ def get_client_data_in_user_data( version_name: sql_identifier.SqlIdentifier, statement_params: Optional[Dict[str, Any]] = None, ) -> model_manifest_schema.SnowparkMLDataDict: - if ( - snowflake_env.get_current_snowflake_version(self._session) - < model_manifest_schema.MANIFEST_USER_DATA_ENABLE_VERSION - ): - raise NotImplementedError("User_data has not been supported yet.") raw_user_data_json_string = self._model_client.show_versions( model_name=model_name, version_name=version_name, diff --git a/snowflake/ml/model/_client/ops/model_ops_test.py b/snowflake/ml/model/_client/ops/model_ops_test.py index dd041e3c..48729902 100644 --- a/snowflake/ml/model/_client/ops/model_ops_test.py +++ b/snowflake/ml/model/_client/ops/model_ops_test.py @@ -10,9 +10,8 @@ import pandas as pd import yaml from absl.testing import absltest -from packaging import version -from snowflake.ml._internal.utils import snowflake_env, sql_identifier +from snowflake.ml._internal.utils import sql_identifier from snowflake.ml.model import model_signature from snowflake.ml.model._client.ops import model_ops from snowflake.ml.model._model_composer.model_manifest import model_manifest_schema @@ -41,9 +40,6 @@ def setUp(self) -> None: database_name=sql_identifier.SqlIdentifier("TEMP"), schema_name=sql_identifier.SqlIdentifier("test", case_sensitive=True), ) - snowflake_env.get_current_snowflake_version = mock.MagicMock( - return_value=model_manifest_schema.MANIFEST_USER_DATA_ENABLE_VERSION - ) def test_prepare_model_stage_path(self) -> None: with mock.patch.object(self.m_ops._stage_client, "create_tmp_stage",) as mock_create_stage, mock.patch.object( @@ -356,29 +352,8 @@ def test_show_tags(self) -> None: statement_params=self.m_statement_params, ) - def test_set_tag_fail(self) -> None: - with mock.patch.object( - snowflake_env, - "get_current_snowflake_version", - return_value=version.parse("8.1.0+23d9c914e5"), - ), mock.patch.object(self.m_ops._tag_client, "set_tag_on_model") as mock_set_tag: - with self.assertRaisesRegex(NotImplementedError, "`set_tag` won't work before Snowflake version"): - self.m_ops.set_tag( - model_name=sql_identifier.SqlIdentifier("MODEL"), - tag_database_name=sql_identifier.SqlIdentifier("DB"), - tag_schema_name=sql_identifier.SqlIdentifier("schema", case_sensitive=True), - tag_name=sql_identifier.SqlIdentifier("MYTAG"), - tag_value="tag content", - statement_params=self.m_statement_params, - ) - mock_set_tag.assert_not_called() - def test_set_tag(self) -> None: - with mock.patch.object( - snowflake_env, - "get_current_snowflake_version", - return_value=version.parse("8.2.0+23d9c914e5"), - ), mock.patch.object(self.m_ops._tag_client, "set_tag_on_model") as mock_set_tag: + with mock.patch.object(self.m_ops._tag_client, "set_tag_on_model") as mock_set_tag: self.m_ops.set_tag( model_name=sql_identifier.SqlIdentifier("MODEL"), tag_database_name=sql_identifier.SqlIdentifier("DB"), @@ -396,28 +371,8 @@ def test_set_tag(self) -> None: statement_params=self.m_statement_params, ) - def test_unset_tag_fail(self) -> None: - with mock.patch.object( - snowflake_env, - "get_current_snowflake_version", - return_value=version.parse("8.1.0+23d9c914e5"), - ), mock.patch.object(self.m_ops._tag_client, "unset_tag_on_model") as mock_unset_tag: - with self.assertRaisesRegex(NotImplementedError, "`unset_tag` won't work before Snowflake version"): - self.m_ops.unset_tag( - model_name=sql_identifier.SqlIdentifier("MODEL"), - tag_database_name=sql_identifier.SqlIdentifier("DB"), - tag_schema_name=sql_identifier.SqlIdentifier("schema", case_sensitive=True), - tag_name=sql_identifier.SqlIdentifier("MYTAG"), - statement_params=self.m_statement_params, - ) - mock_unset_tag.assert_not_called() - def test_unset_tag(self) -> None: - with mock.patch.object( - snowflake_env, - "get_current_snowflake_version", - return_value=version.parse("8.2.0+23d9c914e5"), - ), mock.patch.object(self.m_ops._tag_client, "unset_tag_on_model") as mock_unset_tag: + with mock.patch.object(self.m_ops._tag_client, "unset_tag_on_model") as mock_unset_tag: self.m_ops.unset_tag( model_name=sql_identifier.SqlIdentifier("MODEL"), tag_database_name=sql_identifier.SqlIdentifier("DB"), diff --git a/snowflake/ml/model/_client/sql/BUILD.bazel b/snowflake/ml/model/_client/sql/BUILD.bazel index 8857ea8b..567ee9a7 100644 --- a/snowflake/ml/model/_client/sql/BUILD.bazel +++ b/snowflake/ml/model/_client/sql/BUILD.bazel @@ -22,7 +22,6 @@ py_test( srcs = ["model_test.py"], deps = [ ":model", - "//snowflake/ml/_internal/utils:snowflake_env", "//snowflake/ml/_internal/utils:sql_identifier", "//snowflake/ml/model/_model_composer/model_manifest:model_manifest_schema", "//snowflake/ml/test_utils:mock_data_frame", diff --git a/snowflake/ml/model/_client/sql/model.py b/snowflake/ml/model/_client/sql/model.py index f07c02dd..0ad47535 100644 --- a/snowflake/ml/model/_client/sql/model.py +++ b/snowflake/ml/model/_client/sql/model.py @@ -3,10 +3,8 @@ from snowflake.ml._internal.utils import ( identifier, query_result_checker, - snowflake_env, sql_identifier, ) -from snowflake.ml.model._model_composer.model_manifest import model_manifest_schema from snowflake.snowpark import row, session @@ -89,12 +87,8 @@ def show_versions( .has_column(ModelSQLClient.MODEL_VERSION_NAME_COL_NAME, allow_empty=True) .has_column(ModelSQLClient.MODEL_VERSION_COMMENT_COL_NAME, allow_empty=True) .has_column(ModelSQLClient.MODEL_VERSION_METADATA_COL_NAME, allow_empty=True) + .has_column(ModelSQLClient.MODEL_VERSION_USER_DATA_COL_NAME, allow_empty=True) ) - if ( - snowflake_env.get_current_snowflake_version(self._session) - >= model_manifest_schema.MANIFEST_USER_DATA_ENABLE_VERSION - ): - res = res.has_column(ModelSQLClient.MODEL_VERSION_USER_DATA_COL_NAME, allow_empty=True) if validate_result and version_name: res = res.has_dimensions(expected_rows=1) diff --git a/snowflake/ml/model/_client/sql/model_test.py b/snowflake/ml/model/_client/sql/model_test.py index ba6fab90..93a308e9 100644 --- a/snowflake/ml/model/_client/sql/model_test.py +++ b/snowflake/ml/model/_client/sql/model_test.py @@ -1,11 +1,9 @@ from typing import cast -from unittest import mock from absl.testing import absltest -from snowflake.ml._internal.utils import snowflake_env, sql_identifier +from snowflake.ml._internal.utils import sql_identifier from snowflake.ml.model._client.sql import model as model_sql -from snowflake.ml.model._model_composer.model_manifest import model_manifest_schema from snowflake.ml.test_utils import mock_data_frame, mock_session from snowflake.snowpark import Row, Session @@ -13,9 +11,6 @@ class ModelSQLTest(absltest.TestCase): def setUp(self) -> None: self.m_session = mock_session.MockSession(conn=None, test_case=self) - snowflake_env.get_current_snowflake_version = mock.MagicMock( - return_value=model_manifest_schema.MANIFEST_USER_DATA_ENABLE_VERSION - ) def test_show_models_1(self) -> None: m_statement_params = {"test": "1"} diff --git a/snowflake/ml/model/_client/sql/model_version.py b/snowflake/ml/model/_client/sql/model_version.py index 18ba0a55..7861d2a4 100644 --- a/snowflake/ml/model/_client/sql/model_version.py +++ b/snowflake/ml/model/_client/sql/model_version.py @@ -146,24 +146,29 @@ def invoke_method( returns: List[Tuple[str, spt.DataType, sql_identifier.SqlIdentifier]], statement_params: Optional[Dict[str, Any]] = None, ) -> dataframe.DataFrame: - tmp_table_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.TABLE) - INTERMEDIATE_TABLE_NAME = identifier.get_schema_level_object_identifier( - self._database_name.identifier(), - self._schema_name.identifier(), - tmp_table_name, - ) - input_df.write.save_as_table( # type: ignore[call-overload] - table_name=INTERMEDIATE_TABLE_NAME, - mode="errorifexists", - table_type="temporary", - statement_params=statement_params, - ) + with_statements = [] + if len(input_df.queries["queries"]) == 1 and len(input_df.queries["post_actions"]) == 0: + INTERMEDIATE_TABLE_NAME = "SNOWPARK_ML_MODEL_INFERENCE_INPUT" + with_statements.append(f"{INTERMEDIATE_TABLE_NAME} AS ({input_df.queries['queries'][0]})") + else: + tmp_table_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.TABLE) + INTERMEDIATE_TABLE_NAME = identifier.get_schema_level_object_identifier( + self._database_name.identifier(), + self._schema_name.identifier(), + tmp_table_name, + ) + input_df.write.save_as_table( # type: ignore[call-overload] + table_name=INTERMEDIATE_TABLE_NAME, + mode="errorifexists", + table_type="temporary", + statement_params=statement_params, + ) INTERMEDIATE_OBJ_NAME = "TMP_RESULT" module_version_alias = "MODEL_VERSION_ALIAS" - model_version_alias_sql = ( - f"WITH {module_version_alias} AS " + with_statements.append( + f"{module_version_alias} AS " f"MODEL {self.fully_qualified_model_name(model_name)} VERSION {version_name.identifier()}" ) @@ -174,7 +179,7 @@ def invoke_method( args_sql = ", ".join(args_sql_list) sql = textwrap.dedent( - f"""{model_version_alias_sql} + f"""WITH {','.join(with_statements)} SELECT *, {module_version_alias}!{method_name.identifier()}({args_sql}) AS {INTERMEDIATE_OBJ_NAME} FROM {INTERMEDIATE_TABLE_NAME}""" diff --git a/snowflake/ml/model/_client/sql/model_version_test.py b/snowflake/ml/model/_client/sql/model_version_test.py index 0557fe77..0fc1c3e1 100644 --- a/snowflake/ml/model/_client/sql/model_version_test.py +++ b/snowflake/ml/model/_client/sql/model_version_test.py @@ -95,8 +95,9 @@ def test_get_file(self) -> None: collect_result=[Row(file="946964364/MANIFEST.yml", size=419, status="DOWNLOADED", message="")], collect_statement_params=m_statement_params, ) + path = pathlib.Path("/tmp").resolve().as_posix() self.m_session.add_mock_sql( - """GET 'snow://model/TEMP."test".MODEL/versions/v1/model.yaml' 'file:///tmp'""", m_df + f"""GET 'snow://model/TEMP."test".MODEL/versions/v1/model.yaml' 'file://{path}'""", m_df ) c_session = cast(Session, self.m_session) res = model_version_sql.ModelVersionSQLClient( @@ -126,6 +127,7 @@ def test_invoke_method(self) -> None: c_session = cast(Session, self.m_session) mock_writer = mock.MagicMock() m_df.__setattr__("write", mock_writer) + m_df.__setattr__("queries", {"queries": ["query_1", "query_2"], "post_actions": []}) with mock.patch.object(mock_writer, "save_as_table") as mock_save_as_table, mock.patch.object( snowpark_utils, "random_name_for_temp_object", return_value="SNOWPARK_TEMP_TABLE_ABCDEF0123" ) as mock_random_name_for_temp_object: @@ -150,6 +152,73 @@ def test_invoke_method(self) -> None: statement_params=m_statement_params, ) + def test_invoke_method_1(self) -> None: + m_statement_params = {"test": "1"} + m_df = mock_data_frame.MockDataFrame() + self.m_session.add_mock_sql( + """WITH MODEL_VERSION_ALIAS AS MODEL TEMP."test".MODEL VERSION V1 + SELECT *, + MODEL_VERSION_ALIAS!PREDICT(COL1, COL2) AS TMP_RESULT + FROM TEMP."test".SNOWPARK_TEMP_TABLE_ABCDEF0123""", + m_df, + ) + m_df.add_mock_with_columns(["OUTPUT_1"], [F.col("OUTPUT_1")]).add_mock_drop("TMP_RESULT") + c_session = cast(Session, self.m_session) + mock_writer = mock.MagicMock() + m_df.__setattr__("write", mock_writer) + m_df.__setattr__("queries", {"queries": ["query_1"], "post_actions": ["query_2"]}) + with mock.patch.object(mock_writer, "save_as_table") as mock_save_as_table, mock.patch.object( + snowpark_utils, "random_name_for_temp_object", return_value="SNOWPARK_TEMP_TABLE_ABCDEF0123" + ) as mock_random_name_for_temp_object: + model_version_sql.ModelVersionSQLClient( + c_session, + database_name=sql_identifier.SqlIdentifier("TEMP"), + schema_name=sql_identifier.SqlIdentifier("test", case_sensitive=True), + ).invoke_method( + model_name=sql_identifier.SqlIdentifier("MODEL"), + version_name=sql_identifier.SqlIdentifier("V1"), + method_name=sql_identifier.SqlIdentifier("PREDICT"), + input_df=cast(DataFrame, m_df), + input_args=[sql_identifier.SqlIdentifier("COL1"), sql_identifier.SqlIdentifier("COL2")], + returns=[("output_1", spt.IntegerType(), sql_identifier.SqlIdentifier("OUTPUT_1"))], + statement_params=m_statement_params, + ) + mock_random_name_for_temp_object.assert_called_once_with(snowpark_utils.TempObjectType.TABLE) + mock_save_as_table.assert_called_once_with( + table_name='TEMP."test".SNOWPARK_TEMP_TABLE_ABCDEF0123', + mode="errorifexists", + table_type="temporary", + statement_params=m_statement_params, + ) + + def test_invoke_method_2(self) -> None: + m_statement_params = {"test": "1"} + m_df = mock_data_frame.MockDataFrame() + self.m_session.add_mock_sql( + """WITH SNOWPARK_ML_MODEL_INFERENCE_INPUT AS (query_1), + MODEL_VERSION_ALIAS AS MODEL TEMP."test".MODEL VERSION V1 + SELECT *, + MODEL_VERSION_ALIAS!PREDICT(COL1, COL2) AS TMP_RESULT + FROM SNOWPARK_ML_MODEL_INFERENCE_INPUT""", + m_df, + ) + m_df.add_mock_with_columns(["OUTPUT_1"], [F.col("OUTPUT_1")]).add_mock_drop("TMP_RESULT") + c_session = cast(Session, self.m_session) + m_df.__setattr__("queries", {"queries": ["query_1"], "post_actions": []}) + model_version_sql.ModelVersionSQLClient( + c_session, + database_name=sql_identifier.SqlIdentifier("TEMP"), + schema_name=sql_identifier.SqlIdentifier("test", case_sensitive=True), + ).invoke_method( + model_name=sql_identifier.SqlIdentifier("MODEL"), + version_name=sql_identifier.SqlIdentifier("V1"), + method_name=sql_identifier.SqlIdentifier("PREDICT"), + input_df=cast(DataFrame, m_df), + input_args=[sql_identifier.SqlIdentifier("COL1"), sql_identifier.SqlIdentifier("COL2")], + returns=[("output_1", spt.IntegerType(), sql_identifier.SqlIdentifier("OUTPUT_1"))], + statement_params=m_statement_params, + ) + def test_set_metadata(self) -> None: m_statement_params = {"test": "1"} m_df = mock_data_frame.MockDataFrame(collect_result=[Row("")], collect_statement_params=m_statement_params) diff --git a/snowflake/ml/model/_deploy_client/warehouse/deploy_test.py b/snowflake/ml/model/_deploy_client/warehouse/deploy_test.py index 4bbfa6df..9406ddae 100644 --- a/snowflake/ml/model/_deploy_client/warehouse/deploy_test.py +++ b/snowflake/ml/model/_deploy_client/warehouse/deploy_test.py @@ -85,7 +85,7 @@ def test_get_model_final_packages(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: env_utils._SNOWFLAKE_INFO_SCHEMA_PACKAGE_CACHE = {} with model_meta.create_model_metadata( - model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG + model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG, _legacy_save=True ) as meta: meta.models["model1"] = _DUMMY_BLOB c_session = cast(session.Session, self.m_session) @@ -102,6 +102,7 @@ def test_get_model_final_packages_no_relax(self) -> None: model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["pandas==1.0.*"], + _legacy_save=True, ) as meta: meta.models["model1"] = _DUMMY_BLOB c_session = cast(session.Session, self.m_session) @@ -112,10 +113,7 @@ def test_get_model_final_packages_relax(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: env_utils._SNOWFLAKE_INFO_SCHEMA_PACKAGE_CACHE = {} with model_meta.create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures=_DUMMY_SIG, + model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG, _legacy_save=True ) as meta: meta.models["model1"] = _DUMMY_BLOB c_session = cast(session.Session, self.m_session) @@ -135,6 +133,7 @@ def test_get_model_final_packages_with_pip(self) -> None: model_type="custom", signatures=_DUMMY_SIG, pip_requirements=["python-package"], + _legacy_save=True, ) as meta: meta.models["model1"] = _DUMMY_BLOB c_session = cast(session.Session, self.m_session) @@ -150,6 +149,7 @@ def test_get_model_final_packages_with_other_channel(self) -> None: model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["conda-forge::python_package"], + _legacy_save=True, ) as meta: meta.models["model1"] = _DUMMY_BLOB c_session = cast(session.Session, self.m_session) @@ -176,6 +176,7 @@ def test_get_model_final_packages_with_non_exist_package(self) -> None: model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["python-package"], + _legacy_save=True, ) as meta: meta.models["model1"] = _DUMMY_BLOB c_session = cast(session.Session, self.m_session) diff --git a/snowflake/ml/model/_model_composer/model_manifest/BUILD.bazel b/snowflake/ml/model/_model_composer/model_manifest/BUILD.bazel index 0c6b509c..bef4b9c7 100644 --- a/snowflake/ml/model/_model_composer/model_manifest/BUILD.bazel +++ b/snowflake/ml/model/_model_composer/model_manifest/BUILD.bazel @@ -17,7 +17,6 @@ py_library( srcs = ["model_manifest.py"], deps = [ ":model_manifest_schema", - "//snowflake/ml/_internal/utils:snowflake_env", "//snowflake/ml/model/_model_composer/model_method", "//snowflake/ml/model/_model_composer/model_method:function_generator", "//snowflake/ml/model/_model_composer/model_runtime", @@ -43,7 +42,6 @@ py_test( deps = [ ":model_manifest", "//snowflake/ml/_internal:env_utils", - "//snowflake/ml/_internal/utils:snowflake_env", "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", "//snowflake/ml/model/_packager/model_meta", diff --git a/snowflake/ml/model/_model_composer/model_manifest/model_manifest.py b/snowflake/ml/model/_model_composer/model_manifest/model_manifest.py index 78a782cf..eed7a1bf 100644 --- a/snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +++ b/snowflake/ml/model/_model_composer/model_manifest/model_manifest.py @@ -4,7 +4,6 @@ import yaml -from snowflake.ml._internal.utils import snowflake_env from snowflake.ml.model import type_hints from snowflake.ml.model._model_composer.model_manifest import model_manifest_schema from snowflake.ml.model._model_composer.model_method import ( @@ -84,11 +83,7 @@ def save( ], ) - if ( - snowflake_env.get_current_snowflake_version(session) - >= model_manifest_schema.MANIFEST_USER_DATA_ENABLE_VERSION - ): - manifest_dict["user_data"] = self.generate_user_data_with_client_data(model_meta) + manifest_dict["user_data"] = self.generate_user_data_with_client_data(model_meta) with (self.workspace_path / ModelManifest.MANIFEST_FILE_REL_PATH).open("w", encoding="utf-8") as f: # Anchors are not supported in the server, avoid that. diff --git a/snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py b/snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py index bf2d5473..8c303be0 100644 --- a/snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +++ b/snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py @@ -2,14 +2,12 @@ from typing import Any, Dict, List, Literal, TypedDict -from packaging import version from typing_extensions import NotRequired, Required from snowflake.ml.model import model_signature MODEL_MANIFEST_VERSION = "1.0" -MANIFEST_USER_DATA_ENABLE_VERSION = version.parse("8.2.0") MANIFEST_CLIENT_DATA_KEY_NAME = "snowpark_ml_data" MANIFEST_CLIENT_DATA_SCHEMA_VERSION = "2024-02-01" diff --git a/snowflake/ml/model/_model_composer/model_manifest/model_manifest_test.py b/snowflake/ml/model/_model_composer/model_manifest/model_manifest_test.py index acc159cb..24409fe1 100644 --- a/snowflake/ml/model/_model_composer/model_manifest/model_manifest_test.py +++ b/snowflake/ml/model/_model_composer/model_manifest/model_manifest_test.py @@ -7,10 +7,8 @@ import importlib_resources import yaml from absl.testing import absltest -from packaging import version from snowflake.ml._internal import env_utils -from snowflake.ml._internal.utils import snowflake_env from snowflake.ml.model import model_signature, type_hints from snowflake.ml.model._model_composer.model_manifest import ( model_manifest, @@ -44,68 +42,6 @@ class ModelManifestTest(absltest.TestCase): def setUp(self) -> None: self.m_session = mock.MagicMock() - snowflake_env.get_current_snowflake_version = mock.MagicMock( - return_value=model_manifest_schema.MANIFEST_USER_DATA_ENABLE_VERSION - ) - - def test_model_manifest_old(self) -> None: - snowflake_env.get_current_snowflake_version = mock.MagicMock(return_value=version.parse("8.0.0")) - with tempfile.TemporaryDirectory() as workspace, tempfile.TemporaryDirectory() as tmpdir: - mm = model_manifest.ModelManifest(pathlib.Path(workspace)) - with model_meta.create_model_metadata( - model_dir_path=tmpdir, - name="model1", - model_type="custom", - signatures={"predict": _DUMMY_SIG["predict"], "__call__": _DUMMY_SIG["predict"]}, - python_version="3.8", - ) as meta: - meta.models["model1"] = _DUMMY_BLOB - with mock.patch.object( - env_utils, - "get_matched_package_versions_in_information_schema", - return_value={env_utils.SNOWPARK_ML_PKG_NAME: []}, - ): - mm.save( - self.m_session, - meta, - pathlib.PurePosixPath("model.zip"), - options=type_hints.BaseModelSaveOption( - method_options={ - "predict": type_hints.ModelMethodSaveOptions(case_sensitive=True), - "__call__": type_hints.ModelMethodSaveOptions(max_batch_size=10), - } - ), - ) - with open(os.path.join(workspace, "MANIFEST.yml"), encoding="utf-8") as f: - self.assertEqual( - ( - importlib_resources.files("snowflake.ml.model._model_composer.model_manifest") - .joinpath("fixtures") # type: ignore[no-untyped-call] - .joinpath("MANIFEST_0.yml") - .read_text() - ), - f.read(), - ) - with open(pathlib.Path(workspace, "functions", "predict.py"), encoding="utf-8") as f: - self.assertEqual( - ( - importlib_resources.files("snowflake.ml.model._model_composer.model_method") - .joinpath("fixtures") # type: ignore[no-untyped-call] - .joinpath("function_1.py") - .read_text() - ), - f.read(), - ) - with open(pathlib.Path(workspace, "functions", "__call__.py"), encoding="utf-8") as f: - self.assertEqual( - ( - importlib_resources.files("snowflake.ml.model._model_composer.model_method") - .joinpath("fixtures") # type: ignore[no-untyped-call] - .joinpath("function_2.py") - .read_text() - ), - f.read(), - ) def test_model_manifest_1(self) -> None: with tempfile.TemporaryDirectory() as workspace, tempfile.TemporaryDirectory() as tmpdir: diff --git a/snowflake/ml/model/_model_composer/model_runtime/model_runtime.py b/snowflake/ml/model/_model_composer/model_runtime/model_runtime.py index a68a0f87..0f02ea73 100644 --- a/snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +++ b/snowflake/ml/model/_model_composer/model_runtime/model_runtime.py @@ -62,7 +62,6 @@ def __init__( model_env.ModelDependency(requirement=dep, pip_name=requirements.Requirement(dep).name) for dep in _UDF_INFERENCE_DEPENDENCIES ], - check_local_version=True, ) else: self.runtime_env.include_if_absent( @@ -70,7 +69,6 @@ def __init__( model_env.ModelDependency(requirement=dep, pip_name=requirements.Requirement(dep).name) for dep in _UDF_INFERENCE_DEPENDENCIES + [snowml_pkg_spec] ], - check_local_version=True, ) def save(self, workspace_path: pathlib.Path) -> model_manifest_schema.ModelRuntimeDict: diff --git a/snowflake/ml/model/_model_composer/model_runtime/model_runtime_test.py b/snowflake/ml/model/_model_composer/model_runtime/model_runtime_test.py index 9ad8b4c3..cda06a09 100644 --- a/snowflake/ml/model/_model_composer/model_runtime/model_runtime_test.py +++ b/snowflake/ml/model/_model_composer/model_runtime/model_runtime_test.py @@ -1,7 +1,6 @@ import os import pathlib import tempfile -from importlib import metadata as importlib_metadata from unittest import mock import yaml @@ -28,19 +27,20 @@ _BASIC_DEPENDENCIES_TARGET = list( sorted( - map( - lambda x: str(env_utils.get_local_installed_version_of_pip_package(requirements.Requirement(x))), - model_runtime._UDF_INFERENCE_DEPENDENCIES, - ) + map(lambda x: str(requirements.Requirement(x)), model_runtime._UDF_INFERENCE_DEPENDENCIES), ) ) _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML = list( sorted( - map( - lambda x: str(env_utils.get_local_installed_version_of_pip_package(requirements.Requirement(x))), - model_runtime._UDF_INFERENCE_DEPENDENCIES + [env_utils.SNOWPARK_ML_PKG_NAME], - ) + list(map(lambda x: str(requirements.Requirement(x)), model_runtime._UDF_INFERENCE_DEPENDENCIES)) + + [ + str( + env_utils.get_local_installed_version_of_pip_package( + requirements.Requirement(env_utils.SNOWPARK_ML_PKG_NAME) + ) + ) + ] ) ) @@ -118,12 +118,12 @@ def test_model_runtime_dup_basic_dep(self) -> None: name="model1", model_type="custom", signatures=_DUMMY_SIG, - conda_dependencies=["pandas"], + conda_dependencies=["packaging"], ) as meta: meta.models["model1"] = _DUMMY_BLOB dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] - dep_target.remove(f"pandas=={importlib_metadata.version('pandas')}") - dep_target.append("pandas") + dep_target.remove(next(filter(lambda x: x.startswith("packaging"), dep_target))) + dep_target.append("packaging") dep_target.sort() with mock.patch.object( @@ -148,12 +148,12 @@ def test_model_runtime_dup_basic_dep_other_channel(self) -> None: name="model1", model_type="custom", signatures=_DUMMY_SIG, - conda_dependencies=["conda-forge::pandas"], + conda_dependencies=["conda-forge::packaging"], ) as meta: meta.models["model1"] = _DUMMY_BLOB dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] - dep_target.remove(f"pandas=={importlib_metadata.version('pandas')}") - dep_target.append("conda-forge::pandas") + dep_target.remove(next(filter(lambda x: x.startswith("packaging"), dep_target))) + dep_target.append("conda-forge::packaging") dep_target.sort() with mock.patch.object( @@ -178,11 +178,11 @@ def test_model_runtime_dup_basic_dep_pip(self) -> None: name="model1", model_type="custom", signatures=_DUMMY_SIG, - pip_requirements=["pandas"], + pip_requirements=["packaging"], ) as meta: meta.models["model1"] = _DUMMY_BLOB dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] - dep_target.remove(f"pandas=={importlib_metadata.version('pandas')}") + dep_target.remove(next(filter(lambda x: x.startswith("packaging"), dep_target))) dep_target.sort() with mock.patch.object( diff --git a/snowflake/ml/model/_packager/model_handlers_test/mlflow_test.py b/snowflake/ml/model/_packager/model_handlers_test/mlflow_test.py index feb0eb46..e04da263 100644 --- a/snowflake/ml/model/_packager/model_handlers_test/mlflow_test.py +++ b/snowflake/ml/model/_packager/model_handlers_test/mlflow_test.py @@ -84,9 +84,11 @@ def test_mlflow_model(self) -> None: sorted( [ "mlflow<3,>=2.3", + "numpy==1.23.4", "psutil==5.9.0", "scikit-learn==1.2.2", "scipy==1.9.3", + "typing-extensions==4.5.0", ] ), ) @@ -126,9 +128,11 @@ def test_mlflow_model(self) -> None: sorted( [ "mlflow<3,>=2.3", + "numpy==1.23.4", "psutil==5.9.0", "scikit-learn==1.2.2", "scipy==1.9.3", + "typing-extensions==4.5.0", ] ), ) diff --git a/snowflake/ml/model/_packager/model_meta/BUILD.bazel b/snowflake/ml/model/_packager/model_meta/BUILD.bazel index 8a55af08..1fd7fb57 100644 --- a/snowflake/ml/model/_packager/model_meta/BUILD.bazel +++ b/snowflake/ml/model/_packager/model_meta/BUILD.bazel @@ -20,6 +20,24 @@ py_library( srcs = [":gen_core_requirements"], ) +GEN_PACKAGING_REQ_CMD = "$(location //bazel/requirements:parse_and_generate_requirements) $(location //:requirements.yml) --schema $(location //bazel/requirements:requirements.schema.json) --mode version_requirements --format python --filter_by_tag model_packaging > $@" + +py_genrule( + name = "gen_packaging_requirements", + srcs = [ + "//:requirements.yml", + "//bazel/requirements:requirements.schema.json", + ], + outs = ["_packaging_requirements.py"], + cmd = GEN_PACKAGING_REQ_CMD, + tools = ["//bazel/requirements:parse_and_generate_requirements"], +) + +py_library( + name = "_packaging_requirements", + srcs = [":gen_packaging_requirements"], +) + py_library( name = "model_meta_schema", srcs = ["model_meta_schema.py"], @@ -41,6 +59,7 @@ py_library( srcs = ["model_meta.py"], deps = [ ":_core_requirements", + ":_packaging_requirements", ":model_blob_meta", ":model_meta_schema", "//snowflake/ml/_internal:env", diff --git a/snowflake/ml/model/_packager/model_meta/model_meta.py b/snowflake/ml/model/_packager/model_meta/model_meta.py index ae6a32e1..b2be8567 100644 --- a/snowflake/ml/model/_packager/model_meta/model_meta.py +++ b/snowflake/ml/model/_packager/model_meta/model_meta.py @@ -18,6 +18,7 @@ from snowflake.ml.model._packager.model_env import model_env from snowflake.ml.model._packager.model_meta import ( _core_requirements, + _packaging_requirements, model_blob_meta, model_meta_schema, ) @@ -26,7 +27,8 @@ MODEL_METADATA_FILE = "model.yaml" MODEL_CODE_DIR = "code" -_PACKAGING_CORE_DEPENDENCIES = _core_requirements.REQUIREMENTS +_PACKAGING_CORE_DEPENDENCIES = _core_requirements.REQUIREMENTS # Legacy Model only +_PACKAGING_REQUIREMENTS = _packaging_requirements.REQUIREMENTS # New Model only _SNOWFLAKE_PKG_NAME = "snowflake" _SNOWFLAKE_ML_PKG_NAME = f"{_SNOWFLAKE_PKG_NAME}.ml" @@ -73,6 +75,8 @@ def create_model_metadata( model_dir_path = os.path.normpath(model_dir_path) embed_local_ml_library = kwargs.pop("embed_local_ml_library", False) legacy_save = kwargs.pop("_legacy_save", False) + relax_version = kwargs.pop("relax_version", False) + if embed_local_ml_library: # Use the last one which is loaded first, that is mean, it is loaded from site-packages. # We could make sure that user does not overwrite our library with their code follow the same naming. @@ -94,6 +98,8 @@ def create_model_metadata( pip_requirements=pip_requirements, python_version=python_version, embed_local_ml_library=embed_local_ml_library, + legacy_save=legacy_save, + relax_version=relax_version, ) if embed_local_ml_library: @@ -146,6 +152,8 @@ def _create_env_for_model_metadata( pip_requirements: Optional[List[str]] = None, python_version: Optional[str] = None, embed_local_ml_library: bool = False, + legacy_save: bool = False, + relax_version: bool = False, ) -> model_env.ModelEnv: env = model_env.ModelEnv() @@ -154,11 +162,14 @@ def _create_env_for_model_metadata( env.pip_requirements = pip_requirements # type: ignore[assignment] env.python_version = python_version # type: ignore[assignment] env.snowpark_ml_version = snowml_env.VERSION + + requirements_to_add = _PACKAGING_CORE_DEPENDENCIES if legacy_save else _PACKAGING_REQUIREMENTS + if embed_local_ml_library: env.include_if_absent( [ model_env.ModelDependency(requirement=dep, pip_name=requirements.Requirement(dep).name) - for dep in _PACKAGING_CORE_DEPENDENCIES + for dep in requirements_to_add ], check_local_version=True, ) @@ -166,11 +177,14 @@ def _create_env_for_model_metadata( env.include_if_absent( [ model_env.ModelDependency(requirement=dep, pip_name=requirements.Requirement(dep).name) - for dep in _PACKAGING_CORE_DEPENDENCIES + [env_utils.SNOWPARK_ML_PKG_NAME] + for dep in requirements_to_add + [env_utils.SNOWPARK_ML_PKG_NAME] ], check_local_version=True, ) + if relax_version: + env.relax_version() + return env diff --git a/snowflake/ml/model/_packager/model_meta/model_meta_test.py b/snowflake/ml/model/_packager/model_meta/model_meta_test.py index 7d9993cf..30ecbbe7 100644 --- a/snowflake/ml/model/_packager/model_meta/model_meta_test.py +++ b/snowflake/ml/model/_packager/model_meta/model_meta_test.py @@ -41,12 +41,43 @@ ) ) +_PACKAGING_REQUIREMENTS_TARGET = list( + sorted( + map( + lambda x: str(env_utils.get_local_installed_version_of_pip_package(requirements.Requirement(x))), + model_meta._PACKAGING_REQUIREMENTS, + ) + ) +) -class ModelMetaEnvTest(absltest.TestCase): +_PACKAGING_REQUIREMENTS_TARGET_WITH_SNOWML = list( + sorted( + map( + lambda x: str(env_utils.get_local_installed_version_of_pip_package(requirements.Requirement(x))), + model_meta._PACKAGING_REQUIREMENTS + [env_utils.SNOWPARK_ML_PKG_NAME], + ) + ) +) + +_PACKAGING_REQUIREMENTS_TARGET_WITH_SNOWML_RELAXED = list( + sorted( + map( + lambda x: str( + env_utils.relax_requirement_version( + env_utils.get_local_installed_version_of_pip_package(requirements.Requirement(x)) + ) + ), + model_meta._PACKAGING_REQUIREMENTS + [env_utils.SNOWPARK_ML_PKG_NAME], + ) + ) +) + + +class ModelMetaEnvLegacyTest(absltest.TestCase): def test_model_meta_dependencies_no_packages(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: with model_meta.create_model_metadata( - model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG + model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG, _legacy_save=True ) as meta: meta.models["model1"] = _DUMMY_BLOB self.assertListEqual(meta.env.pip_requirements, []) @@ -67,6 +98,7 @@ def test_model_meta_dependencies_no_packages_embedded_snowml(self) -> None: model_type="custom", signatures=_DUMMY_SIG, embed_local_ml_library=True, + _legacy_save=True, ) as meta: meta.models["model1"] = _DUMMY_BLOB self.assertListEqual(meta.env.pip_requirements, []) @@ -86,12 +118,13 @@ def test_model_meta_dependencies_dup_basic_dep(self) -> None: name="model1", model_type="custom", signatures=_DUMMY_SIG, - conda_dependencies=["pandas"], + conda_dependencies=["cloudpickle"], + _legacy_save=True, ) as meta: meta.models["model1"] = _DUMMY_BLOB dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] - dep_target.remove(f"pandas=={importlib_metadata.version('pandas')}") - dep_target.append("pandas") + dep_target.remove(f"cloudpickle=={importlib_metadata.version('cloudpickle')}") + dep_target.append("cloudpickle") dep_target.sort() self.assertListEqual(meta.env.pip_requirements, []) @@ -110,12 +143,13 @@ def test_model_meta_dependencies_dup_basic_dep_other_channel(self) -> None: name="model1", model_type="custom", signatures=_DUMMY_SIG, - conda_dependencies=["conda-forge::pandas"], + conda_dependencies=["conda-forge::cloudpickle"], + _legacy_save=True, ) as meta: meta.models["model1"] = _DUMMY_BLOB dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] - dep_target.remove(f"pandas=={importlib_metadata.version('pandas')}") - dep_target.append("conda-forge::pandas") + dep_target.remove(f"cloudpickle=={importlib_metadata.version('cloudpickle')}") + dep_target.append("conda-forge::cloudpickle") dep_target.sort() self.assertListEqual(meta.env.pip_requirements, []) @@ -135,20 +169,21 @@ def test_model_meta_dependencies_dup_basic_dep_pip(self) -> None: name="model1", model_type="custom", signatures=_DUMMY_SIG, - pip_requirements=["pandas"], + pip_requirements=["cloudpickle"], + _legacy_save=True, ) as meta: meta.models["model1"] = _DUMMY_BLOB dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] - dep_target.remove(f"pandas=={importlib_metadata.version('pandas')}") + dep_target.remove(f"cloudpickle=={importlib_metadata.version('cloudpickle')}") dep_target.sort() - self.assertListEqual(meta.env.pip_requirements, ["pandas"]) + self.assertListEqual(meta.env.pip_requirements, ["cloudpickle"]) self.assertListEqual(meta.env.conda_dependencies, dep_target) with self.assertWarns(UserWarning): loaded_meta = model_meta.ModelMetadata.load(tmpdir) - self.assertListEqual(loaded_meta.env.pip_requirements, ["pandas"]) + self.assertListEqual(loaded_meta.env.pip_requirements, ["cloudpickle"]) self.assertListEqual(loaded_meta.env.conda_dependencies, dep_target) def test_model_meta_dependencies_conda(self) -> None: @@ -159,6 +194,7 @@ def test_model_meta_dependencies_conda(self) -> None: model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["pytorch"], + _legacy_save=True, ) as meta: meta.models["model1"] = _DUMMY_BLOB dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] @@ -181,6 +217,7 @@ def test_model_meta_dependencies_pip(self) -> None: model_type="custom", signatures=_DUMMY_SIG, pip_requirements=["torch"], + _legacy_save=True, ) as meta: meta.models["model1"] = _DUMMY_BLOB dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] @@ -203,6 +240,7 @@ def test_model_meta_dependencies_both(self) -> None: signatures=_DUMMY_SIG, conda_dependencies=["pytorch"], pip_requirements=["torch"], + _legacy_save=True, ) as meta: meta.models["model1"] = _DUMMY_BLOB dep_target = _BASIC_DEPENDENCIES_TARGET_WITH_SNOWML[:] @@ -217,6 +255,198 @@ def test_model_meta_dependencies_both(self) -> None: self.assertListEqual(loaded_meta.env.pip_requirements, ["torch"]) self.assertListEqual(loaded_meta.env.conda_dependencies, dep_target) + +class ModelMetaEnvTest(absltest.TestCase): + def test_model_meta_dependencies_no_packages(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + self.assertListEqual(meta.env.pip_requirements, []) + self.assertListEqual(meta.env.conda_dependencies, _PACKAGING_REQUIREMENTS_TARGET_WITH_SNOWML) + self.assertEqual(meta.env.snowpark_ml_version, snowml_env.VERSION) + + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, []) + self.assertListEqual(loaded_meta.env.conda_dependencies, _PACKAGING_REQUIREMENTS_TARGET_WITH_SNOWML) + self.assertEqual(meta.env.snowpark_ml_version, snowml_env.VERSION) + + def test_model_meta_dependencies_relax_version(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG, relax_version=True + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + self.assertListEqual(meta.env.pip_requirements, []) + self.assertListEqual(meta.env.conda_dependencies, _PACKAGING_REQUIREMENTS_TARGET_WITH_SNOWML_RELAXED) + self.assertEqual(meta.env.snowpark_ml_version, snowml_env.VERSION) + + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, []) + self.assertListEqual(loaded_meta.env.conda_dependencies, _PACKAGING_REQUIREMENTS_TARGET_WITH_SNOWML_RELAXED) + self.assertEqual(meta.env.snowpark_ml_version, snowml_env.VERSION) + + def test_model_meta_dependencies_no_packages_embedded_snowml(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + embed_local_ml_library=True, + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + self.assertListEqual(meta.env.pip_requirements, []) + self.assertListEqual(meta.env.conda_dependencies, _PACKAGING_REQUIREMENTS_TARGET) + self.assertIsNotNone(meta.env._snowpark_ml_version.local) + + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, []) + self.assertListEqual(loaded_meta.env.conda_dependencies, _PACKAGING_REQUIREMENTS_TARGET) + self.assertIsNotNone(meta.env._snowpark_ml_version.local) + + def test_model_meta_dependencies_dup_basic_dep(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + conda_dependencies=["cloudpickle"], + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + dep_target = _PACKAGING_REQUIREMENTS_TARGET_WITH_SNOWML[:] + dep_target.remove(f"cloudpickle=={importlib_metadata.version('cloudpickle')}") + dep_target.append("cloudpickle") + dep_target.sort() + + self.assertListEqual(meta.env.pip_requirements, []) + self.assertListEqual(meta.env.conda_dependencies, dep_target) + + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, []) + self.assertListEqual(loaded_meta.env.conda_dependencies, dep_target) + + def test_model_meta_dependencies_dup_basic_dep_other_channel(self) -> None: + with self.assertWarns(UserWarning): + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + conda_dependencies=["conda-forge::cloudpickle"], + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + dep_target = _PACKAGING_REQUIREMENTS_TARGET_WITH_SNOWML[:] + dep_target.remove(f"cloudpickle=={importlib_metadata.version('cloudpickle')}") + dep_target.append("conda-forge::cloudpickle") + dep_target.sort() + + self.assertListEqual(meta.env.pip_requirements, []) + self.assertListEqual(meta.env.conda_dependencies, dep_target) + + with self.assertWarns(UserWarning): + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, []) + self.assertListEqual(loaded_meta.env.conda_dependencies, dep_target) + + def test_model_meta_dependencies_dup_basic_dep_pip(self) -> None: + with self.assertWarns(UserWarning): + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + pip_requirements=["cloudpickle"], + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + dep_target = _PACKAGING_REQUIREMENTS_TARGET_WITH_SNOWML[:] + dep_target.remove(f"cloudpickle=={importlib_metadata.version('cloudpickle')}") + dep_target.sort() + + self.assertListEqual(meta.env.pip_requirements, ["cloudpickle"]) + self.assertListEqual(meta.env.conda_dependencies, dep_target) + + with self.assertWarns(UserWarning): + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, ["cloudpickle"]) + self.assertListEqual(loaded_meta.env.conda_dependencies, dep_target) + + def test_model_meta_dependencies_conda(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + conda_dependencies=["pytorch"], + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + dep_target = _PACKAGING_REQUIREMENTS_TARGET_WITH_SNOWML[:] + dep_target.append("pytorch") + dep_target.sort() + + self.assertListEqual(meta.env.pip_requirements, []) + self.assertListEqual(meta.env.conda_dependencies, dep_target) + + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, []) + self.assertListEqual(loaded_meta.env.conda_dependencies, dep_target) + + def test_model_meta_dependencies_pip(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + pip_requirements=["torch"], + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + dep_target = _PACKAGING_REQUIREMENTS_TARGET_WITH_SNOWML[:] + dep_target.sort() + + self.assertListEqual(meta.env.pip_requirements, ["torch"]) + self.assertListEqual(meta.env.conda_dependencies, dep_target) + + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, ["torch"]) + self.assertListEqual(loaded_meta.env.conda_dependencies, dep_target) + + def test_model_meta_dependencies_both(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with model_meta.create_model_metadata( + model_dir_path=tmpdir, + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + conda_dependencies=["pytorch"], + pip_requirements=["torch"], + ) as meta: + meta.models["model1"] = _DUMMY_BLOB + dep_target = _PACKAGING_REQUIREMENTS_TARGET_WITH_SNOWML[:] + dep_target.append("pytorch") + dep_target.sort() + + self.assertListEqual(meta.env.pip_requirements, ["torch"]) + self.assertListEqual(meta.env.conda_dependencies, dep_target) + + loaded_meta = model_meta.ModelMetadata.load(tmpdir) + + self.assertListEqual(loaded_meta.env.pip_requirements, ["torch"]) + self.assertListEqual(loaded_meta.env.conda_dependencies, dep_target) + def test_model_meta_override_py_version(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: with model_meta.create_model_metadata( diff --git a/snowflake/ml/model/type_hints.py b/snowflake/ml/model/type_hints.py index 31a89bbf..8250e435 100644 --- a/snowflake/ml/model/type_hints.py +++ b/snowflake/ml/model/type_hints.py @@ -198,9 +198,12 @@ class BaseModelSaveOption(TypedDict): """Options for saving the model. embed_local_ml_library: Embedding local SnowML into the code directory of the folder. + relax_version: Whether or not relax the version constraints of the dependencies if unresolvable. It detects any + ==x.y.z in specifiers and replaced with >=x.y, <(x+1). Defaults to False. """ embed_local_ml_library: NotRequired[bool] + relax_version: NotRequired[bool] _legacy_save: NotRequired[bool] method_options: NotRequired[Dict[str, ModelMethodSaveOptions]] diff --git a/snowflake/ml/modeling/_internal/distributed_hpo_trainer.py b/snowflake/ml/modeling/_internal/distributed_hpo_trainer.py index 8928f61b..0366c4e2 100644 --- a/snowflake/ml/modeling/_internal/distributed_hpo_trainer.py +++ b/snowflake/ml/modeling/_internal/distributed_hpo_trainer.py @@ -4,11 +4,12 @@ import os import posixpath import sys -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import cloudpickle as cp import numpy as np from sklearn import model_selection +from sklearn.model_selection import GridSearchCV, RandomizedSearchCV from snowflake.ml._internal import telemetry from snowflake.ml._internal.utils import ( @@ -41,23 +42,28 @@ def construct_cv_results( + estimator: Union[GridSearchCV, RandomizedSearchCV], + n_split: int, + param_grid: List[Dict[str, Any]], cv_results_raw_hex: List[Row], cross_validator_indices_length: int, parameter_grid_length: int, - search_cv_kwargs: Dict[str, Any], -) -> Tuple[bool, Dict[str, Any], int, Set[str]]: +) -> Tuple[bool, Dict[str, Any]]: """Construct the cross validation result from the UDF. Because we accelerate the process by the number of cross validation number, and the combination of parameter grids. Therefore, we need to stick them back together instead of returning the raw result to align with original sklearn result. Args: + estimator (Union[GridSearchCV, RandomizedSearchCV]): The sklearn object of estimator + GridSearchCV or RandomizedSearchCV + n_split (int): The number of split, which is determined by build_cross_validator.get_n_splits(X, y, groups) + param_grid (List[Dict[str, Any]]): the list of parameter grid or parameter sampler cv_results_raw_hex (List[Row]): the list of cv_results from each cv and parameter grid combination. Because UDxF can only return string, and numpy array/masked arrays cannot be encoded in a json format. Each cv_result is encoded into hex string. cross_validator_indices_length (int): the length of cross validator indices parameter_grid_length (int): the length of parameter grid combination - search_cv_kwargs (Dict[str, Any]): the kwargs for GridSearchCV/RandomSearchCV. Raises: ValueError: Retrieved empty cross validation results @@ -67,7 +73,7 @@ def construct_cv_results( RuntimeError: Cross validation results are unexpectedly empty for one fold. Returns: - Tuple[bool, Dict[str, Any], int, Set[str]]: returns multimetric, cv_results_, best_param_index, scorers + Tuple[bool, Dict[str, Any]]: returns multimetric, cv_results_ """ # Filter corner cases: either the snowpark dataframe result is empty; or index length is empty if len(cv_results_raw_hex) == 0: @@ -79,12 +85,8 @@ def construct_cv_results( if parameter_grid_length == 0: raise ValueError("Parameter index length is 0. Were there no candidates?") - from scipy.stats import rankdata - # cv_result maintains the original order multimetric = False - cv_results_ = dict() - scorers = set() # retrieve the cv_results from udtf table; results are encoded by hex and cloudpickle; # We are constructing the raw information back to original form if len(cv_results_raw_hex) != cross_validator_indices_length * parameter_grid_length: @@ -94,7 +96,9 @@ def construct_cv_results( "Please retry or contact snowflake support." ) - for param_cv_indices, each_cv_result_hex in enumerate(cv_results_raw_hex): + out = [] + + for each_cv_result_hex in cv_results_raw_hex: # convert the hex string back to cv_results_ hex_str = bytes.fromhex(each_cv_result_hex[0]) with io.BytesIO(hex_str) as f_reload: @@ -103,85 +107,46 @@ def construct_cv_results( raise RuntimeError( "Cross validation response is empty. This issue may be temporary - please try again." ) - for k, v in each_cv_result.items(): - cur_cv_idx = param_cv_indices % cross_validator_indices_length - key = k - if "split0_test_" in k: + temp_dict = dict() + """ + This dictionary has the following keys + train_scores : dict of scorer name -> float + Score on training set (for all the scorers), + returned only if `return_train_score` is `True`. + test_scores : dict of scorer name -> float + Score on testing set (for all the scorers). + fit_time : float + Time spent for fitting in seconds. + score_time : float + Time spent for scoring in seconds. + """ + if estimator.return_train_score: + if each_cv_result.get("split0_train_score", None): + # for single scorer, the split0_train_score only contains an array with one value + temp_dict["train_scores"] = each_cv_result["split0_train_score"][0] + else: + # if multimetric situation, the format would be + # {metric_name1: value, metric_name2: value, ...} + temp_dict["train_scores"] = {} # For multi-metric evaluation, the scores for all the scorers are available in the # cv_results_ dict at the keys ending with that scorer’s name ('_') # instead of '_score'. - scorers.add(k[len("split0_test_") :]) - key = k.replace("split0_test", f"split{cur_cv_idx}_test") - if search_cv_kwargs.get("return_train_score", None) and "split0_train_" in k: - key = k.replace("split0_train", f"split{cur_cv_idx}_train") - elif k.startswith("param"): - if cur_cv_idx != 0: - continue - if key: - if key not in cv_results_: - cv_results_[key] = v - else: - cv_results_[key] = np.concatenate([cv_results_[key], v]) - - multimetric = len(scorers) > 1 - # Use numpy to re-calculate all the information in cv_results_ again - # Generally speaking, reshape all the results into the (scorers+2, idx_length, params_length) shape, - # and average them by the idx_length; - # idx_length is the number of cv folds; params_length is the number of parameter combinations - scores_test = [ - np.reshape( - np.concatenate( - [cv_results_[f"split{cur_cv}_test_{score}"] for cur_cv in range(cross_validator_indices_length)] - ), - (cross_validator_indices_length, -1), - ) - for score in scorers - ] - - fit_score_test_matrix = np.stack( - [ - np.reshape(cv_results_["mean_fit_time"], (cross_validator_indices_length, -1)), - np.reshape(cv_results_["mean_score_time"], (cross_validator_indices_length, -1)), - ] - + scores_test - ) - mean_fit_score_test_matrix = np.mean(fit_score_test_matrix, axis=1) - std_fit_score_test_matrix = np.std(fit_score_test_matrix, axis=1) - - if search_cv_kwargs.get("return_train_score", None): - scores_train = [ - np.reshape( - np.concatenate( - [cv_results_[f"split{cur_cv}_train_{score}"] for cur_cv in range(cross_validator_indices_length)] - ), - (cross_validator_indices_length, -1), - ) - for score in scorers - ] - mean_fit_score_train_matrix = np.mean(scores_train, axis=1) - std_fit_score_train_matrix = np.std(scores_train, axis=1) - - cv_results_["std_fit_time"] = std_fit_score_test_matrix[0] - cv_results_["mean_fit_time"] = mean_fit_score_test_matrix[0] - cv_results_["std_score_time"] = std_fit_score_test_matrix[1] - cv_results_["mean_score_time"] = mean_fit_score_test_matrix[1] - for idx, score in enumerate(scorers): - cv_results_[f"std_test_{score}"] = std_fit_score_test_matrix[idx + 2] - cv_results_[f"mean_test_{score}"] = mean_fit_score_test_matrix[idx + 2] - if search_cv_kwargs.get("return_train_score", None): - cv_results_[f"std_train_{score}"] = std_fit_score_train_matrix[idx] - cv_results_[f"mean_train_{score}"] = mean_fit_score_train_matrix[idx] - # re-compute the ranking again with mean_test_. - cv_results_[f"rank_test_{score}"] = rankdata(-cv_results_[f"mean_test_{score}"], method="min") - # The best param is the highest ranking (which is 1) and we choose the first time ranking 1 appeared. - # If all scores are `nan`, `rankdata` will also produce an array of `nan` values. - # In that case, default to first index. - best_param_index = ( - np.where(cv_results_[f"rank_test_{score}"] == 1)[0][0] - if not np.isnan(cv_results_[f"rank_test_{score}"]).all() - else 0 - ) - return multimetric, cv_results_, best_param_index, scorers + for k, v in each_cv_result.items(): + if "split0_train_" in k: + temp_dict["train_scores"][k[len("split0_train_") :]] = v + if isinstance(each_cv_result.get("split0_test_score"), np.ndarray): + temp_dict["test_scores"] = each_cv_result["split0_test_score"][0] + else: + temp_dict["test_scores"] = {} + for k, v in each_cv_result.items(): + if "split0_test_" in k: + temp_dict["test_scores"][k[len("split0_test_") :]] = v + temp_dict["fit_time"] = each_cv_result["mean_fit_time"][0] + temp_dict["score_time"] = each_cv_result["mean_score_time"][0] + out.append(temp_dict) + first_test_score = out[0]["test_scores"] + multimetric = isinstance(first_test_score, dict) + return multimetric, estimator._format_results(param_grid, n_split, out) cp.register_pickle_by_value(inspect.getmodule(construct_cv_results)) @@ -288,7 +253,6 @@ def fit_search_snowpark( inspect.currentframe(), self.__class__.__name__ ), api_calls=[sproc], - custom_tags=dict([("autogen", True)]) if self._autogenerated else None, ) udtf_statement_params = telemetry.get_function_usage_statement_params( project=_PROJECT, @@ -297,7 +261,7 @@ def fit_search_snowpark( inspect.currentframe(), self.__class__.__name__ ), api_calls=[udtf], - custom_tags=dict([("autogen", True)]) if self._autogenerated else None, + custom_tags=dict([("hpo_udtf", True)]), ) # Put locally serialized estimator on stage. @@ -375,8 +339,12 @@ def _distributed_search( estimator = cp.load(local_estimator_file_obj)["estimator"] build_cross_validator = check_cv(estimator.cv, y, classifier=is_classifier(estimator.estimator)) + from sklearn.utils.validation import indexable + + X, y, _ = indexable(X, y, None) + n_splits = build_cross_validator.get_n_splits(X, y, None) # store the cross_validator's test indices only to save space - cross_validator_indices = [test for _, test in build_cross_validator.split(X, y)] + cross_validator_indices = [test for _, test in build_cross_validator.split(X, y, None)] local_indices_file_name = get_temp_file_path() with open(local_indices_file_name, mode="w+b") as local_indices_file_obj: cp.dump(cross_validator_indices, local_indices_file_obj) @@ -529,14 +497,14 @@ def end_partition(self) -> None: ) ), ) - - multimetric, cv_results_, best_param_index, scorers = construct_cv_results( + # multimetric, cv_results_, best_param_index, scorers + multimetric, cv_results_ = construct_cv_results( + estimator, + n_splits, + list(param_grid), HP_raw_results.select("CV_RESULTS").sort(F.col("PARAM_CV_IND")).collect(), cross_validator_indices_length, parameter_grid_length, - { - "return_train_score": estimator.return_train_score, - }, # TODO(xjiang): support more kwargs in here ) estimator.cv_results_ = cv_results_ @@ -568,7 +536,7 @@ def end_partition(self) -> None: # With a non-custom callable, we can select the best score # based on the best index estimator.best_score_ = cv_results_[f"mean_test_{refit_metric}"][estimator.best_index_] - estimator.best_params_ = cv_results_["params"][best_param_index] + estimator.best_params_ = cv_results_["params"][estimator.best_index_] if original_refit: estimator.best_estimator_ = clone(estimator.estimator).set_params( diff --git a/snowflake/ml/modeling/_internal/model_specifications_test.py b/snowflake/ml/modeling/_internal/model_specifications_test.py index a7ec2a26..785f922d 100644 --- a/snowflake/ml/modeling/_internal/model_specifications_test.py +++ b/snowflake/ml/modeling/_internal/model_specifications_test.py @@ -6,6 +6,7 @@ import numpy as np from absl.testing import absltest, parameterized from lightgbm import LGBMRegressor +from sklearn.decomposition import PCA from sklearn.linear_model import LinearRegression from sklearn.model_selection import GridSearchCV from xgboost import XGBRegressor @@ -142,6 +143,9 @@ SAMPLES: Dict[str, Dict[str, Any]] = { "basic": { + "estimator": GridSearchCV(estimator=PCA(), param_grid={"n_components": range(1, 3)}, cv=3), + "n_splits": 3, + "param_grid": [{"n_components": 1}, {"n_components": 2}], "each_cv_result": each_cv_result_basic_sample, "IDX_LENGTH": 3, "PARAM_LENGTH": 2, @@ -163,31 +167,26 @@ }, }, "return_train_score": { + "estimator": GridSearchCV(estimator=PCA(), param_grid={"n_components": range(1, 2)}, cv=2), + "n_splits": 2, + "param_grid": [{"n_components": 2}], "each_cv_result": each_cv_result_return_train, "IDX_LENGTH": 2, "PARAM_LENGTH": 1, "CV_RESULT_": { - "mean_fit_time": np.array( - [ - 0.00286627, - ] - ), + "mean_fit_time": np.array([0.00286627]), "std_fit_time": np.array([0.0002892]), "mean_score_time": np.array([0.00164152]), "std_score_time": np.array([0.00012303]), "param_n_components": np.ma.masked_array( - data=[2], mask=False, fill_value="?", dtype=object + data=[2], mask=[False], fill_value="?", dtype=object ), # type: ignore[no-untyped-call] - "params": np.array([{"n_components": 2}], dtype=object), - "mean_train_score": np.array([-11.09288916]), - "std_train_score": np.array([2.52275917]), - "mean_test_score": np.array([-11.09288916]), - "std_test_score": np.array([2.52275917]), - "rank_test_score": np.array([1]), + "params": [{"n_components": 2}], "split0_test_score": np.array([-13.61564833]), "split1_test_score": np.array([-8.57012999]), - "split0_train_score": np.array([-13.61564833]), - "split1_train_score": np.array([-8.57012999]), + "mean_test_score": np.array([-11.09288916]), + "std_test_score": np.array([2.52275917]), + "rank_test_score": np.array([1], dtype=np.int32), }, }, } @@ -276,40 +275,60 @@ def _compare_cv_results(self, cv_result_1: Dict[str, Any], cv_result_2: Dict[str # Do not compare the fit time def test_cv_result(self) -> None: - multimetric, cv_results_, best_param_index, scorers = construct_cv_results( + multimetric, cv_results_ = construct_cv_results( + SAMPLES["basic"]["estimator"], + SAMPLES["basic"]["n_splits"], + SAMPLES["basic"]["param_grid"], self.RAW_DATA_SP, SAMPLES["basic"]["IDX_LENGTH"], SAMPLES["basic"]["PARAM_LENGTH"], - {"return_train_score": False}, ) self.assertEqual(multimetric, False) - self.assertEqual(best_param_index, 0) self._compare_cv_results(cv_results_, SAMPLES["basic"]["CV_RESULT_"]) - self.assertEqual(scorers, {"score"}) def test_cv_result_return_train_score(self) -> None: - multimetric, cv_results_, best_param_index, scorers = construct_cv_results( + multimetric, cv_results_ = construct_cv_results( + SAMPLES["return_train_score"]["estimator"], + SAMPLES["return_train_score"]["n_splits"], + SAMPLES["return_train_score"]["param_grid"], [Row(val) for val in SAMPLES["return_train_score"]["combine_hex_cv_result"]], SAMPLES["return_train_score"]["IDX_LENGTH"], SAMPLES["return_train_score"]["PARAM_LENGTH"], - {"return_train_score": True}, ) self.assertEqual(multimetric, False) self._compare_cv_results(cv_results_, SAMPLES["return_train_score"]["CV_RESULT_"]) - self.assertEqual(scorers, {"score"}) def test_cv_result_incorrect_param_length(self) -> None: with self.assertRaises(ValueError): - construct_cv_results(self.RAW_DATA_SP, SAMPLES["basic"]["IDX_LENGTH"], 1, {"return_train_score": False}) + construct_cv_results( + SAMPLES["basic"]["estimator"], + SAMPLES["basic"]["n_splits"], + SAMPLES["basic"]["param_grid"], + self.RAW_DATA_SP, + SAMPLES["basic"]["IDX_LENGTH"], + 1, + ) def test_cv_result_nan(self) -> None: # corner cases with nan values with self.assertRaises(ValueError): - construct_cv_results(self.RAW_DATA_SP, 0, SAMPLES["basic"]["PARAM_LENGTH"], {"return_train_score": False}) + construct_cv_results( + SAMPLES["basic"]["estimator"], + SAMPLES["basic"]["n_splits"], + SAMPLES["basic"]["param_grid"], + self.RAW_DATA_SP, + 0, + SAMPLES["basic"]["PARAM_LENGTH"], + ) # empty list with self.assertRaises(ValueError): construct_cv_results( - [], SAMPLES["basic"]["IDX_LENGTH"], SAMPLES["basic"]["PARAM_LENGTH"], {"return_train_score": False} + SAMPLES["basic"]["estimator"], + SAMPLES["basic"]["n_splits"], + SAMPLES["basic"]["param_grid"], + [], + SAMPLES["basic"]["IDX_LENGTH"], + SAMPLES["basic"]["PARAM_LENGTH"], ) diff --git a/snowflake/ml/modeling/_internal/snowpark_handlers.py b/snowflake/ml/modeling/_internal/snowpark_handlers.py index 3adb0c54..ece6f771 100644 --- a/snowflake/ml/modeling/_internal/snowpark_handlers.py +++ b/snowflake/ml/modeling/_internal/snowpark_handlers.py @@ -306,7 +306,7 @@ def score_wrapper_sproc( input_cols: List[str], label_cols: List[str], sample_weight_col: Optional[str], - statement_params: Dict[str, str], + score_statement_params: Dict[str, str], ) -> float: import inspect import os @@ -317,13 +317,13 @@ def score_wrapper_sproc( importlib.import_module(import_name) for query in sql_queries[:-1]: - _ = session.sql(query).collect(statement_params=statement_params) + _ = session.sql(query).collect(statement_params=score_statement_params) sp_df = session.sql(sql_queries[-1]) - df: pd.DataFrame = sp_df.to_pandas(statement_params=statement_params) + df: pd.DataFrame = sp_df.to_pandas(statement_params=score_statement_params) df.columns = sp_df.columns local_score_file_name = get_temp_file_path() - session.file.get(stage_score_file_name, local_score_file_name, statement_params=statement_params) + session.file.get(stage_score_file_name, local_score_file_name, statement_params=score_statement_params) local_score_file_name_path = os.path.join(local_score_file_name, os.listdir(local_score_file_name)[0]) with open(local_score_file_name_path, mode="r+b") as local_score_file_obj: @@ -348,7 +348,7 @@ def score_wrapper_sproc( return result # Call score sproc - statement_params = telemetry.get_function_usage_statement_params( + score_statement_params = telemetry.get_function_usage_statement_params( project=_PROJECT, subproject=self._subproject, function_name=telemetry.get_statement_params_full_func_name( @@ -357,6 +357,8 @@ def score_wrapper_sproc( api_calls=[Session.call], custom_tags=dict([("autogen", True)]) if self._autogenerated else None, ) + + kwargs = telemetry.get_sproc_statement_params_kwargs(score_wrapper_sproc, score_statement_params) score: float = score_wrapper_sproc( session, queries, @@ -364,7 +366,8 @@ def score_wrapper_sproc( input_cols, label_cols, sample_weight_col, - statement_params, + score_statement_params, + **kwargs, ) cleanup_temp_files([local_score_file_name]) diff --git a/snowflake/ml/modeling/metrics/classification.py b/snowflake/ml/modeling/metrics/classification.py index 23bf0960..82fd3cd2 100644 --- a/snowflake/ml/modeling/metrics/classification.py +++ b/snowflake/ml/modeling/metrics/classification.py @@ -228,16 +228,15 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_ Returns: Name of the UDTF. """ + batch_size = metrics_utils.BATCH_SIZE class ConfusionMatrixComputer: - BATCH_SIZE = 1000 - def __init__(self) -> None: self._initialized = False self._confusion_matrix = np.zeros((1, 1)) - # 2d array containing a batch of input rows. A batch contains self.BATCH_SIZE rows. + # 2d array containing a batch of input rows. A batch contains metrics_utils.BATCH_SIZE rows. # [sample_weight, y_true, y_pred] - self._batched_rows = np.zeros((self.BATCH_SIZE, 1)) + self._batched_rows = np.zeros((batch_size, 1)) # Number of columns in the dataset. self._n_cols = -1 # Running count of number of rows added to self._batched_rows. @@ -255,7 +254,7 @@ def process(self, input_row: List[float], n_label: int) -> None: # 1. Initialize variables. if not self._initialized: self._n_cols = len(input_row) - self._batched_rows = np.zeros((self.BATCH_SIZE, self._n_cols)) + self._batched_rows = np.zeros((batch_size, self._n_cols)) self._n_label = n_label self._confusion_matrix = np.zeros((self._n_label, self._n_label)) self._initialized = True @@ -264,7 +263,7 @@ def process(self, input_row: List[float], n_label: int) -> None: self._cur_count += 1 # 2. Compute incremental confusion matrix for the batch. - if self._cur_count >= self.BATCH_SIZE: + if self._cur_count >= batch_size: self.update_confusion_matrix() self._cur_count = 0 diff --git a/snowflake/ml/modeling/metrics/metrics_utils.py b/snowflake/ml/modeling/metrics/metrics_utils.py index 79cde616..55ffd17c 100644 --- a/snowflake/ml/modeling/metrics/metrics_utils.py +++ b/snowflake/ml/modeling/metrics/metrics_utils.py @@ -15,6 +15,7 @@ LABEL = "LABEL" INDEX = "INDEX" +BATCH_SIZE = 1000 def register_accumulator_udtf(*, session: Session, statement_params: Dict[str, Any]) -> str: @@ -82,7 +83,7 @@ class ShardedDotAndSumComputer: """This class is registered as a UDTF and computes the sum and dot product of columns for each partition of rows. The computations across all the partitions happens in parallel using the nodes in the warehouse. In order to avoid keeping the entire partition - in memory, we batch the rows (size is 1000) and maintain a running sum and dot prod in self._sum_by_count, + in memory, we batch the rows and maintain a running sum and dot prod in self._sum_by_count, self._sum_by_countd and self._dot_prod respectively. We return these at the end of the partition. """ @@ -95,7 +96,7 @@ def __init__(self) -> None: # delta degree of freedom self._ddof = 0 # Setting the batch size to 1000 based on experimentation. Can be fine tuned later. - self._batch_size = 1000 + self._batch_size = BATCH_SIZE # 2d array containing a batch of input rows. A batch contains self._batch_size rows. self._batched_rows = np.zeros((self._batch_size, 1)) # 1d array of length = # of cols. Contains sum(col/count) for each column. @@ -224,7 +225,7 @@ def check_label_columns( TypeError: `y_true_col_names` and `y_pred_col_names` are of different types. ValueError: Multilabel `y_true_col_names` and `y_pred_col_names` are of different lengths. """ - if type(y_true_col_names) != type(y_pred_col_names): + if type(y_true_col_names) is not type(y_pred_col_names): raise TypeError( "Label columns should be of the same type." f"Got y_true_col_names={type(y_true_col_names)} vs y_pred_col_names={type(y_pred_col_names)}." @@ -300,6 +301,7 @@ def validate_average_pos_label(average: Optional[str] = None, pos_label: Union[s "average != 'binary' (got %r). You may use " "labels=[pos_label] to specify a single positive class." % (pos_label, average), UserWarning, + stacklevel=2, ) diff --git a/snowflake/ml/modeling/metrics/ranking.py b/snowflake/ml/modeling/metrics/ranking.py index 19a2e1c1..540a94da 100644 --- a/snowflake/ml/modeling/metrics/ranking.py +++ b/snowflake/ml/modeling/metrics/ranking.py @@ -122,7 +122,8 @@ def precision_recall_curve_anon_sproc(session: snowpark.Session) -> bytes: result_module = cloudpickle.loads(pickled_result_module) return result_module.serialize(session, (precision, recall, thresholds)) # type: ignore[no-any-return] - result_object = result.deserialize(session, precision_recall_curve_anon_sproc(session)) + kwargs = telemetry.get_sproc_statement_params_kwargs(precision_recall_curve_anon_sproc, statement_params) + result_object = result.deserialize(session, precision_recall_curve_anon_sproc(session, **kwargs)) res: Tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]] = result_object return res @@ -271,7 +272,8 @@ def roc_auc_score_anon_sproc(session: snowpark.Session) -> bytes: result_module = cloudpickle.loads(pickled_result_module) return result_module.serialize(session, auc) # type: ignore[no-any-return] - result_object = result.deserialize(session, roc_auc_score_anon_sproc(session)) + kwargs = telemetry.get_sproc_statement_params_kwargs(roc_auc_score_anon_sproc, statement_params) + result_object = result.deserialize(session, roc_auc_score_anon_sproc(session, **kwargs)) auc: Union[float, npt.NDArray[np.float_]] = result_object return auc @@ -372,7 +374,9 @@ def roc_curve_anon_sproc(session: snowpark.Session) -> bytes: result_module = cloudpickle.loads(pickled_result_module) return result_module.serialize(session, (fpr, tpr, thresholds)) # type: ignore[no-any-return] - result_object = result.deserialize(session, roc_curve_anon_sproc(session)) + kwargs = telemetry.get_sproc_statement_params_kwargs(roc_curve_anon_sproc, statement_params) + result_object = result.deserialize(session, roc_curve_anon_sproc(session, **kwargs)) + res: Tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]] = result_object return res diff --git a/snowflake/ml/modeling/metrics/regression.py b/snowflake/ml/modeling/metrics/regression.py index 1fcfb79e..28d80b5b 100644 --- a/snowflake/ml/modeling/metrics/regression.py +++ b/snowflake/ml/modeling/metrics/regression.py @@ -108,7 +108,8 @@ def d2_absolute_error_score_anon_sproc(session: snowpark.Session) -> bytes: result_module = cloudpickle.loads(pickled_snowflake_result) return result_module.serialize(session, score) # type: ignore[no-any-return] - result_object = result.deserialize(session, d2_absolute_error_score_anon_sproc(session)) + kwargs = telemetry.get_sproc_statement_params_kwargs(d2_absolute_error_score_anon_sproc, statement_params) + result_object = result.deserialize(session, d2_absolute_error_score_anon_sproc(session, **kwargs)) score: Union[float, npt.NDArray[np.float_]] = result_object return score @@ -205,7 +206,8 @@ def d2_pinball_score_anon_sproc(session: snowpark.Session) -> bytes: result_module = cloudpickle.loads(pickled_result_module) return result_module.serialize(session, score) # type: ignore[no-any-return] - result_object = result.deserialize(session, d2_pinball_score_anon_sproc(session)) + kwargs = telemetry.get_sproc_statement_params_kwargs(d2_pinball_score_anon_sproc, statement_params) + result_object = result.deserialize(session, d2_pinball_score_anon_sproc(session, **kwargs)) score: Union[float, npt.NDArray[np.float_]] = result_object return score @@ -319,7 +321,8 @@ def explained_variance_score_anon_sproc(session: snowpark.Session) -> bytes: result_module = cloudpickle.loads(pickled_result_module) return result_module.serialize(session, score) # type: ignore[no-any-return] - result_object = result.deserialize(session, explained_variance_score_anon_sproc(session)) + kwargs = telemetry.get_sproc_statement_params_kwargs(explained_variance_score_anon_sproc, statement_params) + result_object = result.deserialize(session, explained_variance_score_anon_sproc(session, **kwargs)) score: Union[float, npt.NDArray[np.float_]] = result_object return score diff --git a/snowflake/ml/monitoring/BUILD.bazel b/snowflake/ml/monitoring/BUILD.bazel index 66cc4014..1a9e8e18 100644 --- a/snowflake/ml/monitoring/BUILD.bazel +++ b/snowflake/ml/monitoring/BUILD.bazel @@ -1,4 +1,4 @@ -load("//bazel:py_rules.bzl", "py_library", "py_package", "snowml_wheel") +load("//bazel:py_rules.bzl", "py_library", "py_package", "py_wheel") package_group( name = "monitoring", @@ -9,6 +9,10 @@ package_group( package(default_visibility = ["//visibility:public"]) +exports_files([ + "pyproject.toml", +]) + py_library( name = "monitoring_lib", srcs = [ @@ -28,19 +32,8 @@ py_package( ], ) -snowml_wheel( - name = "monitoring_wheel", - compatible_with_snowpark = True, - development_status = "PrPr", - extra_requires = {}, - requires = [ - "numpy", - "shap", - "snowflake-connector-python[pandas]", - "snowflake-snowpark-python>=1.4.0,<2", - ], - version = "0.1.0", - deps = [ - "//snowflake/ml/monitoring:monitoring_pkg", - ], +py_wheel( + name = "wheel", + pyproject_toml = ":pyproject.toml", + deps = ["//snowflake/ml/monitoring:monitoring_pkg"], ) diff --git a/snowflake/ml/monitoring/pyproject.toml b/snowflake/ml/monitoring/pyproject.toml new file mode 100644 index 00000000..e90e7152 --- /dev/null +++ b/snowflake/ml/monitoring/pyproject.toml @@ -0,0 +1,50 @@ +[build-system] +requires = ["setuptools >= 61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "snowflake-ml-python" +version = "0.1.0" +authors = [ + {name = "Snowflake, Inc", email = "support@snowflake.com"} +] +description = "The machine learning client library that is used for interacting with Snowflake to build machine learning solutions." +license = {file = "LICENSE.txt"} +classifiers = [ + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Environment :: Other Environment", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Information Technology", + "Intended Audience :: System Administrators", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Database", + "Topic :: Software Development", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Application Frameworks", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Scientific/Engineering :: Information Analysis" +] +requires-python = ">=3.8, <4" +dependencies = [ + "numpy", + "shap", + "snowflake-connector-python[pandas]", + "snowflake-snowpark-python>=1.4.0,<2" +] + +[project.urls] +Homepage = "https://github.com/snowflakedb/snowflake-ml-python" +Documentation = "https://docs.snowflake.com/developer-guide/snowpark-ml" +Repository = "https://github.com/snowflakedb/snowflake-ml-python" +Issues = "https://github.com/snowflakedb/snowflake-ml-python/issues" +Changelog = "https://github.com/snowflakedb/snowflake-ml-python/blob/master/CHANGELOG.md" + +[tool.setuptools.packages.find] +where = ["."] +include = ["snowflake.ml.monitoring*"] diff --git a/snowflake/ml/registry/notebooks/Using MODEL via Registry in Snowflake.ipynb b/snowflake/ml/registry/notebooks/Using MODEL via Registry in Snowflake.ipynb index 9d2e5757..7bd2fc8f 100644 --- a/snowflake/ml/registry/notebooks/Using MODEL via Registry in Snowflake.ipynb +++ b/snowflake/ml/registry/notebooks/Using MODEL via Registry in Snowflake.ipynb @@ -321,7 +321,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Show and List models and versions\n" + "### Show and List models and versions" ] }, { @@ -384,6 +384,15 @@ "print(m.description)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reg.show_models()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -394,6 +403,15 @@ "print(mv.description)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m.show_versions()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -465,6 +483,15 @@ "mv.show_metrics()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m.show_versions()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -497,6 +524,81 @@ "m.default" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### TAG management" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You could set Snowflake TAG on a model. You need to have your TAG pre-created before setting it. Below we will show how to use TAG to label a live version of the model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: When a tag is not set to a model, `get_tag` will return `None`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m.get_tag(\"live_version\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m.set_tag(\"live_version\", version_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m.get_tag(\"live_version\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m.show_tags()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m.unset_tag(\"live_version\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m.show_tags()" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/snowflake/ml/registry/registry.py b/snowflake/ml/registry/registry.py index 2132d549..57fb8701 100644 --- a/snowflake/ml/registry/registry.py +++ b/snowflake/ml/registry/registry.py @@ -120,6 +120,8 @@ def log_model( - embed_local_ml_library: Embed local Snowpark ML into the code directory or folder. Override to True if the local Snowpark ML version is not available in the Snowflake Anaconda Channel. Otherwise, defaults to False + - relax_version: Whether or not relax the version constraints of the dependencies. + It detects any ==x.y.z in specifiers and replaced with >=x.y, <(x+1). Defaults to False. - method_options: Per-method saving options including: - case_sensitive: Indicates whether the method and its signature should be case sensitive. This means when you refer the method in the SQL, you need to double quote it. diff --git a/snowflake/ml/requirements.bzl b/snowflake/ml/requirements.bzl deleted file mode 100755 index 8d5cd87b..00000000 --- a/snowflake/ml/requirements.bzl +++ /dev/null @@ -1,63 +0,0 @@ -# DO NOT EDIT! -# Generate by running 'bazel run --config=pre_build //bazel/requirements:sync_requirements' - -EXTRA_REQUIREMENTS = { - "all": [ - "lightgbm==3.3.5", - "mlflow>=2.1.0,<2.4", - "peft>=0.5.0,<1", - "sentencepiece>=0.1.95,<0.2", - "shap==0.42.1", - "tensorflow>=2.9,<3,!=2.12.0", - "tokenizers>=0.10,<1", - "torchdata>=0.4,<1", - "transformers>=4.32.1,<5" - ], - "lightgbm": [ - "lightgbm==3.3.5" - ], - "llm": [ - "peft>=0.5.0,<1" - ], - "mlflow": [ - "mlflow>=2.1.0,<2.4" - ], - "shap": [ - "shap==0.42.1" - ], - "tensorflow": [ - "tensorflow>=2.9,<3,!=2.12.0" - ], - "torch": [ - "torchdata>=0.4,<1" - ], - "transformers": [ - "sentencepiece>=0.1.95,<0.2", - "tokenizers>=0.10,<1", - "transformers>=4.32.1,<5" - ] -} - -REQUIREMENTS = [ - "absl-py>=0.15,<2", - "anyio>=3.5.0,<4", - "cachetools>=3.1.1,<5", - "cloudpickle>=2.0.0", - "fsspec[http]>=2022.11,<2024", - "importlib_resources>=5.1.4, <6", - "numpy>=1.23,<2", - "packaging>=20.9,<24", - "pandas>=1.0.0,<2", - "pyarrow", - "pytimeparse>=1.1.8,<2", - "pyyaml>=6.0,<7", - "retrying>=1.3.3,<2", - "s3fs>=2022.11,<2024", - "scikit-learn>=1.2.1,<1.4", - "scipy>=1.9,<2", - "snowflake-connector-python[pandas]>=3.0.4,<4", - "snowflake-snowpark-python>=1.8.0,<2", - "sqlparse>=0.4,<1", - "typing-extensions>=4.1.0,<5", - "xgboost>=1.7.3,<2" -] diff --git a/snowflake/ml/version.bzl b/snowflake/ml/version.bzl index 793bbe93..f660cf73 100644 --- a/snowflake/ml/version.bzl +++ b/snowflake/ml/version.bzl @@ -1,2 +1,2 @@ # This is parsed by regex in conda reciper meta file. Make sure not to break it. -VERSION = "1.2.0" +VERSION = "1.2.1" diff --git a/tests/integ/snowflake/ml/_internal/snowpark_handlers_test.py b/tests/integ/snowflake/ml/_internal/snowpark_handlers_test.py index ef038bd9..bb492901 100644 --- a/tests/integ/snowflake/ml/_internal/snowpark_handlers_test.py +++ b/tests/integ/snowflake/ml/_internal/snowpark_handlers_test.py @@ -48,7 +48,7 @@ def _get_test_dataset(self) -> Tuple[pd.DataFrame, List[str], List[str]]: return (input_df_pandas, input_cols, label_cols) - @common_test_base.CommonTestBase.sproc_test() + @common_test_base.CommonTestBase.sproc_test(additional_packages=["inflection"]) def test_batch_inference(self) -> None: sklearn_estimator = SkLinearRegression() input_df_pandas, input_cols, label_cols = self._get_test_dataset() @@ -75,7 +75,7 @@ def test_batch_inference(self) -> None: np.testing.assert_allclose(sklearn_numpy_arr, sf_numpy_arr, rtol=1.0e-1, atol=1.0e-2) - @common_test_base.CommonTestBase.sproc_test() + @common_test_base.CommonTestBase.sproc_test(additional_packages=["inflection"]) def test_score_snowpark(self) -> None: sklearn_estimator = SkLinearRegression() input_df_pandas, input_cols, label_cols = self._get_test_dataset() diff --git a/tests/integ/snowflake/ml/extra_tests/BUILD.bazel b/tests/integ/snowflake/ml/extra_tests/BUILD.bazel index d34395ef..9f5058c9 100644 --- a/tests/integ/snowflake/ml/extra_tests/BUILD.bazel +++ b/tests/integ/snowflake/ml/extra_tests/BUILD.bazel @@ -68,6 +68,7 @@ py_test( shard_count = 4, deps = [ "//snowflake/ml/modeling/framework", + "//snowflake/ml/modeling/impute:knn_imputer", "//snowflake/ml/modeling/pipeline", "//snowflake/ml/modeling/preprocessing:min_max_scaler", "//snowflake/ml/modeling/preprocessing:one_hot_encoder", diff --git a/tests/integ/snowflake/ml/extra_tests/pipeline_with_ohe_and_xgbr_test.py b/tests/integ/snowflake/ml/extra_tests/pipeline_with_ohe_and_xgbr_test.py index 58ad8ec0..605dc95f 100644 --- a/tests/integ/snowflake/ml/extra_tests/pipeline_with_ohe_and_xgbr_test.py +++ b/tests/integ/snowflake/ml/extra_tests/pipeline_with_ohe_and_xgbr_test.py @@ -3,6 +3,7 @@ from absl.testing import absltest from importlib_resources import files from sklearn.compose import ColumnTransformer as SkColumnTransformer +from sklearn.impute import KNNImputer as SkKNNImputer from sklearn.pipeline import Pipeline as SkPipeline from sklearn.preprocessing import ( MinMaxScaler as SkMinMaxScaler, @@ -10,6 +11,7 @@ ) from xgboost import XGBClassifier as XGB_XGBClassifier +from snowflake.ml.modeling.impute import KNNImputer from snowflake.ml.modeling.pipeline import Pipeline from snowflake.ml.modeling.preprocessing import ( MinMaxScaler, @@ -48,7 +50,7 @@ feature_cols = categorical_columns + numerical_columns -class GridSearchCVTest(absltest.TestCase): +class PipelineXGBRTest(absltest.TestCase): def setUp(self): """Creates Snowpark and Snowflake environments for testing.""" self._session = Session.builder.configs(SnowflakeLoginOptions()).create() @@ -79,6 +81,7 @@ def test_fit_and_compare_results(self) -> None: output_cols=numerical_columns, ), ), + ("KNNImputer", KNNImputer(input_cols=numerical_columns, output_cols=numerical_columns)), ("regression", XGBClassifier(label_cols=label_column, passthrough_cols="ROW_INDEX")), ] ) @@ -94,6 +97,7 @@ def test_fit_and_compare_results(self) -> None: [ ("cat_transformer", SkOneHotEncoder(), categorical_columns), ("num_transforms", SkMinMaxScaler(), numerical_columns), + ("num_imputer", SkKNNImputer(), numerical_columns), ] ), ), diff --git a/tests/integ/snowflake/ml/model/_client/model/BUILD.bazel b/tests/integ/snowflake/ml/model/_client/model/BUILD.bazel index f3410613..2426fd20 100644 --- a/tests/integ/snowflake/ml/model/_client/model/BUILD.bazel +++ b/tests/integ/snowflake/ml/model/_client/model/BUILD.bazel @@ -6,7 +6,6 @@ py_test( srcs = ["model_impl_integ_test.py"], deps = [ "//snowflake/ml/_internal/utils:identifier", - "//snowflake/ml/_internal/utils:snowflake_env", "//snowflake/ml/registry", "//snowflake/ml/utils:connection_params", "//tests/integ/snowflake/ml/test_utils:db_manager", @@ -20,13 +19,11 @@ py_test( timeout = "long", srcs = ["model_version_impl_integ_test.py"], deps = [ - "//snowflake/ml/_internal/utils:snowflake_env", "//snowflake/ml/_internal/utils:sql_identifier", "//snowflake/ml/model/_client/model:model_version_impl", "//snowflake/ml/registry", "//snowflake/ml/utils:connection_params", "//tests/integ/snowflake/ml/test_utils:db_manager", "//tests/integ/snowflake/ml/test_utils:model_factory", - "//tests/integ/snowflake/ml/test_utils:test_env_utils", ], ) diff --git a/tests/integ/snowflake/ml/model/_client/model/model_impl_integ_test.py b/tests/integ/snowflake/ml/model/_client/model/model_impl_integ_test.py index 5256c1e0..de9a92cb 100644 --- a/tests/integ/snowflake/ml/model/_client/model/model_impl_integ_test.py +++ b/tests/integ/snowflake/ml/model/_client/model/model_impl_integ_test.py @@ -4,7 +4,7 @@ from absl.testing import absltest, parameterized from packaging import version -from snowflake.ml._internal.utils import identifier, snowflake_env +from snowflake.ml._internal.utils import identifier from snowflake.ml.registry import registry from snowflake.ml.utils import connection_params from snowflake.snowpark import Session @@ -19,14 +19,6 @@ VERSION_NAME2 = "V2" -@unittest.skipUnless( - test_env_utils.get_current_snowflake_version() >= version.parse("8.0.0"), - "New model only available when the Snowflake Version is newer than 8.0.0", -) -@unittest.skipUnless( - test_env_utils.get_current_snowflake_cloud_type() == snowflake_env.SnowflakeCloudType.AWS, - "New model only available in AWS", -) class TestModelImplInteg(parameterized.TestCase): @classmethod def setUpClass(self) -> None: diff --git a/tests/integ/snowflake/ml/model/_client/model/model_version_impl_integ_test.py b/tests/integ/snowflake/ml/model/_client/model/model_version_impl_integ_test.py index 0ac15c80..acc8b876 100644 --- a/tests/integ/snowflake/ml/model/_client/model/model_version_impl_integ_test.py +++ b/tests/integ/snowflake/ml/model/_client/model/model_version_impl_integ_test.py @@ -1,31 +1,16 @@ -import unittest import uuid from absl.testing import absltest, parameterized -from packaging import version -from snowflake.ml._internal.utils import snowflake_env from snowflake.ml.registry import registry from snowflake.ml.utils import connection_params from snowflake.snowpark import Session -from tests.integ.snowflake.ml.test_utils import ( - db_manager, - model_factory, - test_env_utils, -) +from tests.integ.snowflake.ml.test_utils import db_manager, model_factory MODEL_NAME = "TEST_MODEL" VERSION_NAME = "V1" -@unittest.skipUnless( - test_env_utils.get_current_snowflake_version() >= version.parse("8.0.0"), - "New model only available when the Snowflake Version is newer than 8.0.0", -) -@unittest.skipUnless( - test_env_utils.get_current_snowflake_cloud_type() == snowflake_env.SnowflakeCloudType.AWS, - "New model only available in AWS", -) class TestModelVersionImplInteg(parameterized.TestCase): @classmethod def setUpClass(self) -> None: diff --git a/tests/integ/snowflake/ml/model/spcs_llm_model_integ_test.py b/tests/integ/snowflake/ml/model/spcs_llm_model_integ_test.py index 6639d3da..8b62f4f9 100644 --- a/tests/integ/snowflake/ml/model/spcs_llm_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/spcs_llm_model_integ_test.py @@ -1,5 +1,6 @@ import os import tempfile +import unittest import pandas as pd import pytest @@ -18,6 +19,7 @@ ) +@unittest.skip("release-1.2.1") @pytest.mark.conda_incompatible class TestSPCSLLMModelInteg(spcs_integ_test_base.SpcsIntegTestBase): def setUp(self) -> None: @@ -47,7 +49,7 @@ def test_text_generation_pipeline( stage_path = f"@{self._test_stage}/{self._run_id}" deployment_stage_path = f"@{self._test_stage}/{self._run_id}" - model_api.save_model( # type: ignore[call-overload] + model_api.save_model( name="model", session=self._session, stage_path=stage_path, @@ -76,7 +78,7 @@ def test_text_generation_pipeline( model_id=svc_func_name, platform=deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES, options={ - **deployment_options, # type: ignore[arg-type] + **deployment_options, }, # type: ignore[call-overload] ) assert deploy_info is not None diff --git a/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py b/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py index 14ecd17b..4701b497 100644 --- a/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py +++ b/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py @@ -66,7 +66,7 @@ def base_test_case( platform=deploy_platforms.TargetPlatform.WAREHOUSE, target_method=target_method_arg, options={ - **permanent_deploy_args, # type: ignore[arg-type] + **permanent_deploy_args, **additional_deploy_options, }, # type: ignore[call-overload] ) diff --git a/tests/integ/snowflake/ml/modeling/metrics/BUILD.bazel b/tests/integ/snowflake/ml/modeling/metrics/BUILD.bazel index 4ce78a9a..d5d65045 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/metrics/BUILD.bazel @@ -1,3 +1,4 @@ +load("@rules_python//python:defs.bzl", "py_library") load("//bazel:py_rules.bzl", "py_test") package(default_visibility = ["//visibility:public"]) @@ -18,7 +19,9 @@ py_test( py_test( name = "accuracy_score_test", srcs = ["accuracy_score_test.py"], + shard_count = SHARD_COUNT, deps = [ + ":generator", "//snowflake/ml/modeling/metrics:classification", "//snowflake/ml/utils:connection_params", "//tests/integ/snowflake/ml/modeling/framework:utils", @@ -29,8 +32,11 @@ py_test( name = "confusion_matrix_test", timeout = TIMEOUT, srcs = ["confusion_matrix_test.py"], + shard_count = SHARD_COUNT, deps = [ + ":generator", "//snowflake/ml/modeling/metrics:classification", + "//snowflake/ml/modeling/metrics:metrics_utils", "//snowflake/ml/utils:connection_params", "//tests/integ/snowflake/ml/modeling/framework:utils", ], @@ -248,3 +254,12 @@ py_test( "//tests/integ/snowflake/ml/modeling/framework:utils", ], ) + +py_library( + name = "generator", + srcs = ["generator.py"], + deps = [ + "//snowflake/ml/modeling/metrics:metrics_utils", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], +) diff --git a/tests/integ/snowflake/ml/modeling/metrics/accuracy_score_test.py b/tests/integ/snowflake/ml/modeling/metrics/accuracy_score_test.py index f89a0d75..f566dacb 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/accuracy_score_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/accuracy_score_test.py @@ -1,5 +1,6 @@ -from typing import Any, Dict +from typing import Optional +import numpy as np from absl.testing import parameterized from absl.testing.absltest import main from sklearn import metrics as sklearn_metrics @@ -8,21 +9,15 @@ from snowflake.ml.modeling import metrics as snowml_metrics from snowflake.ml.utils import connection_params from tests.integ.snowflake.ml.modeling.framework import utils +from tests.integ.snowflake.ml.modeling.metrics import generator -_ROWS = 100 _TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] -_BINARY_DATA, _SF_SCHEMA = utils.gen_fuzz_data( - rows=_ROWS, - types=_TYPES, - low=0, - high=2, -) -_MULTICLASS_DATA, _ = utils.gen_fuzz_data( - rows=_ROWS, - types=_TYPES, - low=0, - high=5, -) +_BINARY_LOW, _BINARY_HIGH = 0, 2 +_MULTICLASS_LOW, _MULTICLASS_HIGH = 0, 5 +_BINARY_DATA_LIST, _SF_SCHEMA = generator.gen_test_cases(_TYPES, _BINARY_LOW, _BINARY_HIGH) +_MULTICLASS_DATA_LIST, _ = generator.gen_test_cases(_TYPES, _MULTICLASS_LOW, _MULTICLASS_HIGH) +_REGULAR_BINARY_DATA_LIST, _LARGE_BINARY_DATA = _BINARY_DATA_LIST[:-1], _BINARY_DATA_LIST[-1] +_REGULAR_MULTICLASS_DATA_LIST, _LARGE_MULTICLASS_DATA = _MULTICLASS_DATA_LIST[:-1], _MULTICLASS_DATA_LIST[-1] _Y_TRUE_COL = _SF_SCHEMA[1] _Y_PRED_COL = _SF_SCHEMA[2] _Y_TRUE_COLS = [_SF_SCHEMA[1], _SF_SCHEMA[2]] @@ -40,70 +35,115 @@ def setUp(self) -> None: def tearDown(self) -> None: self._session.close() - @parameterized.parameters( # type: ignore[misc] - { - "params": { - "sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL], - "values": [ - {"data": _BINARY_DATA, "y_true": _Y_TRUE_COLS, "y_pred": _Y_PRED_COLS}, - {"data": _MULTICLASS_DATA, "y_true": _Y_TRUE_COL, "y_pred": _Y_PRED_COL}, - ], - } - }, + @parameterized.product( # type: ignore[misc] + data_index=list(range(len(_REGULAR_BINARY_DATA_LIST))), + sample_weight_col_name=[None, _SAMPLE_WEIGHT_COL], ) - def test_sample_weight(self, params: Dict[str, Any]) -> None: - for values in params["values"]: - data = values["data"] - y_true = values["y_true"] - y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) - - for sample_weight_col_name in params["sample_weight_col_name"]: - actual_score = snowml_metrics.accuracy_score( - df=input_df, - y_true_col_names=y_true, - y_pred_col_names=y_pred, - sample_weight_col_name=sample_weight_col_name, - ) - sample_weight = pandas_df[sample_weight_col_name].to_numpy() if sample_weight_col_name else None - sklearn_score = sklearn_metrics.accuracy_score( - pandas_df[y_true], - pandas_df[y_pred], - sample_weight=sample_weight, - ) - self.assertAlmostEqual(sklearn_score, actual_score) - - @parameterized.parameters( # type: ignore[misc] - { - "params": { - "normalize": [True, False], - "values": [ - {"data": _BINARY_DATA, "y_true": _Y_TRUE_COLS, "y_pred": _Y_PRED_COLS}, - {"data": _MULTICLASS_DATA, "y_true": _Y_TRUE_COL, "y_pred": _Y_PRED_COL}, - ], - } - }, + def test_sample_weight_binary(self, data_index: int, sample_weight_col_name: Optional[str]) -> None: + pandas_df, input_df = utils.get_df(self._session, _REGULAR_BINARY_DATA_LIST[data_index], _SF_SCHEMA) + + actual_score = snowml_metrics.accuracy_score( + df=input_df, + y_true_col_names=_Y_TRUE_COLS, + y_pred_col_names=_Y_PRED_COLS, + sample_weight_col_name=sample_weight_col_name, + ) + sample_weight = pandas_df[sample_weight_col_name].to_numpy() if sample_weight_col_name else None + sklearn_score = sklearn_metrics.accuracy_score( + pandas_df[_Y_TRUE_COLS], + pandas_df[_Y_PRED_COLS], + sample_weight=sample_weight, + ) + np.testing.assert_allclose(actual_score, sklearn_score, rtol=1.0e-6, atol=1.0e-6) + + @parameterized.product( # type: ignore[misc] + data_index=list(range(len(_REGULAR_MULTICLASS_DATA_LIST))), + sample_weight_col_name=[None, _SAMPLE_WEIGHT_COL], ) - def test_normalized(self, params: Dict[str, Any]) -> None: - for values in params["values"]: - data = values["data"] - y_true = values["y_true"] - y_pred = values["y_pred"] - pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) - - for normalize in params["normalize"]: - actual_score = snowml_metrics.accuracy_score( - df=input_df, - y_true_col_names=y_true, - y_pred_col_names=y_pred, - normalize=normalize, - ) - sklearn_score = sklearn_metrics.accuracy_score( - pandas_df[y_true], - pandas_df[y_pred], - normalize=normalize, - ) - self.assertAlmostEqual(sklearn_score, actual_score) + def test_sample_weight_multiclass(self, data_index: int, sample_weight_col_name: Optional[str]) -> None: + pandas_df, input_df = utils.get_df(self._session, _REGULAR_MULTICLASS_DATA_LIST[data_index], _SF_SCHEMA) + + actual_score = snowml_metrics.accuracy_score( + df=input_df, + y_true_col_names=_Y_TRUE_COL, + y_pred_col_names=_Y_PRED_COL, + sample_weight_col_name=sample_weight_col_name, + ) + sample_weight = pandas_df[sample_weight_col_name].to_numpy() if sample_weight_col_name else None + sklearn_score = sklearn_metrics.accuracy_score( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + sample_weight=sample_weight, + ) + np.testing.assert_allclose(actual_score, sklearn_score, rtol=1.0e-6, atol=1.0e-6) + + @parameterized.product( # type: ignore[misc] + data_index=list(range(len(_REGULAR_BINARY_DATA_LIST))), + normalize=[True, False], + ) + def test_normalized_binary(self, data_index: int, normalize: bool) -> None: + pandas_df, input_df = utils.get_df(self._session, _REGULAR_BINARY_DATA_LIST[data_index], _SF_SCHEMA) + + actual_score = snowml_metrics.accuracy_score( + df=input_df, + y_true_col_names=_Y_TRUE_COLS, + y_pred_col_names=_Y_PRED_COLS, + normalize=normalize, + ) + sklearn_score = sklearn_metrics.accuracy_score( + pandas_df[_Y_TRUE_COLS], + pandas_df[_Y_PRED_COLS], + normalize=normalize, + ) + np.testing.assert_allclose(actual_score, sklearn_score, rtol=1.0e-6, atol=1.0e-6) + + @parameterized.product( # type: ignore[misc] + data_index=list(range(len(_REGULAR_MULTICLASS_DATA_LIST))), + normalize=[True, False], + ) + def test_normalized_multiclass(self, data_index: int, normalize: bool) -> None: + pandas_df, input_df = utils.get_df(self._session, _REGULAR_MULTICLASS_DATA_LIST[data_index], _SF_SCHEMA) + + actual_score = snowml_metrics.accuracy_score( + df=input_df, + y_true_col_names=_Y_TRUE_COL, + y_pred_col_names=_Y_PRED_COL, + normalize=normalize, + ) + sklearn_score = sklearn_metrics.accuracy_score( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + normalize=normalize, + ) + np.testing.assert_allclose(actual_score, sklearn_score, rtol=1.0e-6, atol=1.0e-6) + + def test_with_large_num_of_rows_binary(self) -> None: + pandas_df, input_df = utils.get_df(self._session, _LARGE_BINARY_DATA, _SF_SCHEMA) + + actual_score = snowml_metrics.accuracy_score( + df=input_df, + y_true_col_names=_Y_TRUE_COLS, + y_pred_col_names=_Y_PRED_COLS, + ) + sklearn_score = sklearn_metrics.accuracy_score( + pandas_df[_Y_TRUE_COLS], + pandas_df[_Y_PRED_COLS], + ) + np.testing.assert_allclose(actual_score, sklearn_score, rtol=1.0e-6, atol=1.0e-6) + + def test_with_large_num_of_rows_multiclass(self) -> None: + pandas_df, input_df = utils.get_df(self._session, _LARGE_MULTICLASS_DATA, _SF_SCHEMA) + + actual_score = snowml_metrics.accuracy_score( + df=input_df, + y_true_col_names=_Y_TRUE_COL, + y_pred_col_names=_Y_PRED_COL, + ) + sklearn_score = sklearn_metrics.accuracy_score( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + ) + np.testing.assert_allclose(actual_score, sklearn_score, rtol=1.0e-6, atol=1.0e-6) if __name__ == "__main__": diff --git a/tests/integ/snowflake/ml/modeling/metrics/confusion_matrix_test.py b/tests/integ/snowflake/ml/modeling/metrics/confusion_matrix_test.py index 97bd6f60..9d99c305 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/confusion_matrix_test.py +++ b/tests/integ/snowflake/ml/modeling/metrics/confusion_matrix_test.py @@ -1,29 +1,22 @@ -from typing import Any, Dict +from typing import Optional import numpy as np +import numpy.typing as npt from absl.testing import parameterized from absl.testing.absltest import main from sklearn import metrics as sklearn_metrics from snowflake import snowpark from snowflake.ml.modeling import metrics as snowml_metrics +from snowflake.ml.modeling.metrics import metrics_utils from snowflake.ml.utils import connection_params from tests.integ.snowflake.ml.modeling.framework import utils +from tests.integ.snowflake.ml.modeling.metrics import generator _TYPES = [utils.DataType.INTEGER] * 2 + [utils.DataType.FLOAT] _LOW, _HIGH = 1, 5 -_SMALL_DATA, _SF_SCHEMA = utils.gen_fuzz_data( - rows=100, # data size < batch size - types=_TYPES, - low=-_LOW, - high=_HIGH, -) -_LARGE_DATA, _ = utils.gen_fuzz_data( - rows=1000 + 7, # data size > batch size - types=_TYPES, - low=-_LOW, - high=_HIGH, -) +_DATA_LIST, _SF_SCHEMA = generator.gen_test_cases(_TYPES, _LOW, _HIGH) +_REGULAR_DATA_LIST, _LARGE_DATA = _DATA_LIST[:-1], _DATA_LIST[-1] _Y_TRUE_COL = _SF_SCHEMA[1] _Y_PRED_COL = _SF_SCHEMA[2] _SAMPLE_WEIGHT_COL = _SF_SCHEMA[3] @@ -39,104 +32,82 @@ def setUp(self) -> None: def tearDown(self) -> None: self._session.close() - @parameterized.parameters( # type: ignore[misc] - {"params": {"data": [_SMALL_DATA, _LARGE_DATA], "labels": [None, [2, 0, 4]]}}, + @parameterized.product( # type: ignore[misc] + data_index=list(range(len(_REGULAR_DATA_LIST))), labels=[None, [2, 0, 4]] ) - def test_labels(self, params: Dict[str, Any]) -> None: - for data in params["data"]: - pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) + def test_labels(self, data_index: int, labels: Optional[npt.ArrayLike]) -> None: + pandas_df, input_df = utils.get_df(self._session, _REGULAR_DATA_LIST[data_index], _SF_SCHEMA) - for labels in params["labels"]: - actual_cm = snowml_metrics.confusion_matrix( - df=input_df, - y_true_col_name=_Y_TRUE_COL, - y_pred_col_name=_Y_PRED_COL, - labels=labels, - ) - sklearn_cm = sklearn_metrics.confusion_matrix( - pandas_df[_Y_TRUE_COL], - pandas_df[_Y_PRED_COL], - labels=labels, - ) - np.testing.assert_allclose(actual_cm, sklearn_cm) + actual_cm = snowml_metrics.confusion_matrix( + df=input_df, + y_true_col_name=_Y_TRUE_COL, + y_pred_col_name=_Y_PRED_COL, + labels=labels, + ) + sklearn_cm = sklearn_metrics.confusion_matrix( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + labels=labels, + ) + np.testing.assert_allclose(actual_cm, sklearn_cm) - @parameterized.parameters( # type: ignore[misc] - {"params": {"data": [_SMALL_DATA, _LARGE_DATA], "sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL]}}, + @parameterized.product( # type: ignore[misc] + data_index=list(range(len(_REGULAR_DATA_LIST))), sample_weight_col_name=[None, _SAMPLE_WEIGHT_COL] ) - def test_sample_weight(self, params: Dict[str, Any]) -> None: - for data in params["data"]: - pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) + def test_sample_weight(self, data_index: int, sample_weight_col_name: Optional[str]) -> None: + pandas_df, input_df = utils.get_df(self._session, _REGULAR_DATA_LIST[data_index], _SF_SCHEMA) - for sample_weight_col_name in params["sample_weight_col_name"]: - actual_cm = snowml_metrics.confusion_matrix( - df=input_df, - y_true_col_name=_Y_TRUE_COL, - y_pred_col_name=_Y_PRED_COL, - sample_weight_col_name=sample_weight_col_name, - ) - sample_weight = pandas_df[sample_weight_col_name].to_numpy() if sample_weight_col_name else None - sklearn_cm = sklearn_metrics.confusion_matrix( - pandas_df[_Y_TRUE_COL], - pandas_df[_Y_PRED_COL], - sample_weight=sample_weight, - ) - np.testing.assert_allclose(actual_cm, sklearn_cm) + actual_cm = snowml_metrics.confusion_matrix( + df=input_df, + y_true_col_name=_Y_TRUE_COL, + y_pred_col_name=_Y_PRED_COL, + sample_weight_col_name=sample_weight_col_name, + ) + sample_weight = pandas_df[sample_weight_col_name].to_numpy() if sample_weight_col_name else None + sklearn_cm = sklearn_metrics.confusion_matrix( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + sample_weight=sample_weight, + ) + np.testing.assert_allclose(actual_cm, sklearn_cm) - @parameterized.parameters( # type: ignore[misc] - {"params": {"data": [_SMALL_DATA, _LARGE_DATA], "normalize": ["true", "pred", "all", None]}}, + @parameterized.product( # type: ignore[misc] + data_index=list(range(len(_REGULAR_DATA_LIST))), normalize=["true", "pred", "all", None] ) - def test_normalize(self, params: Dict[str, Any]) -> None: - for data in params["data"]: - pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) + def test_normalize(self, data_index: int, normalize: Optional[str]) -> None: + pandas_df, input_df = utils.get_df(self._session, _REGULAR_DATA_LIST[data_index], _SF_SCHEMA) - for normalize in params["normalize"]: - actual_cm = snowml_metrics.confusion_matrix( - df=input_df, - y_true_col_name=_Y_TRUE_COL, - y_pred_col_name=_Y_PRED_COL, - normalize=normalize, - ) - sklearn_cm = sklearn_metrics.confusion_matrix( - pandas_df[_Y_TRUE_COL], - pandas_df[_Y_PRED_COL], - normalize=normalize, - ) - np.testing.assert_allclose(actual_cm, sklearn_cm) + actual_cm = snowml_metrics.confusion_matrix( + df=input_df, + y_true_col_name=_Y_TRUE_COL, + y_pred_col_name=_Y_PRED_COL, + normalize=normalize, + ) + sklearn_cm = sklearn_metrics.confusion_matrix( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + normalize=normalize, + ) + np.testing.assert_allclose(actual_cm, sklearn_cm) - @parameterized.parameters( # type: ignore[misc] - {"params": {"labels": []}}, - {"params": {"labels": [100, -10]}}, - {"params": {"normalize": "invalid"}}, + @parameterized.product( # type: ignore[misc] + data_index=list(range(len(_REGULAR_DATA_LIST))), labels=[None, [], [100, -10]], normalize=[None, "invalid"] ) - def test_invalid_params(self, params: Dict[str, Any]) -> None: - input_df = self._session.create_dataframe(_SMALL_DATA, schema=_SF_SCHEMA) - - if "labels" in params: - with self.assertRaises(ValueError): - snowml_metrics.confusion_matrix( - df=input_df, - y_true_col_name=_Y_TRUE_COL, - y_pred_col_name=_Y_PRED_COL, - labels=params["labels"], - ) + def test_invalid_params(self, data_index: int, labels: Optional[npt.ArrayLike], normalize: Optional[str]) -> None: + input_df = self._session.create_dataframe(_REGULAR_DATA_LIST[data_index], schema=_SF_SCHEMA) - if "normalize" in params: + if labels is not None or normalize is not None: with self.assertRaises(ValueError): snowml_metrics.confusion_matrix( df=input_df, y_true_col_name=_Y_TRUE_COL, y_pred_col_name=_Y_PRED_COL, - normalize=params["normalize"], + labels=labels, + normalize=normalize, ) def test_with_large_num_of_rows(self) -> None: - data, sf_schema = utils.gen_fuzz_data( - rows=100 * 1000 + 7, - types=[utils.DataType.INTEGER] * 2 + [utils.DataType.FLOAT], - low=-1, - high=5, - ) - pandas_df, input_df = utils.get_df(self._session, data, sf_schema) + pandas_df, input_df = utils.get_df(self._session, _LARGE_DATA, _SF_SCHEMA) actual_cm = snowml_metrics.confusion_matrix( df=input_df, @@ -150,13 +121,13 @@ def test_with_large_num_of_rows(self) -> None: np.testing.assert_allclose(actual_cm, sklearn_cm) def test_with_divisible_num_of_rows(self) -> None: - data, sf_schema = utils.gen_fuzz_data( - rows=4 * 1000, - types=[utils.DataType.INTEGER] * 2 + [utils.DataType.FLOAT], - low=-1, - high=5, + data, _ = utils.gen_fuzz_data( + rows=metrics_utils.BATCH_SIZE * 4, + types=_TYPES, + low=-_LOW, + high=_HIGH, ) - pandas_df, input_df = utils.get_df(self._session, data, sf_schema) + pandas_df, input_df = utils.get_df(self._session, data, _SF_SCHEMA) actual_cm = snowml_metrics.confusion_matrix( df=input_df, diff --git a/tests/integ/snowflake/ml/modeling/metrics/generator.py b/tests/integ/snowflake/ml/modeling/metrics/generator.py new file mode 100644 index 00000000..012296d2 --- /dev/null +++ b/tests/integ/snowflake/ml/modeling/metrics/generator.py @@ -0,0 +1,42 @@ +from typing import Any, List, Tuple, Union + +from snowflake.ml.modeling.metrics import metrics_utils +from tests.integ.snowflake.ml.modeling.framework import utils +from tests.integ.snowflake.ml.modeling.framework.utils import MAX_INT, MIN_INT, DataType + +_NUM_ROWS = [ + metrics_utils.BATCH_SIZE // 2, # row # < batch size + metrics_utils.BATCH_SIZE + 7, # row # > batch size + metrics_utils.BATCH_SIZE * 4, # row # is a multiple of batch size +] +_NUM_ROW_LARGE = metrics_utils.BATCH_SIZE * 100 + 7 # large row # +_NUM_ROWS.append(_NUM_ROW_LARGE) + + +def gen_test_cases( + types: List[DataType], low: Union[int, List[int]] = MIN_INT, high: Union[int, List[int]] = MAX_INT +) -> Tuple[List[List[Any]], List[str]]: + """ + Generate metrics test cases. The last test case has a large data size. + + Args: + types: type per column + low: lower bound(s) of the output interval (inclusive) + high: upper bound(s) of the output interval (exclusive) + + Returns: + A tuple of test data of multiple sizes and column names + """ + data_list = [] + snowflake_identifiers: List[str] = [] + for num_row in _NUM_ROWS: + data, identifiers = utils.gen_fuzz_data( + rows=num_row, + types=types, + low=low, + high=high, + ) + data_list.append(data) + if len(snowflake_identifiers) == 0: + snowflake_identifiers = identifiers + return data_list, snowflake_identifiers diff --git a/tests/integ/snowflake/ml/modeling/model_selection/check_output_hpo_integ_test.py b/tests/integ/snowflake/ml/modeling/model_selection/check_output_hpo_integ_test.py index 95326f58..d6ea8558 100644 --- a/tests/integ/snowflake/ml/modeling/model_selection/check_output_hpo_integ_test.py +++ b/tests/integ/snowflake/ml/modeling/model_selection/check_output_hpo_integ_test.py @@ -3,9 +3,11 @@ to match all kinds of input and output for GridSearchCV/RandomSearchCV. """ +import inspect from typing import Any, Dict, List, Tuple, Union from unittest import mock +import cloudpickle as cp import inflection import numpy as np import numpy.typing as npt @@ -13,6 +15,12 @@ from absl.testing import absltest, parameterized from sklearn.datasets import load_iris from sklearn.linear_model import LinearRegression as SkLinearRegression +from sklearn.metrics import ( + make_scorer, + mean_absolute_error, + mean_squared_error, + r2_score, +) from sklearn.model_selection import GridSearchCV as SkGridSearchCV, KFold from sklearn.model_selection._split import BaseCrossValidator @@ -37,7 +45,27 @@ def _load_iris_data() -> Tuple[pd.DataFrame, List[str], List[str]]: return input_df_pandas, input_cols, label_col -class GridSearchCVTest(parameterized.TestCase): +def matrix_scorer(clf: SkLinearRegression, X: npt.NDArray[Any], y: npt.NDArray[np.int_]) -> Dict[str, float]: + y_pred = clf.predict(X) + return { + "mean_absolute_error": mean_absolute_error(y, y_pred), + "mean_squared_error": mean_squared_error(y, y_pred), + "r2_score": r2_score(y, y_pred), + } + + +def refit_strategy(cv_results: Dict[str, Any]) -> Any: + precision_threshold = 0.3 + cv_results_ = pd.DataFrame(cv_results) + high_precision_cv_results = cv_results_[cv_results_["mean_test_score"] > precision_threshold] + return high_precision_cv_results["mean_test_score"].idxmin() + + +cp.register_pickle_by_value(inspect.getmodule(matrix_scorer)) +cp.register_pickle_by_value(inspect.getmodule(refit_strategy)) + + +class HPOCorrectness(parameterized.TestCase): def setUp(self) -> None: """Creates Snowpark and Snowflake environments for testing.""" self._session = Session.builder.configs(SnowflakeLoginOptions()).create() @@ -65,11 +93,14 @@ def _compare_cv_results(self, cv_result_1: Dict[str, Any], cv_result_2: Dict[str np.testing.assert_allclose(v, cv_result_2[k], rtol=1.0e-7, atol=1.0e-7) # Do not compare the fit time - def _compare_global_variables(self, sk_obj: SkLinearRegression, sklearn_reg: SkLinearRegression) -> None: + def _compare_global_variables(self, sk_obj: SkGridSearchCV, sklearn_reg: SkGridSearchCV) -> None: # the result of SnowML grid search cv should behave the same as sklearn's - # TODO - check scorer_ - assert isinstance(sk_obj.refit_time_, float) - np.testing.assert_allclose(sk_obj.best_score_, sklearn_reg.best_score_) + if hasattr(sk_obj, "refit_time_"): + # if refit = False, there is no attribute called refit_time_ + assert isinstance(sk_obj.refit_time_, float) + if hasattr(sk_obj, "best_score_"): + # if refit = callable and no best_score specified, then this attribute is empty + np.testing.assert_allclose(sk_obj.best_score_, sklearn_reg.best_score_) self.assertEqual(sk_obj.multimetric_, sklearn_reg.multimetric_) self.assertEqual(sk_obj.best_index_, sklearn_reg.best_index_) if hasattr(sk_obj, "n_splits_"): # n_splits_ is only available in RandomSearchCV @@ -97,7 +128,7 @@ def _compare_global_variables(self, sk_obj: SkLinearRegression, sklearn_reg: SkL rtol=1.0e-7, atol=1.0e-7, ) - self.assertEqual(sk_obj.n_features_in_, sklearn_reg.n_features_in_) + self.assertEqual(sk_obj.n_features_in_, sklearn_reg.n_features_in_) if hasattr(sk_obj, "feature_names_in_") and hasattr( sklearn_reg, "feature_names_in_" ): # feature_names_in_ variable is only available when `best_estimator_` is defined @@ -112,7 +143,7 @@ def _compare_global_variables(self, sk_obj: SkLinearRegression, sklearn_reg: SkL # Standard Sklearn sample { "is_single_node": False, - "params": {"copy_X": [True, False], "fit_intercept": [True, False]}, + "params": {"fit_intercept": [True, False], "positive": [True, False]}, "cv": 5, "kwargs": dict(), }, @@ -120,8 +151,8 @@ def _compare_global_variables(self, sk_obj: SkLinearRegression, sklearn_reg: SkL { "is_single_node": False, "params": [ - {"copy_X": [True], "fit_intercept": [True, False]}, - {"copy_X": [False], "fit_intercept": [True, False]}, + {"fit_intercept": [True], "positive": [True, False]}, + {"fit_intercept": [False], "positive": [True, False]}, ], "cv": 5, "kwargs": dict(), @@ -130,18 +161,18 @@ def _compare_global_variables(self, sk_obj: SkLinearRegression, sklearn_reg: SkL { "is_single_node": False, "params": [ - {"copy_X": [True], "fit_intercept": [True, False]}, - {"copy_X": [False], "fit_intercept": [True, False]}, + {"fit_intercept": [True], "positive": [True, False]}, + {"fit_intercept": [False], "positive": [True, False]}, ], "cv": KFold(5), "kwargs": dict(), }, - # cv: iterator + # cv: iterator with numpy array { "is_single_node": False, "params": [ - {"copy_X": [True], "fit_intercept": [True, False]}, - {"copy_X": [False], "fit_intercept": [True, False]}, + {"fit_intercept": [True], "positive": [True, False]}, + {"fit_intercept": [False], "positive": [True, False]}, ], "cv": [ ( @@ -167,11 +198,12 @@ def _compare_global_variables(self, sk_obj: SkLinearRegression, sklearn_reg: SkL ], "kwargs": dict(), }, + # cv: iterator with list { "is_single_node": False, "params": [ - {"copy_X": [True], "fit_intercept": [True, False]}, - {"copy_X": [False], "fit_intercept": [True, False]}, + {"fit_intercept": [True], "positive": [True, False]}, + {"fit_intercept": [False], "positive": [True, False]}, ], "cv": [ ( @@ -197,22 +229,129 @@ def _compare_global_variables(self, sk_obj: SkLinearRegression, sklearn_reg: SkL ], "kwargs": dict(), }, - # TODO: scoring + # scoring: single score with a string + { + "is_single_node": False, + "params": {"fit_intercept": [True, False], "positive": [True, False]}, + "cv": 5, + "kwargs": dict(scoring="neg_mean_absolute_error"), + }, + # scoring: single score with a callable + { + "is_single_node": False, + "params": {"fit_intercept": [True, False], "positive": [True, False]}, + "cv": 5, + "kwargs": dict(scoring=make_scorer(mean_absolute_error)), + }, + # scoring: multiple scores with list { "is_single_node": False, - "params": {"copy_X": [True, False], "fit_intercept": [True, False]}, + "params": {"fit_intercept": [True, False], "positive": [True, False]}, "cv": 5, - "kwargs": dict(scoring=["accuracy", "f1_macro"], refit="f1_macro", return_train_score=True), + "kwargs": dict( + scoring=["neg_mean_absolute_error", "neg_mean_squared_error"], + refit="neg_mean_squared_error", + return_train_score=True, + ), + }, + # scoring: multiple scores with tuple + { + "is_single_node": False, + "params": {"fit_intercept": [True, False], "positive": [True, False]}, + "cv": 5, + "kwargs": dict( + scoring=("neg_mean_absolute_error", "neg_mean_squared_error"), + refit="neg_mean_squared_error", + return_train_score=True, + ), + }, + # scoring: multiple scores with custom function + { + "is_single_node": False, + "params": {"fit_intercept": [True, False], "positive": [True, False]}, + "cv": 5, + "kwargs": dict(scoring=matrix_scorer, refit="mean_squared_error"), + }, + # scoring: multiple scores with dictionary of callable + { + "is_single_node": False, + "params": {"fit_intercept": [True, False], "positive": [True, False]}, + "cv": 5, + "kwargs": dict( + scoring={ + "mean_absolute_error": make_scorer(mean_absolute_error), + "mean_squared_error": make_scorer(mean_squared_error), + }, + refit="mean_squared_error", + return_train_score=True, + ), + }, + # refit: True + { + "is_single_node": False, + "params": {"fit_intercept": [True, False], "positive": [True, False]}, + "cv": 5, + "kwargs": dict(refit=False), + }, + # refit: str + { + "is_single_node": False, + "params": {"fit_intercept": [True, False], "positive": [True, False]}, + "cv": 5, + "kwargs": dict(refit="neg_mean_squared_error"), + }, + # refit: callable + { + "is_single_node": False, + "params": {"fit_intercept": [True, False], "positive": [True, False]}, + "cv": 5, + "kwargs": dict(refit=refit_strategy, error_score=0.98), + }, + # error_score: float + { + "is_single_node": False, + "params": {"fit_intercept": [True, False], "positive": [True, False]}, + "cv": 5, + "kwargs": dict(error_score=0), + }, + # error_score: 'raise' + { + "is_single_node": False, + "params": {"fit_intercept": [True, False], "positive": [True, False]}, + "cv": 5, + "kwargs": dict( + scoring=["neg_mean_absolute_error", "neg_mean_squared_error"], + refit="neg_mean_squared_error", + return_train_score=True, + ), }, - # TODO: refit - # TODO: error_score # return_train_score: True { "is_single_node": False, - "params": {"copy_X": [True, False], "fit_intercept": [True, False]}, + "params": {"fit_intercept": [True, False], "positive": [True, False]}, "cv": 5, "kwargs": dict(return_train_score=True), }, + # n_jobs = -1 + { + "is_single_node": False, + "params": {"fit_intercept": [True, False], "positive": [True, False]}, + "cv": 5, + "kwargs": dict(n_jobs=-1), + }, + { + "is_single_node": False, + "params": {"fit_intercept": [True, False], "positive": [True, False]}, + "cv": 5, + "kwargs": dict(n_jobs=3), + }, + # pre_dispatch: ignored + { + "is_single_node": False, + "params": {"fit_intercept": [True, False], "positive": [True, False]}, + "cv": 5, + "kwargs": dict(pre_dispatch=5), + }, ) @mock.patch("snowflake.ml.modeling._internal.model_trainer_builder.is_single_node") def test_fit_and_compare_results( diff --git a/tests/integ/snowflake/ml/modeling/model_selection/grid_search_integ_test.py b/tests/integ/snowflake/ml/modeling/model_selection/grid_search_integ_test.py index 7465acab..aa5f41f1 100644 --- a/tests/integ/snowflake/ml/modeling/model_selection/grid_search_integ_test.py +++ b/tests/integ/snowflake/ml/modeling/model_selection/grid_search_integ_test.py @@ -1,4 +1,5 @@ -from typing import List, Tuple +import numbers +from typing import Any, Dict, List, Tuple from unittest import mock import inflection @@ -46,7 +47,7 @@ def setUp(self): def tearDown(self): self._session.close() - def _compare_cv_results(self, cv_result_1, cv_result_2) -> None: + def _compare_cv_results(self, cv_result_1: Dict[str, Any], cv_result_2: Dict[str, Any]) -> None: # compare the keys self.assertEqual(cv_result_1.keys(), cv_result_2.keys()) # compare the values @@ -60,6 +61,71 @@ def _compare_cv_results(self, cv_result_1, cv_result_2) -> None: np.testing.assert_allclose(v, cv_result_2[k], rtol=1.0e-1, atol=1.0e-2) # Do not compare the fit time + def _compare_global_variables( + self, + sk_obj: SkGridSearchCV, + sklearn_reg: SkGridSearchCV, + ) -> None: + # the result of SnowML grid search cv should behave the same as sklearn's + if hasattr(sk_obj, "refit_time_"): + # if refit = False, there is no attribute called refit_time_ + assert isinstance(sk_obj.refit_time_, float) + if hasattr(sk_obj, "best_score_"): + # if refit = callable and no best_score specified, then this attribute is empty + np.testing.assert_allclose( + sk_obj.best_score_, + sklearn_reg.best_score_, + rtol=1.0e-5, + atol=1.0e-4, + ) + self.assertEqual(sk_obj.multimetric_, sklearn_reg.multimetric_) + self.assertEqual(sk_obj.best_index_, sklearn_reg.best_index_) + if hasattr(sk_obj, "n_splits_"): # n_splits_ is only available in RandomSearchCV + self.assertEqual(sk_obj.n_splits_, sklearn_reg.n_splits_) + if hasattr(sk_obj, "best_estimator_"): + for variable_name in sk_obj.best_estimator_.__dict__.keys(): + if variable_name not in ("n_jobs", "estimator", "estimator_", "estimators_", "_Booster"): + target_element = getattr(sk_obj.best_estimator_, variable_name) + if isinstance(target_element, np.ndarray): + if getattr(sk_obj.best_estimator_, variable_name).dtype == "object": + self.assertEqual( + getattr(sk_obj.best_estimator_, variable_name).tolist(), + getattr(sklearn_reg.best_estimator_, variable_name).tolist(), + ) + else: + np.testing.assert_allclose( + target_element, + getattr(sklearn_reg.best_estimator_, variable_name), + rtol=1.0e-5, + atol=1.0e-4, + ) + elif isinstance(target_element, (str, bool, tuple, dict)): + self.assertEqual( + target_element, + getattr(sklearn_reg.best_estimator_, variable_name), + ) + elif isinstance(target_element, numbers.Number): + np.testing.assert_allclose( + target_element, + getattr(sklearn_reg.best_estimator_, variable_name), + rtol=1.0e-5, + atol=1.0e-4, + ) + elif isinstance(target_element, type(None)): + assert isinstance(getattr(sklearn_reg.best_estimator_, variable_name), type(None)) + else: + raise ValueError(variable_name, target_element) + self.assertEqual(sk_obj.n_features_in_, sklearn_reg.n_features_in_) + if hasattr(sk_obj, "feature_names_in_") and hasattr( + sklearn_reg, "feature_names_in_" + ): # feature_names_in_ variable is only available when `best_estimator_` is defined + self.assertEqual(sk_obj.feature_names_in_.tolist(), sklearn_reg.feature_names_in_.tolist()) + if hasattr(sk_obj, "classes_"): + np.testing.assert_allclose(sk_obj.classes_, sklearn_reg.classes_) + self._compare_cv_results(sk_obj.cv_results_, sklearn_reg.cv_results_) + if not sk_obj.multimetric_: + self.assertEqual(sk_obj.best_params_, sklearn_reg.best_params_) + @mock.patch("snowflake.ml.modeling._internal.model_trainer_builder.is_single_node") def test_fit_and_compare_results(self, mock_is_single_node) -> None: mock_is_single_node.return_value = True # falls back to HPO implementation @@ -156,15 +222,7 @@ def test_fit_and_compare_results_distributed( sk_obj = reg.to_sklearn() # the result of SnowML grid search cv should behave the same as sklearn's - np.testing.assert_allclose(sk_obj.best_score_, sklearn_reg.best_score_) - self.assertEqual(sk_obj.multimetric_, sklearn_reg.multimetric_) - - # self.assertEqual(sklearn_reg.multimetric_, False) - self.assertEqual(sk_obj.best_index_, sklearn_reg.best_index_) - self._compare_cv_results(sk_obj.cv_results_, sklearn_reg.cv_results_) - - if not sk_obj.multimetric_: - self.assertEqual(sk_obj.best_params_, sklearn_reg.best_params_) + self._compare_global_variables(sk_obj, sklearn_reg) actual_arr = reg.predict(self._input_df).to_pandas().sort_values(by="INDEX")[output_cols].to_numpy() sklearn_numpy_arr = sklearn_reg.predict(self._input_df_pandas[self._input_cols]) @@ -256,6 +314,10 @@ def test_transform(self, mock_is_single_node) -> None: reg.fit(self._input_df) sklearn_reg.fit(X=self._input_df_pandas[self._input_cols], y=self._input_df_pandas[self._label_col].squeeze()) + # the result of SnowML grid search cv should behave the same as sklearn's + sk_obj = reg.to_sklearn() + self._compare_global_variables(sk_obj, sklearn_reg) + transformed = reg.transform(self._input_df).to_pandas().sort_values(by="INDEX") sk_transformed = sklearn_reg.transform(self._input_df_pandas[self._input_cols]) diff --git a/tests/integ/snowflake/ml/modeling/model_selection/randomized_search_integ_test.py b/tests/integ/snowflake/ml/modeling/model_selection/randomized_search_integ_test.py index dc6391bb..a0457bd7 100644 --- a/tests/integ/snowflake/ml/modeling/model_selection/randomized_search_integ_test.py +++ b/tests/integ/snowflake/ml/modeling/model_selection/randomized_search_integ_test.py @@ -1,4 +1,5 @@ -from typing import List, Tuple +import numbers +from typing import Any, Dict, List, Tuple from unittest import mock import inflection @@ -46,7 +47,7 @@ def setUp(self): def tearDown(self): self._session.close() - def _compare_cv_results(self, cv_result_1, cv_result_2) -> None: + def _compare_cv_results(self, cv_result_1: Dict[str, Any], cv_result_2: Dict[str, Any]) -> None: # compare the keys self.assertEqual(cv_result_1.keys(), cv_result_2.keys()) # compare the values @@ -60,6 +61,71 @@ def _compare_cv_results(self, cv_result_1, cv_result_2) -> None: np.testing.assert_allclose(v, cv_result_2[k], rtol=1.0e-1, atol=1.0e-2) # Do not compare the fit time + def _compare_global_variables( + self, + sk_obj: SkRandomizedSearchCV, + sklearn_reg: SkRandomizedSearchCV, + ) -> None: + # the result of SnowML grid search cv should behave the same as sklearn's + if hasattr(sk_obj, "refit_time_"): + # if refit = False, there is no attribute called refit_time_ + assert isinstance(sk_obj.refit_time_, float) + if hasattr(sk_obj, "best_score_"): + # if refit = callable and no best_score specified, then this attribute is empty + np.testing.assert_allclose( + sk_obj.best_score_, + sklearn_reg.best_score_, + rtol=1.0e-5, + atol=1.0e-4, + ) + self.assertEqual(sk_obj.multimetric_, sklearn_reg.multimetric_) + self.assertEqual(sk_obj.best_index_, sklearn_reg.best_index_) + if hasattr(sk_obj, "n_splits_"): # n_splits_ is only available in RandomSearchCV + self.assertEqual(sk_obj.n_splits_, sklearn_reg.n_splits_) + if hasattr(sk_obj, "best_estimator_"): + for variable_name in sk_obj.best_estimator_.__dict__.keys(): + if variable_name not in ("n_jobs", "estimator", "estimator_", "estimators_", "_Booster"): + target_element = getattr(sk_obj.best_estimator_, variable_name) + if isinstance(target_element, np.ndarray): + if getattr(sk_obj.best_estimator_, variable_name).dtype == "object": + self.assertEqual( + getattr(sk_obj.best_estimator_, variable_name).tolist(), + getattr(sklearn_reg.best_estimator_, variable_name).tolist(), + ) + else: + np.testing.assert_allclose( + target_element, + getattr(sklearn_reg.best_estimator_, variable_name), + rtol=1.0e-5, + atol=1.0e-4, + ) + elif isinstance(target_element, (str, bool, tuple, dict)): + self.assertEqual( + target_element, + getattr(sklearn_reg.best_estimator_, variable_name), + ) + elif isinstance(target_element, numbers.Number): + np.testing.assert_allclose( + target_element, + getattr(sklearn_reg.best_estimator_, variable_name), + rtol=1.0e-5, + atol=1.0e-4, + ) + elif isinstance(target_element, type(None)): + assert isinstance(getattr(sklearn_reg.best_estimator_, variable_name), type(None)) + else: + raise ValueError(variable_name, target_element) + self.assertEqual(sk_obj.n_features_in_, sklearn_reg.n_features_in_) + if hasattr(sk_obj, "feature_names_in_") and hasattr( + sklearn_reg, "feature_names_in_" + ): # feature_names_in_ variable is only available when `best_estimator_` is defined + self.assertEqual(sk_obj.feature_names_in_.tolist(), sklearn_reg.feature_names_in_.tolist()) + if hasattr(sk_obj, "classes_"): + np.testing.assert_allclose(sk_obj.classes_, sklearn_reg.classes_) + self._compare_cv_results(sk_obj.cv_results_, sklearn_reg.cv_results_) + if not sk_obj.multimetric_: + self.assertEqual(sk_obj.best_params_, sklearn_reg.best_params_) + @parameterized.parameters( { "is_single_node": True, @@ -109,15 +175,7 @@ def test_fit_and_compare_results( sk_obj = reg.to_sklearn() # the result of SnowML grid search cv should behave the same as sklearn's - np.testing.assert_allclose(sk_obj.best_score_, sklearn_reg.best_score_) - self.assertEqual(sk_obj.multimetric_, sklearn_reg.multimetric_) - - # self.assertEqual(sklearn_reg.multimetric_, False) - self.assertEqual(sk_obj.best_index_, sklearn_reg.best_index_) - self._compare_cv_results(sk_obj.cv_results_, sklearn_reg.cv_results_) - - if not sk_obj.multimetric_: - self.assertEqual(sk_obj.best_params_, sklearn_reg.best_params_) + self._compare_global_variables(sk_obj, sklearn_reg) actual_arr = reg.predict(self._input_df).to_pandas().sort_values(by="INDEX")[output_cols].to_numpy() sklearn_numpy_arr = sklearn_reg.predict(self._input_df_pandas[self._input_cols]) @@ -209,6 +267,10 @@ def test_transform(self, mock_is_single_node) -> None: reg.fit(self._input_df) sklearn_reg.fit(X=self._input_df_pandas[self._input_cols], y=self._input_df_pandas[self._label_col].squeeze()) + # the result of SnowML grid search cv should behave the same as sklearn's + sk_obj = reg.to_sklearn() + self._compare_global_variables(sk_obj, sklearn_reg) + transformed = reg.transform(self._input_df).to_pandas().sort_values(by="INDEX") sk_transformed = sklearn_reg.transform(self._input_df_pandas[self._input_cols]) diff --git a/tests/integ/snowflake/ml/registry/model/BUILD.bazel b/tests/integ/snowflake/ml/registry/model/BUILD.bazel index 727e7374..4dd54706 100644 --- a/tests/integ/snowflake/ml/registry/model/BUILD.bazel +++ b/tests/integ/snowflake/ml/registry/model/BUILD.bazel @@ -7,7 +7,6 @@ py_library( testonly = True, srcs = ["registry_model_test_base.py"], deps = [ - "//snowflake/ml/_internal/utils:snowflake_env", "//snowflake/ml/model:type_hints", "//snowflake/ml/registry", "//snowflake/ml/utils:connection_params", diff --git a/tests/integ/snowflake/ml/registry/model/registry_custom_model_test.py b/tests/integ/snowflake/ml/registry/model/registry_custom_model_test.py index 8f68db1f..96371706 100644 --- a/tests/integ/snowflake/ml/registry/model/registry_custom_model_test.py +++ b/tests/integ/snowflake/ml/registry/model/registry_custom_model_test.py @@ -7,6 +7,7 @@ from absl.testing import absltest from snowflake.ml.model import custom_model +from snowflake.snowpark._internal import utils as snowpark_utils from tests.integ.snowflake.ml.registry.model import registry_model_test_base from tests.integ.snowflake.ml.test_utils import dataframe_utils @@ -107,6 +108,26 @@ def test_custom_demo_model_sp( }, ) + def test_custom_demo_model_sp_one_query( + self, + ) -> None: + lm = DemoModel(custom_model.ModelContext()) + arr = [[1, 2, 3], [4, 2, 5]] + sp_df = self._session.create_dataframe(arr, schema=['"c1"', '"c2"', '"c3"']) + table_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.TABLE) + sp_df.write.save_as_table(table_name, mode="errorifexists", table_type="temporary") + sp_df_2 = self._session.table(table_name) + assert len(sp_df_2.queries["queries"]) == 1, sp_df_2.queries + assert len(sp_df_2.queries["post_actions"]) == 0, sp_df_2.queries + y_df_expected = pd.DataFrame([[1, 2, 3, 1], [4, 2, 5, 4]], columns=["c1", "c2", "c3", "output"]) + self._test_registry_model( + model=lm, + sample_input=sp_df_2, + prediction_assert_fns={ + "predict": (sp_df_2, lambda res: dataframe_utils.check_sp_df_res(res, y_df_expected, check_dtype=False)) + }, + ) + def test_custom_demo_model_sp_quote( self, ) -> None: diff --git a/tests/integ/snowflake/ml/registry/model/registry_model_test_base.py b/tests/integ/snowflake/ml/registry/model/registry_model_test_base.py index 13a85a09..c8ee0936 100644 --- a/tests/integ/snowflake/ml/registry/model/registry_model_test_base.py +++ b/tests/integ/snowflake/ml/registry/model/registry_model_test_base.py @@ -1,12 +1,9 @@ import inspect -import unittest import uuid from typing import Any, Callable, Dict, List, Optional, Tuple from absl.testing import absltest -from packaging import version -from snowflake.ml._internal.utils import snowflake_env from snowflake.ml.model import type_hints as model_types from snowflake.ml.registry import registry from snowflake.ml.utils import connection_params @@ -14,14 +11,6 @@ from tests.integ.snowflake.ml.test_utils import db_manager, test_env_utils -@unittest.skipUnless( - test_env_utils.get_current_snowflake_version() >= version.parse("8.0.0"), - "New model only available when the Snowflake Version is newer than 8.0.0", -) -@unittest.skipUnless( - test_env_utils.get_current_snowflake_cloud_type() == snowflake_env.SnowflakeCloudType.AWS, - "New model only available in AWS", -) class RegistryModelTestBase(absltest.TestCase): def setUp(self) -> None: """Creates Snowpark and Snowflake environments for testing.""" diff --git a/tests/integ/snowflake/ml/test_utils/BUILD.bazel b/tests/integ/snowflake/ml/test_utils/BUILD.bazel index 2e8ca7d8..4587f2e6 100644 --- a/tests/integ/snowflake/ml/test_utils/BUILD.bazel +++ b/tests/integ/snowflake/ml/test_utils/BUILD.bazel @@ -1,8 +1,8 @@ -load("//bazel:py_rules.bzl", "py_genrule", "py_library") +load("//bazel:py_rules.bzl", "py_genrule", "py_library", "py_test") package(default_visibility = ["//tests/integ/snowflake/ml:__subpackages__"]) -GEN_SNOWML_REQ_CMD = "$(location //bazel/requirements:parse_and_generate_requirements) $(location //:requirements.yml) --schema $(location //bazel/requirements:requirements.schema.json) --mode dev_version --format python --snowflake_channel_only > $@" +GEN_SNOWML_REQ_CMD = "$(location //bazel/requirements:parse_and_generate_requirements) $(location //:requirements.yml) --schema $(location //bazel/requirements:requirements.schema.json) --mode version_requirements --format python --snowflake_channel_only > $@" py_genrule( name = "gen_snowml_requirements", @@ -93,3 +93,11 @@ py_library( "//tests/integ/snowflake/ml/test_utils:db_manager", ], ) + +py_test( + name = "common_test_base_test", + srcs = ["common_test_base_test.py"], + deps = [ + ":common_test_base", + ], +) diff --git a/tests/integ/snowflake/ml/test_utils/common_test_base.py b/tests/integ/snowflake/ml/test_utils/common_test_base.py index 11724c09..f5994365 100644 --- a/tests/integ/snowflake/ml/test_utils/common_test_base.py +++ b/tests/integ/snowflake/ml/test_utils/common_test_base.py @@ -2,11 +2,22 @@ import itertools import os import tempfile -from typing import Any, Callable, List, Literal, Optional, Tuple, Type, TypeVar, Union +from typing import ( + Any, + Callable, + Dict, + List, + Literal, + Optional, + Tuple, + Type, + TypeVar, + Union, +) import cloudpickle from absl.testing import absltest, parameterized -from packaging import requirements +from packaging import requirements, specifiers from typing_extensions import Concatenate, ParamSpec from snowflake.ml._internal import env, env_utils, file_utils @@ -36,6 +47,22 @@ def get_function_body(func: Callable[..., Any]) -> str: return "".join([line[indentation:] for line in source_lines_generator]) +def get_modified_test_cases( + test_cases: Union[List[Dict[str, Any]], List[Tuple[Any, ...]]], + additional_cases: List[Any], + additional_case_arg_name: str, + naming_type: object, +) -> Union[List[Dict[str, Any]], List[Tuple[Any, ...]]]: + if all(isinstance(tc, dict) for tc in test_cases): + modified_test_cases = [{**t1, additional_case_arg_name: t2} for t1 in test_cases for t2 in additional_cases] + if naming_type is parameterized._NAMED: + for tc in modified_test_cases: + tc["testcase_name"] += f"_{tc[additional_case_arg_name]}" + else: + raise ValueError("Unsupported parameterized test argument input.") + return modified_test_cases + + class CommonTestBase(parameterized.TestCase): def setUp(self) -> None: """Creates Snowpark and Snowflake environments for testing.""" @@ -47,7 +74,10 @@ def tearDown(self) -> None: @classmethod def sproc_test( - kclass: Type[_V], local: bool = True, test_callers_rights: bool = True + kclass: Type[_V], + local: bool = True, + test_callers_rights: bool = True, + additional_packages: Optional[List[str]] = None, ) -> Callable[ [Callable[Concatenate[_V, _T_args], None]], Union[parameterized._ParameterizedTestIter, Callable[Concatenate[_V, _T_args], None]], @@ -63,12 +93,18 @@ def decorator( original_name = fn._original_name naming_type = fn._naming_type test_cases = list(fn.testcases) + if naming_type is parameterized._NAMED: + method_list = [method.__name__ for method in fn] + else: + method_list = [f"{original_name}{i}" for i, _ in enumerate(fn)] else: actual_method = fn original_name = fn.__name__ naming_type = parameterized._ARGUMENT_REPR test_cases = [{}] + method_list = [original_name] + assert len(method_list) > 0 test_module = inspect.getmodule(actual_method) assert test_module assert test_module.__file__ @@ -78,7 +114,6 @@ def decorator( rel_path = test_module_path[ind:] rel_path = os.path.splitext(rel_path)[0] test_module_name = rel_path.replace(os.sep, ".") - method_list = [func for func in dir(kclass) if func.startswith(original_name)] def test_wrapper( self: _V, @@ -100,12 +135,25 @@ def _in_sproc_test(execute_as: Literal["owner", "caller"] = "owner") -> None: file_utils.zip_python_package(tests_zip_module_filename, "tests") imports = [snowml_zip_module_filename, tests_zip_module_filename] - packages = [ - req - for req in _snowml_requirements.REQUIREMENTS + packages = additional_packages or [] + for req_str in _snowml_requirements.REQUIREMENTS: + req = requirements.Requirement(req_str) # Remove "_" not in req once Snowpark 1.11.0 available, it is a workaround for their bug. - if not any(offending in req for offending in ["snowflake-connector-python", "pyarrow", "_"]) - ] + if any( + offending in req.name for offending in ["snowflake-connector-python", "pyarrow", "_"] + ): + continue + # != and ~= are not supported by Snowpark + req.specifier = specifiers.SpecifierSet( + ",".join( + [ + str(spec) + for spec in req.specifier + if spec.operator in ["<", "<=", ">", ">=", "=="] + ] + ) + ) + packages.append(str(req)) cloudpickle.register_pickle_by_value(test_module) @@ -133,23 +181,23 @@ def test_in_sproc(sess: session.Session, test_name: str) -> None: if result.testsRun == 0: raise RuntimeError("Unit test does not run any test.") - for method in method_list: - test_in_sproc(self.session, f"{test_module_name}.{self.__class__.__qualname__}.{method}") + for method_name in method_list: + test_in_sproc( + self.session, f"{test_module_name}.{self.__class__.__qualname__}.{method_name}" + ) cloudpickle.unregister_pickle_by_value(test_module) _in_sproc_test(execute_as=_sproc_test_mode) - additional_cases = [ - {"_sproc_test_mode": "owner"}, - ] + additional_cases = ["owner"] if local: - additional_cases.append({"_sproc_test_mode": "local"}) + additional_cases.append("local") if test_callers_rights: - additional_cases.append({"_sproc_test_mode": "caller"}) + additional_cases.append("caller") - modified_test_cases = [{**t1, **t2} for t1 in test_cases for t2 in additional_cases] + modified_test_cases = get_modified_test_cases(test_cases, additional_cases, "_sproc_test_mode", naming_type) return parameterized._ParameterizedTestIter( test_wrapper, modified_test_cases, naming_type, original_name=original_name @@ -183,10 +231,7 @@ def decorator( def test_wrapper(self: _V, /, *args: _T_args.args, _snowml_pkg_ver: str, **kwargs: _T_args.kwargs) -> None: prepare_fn, prepare_fn_args = prepare_fn_factory(self) - if additional_packages: - packages = additional_packages - else: - packages = [] + packages = additional_packages or [] _, _, return_type, input_types = udf_utils.extract_return_input_types( prepare_fn, return_type=None, input_types=None, object_type=snowpark_utils.TempObjectType.PROCEDURE @@ -237,7 +282,7 @@ def {func_name}({first_arg_name}: snowflake.snowpark.Session, {", ".join(arg_lis actual_method(self, *args, **kwargs) additional_cases = [ - {"_snowml_pkg_ver": str(pkg_ver)} + str(pkg_ver) for pkg_ver in env_utils.get_matched_package_versions_in_information_schema( test_env_utils.get_available_session(), [requirements.Requirement(f"{env_utils.SNOWPARK_ML_PKG_NAME}{version_range}")], @@ -245,7 +290,7 @@ def {func_name}({first_arg_name}: snowflake.snowpark.Session, {", ".join(arg_lis )[env_utils.SNOWPARK_ML_PKG_NAME] ] - modified_test_cases = [{**t1, **t2} for t1 in test_cases for t2 in additional_cases] + modified_test_cases = get_modified_test_cases(test_cases, additional_cases, "_snowml_pkg_ver", naming_type) return parameterized._ParameterizedTestIter( test_wrapper, modified_test_cases, naming_type, original_name=original_name diff --git a/tests/integ/snowflake/ml/test_utils/common_test_base_test.py b/tests/integ/snowflake/ml/test_utils/common_test_base_test.py new file mode 100644 index 00000000..4a475eef --- /dev/null +++ b/tests/integ/snowflake/ml/test_utils/common_test_base_test.py @@ -0,0 +1,21 @@ +from absl.testing import absltest, parameterized + +from tests.integ.snowflake.ml.test_utils import common_test_base + + +class CommonTestBaseTest(common_test_base.CommonTestBase): + @common_test_base.CommonTestBase.sproc_test() + @parameterized.parameters({"content": "hello"}, {"content": "snowflake"}) # type: ignore[misc] + def test_parameterized_dict(self, content: str) -> None: + print(content) + + @common_test_base.CommonTestBase.sproc_test() + @parameterized.named_parameters( # type: ignore[misc] + {"testcase_name": "Normal", "content": "hello"}, {"testcase_name": "Surprise", "content": "snowflake"} + ) + def test_named_parameterized_dict(self, content: str) -> None: + print(content) + + +if __name__ == "__main__": + absltest.main() diff --git a/tests/integ/snowflake/ml/test_utils/test_env_utils.py b/tests/integ/snowflake/ml/test_utils/test_env_utils.py index 74a066f1..74fd170c 100644 --- a/tests/integ/snowflake/ml/test_utils/test_env_utils.py +++ b/tests/integ/snowflake/ml/test_utils/test_env_utils.py @@ -10,7 +10,7 @@ def get_available_session() -> session.Session: - return ( + return ( # type: ignore[no-any-return] session._get_active_session() if snowpark_utils.is_in_stored_procedure() # type: ignore[no-untyped-call] # else session.Session.builder.configs(connection_params.SnowflakeLoginOptions()).create() diff --git a/yamlfix.toml b/yamlfix.toml new file mode 100644 index 00000000..98116301 --- /dev/null +++ b/yamlfix.toml @@ -0,0 +1,7 @@ +sequence_style = "block_style" +line_length = 120 +whitelines = 1 +indent_mapping = 2 +indent_offset = 2 +indent_sequence = 4 +comments_min_spaces_from_content = 1