From 20435b18c1b61903eea0cf461cffc8d4b804df60 Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Wed, 9 Aug 2023 12:32:35 -0700 Subject: [PATCH 1/5] Add description field with make_model_config_json function (#203) * Add description field Signed-off-by: Thanawan Atchariyachanvanit * Restore notebook Signed-off-by: Thanawan Atchariyachanvanit * Debug test Signed-off-by: Thanawan Atchariyachanvanit * Resolve linting issues Signed-off-by: Thanawan Atchariyachanvanit * Update CHANGELOG.md Signed-off-by: Thanawan Atchariyachanvanit * Debug test_sentencetransformermodel_pytest.py Signed-off-by: Thanawan Atchariyachanvanit * Improve test coverage Signed-off-by: Thanawan Atchariyachanvanit * Edit test name Signed-off-by: Thanawan Atchariyachanvanit * Change CHANGELOG.md & Add comment to sentencetransformermodel.py Signed-off-by: Thanawan Atchariyachanvanit * Correct linting Signed-off-by: Thanawan Atchariyachanvanit * Improve add description Signed-off-by: Thanawan Atchariyachanvanit * Correct linting Signed-off-by: Thanawan Atchariyachanvanit * Loosen restriction Signed-off-by: Thanawan Atchariyachanvanit * Update sentencetransformermodel.py Signed-off-by: Thanawan Atchariyachanvanit * Change function name + Add comment + Add default description Signed-off-by: Thanawan Atchariyachanvanit * Debug Signed-off-by: Thanawan Atchariyachanvanit --------- Signed-off-by: Thanawan Atchariyachanvanit --- CHANGELOG.md | 9 ++ docs/requirements-docs.txt | 2 + .../ml_models/sentencetransformermodel.py | 95 +++++++++++ requirements-dev.txt | 1 + tests/conftest.py | 2 +- tests/ml_commons/test_ml_commons_client.py | 4 +- .../test_sentencetransformermodel_pytest.py | 150 +++++++++++++++++- 7 files changed, 253 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da9401cb..5368d028 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,15 @@ # CHANGELOG Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) +## [1.2.0] + +### Added + +### Changed + +### Fixed +- Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203)) + ## [1.1.0] ### Added diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 3b3943ae..d4e8a521 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -8,6 +8,7 @@ sphinx_rtd_theme nbsphinx pandoc deprecated + # using in SentenceTransformerModel torch pyyaml @@ -15,6 +16,7 @@ accelerate sentence_transformers transformers tqdm +mdutils # traitlets has been having all sorts of release problems lately. traitlets<5.1 diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 8fbcb1a0..05db5270 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -10,6 +10,7 @@ import pickle import platform import random +import re import shutil import subprocess import time @@ -23,6 +24,7 @@ import torch import yaml from accelerate import Accelerator, notebook_launcher +from mdutils.fileutils import MarkDownFile from sentence_transformers import SentenceTransformer from sentence_transformers.models import Normalize, Pooling, Transformer from torch.utils.data import DataLoader @@ -1006,6 +1008,74 @@ def set_up_accelerate_config( "Failed to open config file for ml common upload: " + file_path + "\n" ) + def _get_model_description_from_readme_file(self, readme_file_path) -> str: + """ + Get description of the model from README.md file in the model folder + after the model is saved in local directory + + See example here: + https://huggingface.co/sentence-transformers/msmarco-distilbert-base-tas-b/blob/main/README.md) + + This function assumes that the README.md has the following format: + + # sentence-transformers/msmarco-distilbert-base-tas-b + This is [ ... further description ... ] + + # [ ... Next section ...] + ... + + :param readme_file_path: Path to README.md file + :type readme_file_path: string + :return: Description of the model + :rtype: string + """ + readme_data = MarkDownFile.read_file(readme_file_path) + + # Find the description section + start_str = f"# {self.model_id}" + start = readme_data.find(start_str) + if start == -1: + model_name = self.model_id.split("/")[1] + start_str = f"# {model_name}" + start = readme_data.find(start_str) + end = readme_data.find("\n#", start + len(start_str)) + + # If we cannot find the scope of description section, raise error. + if start == -1 or end == -1: + assert False, "Cannot find description in README.md file" + + # Parse out the description section + description = readme_data[start + len(start_str) + 1 : end].strip() + description = description.split("\n")[0] + + # Remove hyperlink and reformat text + description = re.sub(r"\(.*?\)", "", description) + description = re.sub(r"[\[\]]", "", description) + description = re.sub(r"\*", "", description) + + # Remove unnecessary part if exists (i.e. " For an introduction to ...") + # (Found in https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1/blob/main/README.md) + unnecessary_part = description.find(" For an introduction to") + if unnecessary_part != -1: + description = description[:unnecessary_part] + + return description + + def _generate_default_model_description(self, embedding_dimension) -> str: + """ + Generate default model description of the model based on embedding_dimension + + ::param embedding_dimension: Embedding dimension of the model. + :type embedding_dimension: int + :return: Description of the model + :rtype: string + """ + print( + "Using default description from embedding_dimension instead (You can overwrite this by specifying description parameter in make_model_config_json function" + ) + description = f"This is a sentence-transformers model: It maps sentences & paragraphs to a {embedding_dimension} dimensional dense vector space." + return description + def make_model_config_json( self, model_name: str = None, @@ -1014,6 +1084,7 @@ def make_model_config_json( embedding_dimension: int = None, pooling_mode: str = None, normalize_result: bool = None, + description: str = None, all_config: str = None, model_type: str = None, verbose: bool = False, @@ -1040,6 +1111,9 @@ def make_model_config_json( :param normalize_result: Optional, whether to normalize the result of the model. If None, check from the pre-trained hugging-face model object. :type normalize_result: bool + :param description: Optional, the description of the model. If None, get description from the README.md + file in the model folder. + :type description: str :param all_config: Optional, the all_config of the model. If None, parse all contents from the config file of pre-trained hugging-face model @@ -1087,6 +1161,26 @@ def make_model_config_json( f"Raised exception while getting model data from pre-trained hugging-face model object: {e}" ) + if description is None: + readme_file_path = os.path.join(self.folder_path, "README.md") + if os.path.exists(readme_file_path): + try: + if verbose: + print("reading README.md file") + description = self._get_model_description_from_readme_file( + readme_file_path + ) + except Exception as e: + print(f"Cannot scrape model description from README.md file: {e}") + description = self._generate_default_model_description( + embedding_dimension + ) + else: + print("Cannot find README.md file to scrape model description") + description = self._generate_default_model_description( + embedding_dimension + ) + if all_config is None: if not os.path.exists(config_json_file_path): raise Exception( @@ -1114,6 +1208,7 @@ def make_model_config_json( model_config_content = { "name": model_name, "version": version_number, + "description": description, "model_format": model_format, "model_task_type": "TEXT_EMBEDDING", "model_config": { diff --git a/requirements-dev.txt b/requirements-dev.txt index 2734a6fa..0f74e77a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -11,6 +11,7 @@ sentence_transformers tqdm transformers deprecated +mdutils # # Testing diff --git a/tests/conftest.py b/tests/conftest.py index 1e230f4a..93502fa0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -109,7 +109,7 @@ def check_values(self, oml_obj, pd_obj): def check_exception(self, ed_exc, pd_exc): """Checks that either an exception was raised or not from both opensearch_py_ml and pandas""" - assert (ed_exc is None) == (pd_exc is None) and type(ed_exc) == type(pd_exc) + assert (ed_exc is None) == (pd_exc is None) and isinstance(ed_exc, type(pd_exc)) if pd_exc is not None: raise pd_exc diff --git a/tests/ml_commons/test_ml_commons_client.py b/tests/ml_commons/test_ml_commons_client.py index ae32edd9..10be2c16 100644 --- a/tests/ml_commons/test_ml_commons_client.py +++ b/tests/ml_commons/test_ml_commons_client.py @@ -68,8 +68,8 @@ def clean_test_folder(TEST_FOLDER): def test_init(): - assert type(ml_client._client) == OpenSearch - assert type(ml_client._model_uploader) == ModelUploader + assert isinstance(ml_client._client, OpenSearch) + assert isinstance(ml_client._model_uploader, ModelUploader) def test_execute(): diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index de76f1a7..7bf0c95b 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -204,7 +204,12 @@ def test_make_model_config_json_for_torch_script(): assert ( "model_format" in model_config_data_torch and model_config_data_torch["model_format"] == "TORCH_SCRIPT" - ) + ), "Missing or Wrong model_format in torch script model config file" + assert ( + "description" in model_config_data_torch + and model_config_data_torch["description"] + == "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources." + ), "Missing or Wrong model description in onnx model config file'" assert ( "model_config" in model_config_data_torch ), "Missing 'model_config' in torch script model config file" @@ -248,11 +253,16 @@ def test_make_model_config_json_for_onnx(): assert ( "name" in model_config_data_onnx and model_config_data_onnx["name"] == model_id - ), "Missing or Wrong model name in onnx model config file'" + ), "Missing or Wrong model name in onnx model config file" assert ( "model_format" in model_config_data_onnx and model_config_data_onnx["model_format"] == "ONNX" - ) + ), "Missing or Wrong model_format in onnx model config file" + assert ( + "description" in model_config_data_onnx + and model_config_data_onnx["description"] + == "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search." + ), "Missing or Wrong model description in onnx model config file" assert ( "model_config" in model_config_data_onnx ), "Missing 'model_config' in onnx model config file" @@ -310,7 +320,7 @@ def test_overwrite_fields_in_model_config(): assert ( "model_format" in model_config_data_torch and model_config_data_torch["model_format"] == "TORCH_SCRIPT" - ) + ), "Missing or Wrong model_format in onnx model config file" assert ( "model_config" in model_config_data_torch ), "Missing 'model_config' in torch script model config file" @@ -354,7 +364,7 @@ def test_overwrite_fields_in_model_config(): assert ( "model_format" in model_config_data_torch and model_config_data_torch["model_format"] == "TORCH_SCRIPT" - ) + ), "Missing or Wrong model_format in torch script model config file" assert ( "model_config" in model_config_data_torch ), "Missing 'model_config' in torch script model config file" @@ -372,10 +382,42 @@ def test_overwrite_fields_in_model_config(): clean_test_folder(TEST_FOLDER) -def test_truncation_parameter(): +def test_missing_readme_md_file(): model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" - MAX_LENGTH_TASB = 512 + clean_test_folder(TEST_FOLDER) + test_model9 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + test_model9.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + temp_path = os.path.join( + TEST_FOLDER, + "README.md", + ) + os.remove(temp_path) + model_config_path_torch = test_model9.make_model_config_json( + model_format="TORCH_SCRIPT" + ) + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + + assert ( + "description" in model_config_data_torch + and model_config_data_torch["description"] + == "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space." + ), "Should use default model description when README.md file is missing" + + clean_test_folder(TEST_FOLDER) + + +def test_missing_expected_description_in_readme_file(): + model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2" clean_test_folder(TEST_FOLDER) test_model10 = SentenceTransformerModel( folder_path=TEST_FOLDER, @@ -383,6 +425,100 @@ def test_truncation_parameter(): ) test_model10.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + temp_path = os.path.join( + TEST_FOLDER, + "README.md", + ) + with open(temp_path, "w") as f: + f.write("No model description here") + model_config_path_torch = test_model10.make_model_config_json( + model_format="TORCH_SCRIPT" + ) + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + + assert ( + "description" in model_config_data_torch + and model_config_data_torch["description"] + == "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space." + ), "Should use default model description when description is missing from README.md" + + clean_test_folder(TEST_FOLDER) + + +def test_overwrite_description(): + model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" + clean_test_folder(TEST_FOLDER) + test_model11 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model11.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + model_config_path_torch = test_model11.make_model_config_json( + model_format="TORCH_SCRIPT", description="Expected Description" + ) + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + + assert ( + "description" in model_config_data_torch + and model_config_data_torch["description"] == "Expected Description" + ), "Cannot overwrite description in model config file" + + clean_test_folder(TEST_FOLDER) + + +def test_long_description(): + model_id = "sentence-transformers/gtr-t5-base" + clean_test_folder(TEST_FOLDER) + test_model12 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model12.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + model_config_path_torch = test_model12.make_model_config_json( + model_format="TORCH_SCRIPT" + ) + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + + assert ( + "description" in model_config_data_torch + and model_config_data_torch["description"] + == "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space. The model was specifically trained for the task of sematic search." + ), "Missing or Wrong model description in torch_script model config file" + + clean_test_folder(TEST_FOLDER) + + +def test_truncation_parameter(): + model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" + MAX_LENGTH_TASB = 512 + + clean_test_folder(TEST_FOLDER) + test_model13 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model13.save_as_pt(model_id=model_id, sentences=["today is sunny"]) tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json") try: From 69e8021a54dee3ac214a1541ec66331d7625fae2 Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Wed, 9 Aug 2023 13:48:23 -0700 Subject: [PATCH 2/5] Initiate PR #1 Model Auto-tracing & Uploading Signed-off-by: Thanawan Atchariyachanvanit --- .ci/run-repository.sh | 40 +- .ci/run-tests | 2 +- .github/workflows/build_deploy_doc.yml | 2 +- .../delete_model_uploader_branch.yml | 16 + .github/workflows/integration.yml | 2 +- .github/workflows/model_uploader.yml | 412 ++++++++++++ noxfile.py | 25 +- utils/{ => lint}/license-headers.py | 0 utils/model_uploader/model_autotracing.py | 597 ++++++++++++++++++ .../save_model_file_path_to_env.py | 90 +++ utils/model_uploader/update_changelog_md.py | 74 +++ .../update_models_upload_history_md.py | 285 +++++++++ 12 files changed, 1532 insertions(+), 13 deletions(-) create mode 100644 .github/workflows/delete_model_uploader_branch.yml create mode 100644 .github/workflows/model_uploader.yml rename utils/{ => lint}/license-headers.py (100%) create mode 100644 utils/model_uploader/model_autotracing.py create mode 100644 utils/model_uploader/save_model_file_path_to_env.py create mode 100644 utils/model_uploader/update_changelog_md.py create mode 100644 utils/model_uploader/update_models_upload_history_md.py diff --git a/.ci/run-repository.sh b/.ci/run-repository.sh index ba0cea2b..7aadbdda 100755 --- a/.ci/run-repository.sh +++ b/.ci/run-repository.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -# Called by entry point `run-test` use this script to add your repository specific test commands +# Called by entry point `run-test` use this script to add your repository specific task commands # Once called opensearch is up and running and the following parameters are available to this script # OPENSEARCH_VERSION -- version e.g Major.Minor.Patch(-Prelease) @@ -16,7 +16,7 @@ set -e echo -e "\033[34;1mINFO:\033[0m URL ${opensearch_url}\033[0m" echo -e "\033[34;1mINFO:\033[0m EXTERNAL OS URL ${external_opensearch_url}\033[0m" echo -e "\033[34;1mINFO:\033[0m VERSION ${OPENSEARCH_VERSION}\033[0m" -echo -e "\033[34;1mINFO:\033[0m IS_DOC: ${IS_DOC}\033[0m" +echo -e "\033[34;1mINFO:\033[0m TASK_TYPE: ${TASK_TYPE}\033[0m" echo -e "\033[34;1mINFO:\033[0m TEST_SUITE ${TEST_SUITE}\033[0m" echo -e "\033[34;1mINFO:\033[0m PYTHON_VERSION ${PYTHON_VERSION}\033[0m" echo -e "\033[34;1mINFO:\033[0m PYTHON_CONNECTION_CLASS ${PYTHON_CONNECTION_CLASS}\033[0m" @@ -33,7 +33,8 @@ docker build \ echo -e "\033[1m>>>>> Run [opensearch-project/opensearch-py-ml container] >>>>>>>>>>>>>>>>>>>>>>>>>>>>>\033[0m" -if [[ "$IS_DOC" == "false" ]]; then +if [[ "$TASK_TYPE" == "test" ]]; then + # Set up OpenSearch cluster & Run integration and unit tests (Invoked by integration.yml workflow) docker run \ --network=${network_name} \ --env "STACK_VERSION=${STACK_VERSION}" \ @@ -45,10 +46,11 @@ if [[ "$IS_DOC" == "false" ]]; then --name opensearch-py-ml-test-runner \ opensearch-project/opensearch-py-ml \ nox -s "test-${PYTHON_VERSION}(pandas_version='${PANDAS_VERSION}')" + docker cp opensearch-py-ml-test-runner:/code/opensearch-py-ml/junit/ ./junit/ - docker rm opensearch-py-ml-test-runner -else +elif [[ "$TASK_TYPE" == "doc" ]]; then + # Set up OpenSearch cluster & Run docs (Invoked by build_deploy_doc.yml workflow) docker run \ --network=${network_name} \ --env "STACK_VERSION=${STACK_VERSION}" \ @@ -60,7 +62,31 @@ else --name opensearch-py-ml-doc-runner \ opensearch-project/opensearch-py-ml \ nox -s docs + docker cp opensearch-py-ml-doc-runner:/code/opensearch-py-ml/docs/build/ ./docs/ - docker rm opensearch-py-ml-doc-runner -fi \ No newline at end of file +elif [[ "$TASK_TYPE" == "trace" ]]; then + # Set up OpenSearch cluster & Run model autotracing (Invoked by model_uploader.yml workflow) + echo -e "\033[34;1mINFO:\033[0m MODEL_ID: ${MODEL_ID}\033[0m" + echo -e "\033[34;1mINFO:\033[0m MODEL_VERSION: ${MODEL_VERSION}\033[0m" + echo -e "\033[34;1mINFO:\033[0m TRACING_FORMAT: ${TRACING_FORMAT}\033[0m" + echo -e "\033[34;1mINFO:\033[0m EMBEDDING_DIMENSION: ${EMBEDDING_DIMENSION:-Default}\033[0m" + echo -e "\033[34;1mINFO:\033[0m POOLING_MODE: ${POOLING_MODE:-Default}\033[0m" + echo -e "\033[34;1mINFO:\033[0m MODEL_DESCRIPTION: ${MODEL_DESCRIPTION:-Default}\033[0m" + + docker run \ + --network=${network_name} \ + --env "STACK_VERSION=${STACK_VERSION}" \ + --env "OPENSEARCH_URL=${opensearch_url}" \ + --env "OPENSEARCH_VERSION=${OPENSEARCH_VERSION}" \ + --env "TEST_SUITE=${TEST_SUITE}" \ + --env "PYTHON_CONNECTION_CLASS=${PYTHON_CONNECTION_CLASS}" \ + --env "TEST_TYPE=server" \ + --name opensearch-py-ml-trace-runner \ + opensearch-project/opensearch-py-ml \ + nox -s trace -- ${MODEL_ID} ${MODEL_VERSION} ${TRACING_FORMAT} -ed ${EMBEDDING_DIMENSION} -pm ${POOLING_MODE} -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"} + + docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/upload/ ./upload/ + docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/trace_output/ ./trace_output/ + docker rm opensearch-py-ml-trace-runner +fi diff --git a/.ci/run-tests b/.ci/run-tests index abfeac34..258da8a5 100755 --- a/.ci/run-tests +++ b/.ci/run-tests @@ -10,7 +10,7 @@ export PYTHON_CONNECTION_CLASS="${PYTHON_CONNECTION_CLASS:=Urllib3HttpConnection export CLUSTER="${1:-opensearch}" export SECURE_INTEGRATION="${2:-true}" export OPENSEARCH_VERSION="${3:-latest}" -export IS_DOC="${4:-false}" +export TASK_TYPE="${4:-test}" if [[ "$SECURE_INTEGRATION" == "true" ]]; then export OPENSEARCH_URL_EXTENSION="https" else diff --git a/.github/workflows/build_deploy_doc.yml b/.github/workflows/build_deploy_doc.yml index 9321b32e..876a73ea 100644 --- a/.github/workflows/build_deploy_doc.yml +++ b/.github/workflows/build_deploy_doc.yml @@ -20,7 +20,7 @@ jobs: - name: Checkout Repository uses: actions/checkout@v2 - name: Integ ${{ matrix.cluster }} secured=${{ matrix.secured }} version=${{matrix.entry.opensearch_version}} - run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} true" + run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} doc" - name: Deploy uses: peaceiris/actions-gh-pages@v3 with: diff --git a/.github/workflows/delete_model_uploader_branch.yml b/.github/workflows/delete_model_uploader_branch.yml new file mode 100644 index 00000000..a68e451c --- /dev/null +++ b/.github/workflows/delete_model_uploader_branch.yml @@ -0,0 +1,16 @@ +name: Delete merged branch for model-uploader & model-listing-uploader +on: + pull_request: + types: + - closed + +jobs: + delete-branch: + runs-on: ubuntu-latest + if: startsWith(github.event.pull_request.head.ref,'model-uploader/') || startsWith(github.event.pull_request.head.ref,'model-listing-uploader/') + steps: + - name: Delete closed PR branch + uses: dawidd6/action-delete-branch@v3 + with: + GITHUB_TOKEN: ${{github.token}} + numbers: ${{github.event.pull_request.number}} diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 607bd356..e36c7735 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -18,7 +18,7 @@ jobs: - name: Checkout uses: actions/checkout@v2 - name: Integ ${{ matrix.cluster }} secured=${{ matrix.secured }} version=${{matrix.entry.opensearch_version}} - run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }}" + run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} test" - name: Upload coverage to Codecov uses: codecov/codecov-action@v2 with: diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml new file mode 100644 index 00000000..3a8aadd8 --- /dev/null +++ b/.github/workflows/model_uploader.yml @@ -0,0 +1,412 @@ +name: Model Auto-tracing & Uploading +on: + # Step 1: Initiate the workflow + workflow_dispatch: + inputs: + model_source: + description: "Model source (e.g. huggingface)" + required: true + type: string + default: "huggingface" + model_id: + description: "Model ID for auto-tracing and uploading (e.g. sentence-transformers/msmarco-distilbert-base-tas-b)" + required: true + type: string + model_version: + description: "Model version number (e.g. 1.0.1)" + required: true + type: string + tracing_format: + description: "Model format for auto-tracing (torch_script/onnx)" + required: true + type: choice + options: + - "BOTH" + - "TORCH_SCRIPT" + - "ONNX" + embedding_dimension: + description: "(Optional) Embedding Dimension (Specify here if it does not exist in original config.json file, or you want to overwrite it.)" + required: false + type: int + pooling_mode: + description: "(Optional) Pooling Mode (Specify here if it does not exist in original config.json file or you want to overwrite it.)" + required: false + type: choice + options: + - "" + - "CLS" + - "MEAN" + - "MAX" + - "MEAN_SQRT_LEN" + model_description: + description: "(Optional) Description (Specify here if you want to overwrite the default model description)" + required: false + type: string + +jobs: + # Step 2: Initiate workflow variable + init-workflow-var: + runs-on: 'ubuntu-latest' + steps: + - name: Fail if branch is not main + if: github.ref == 'refs/heads/main' + run: | + echo "This workflow should only be triggered on 'main' branch" + exit 1 + - name: Initiate folders + id: init_folders + run: | + model_id=${{ github.event.inputs.model_id }} + echo "model_folder=ml-models/${{github.event.inputs.model_source}}/${model_id}" >> $GITHUB_OUTPUT + echo "sentence_transformer_folder=ml-models/${{github.event.inputs.model_source}}/${model_id%%/*}/" >> $GITHUB_OUTPUT + - name: Initiate workflow_info + id: init_workflow_info + run: | + embedding_dimension=${{ github.event.inputs.embedding_dimension }} + pooling_mode=${{ github.event.inputs.pooling_mode }} + model_description="${{ github.event.inputs.model_description }}" + + workflow_info=" + ============= Workflow Details ============== + - Workflow Name: ${{ github.workflow }} + - Workflow Run ID: ${{ github.run_id }} + - Workflow Initiator: @${{ github.actor }} + + ========= Workflow Input Information ========= + - Model ID: ${{ github.event.inputs.model_id }} + - Model Version: ${{ github.event.inputs.model_version }} + - Tracing Format: ${{ github.event.inputs.tracing_format }} + - Embedding Dimension: ${embedding_dimension:-Default} + - Pooling Mode: ${pooling_mode:-Default} + - Model Description: ${model_description:-Default} + + ======== Workflow Output Information ========= + - Embedding Verification: Passed" + + echo "workflow_info<> $GITHUB_OUTPUT + echo "${workflow_info@E}" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + echo "${workflow_info@E}" + - name: Initiate license_line + id: init_license_line + run: | + echo "verified=:white_check_mark: — It is verified that this model is licensed under Apache 2.0" >> $GITHUB_OUTPUT + echo "unverified=- [ ] :warning: The license cannot be verified. Please confirm by yourself that the model is licensed under Apache 2.0 :warning:" >> $GITHUB_OUTPUT + outputs: + model_folder: ${{ steps.init_folders.outputs.model_folder }} + sentence_transformer_folder: ${{ steps.init_folders.outputs.sentence_transformer_folder }} + workflow_info: ${{ steps.init_workflow_info.outputs.workflow_info }} + verified_license_line: ${{ steps.init_license_line.outputs.verified }} + unverified_license_line: ${{ steps.init_license_line.outputs.unverified }} + + # Step 3: Check if the model already exists in the model hub + checking-out-model-hub: + needs: init-workflow-var + runs-on: 'ubuntu-latest' + permissions: + id-token: write + contents: read + environment: opensearch-py-ml-cicd-env + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + - name: Set Up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-region: ${{ secrets.MODEL_UPLOADER_AWS_REGION }} + role-to-assume: ${{ secrets.MODEL_UPLOADER_ROLE }} + role-session-name: checking-out-model-hub + - name: Check if TORCH_SCRIPT Model Exists + if: github.event.inputs.tracing_format == 'TORCH_SCRIPT' || github.event.inputs.tracing_format == 'BOTH' + run: | + TORCH_FILE_PATH=$(python utils/model_uploader/save_model_file_path_to_env.py \ + ${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} ${{ github.event.inputs.model_id }} \ + ${{ github.event.inputs.model_version }} TORCH_SCRIPT) + aws s3api head-object --bucket ${{ secrets.MODEL_BUCKET }} --key $TORCH_FILE_PATH > /dev/null 2>&1 || TORCH_MODEL_NOT_EXIST=true + if [[ -z $TORCH_MODEL_NOT_EXIST ]] + then + echo "${{ github.event.inputs.model_id }} already exists on model hub for TORCH_SCRIPT format and ${{ github.event.inputs.model_version }} version." + exit 1 + fi + - name: Check if ONNX Model Exists + if: github.event.inputs.tracing_format == 'ONNX' || github.event.inputs.tracing_format == 'BOTH' + run: | + ONNX_FILE_PATH=$(python utils/model_uploader/save_model_file_path_to_env.py \ + ${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} ${{ github.event.inputs.model_id }} \ + ${{ github.event.inputs.model_version }} ONNX) + aws s3api head-object --bucket ${{ secrets.MODEL_BUCKET }} --key $ONNX_FILE_PATH > /dev/null 2>&1 || ONNX_MODEL_NOT_EXIST=true + if [[ -z $ONNX_MODEL_NOT_EXIST ]] + then + echo "${{ github.event.inputs.model_id }} already exists on model hub for ONNX format and ${{ github.event.inputs.model_version }} version." + exit 1 + fi + + # Step 4: Trace the model, Verify the embeddings & Upload the model files as artifacts + model-auto-tracing: + needs: [init-workflow-var, checking-out-model-hub] + name: model-auto-tracing + runs-on: ubuntu-latest + permissions: + id-token: write + contents: read + environment: opensearch-py-ml-cicd-env + strategy: + matrix: + cluster: ["opensearch"] + secured: ["true"] + entry: + - { opensearch_version: 2.7.0 } + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Export Arguments + run: | + echo "MODEL_ID=${{ github.event.inputs.model_id }}" >> $GITHUB_ENV + echo "MODEL_VERSION=${{ github.event.inputs.model_version }}" >> $GITHUB_ENV + echo "TRACING_FORMAT=${{ github.event.inputs.tracing_format }}" >> $GITHUB_ENV + echo "EMBEDDING_DIMENSION=${{ github.event.inputs.embedding_dimension }}" >> $GITHUB_ENV + echo "POOLING_MODE=${{ github.event.inputs.pooling_mode }}" >> $GITHUB_ENV + echo "MODEL_DESCRIPTION=${{ github.event.inputs.model_description }}" >> $GITHUB_ENV + - name: Autotracing ${{ matrix.cluster }} secured=${{ matrix.secured }} version=${{matrix.entry.opensearch_version}} + run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} trace" + - name: License Verification + id: license_verification + run: | + apache_verified=$(> $GITHUB_OUTPUT + echo "license_info=Automatically Verified" >> $GITHUB_OUTPUT + else + echo "license_line=${{ needs.init-workflow-var.outputs.unverified_license_line }}" >> $GITHUB_OUTPUT + echo "license_info=Manually Verified" >> $GITHUB_OUTPUT + fi + - name: Model Description Info + id: model_description_info + run: | + model_description_info="$(> $GITHUB_OUTPUT + echo "$model_description_info" + - name: Upload Artifact + uses: actions/upload-artifact@v3 + with: + name: upload + path: ./upload/ + retention-days: 5 + if-no-files-found: error + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-region: ${{ secrets.MODEL_UPLOADER_AWS_REGION }} + role-to-assume: ${{ secrets.MODEL_UPLOADER_ROLE }} + role-session-name: model-auto-tracing + - name: Dryrun model uploading + id: dryrun_model_uploading + run: | + dryrun_output=$(aws s3 sync ./upload/ s3://${{ secrets.MODEL_BUCKET }}/${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} --dryrun \ + | sed 's|s3://${{ secrets.MODEL_BUCKET }}/|s3://(MODEL_BUCKET)/|' + ) + echo "dryrun_output<> $GITHUB_OUTPUT + echo "${dryrun_output@E}" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + echo "${dryrun_output@E}" + outputs: + license_line: ${{ steps.license_verification.outputs.license_line }} + license_info: ${{ steps.license_verification.outputs.license_info }} + model_description_info: ${{ steps.model_description_info.outputs.model_description_info }} + dryrun_output: ${{ steps.dryrun_model_uploading.outputs.dryrun_output }} + + # Step 5: Ask for manual approval from the CODEOWNERS + manual-approval: + needs: [init-workflow-var, model-auto-tracing] + runs-on: 'ubuntu-latest' + permissions: + issues: write + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + - name: Get Approvers + id: get_approvers + run: | + echo "approvers=$(cat .github/CODEOWNERS | grep @ | tr -d '* ' | sed 's/@/,/g' | sed 's/,//1')" >> $GITHUB_OUTPUT + - name: Create Issue Body + id: create_issue_body + run: | + issue_body="Please approve or deny opensearch-py-ml model uploading: + + ${{ needs.model-auto-tracing.outputs.license_line }} + + ${{ needs.init-workflow-var.outputs.workflow_info }} + ${{ needs.model-auto-tracing.outputs.model_description_info }} + + ===== Dry Run of Model Uploading ===== + ${{ needs.model-auto-tracing.outputs.dryrun_output }}" + + echo "issue_body<> $GITHUB_OUTPUT + echo "${issue_body@E}" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + echo "${issue_body@E}" + - uses: trstringer/manual-approval@v1 + with: + secret: ${{ github.TOKEN }} + approvers: ${{ steps.get_approvers.outputs.approvers }} + minimum-approvals: 1 + issue-title: "Upload Model to OpenSearch Model Hub (${{ github.event.inputs.model_id }})" + issue-body: ${{ steps.create_issue_body.outputs.issue_body }} + exclude-workflow-initiator-as-approver: false + + # Step 6: Download the artifacts & Upload it to the S3 bucket + model-uploading: + needs: [init-workflow-var, manual-approval] + runs-on: 'ubuntu-latest' + permissions: + id-token: write + contents: read + environment: opensearch-py-ml-cicd-env + steps: + - name: Download Artifact + uses: actions/download-artifact@v2 + with: + name: upload + path: ./upload/ + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-region: ${{ secrets.MODEL_UPLOADER_AWS_REGION }} + role-to-assume: ${{ secrets.MODEL_UPLOADER_ROLE }} + role-session-name: model-uploading + - name: Copy Files to the Bucket + id: copying_to_bucket + run: | + aws s3 sync ./upload/ s3://${{ secrets.MODEL_BUCKET }}/${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} + echo "upload_time=$(TZ='America/Los_Angeles' date "+%Y-%m-%d %T")" >> $GITHUB_OUTPUT + outputs: + upload_time: ${{ steps.copying_to_bucket.outputs.upload_time }} + + # Step 7: Update MODEL_UPLOAD_HISTORY.md & supported_models.json + history-update: + needs: [init-workflow-var, model-uploading] + runs-on: 'ubuntu-latest' + permissions: + id-token: write + contents: write + pull-requests: write + env: + model_info: ${{ github.event.inputs.model_id }} (v.${{ github.event.inputs.model_version }})(${{ github.event.inputs.tracing_format }}) + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + - name: Set Up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install Packages + run: + python -m pip install mdutils + - name: Update Model Upload History + run: | + model_description="${{ github.event.inputs.model_description }}" + python utils/model_uploader/update_models_upload_history_md.py \ + ${{ github.event.inputs.model_id }} \ + ${{ github.event.inputs.model_version }} \ + ${{ github.event.inputs.tracing_format }} \ + -ed ${{ github.event.inputs.embedding_dimension }} \ + -pm ${{ github.event.inputs.pooling_mode }} \ + -md ${model_description:+"$model_description"} \ + -u ${{ github.actor }} -t "${{ needs.model-uploading.outputs.upload_time }}" + - name: Create PR Body + id: create_pr_body + run: | + pr_body=" + - [ ] This PR made commit to only these three files: MODEL_UPLOAD_HISTORY.md, supported_models.json, and CHANGELOG.md. + - [ ] CHANGELOG.md has been updated by the workflow or by you if the workflow fails to do so. + - [ ] Merge conflicts have been resolved. + + ${{ needs.init-workflow-var.outputs.workflow_info }} + ${{ needs.model-auto-tracing.outputs.license_info }} + ${{ needs.model-auto-tracing.outputs.model_description_info }}" + + echo "pr_body<> $GITHUB_OUTPUT + echo "${pr_body@E}" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + echo "${pr_body@E}" + - name: Create a Branch & Raise a PR + uses: peter-evans/create-pull-request@v5 + id: create_pr + with: + committer: github-actions[bot] + commit-message: 'GitHub Actions Workflow: Update Model Upload History - ${{ env.model_info }}' + signoff: true + title: 'Update Model Upload History - ${{ env.model_info }}' + body: ${{ steps.create_pr_body.outputs.pr_body }} + labels: ModelUploading + branch: model-uploader/${{ github.run_id }} + delete-branch: true + add-paths: | + ./utils/model_uploader/upload_history/MODEL_UPLOAD_HISTORY.md + ./utils/model_uploader/upload_history/supported_models.json + - name: Checkout Repository + uses: actions/checkout@v3 + with: + ref: model-uploader/${{ github.run_id }} + - name: Create a line for updating CHANGELOG.md + id: create_changelog_line + continue-on-error: true + run: | + pr_ref="([#${{ steps.create_pr.outputs.pull-request-number }}](${{ steps.create_pr.outputs.pull-request-url }}))" + changelog_line="Update model upload history - ${{ env.model_info }} by @${{ github.actor }} $pr_ref" + echo "changelog_line=$changelog_line" >> $GITHUB_OUTPUT + - name: Warning Comment on PR if create_changelog_line fails + if: steps.create_changelog_line.outcome == 'failure' + uses: thollander/actions-comment-pull-request@v2 + with: + pr_number: ${{ steps.create_pr.outputs.pull-request-number }} + message: | + Warning:exclamation:: The workflow failed to update CHANGELOG.md. Please update CHANGELOG.md manually. + - name: Update CHANGELOG.md + if: steps.create_changelog_line.outcome == 'success' + id: update_changelog + continue-on-error: true + run: | + python utils/model_uploader/update_changelog_md.py "${{ steps.create_changelog_line.outputs.changelog_line }}" + - name: Commit Updates + if: steps.create_changelog_line.outcome == 'success' && steps.update_changelog.outcome == 'success' + uses: stefanzweifel/git-auto-commit-action@v4 + id: commit + with: + branch: model-uploader/${{ github.run_id }} + commit_user_email: "github-actions[bot]@users.noreply.github.com" + commit_message: 'GitHub Actions Workflow: Update CHANGELOG.md - ${{ env.model_info }}' + commit_options: '--signoff' + file_pattern: CHANGELOG.md + - name: Warning Comment on PR if update_changelog fails + if: steps.create_changelog_line.outcome == 'success' && steps.update_changelog.outcome == 'failure' + uses: thollander/actions-comment-pull-request@v2 + with: + pr_number: ${{ steps.create_pr.outputs.pull-request-number }} + message: | + Warning:exclamation:: The workflow failed to update CHANGELOG.md. Please add the following line manually. + >>> + ${{ steps.create_changelog_line.outputs.changelog_line }} + + # Step 8: Trigger Jenkins ml-models workflow + trigger-ml-models-release-workflow: + needs: [init-workflow-var, history-update] + runs-on: 'ubuntu-latest' + permissions: + contents: read + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + - name: Trigger Jenkins Workflow with Generic Webhook + run: | + jenkins_trigger_token=${{ secrets.JENKINS_ML_MODELS_RELEASE_GENERIC_WEBHOOK_TOKEN }} + base_download_path=${{ needs.init-workflow-var.outputs.model_folder }} + version=${{ github.event.inputs.model_version }} + format=${{ github.event.inputs.tracing_format }} + jenkins_params="{\"BASE_DOWNLOAD_PATH\":\"$base_download_path\", \"VERSION\":\"$version\", \"FORMAT\":\"$format\"}" + sh utils/model_uploader/trigger_ml_models_release.sh $jenkins_trigger_token "$jenkins_params" diff --git a/noxfile.py b/noxfile.py index 448c3990..03809192 100644 --- a/noxfile.py +++ b/noxfile.py @@ -61,7 +61,7 @@ @nox.session(reuse_venv=True) def format(session): session.install("black", "isort", "flynt") - session.run("python", "utils/license-headers.py", "fix", *SOURCE_FILES) + session.run("python", "utils/lint/license-headers.py", "fix", *SOURCE_FILES) session.run("flynt", *SOURCE_FILES) session.run("black", "--target-version=py38", *SOURCE_FILES) session.run("isort", "--profile=black", *SOURCE_FILES) @@ -73,7 +73,7 @@ def lint(session): # Install numpy to use its mypy plugin # https://numpy.org/devdocs/reference/typing.html#mypy-plugin session.install("black", "flake8", "mypy", "isort", "numpy") - session.run("python", "utils/license-headers.py", "check", *SOURCE_FILES) + session.run("python", "utils/lint/license-headers.py", "check", *SOURCE_FILES) session.run("black", "--check", "--target-version=py38", *SOURCE_FILES) session.run("isort", "--check", "--profile=black", *SOURCE_FILES) session.run("flake8", "--ignore=E501,W503,E402,E712,E203", *SOURCE_FILES) @@ -142,10 +142,29 @@ def test(session, pandas_version: str): @nox.session(reuse_venv=True) def docs(session): # Run this so users get an error if they don't have Pandoc installed. - session.install("-r", "docs/requirements-docs.txt") session.install(".") session.cd("docs") session.run("make", "clean", external=True) session.run("make", "html", external=True) + + +# While nox is typically used for automating testing, in this case, we utilize it +# to automate the action workflow, leveraging its ability to set up the environment +# required for model autotracing. +@nox.session +def trace(session): + session.install( + "-r", + "requirements-dev.txt", + "--timeout", + "1500", + ) + session.install(".") + + session.run( + "python", + "utils/model_uploader/model_autotracing.py", + *(session.posargs), + ) diff --git a/utils/license-headers.py b/utils/lint/license-headers.py similarity index 100% rename from utils/license-headers.py rename to utils/lint/license-headers.py diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py new file mode 100644 index 00000000..840432c8 --- /dev/null +++ b/utils/model_uploader/model_autotracing.py @@ -0,0 +1,597 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +# This program is run by "Model Auto-tracing & Uploading" workflow +# (See model_uploader.yml) to perform model auto-tracing and prepare +# files for uploading to OpenSearch model hub. + +import argparse +import json +import os +import shutil +import sys +import warnings +from typing import List, Optional, Tuple +from zipfile import ZipFile + +import numpy as np +from mdutils.fileutils import MarkDownFile +from numpy.typing import DTypeLike +from sentence_transformers import SentenceTransformer + +# We need to append ROOT_DIR path so that we can import +# OPENSEARCH_TEST_CLIENT and opensearch_py_ml since this +# python script is not in the root directory. +THIS_DIR = os.path.dirname(__file__) +ROOT_DIR = os.path.join(THIS_DIR, "../..") +sys.path.append(ROOT_DIR) + +LICENSE_PATH = "LICENSE" +from opensearch_py_ml.ml_commons import MLCommonClient +from opensearch_py_ml.ml_models.sentencetransformermodel import SentenceTransformerModel +from tests import OPENSEARCH_TEST_CLIENT + +BOTH_FORMAT = "BOTH" +TORCH_SCRIPT_FORMAT = "TORCH_SCRIPT" +ONNX_FORMAT = "ONNX" + +TEMP_MODEL_PATH = "temp_model_path" +ORIGINAL_FOLDER_PATH = "sentence-transformers-original/" +TORCHSCRIPT_FOLDER_PATH = "sentence-transformers-torchscript/" +ONNX_FOLDER_PATH = "sentence-transformers-onnx/" +UPLOAD_FOLDER_PATH = "upload/" +MODEL_CONFIG_FILE_NAME = "ml-commons_model_config.json" +OUTPUT_DIR = "trace_output/" +LICENSE_VAR_FILE = "apache_verified.txt" +DESCRIPTION_VAR_FILE = "description.txt" +TEST_SENTENCES = [ + "First test sentence", + "This is a very long sentence used for testing model embedding outputs.", +] +RTOL_TEST = 1e-03 +ATOL_TEST = 1e-05 +ML_BASE_URI = "/_plugins/_ml" + + +def verify_license_in_md_file() -> bool: + """ + Verify that the model is licensed under Apache 2.0 + + :return: Whether the model is licensed under Apache 2.0 + :rtype: Bool + """ + try: + readme_data = MarkDownFile.read_file(TEMP_MODEL_PATH + "/" + "README.md") + except Exception as e: + print(f"Cannot verify the license: {e}") + return False + + start = readme_data.find("---") + end = readme_data.find("---", start + 3) + if start == -1 or end == -1: + return False + model_info = readme_data[start + 3 : end] + if "apache-2.0" in model_info.lower(): + print("\nFound apache-2.0 license at " + TEMP_MODEL_PATH + "README.md") + return True + else: + print("\nDid not find apache-2.0 license at " + TEMP_MODEL_PATH + "README.md") + return False + + +def trace_sentence_transformer_model( + model_id: str, + model_version: str, + model_format: str, + embedding_dimension: Optional[int] = None, + pooling_mode: Optional[str] = None, + model_description: Optional[str] = None, +) -> Tuple[str, str]: + """ + Trace the pretrained sentence transformer model, create a model config file, + and return a path to the model file and a path to the model config file required for model registration + + :param model_id: Model ID of the pretrained model + :type model_id: string + :param model_version: Version of the pretrained model for registration + :type model_version: string + :param model_format: Model format ("TORCH_SCRIPT" or "ONNX") + :type model_format: string + :param embedding_dimension: Embedding dimension input + :type embedding_dimension: int + :param pooling_mode: Pooling mode input ("CLS", "MEAN", "MAX", "MEAN_SQRT_LEN" or None) + :type pooling_mode: string + :param model_description: Model description input + :type model_description: string + :return: Tuple of model_path (path to model zip file) and model_config_path (path to model config json file) + :rtype: Tuple[str, str] + """ + folder_path = ( + TORCHSCRIPT_FOLDER_PATH + if model_format == TORCH_SCRIPT_FORMAT + else ONNX_FOLDER_PATH + ) + + # 1.) Initiate a sentence transformer model class object + pre_trained_model = None + try: + pre_trained_model = SentenceTransformerModel( + model_id=model_id, folder_path=folder_path, overwrite=True + ) + except Exception as e: + assert ( + False + ), f"Raised Exception in tracing {model_format} model\ + during initiating a sentence transformer model class object: {e}" + + # 2.) Save the model in the specified format + model_path = None + try: + if model_format == TORCH_SCRIPT_FORMAT: + model_path = pre_trained_model.save_as_pt( + model_id=model_id, sentences=TEST_SENTENCES + ) + else: + model_path = pre_trained_model.save_as_onnx(model_id=model_id) + except Exception as e: + assert False, f"Raised Exception during saving model as {model_format}: {e}" + + # 3.) Create a model config json file + try: + pre_trained_model.make_model_config_json( + version_number=model_version, + model_format=model_format, + embedding_dimension=embedding_dimension, + pooling_mode=pooling_mode, + description=model_description, + ) + except Exception as e: + assert ( + False + ), f"Raised Exception during making model config file for {model_format} model: {e}" + + # 4.) Return model_path & model_config_path for model registration + model_config_path = folder_path + MODEL_CONFIG_FILE_NAME + + return model_path, model_config_path + + +def register_and_deploy_sentence_transformer_model( + ml_client: "MLCommonClient", + model_path: str, + model_config_path: str, + model_format: str, +) -> List["DTypeLike"]: + """ + Register the pretrained sentence transformer model by using the model file and the model config file, + deploy the model to generate embeddings for the TEST_SENTENCES, + and return the embeddings for model verification + + :param ml_client: A client that communicates to the ml-common plugin for OpenSearch + :type ml_client: MLCommonClient + :param model_path: Path to model file + :type model_path: string + :param model_config_path: Path to model config file + :type model_config_path: string + :param model_format: Model format ("TORCH_SCRIPT" or "ONNX") + :type model_format: string + :return: List of embedding data for TEST_SENTENCES + :rtype: List["DTypeLike"] + """ + embedding_data = None + + # 1.) Register & Deploy the model + model_id = "" + try: + model_id = ml_client.register_model( + model_path=model_path, + model_config_path=model_config_path, + deploy_model=True, + isVerbose=True, + ) + print(f"\n{model_format}_model_id:", model_id) + assert model_id != "" or model_id is not None + except Exception as e: + assert ( + False + ), f"Raised Exception in {model_format} model registration/deployment: {e}" + + # 2.) Check model status + try: + ml_model_status = ml_client.get_model_info(model_id) + print("\nModel Status:") + print(ml_model_status) + assert ml_model_status.get("model_state") == "DEPLOYED" + assert ml_model_status.get("model_format") == model_format + assert ml_model_status.get("algorithm") == "TEXT_EMBEDDING" + except Exception as e: + assert False, f"Raised Exception in getting {model_format} model info: {e}" + + # 3.) Generate embeddings + try: + embedding_output = ml_client.generate_embedding(model_id, TEST_SENTENCES) + assert len(embedding_output.get("inference_results")) == 2 + embedding_data = [ + embedding_output["inference_results"][i]["output"][0]["data"] + for i in range(len(TEST_SENTENCES)) + ] + except Exception as e: + assert ( + False + ), f"Raised Exception in generating sentence embedding with {model_format} model: {e}" + + # 4.) Undeploy the model + try: + ml_client.undeploy_model(model_id) + ml_model_status = ml_client.get_model_info(model_id) + assert ml_model_status.get("model_state") == "UNDEPLOYED" + except Exception as e: + assert False, f"Raised Exception in {model_format} model undeployment: {e}" + + # 5.) Delete the model + try: + delete_model_obj = ml_client.delete_model(model_id) + assert delete_model_obj.get("result") == "deleted" + except Exception as e: + assert False, f"Raised Exception in deleting {model_format} model: {e}" + + # 6.) Return embedding outputs for model verification + return embedding_data + + +def verify_embedding_data( + original_embedding_data: List["DTypeLike"], + tracing_embedding_data: List["DTypeLike"], +) -> bool: + """ + Verify the embeddings generated by the traced model with those of original model + + :param original_embedding_data: Embedding outputs of TEST_SENTENCES generated by the original model + :type original_embedding_data: List['DTypeLike'] + :param tracing_embedding_data: Embedding outputs of TEST_SENTENCES generated by the traced model + :type tracing_embedding_data: List['DTypeLike'] + :return: Whether the embeddings generated by the traced model match with those of original model + :rtype: bool + """ + failed_cases = [] + for i in range(len(TEST_SENTENCES)): + try: + np.testing.assert_allclose( + original_embedding_data[i], + tracing_embedding_data[i], + rtol=RTOL_TEST, + atol=ATOL_TEST, + ) + except Exception as e: + failed_cases.append((TEST_SENTENCES[i], e)) + + if len(failed_cases): + print( + "\nOriginal embeddings DOES NOT matches the embeddings in the following case(s):" + ) + for sentence, e in failed_cases: + print(sentence) + print(e) + return False + else: + return True + + +def prepare_files_for_uploading( + model_id: str, + model_version: str, + model_format: str, + src_model_path: str, + src_model_config_path: str, +) -> None: + """ + Prepare files for uploading by storing them in UPLOAD_FOLDER_PATH + + :param model_id: Model ID of the pretrained model + :type model_id: string + :param model_version: Version of the pretrained model for registration + :type model_version: string + :param model_format: Model format ("TORCH_SCRIPT" or "ONNX") + :type model_format: string + :param src_model_path: Path to model files for uploading + :type src_model_path: string + :param src_model_config_path: Path to model config files for uploading + :type src_model_config_path: string + :return: Tuple of dst_model_path (path to model zip file) and dst_model_config_path + (path to model config json file) in the UPLOAD_FOLDER_PATH + :rtype: Tuple[str, str] + """ + model_name = str(model_id.split("/")[-1]) + model_format = model_format.lower() + folder_to_delete = ( + TORCHSCRIPT_FOLDER_PATH if model_format == "torch_script" else ONNX_FOLDER_PATH + ) + + # Store to be uploaded files in UPLOAD_FOLDER_PATH + try: + dst_model_dir = ( + f"{UPLOAD_FOLDER_PATH}{model_name}/{model_version}/{model_format}" + ) + os.makedirs(dst_model_dir, exist_ok=True) + dst_model_filename = ( + f"sentence-transformers_{model_name}-{model_version}-{model_format}.zip" + ) + dst_model_path = dst_model_dir + "/" + dst_model_filename + with ZipFile(src_model_path, "a") as zipObj: + zipObj.write(filename=LICENSE_PATH, arcname="LICENSE") + shutil.copy(src_model_path, dst_model_path) + print(f"\nCopied {src_model_path} to {dst_model_path}") + + dst_model_config_dir = ( + f"{UPLOAD_FOLDER_PATH}{model_name}/{model_version}/{model_format}" + ) + os.makedirs(dst_model_config_dir, exist_ok=True) + dst_model_config_filename = "config.json" + dst_model_config_path = dst_model_config_dir + "/" + dst_model_config_filename + shutil.copy(src_model_config_path, dst_model_config_path) + print(f"Copied {src_model_config_path} to {dst_model_config_path}") + except Exception as e: + assert ( + False + ), f"Raised Exception during preparing {model_format} files for uploading: {e}" + + # Delete model folder downloaded from HuggingFace during model tracing + try: + shutil.rmtree(folder_to_delete) + except Exception as e: + assert False, f"Raised Exception while deleting {folder_to_delete}: {e}" + + return dst_model_path, dst_model_config_path + + +def store_license_verified_variable(license_verified: bool) -> None: + """ + Store whether the model is licensed under Apache 2.0 in OUTPUT_DIR/LICENSE_VAR_FILE + to be used to generate issue body for manual approval + + :param license_verified: Whether the model is licensed under Apache 2.0 + :type model_path: bool + :return: No return value expected + :rtype: None + """ + try: + os.makedirs(OUTPUT_DIR, exist_ok=True) + license_var_filepath = OUTPUT_DIR + "/" + LICENSE_VAR_FILE + with open(license_var_filepath, "w") as f: + f.write(str(license_verified)) + except Exception as e: + print( + f"Cannot store license_verified ({license_verified}) in {license_var_filepath}: {e}" + ) + + +def store_description_variable(config_path_for_checking_description: str) -> None: + """ + Store model description in OUTPUT_DIR/DESCRIPTION_VAR_FILE + to be used to generate issue body for manual approval + + :param config_path_for_checking_description: Path to config json file + :type config_path_for_checking_description: str + :return: No return value expected + :rtype: None + """ + try: + os.makedirs(OUTPUT_DIR, exist_ok=True) + description_var_filepath = OUTPUT_DIR + "/" + DESCRIPTION_VAR_FILE + with open(config_path_for_checking_description, "r") as f: + config_dict = json.load(f) + description = ( + config_dict["description"] if "description" in config_dict else "-" + ) + print(f"Storing the following description at {description_var_filepath}") + print(description) + with open(description_var_filepath, "w") as f: + f.write(description) + except Exception as e: + print( + f"Cannot store description ({description}) in {description_var_filepath}: {e}" + ) + + +def main( + model_id: str, + model_version: str, + tracing_format: str, + embedding_dimension: Optional[int] = None, + pooling_mode: Optional[str] = None, + model_description: Optional[str] = None, +) -> None: + """ + Perform model auto-tracing and prepare files for uploading to OpenSearch model hub + + :param model_id: Model ID of the pretrained model + :type model_id: string + :param model_version: Version of the pretrained model for registration + :type model_version: string + :param tracing_format: Tracing format ("TORCH_SCRIPT", "ONNX", or "BOTH") + :type tracing_format: string + :param embedding_dimension: Embedding dimension input + :type embedding_dimension: int + :param pooling_mode: Pooling mode input ("CLS", "MEAN", "MAX", "MEAN_SQRT_LEN" or None) + :type pooling_mode: string + :param model_description: Model description input + :type model_description: string + :return: No return value expected + :rtype: None + """ + + print("\n=== Begin running model_autotracing.py ===") + print("Model ID: ", model_id) + print("Model Version: ", model_version) + print("Tracing Format: ", tracing_format) + print( + "Embedding Dimension: ", + embedding_dimension if embedding_dimension is not None else "Default", + ) + print("Pooling Mode: ", pooling_mode if pooling_mode is not None else "Default") + print( + "Model Description: ", + model_description if model_description is not None else "Default", + ) + print("==========================================") + + ml_client = MLCommonClient(OPENSEARCH_TEST_CLIENT) + + pre_trained_model = SentenceTransformer(model_id) + original_embedding_data = list( + pre_trained_model.encode(TEST_SENTENCES, convert_to_numpy=True) + ) + + pre_trained_model.save(path=TEMP_MODEL_PATH) + license_verified = verify_license_in_md_file() + try: + shutil.rmtree(TEMP_MODEL_PATH) + except Exception as e: + assert False, f"Raised Exception while deleting {TEMP_MODEL_PATH}: {e}" + + if tracing_format in [TORCH_SCRIPT_FORMAT, BOTH_FORMAT]: + print("--- Begin tracing a model in TORCH_SCRIPT ---") + ( + torchscript_model_path, + torchscript_model_config_path, + ) = trace_sentence_transformer_model( + model_id, + model_version, + TORCH_SCRIPT_FORMAT, + embedding_dimension, + pooling_mode, + model_description, + ) + + torchscript_embedding_data = register_and_deploy_sentence_transformer_model( + ml_client, + torchscript_model_path, + torchscript_model_config_path, + TORCH_SCRIPT_FORMAT, + ) + pass_test = verify_embedding_data( + original_embedding_data, torchscript_embedding_data + ) + assert ( + pass_test + ), f"Failed while verifying embeddings of {model_id} model in TORCH_SCRIPT format" + + ( + torchscript_dst_model_path, + torchscript_dst_model_config_path, + ) = prepare_files_for_uploading( + model_id, + model_version, + TORCH_SCRIPT_FORMAT, + torchscript_model_path, + torchscript_model_config_path, + ) + + config_path_for_checking_description = torchscript_dst_model_config_path + print("--- Finished tracing a model in TORCH_SCRIPT ---") + + if tracing_format in [ONNX_FORMAT, BOTH_FORMAT]: + print("--- Begin tracing a model in ONNX ---") + ( + onnx_model_path, + onnx_model_config_path, + ) = trace_sentence_transformer_model( + model_id, + model_version, + ONNX_FORMAT, + embedding_dimension, + pooling_mode, + model_description, + ) + + onnx_embedding_data = register_and_deploy_sentence_transformer_model( + ml_client, onnx_model_path, onnx_model_config_path, ONNX_FORMAT + ) + + pass_test = verify_embedding_data(original_embedding_data, onnx_embedding_data) + assert ( + pass_test + ), f"Failed while verifying embeddings of {model_id} model in ONNX format" + + onnx_dst_model_path, onnx_dst_model_config_path = prepare_files_for_uploading( + model_id, + model_version, + ONNX_FORMAT, + onnx_model_path, + onnx_model_config_path, + ) + + config_path_for_checking_description = onnx_dst_model_config_path + print("--- Finished tracing a model in ONNX ---") + + store_license_verified_variable(license_verified) + store_description_variable(config_path_for_checking_description) + + print("\n=== Finished running model_autotracing.py ===") + + +if __name__ == "__main__": + warnings.filterwarnings("ignore", category=DeprecationWarning) + warnings.filterwarnings("ignore", category=FutureWarning) + warnings.filterwarnings("ignore", message="Unverified HTTPS request") + warnings.filterwarnings("ignore", message="TracerWarning: torch.tensor") + warnings.filterwarnings( + "ignore", message="using SSL with verify_certs=False is insecure." + ) + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "model_id", + type=str, + help="Model ID for auto-tracing and uploading (e.g. sentence-transformers/msmarco-distilbert-base-tas-b)", + ) + parser.add_argument( + "model_version", type=str, help="Model version number (e.g. 1.0.1)" + ) + parser.add_argument( + "tracing_format", + choices=["BOTH", "TORCH_SCRIPT", "ONNX"], + help="Model format for auto-tracing", + ) + parser.add_argument( + "-ed", + "--embedding_dimension", + type=int, + nargs="?", + default=None, + const=None, + help="Embedding dimension of the model to use if it does not exist in original config.json", + ) + parser.add_argument( + "-pm", + "--pooling_mode", + type=str, + nargs="?", + default=None, + const=None, + choices=["CLS", "MEAN", "MAX", "MEAN_SQRT_LEN"], + help="Pooling mode if it does not exist in original config.json", + ) + parser.add_argument( + "-md", + "--model_description", + type=str, + nargs="?", + default=None, + const=None, + help="Model description if you want to overwrite the default description", + ) + args = parser.parse_args() + + main( + args.model_id, + args.model_version, + args.tracing_format, + args.embedding_dimension, + args.pooling_mode, + args.model_description, + ) diff --git a/utils/model_uploader/save_model_file_path_to_env.py b/utils/model_uploader/save_model_file_path_to_env.py new file mode 100644 index 00000000..93576fff --- /dev/null +++ b/utils/model_uploader/save_model_file_path_to_env.py @@ -0,0 +1,90 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +# This program is run by "Model Auto-tracing & Uploading" workflow +# (See model_uploader.yml) to verify if the model already exists in +# model hub before continuing the workflow. + +import argparse +import re + +VERSION_PATTERN = r"^([1-9]\d*|0)(\.(([1-9]\d*)|0)){0,3}$" + + +def verify_inputs(model_id: str, model_version: str) -> None: + """ + Verify the format of model_id and model_version + + :param model_id: Model ID of the pretrained model + :type model_id: string + :param model_version: Version of the pretrained model for registration + :type model_version: string + :return: No return value expected + :rtype: None + """ + assert model_id.count("/") == 1, f"Invalid Model ID: {model_id}" + assert ( + re.fullmatch(VERSION_PATTERN, model_version) is not None + ), f"Invalid Model Version: {model_version}" + + +def get_model_file_path( + model_folder: str, model_id: str, model_version: str, model_format: str +) -> str: + """ + Construct the expected model file path on model hub + + :param model_folder: Model folder for uploading + :type model_folder: string + :param model_id: Model ID of the pretrained model + :type model_id: string + :param model_version: Version of the pretrained model for registration + :type model_version: string + :param model_format: Model format ("TORCH_SCRIPT" or "ONNX") + :type model_format: string + :return: Expected model file path on model hub + :rtype: string + """ + model_name = str(model_id.split("/")[-1]) + model_format = model_format.lower() + model_dirname = f"{model_folder}{model_name}/{model_version}/{model_format}" + model_filename = ( + f"sentence-transformers_{model_name}-{model_version}-{model_format}.zip" + ) + model_file_path = model_dirname + "/" + model_filename + return model_file_path + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "model_folder", + type=str, + help="Model folder for uploading (e.g. ml-models/huggingface/sentence-transformers/)", + ) + parser.add_argument( + "model_id", + type=str, + help="Model ID for auto-tracing and uploading (e.g. sentence-transformers/msmarco-distilbert-base-tas-b)", + ) + parser.add_argument( + "model_version", type=str, help="Model version number (e.g. 1.0.1)" + ) + parser.add_argument( + "model_format", + choices=["TORCH_SCRIPT", "ONNX"], + help="Model format for auto-tracing", + ) + + args = parser.parse_args() + verify_inputs(args.model_id, args.model_version) + model_file_path = get_model_file_path( + args.model_folder, args.model_id, args.model_version, args.model_format + ) + + # Print the model file path so that the workflow can store it in the variable (See model_uploader.yml) + print(model_file_path) diff --git a/utils/model_uploader/update_changelog_md.py b/utils/model_uploader/update_changelog_md.py new file mode 100644 index 00000000..e5239a44 --- /dev/null +++ b/utils/model_uploader/update_changelog_md.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +# This program is run by "Model Auto-tracing & Uploading" workflow +# (See model_uploader.yml) to update CHANGELOG.md after uploading the model +# to our model hub. + +import argparse + +from mdutils.fileutils import MarkDownFile + +CHANGELOG_DIRNAME = "." +CHANGELOG_FILENAME = "CHANGELOG.md" +SECTION_NAME = "Changed" + + +def update_changelog_file( + changelog_line: str, +) -> None: + """ + Update supported_models.json + + :param changelog_line: Line to be added to CHANGELOG.md + :type changelog_line: string + :return: No return value expected + :rtype: None + """ + changelog_data = MarkDownFile.read_file(f"{CHANGELOG_DIRNAME}/{CHANGELOG_FILENAME}") + + this_version_ptr = changelog_data.find("## [") + assert this_version_ptr != -1, "Cannot find a version section in the CHANGELOG.md" + next_version_ptr = changelog_data.find("## [", this_version_ptr + 1) + this_version_section = changelog_data[this_version_ptr:next_version_ptr] + + this_subsection_ptr = this_version_section.find(f"### {SECTION_NAME}") + if this_subsection_ptr != -1: + next_subsection_ptr = this_version_section.find("### ", this_subsection_ptr + 1) + this_subsection = this_version_section[ + this_subsection_ptr:next_subsection_ptr + ].strip() + this_subsection += "\n- " + changelog_line + "\n\n" + new_version_section = ( + this_version_section[:this_subsection_ptr] + + this_subsection + + this_version_section[next_subsection_ptr:] + ) + else: + this_subsection = this_version_section.strip() + this_subsection += "\n\n" + f"### {SECTION_NAME}\n- " + changelog_line + "\n\n" + new_version_section = this_subsection + + new_changelog_data = ( + changelog_data[:this_version_ptr] + + new_version_section + + changelog_data[next_version_ptr:] + ) + + mdFile = MarkDownFile(CHANGELOG_FILENAME, dirname=CHANGELOG_DIRNAME) + mdFile.rewrite_all_file(data=new_changelog_data) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "changelog_line", + type=str, + help="Line to be added to CHANGELOG.md", + ) + args = parser.parse_args() + update_changelog_file(args.changelog_line) diff --git a/utils/model_uploader/update_models_upload_history_md.py b/utils/model_uploader/update_models_upload_history_md.py new file mode 100644 index 00000000..6a45b8f2 --- /dev/null +++ b/utils/model_uploader/update_models_upload_history_md.py @@ -0,0 +1,285 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +# This program is run by "Model Auto-tracing & Uploading" workflow +# (See model_uploader.yml) to update MODEL_UPLOAD_HISTORY.md & supported_models.json +# after uploading the model to our model hub. + +import argparse +import json +import os +from typing import Dict, List, Optional + +from mdutils.fileutils import MarkDownFile +from mdutils.tools.Table import Table + +BOTH_FORMAT = "BOTH" +TORCH_SCRIPT_FORMAT = "TORCH_SCRIPT" +ONNX_FORMAT = "ONNX" + +MD_FILENAME = "MODEL_UPLOAD_HISTORY.md" +JSON_FILENAME = "supported_models.json" +DIRNAME = "utils/model_uploader/upload_history" +MODEL_JSON_FILEPATH = os.path.join(DIRNAME, JSON_FILENAME) +KEYS = [ + "Upload Time", + "Model Uploader", + "Model ID", + "Model Version", + "Model Format", + "Embedding Dimension", + "Pooling Mode", + "Model Description", +] +MD_HEADER = "# Pretrained Model Upload History\n\nThe model-serving framework supports a variety of open-source pretrained models that can assist with a range of machine learning (ML) search and analytics use cases. \n\n\n## Uploaded Pretrained Models\n\n\n### Sentence transformers\n\nSentence transformer models map sentences and paragraphs across a dimensional dense vector space. The number of vectors depends on the model. Use these models for use cases such as clustering and semantic search. \n\nThe following table shows sentence transformer model upload history.\n\n[//]: # (This may be the most platform independent comment)\n" + + +def create_model_json_obj( + model_id: str, + model_version: str, + model_format: str, + embedding_dimension: Optional[int] = None, + pooling_mode: Optional[str] = None, + model_description: Optional[str] = None, + model_uploader: Optional[str] = None, + upload_time: Optional[str] = None, +) -> Dict: + """ + Create a model dict obj to be added to supported_models.json + + :param model_id: Model ID of the pretrained model + :type model_id: string + :param model_version: Version of the pretrained model for registration + :type model_version: string + :param model_format: Model format ("TORCH_SCRIPT" or "ONNX") + :type model_format: string + :param embedding_dimension: Embedding dimension input + :type embedding_dimension: int + :param pooling_mode: Pooling mode input ("CLS", "MEAN", "MAX", "MEAN_SQRT_LEN" or None) + :type pooling_mode: string + :param model_description: Model description input + :type model_description: string + :param model_uploader: Model uploader input + :type model_uploader: string + :param uploader_time: Upload time input + :type uploader_time: string + :return: Model dictionary object to be added to supported_models.json + :rtype: dict + """ + model_obj = { + "Model Uploader": "@" + model_uploader if model_uploader is not None else "-", + "Upload Time": upload_time if upload_time is not None else "-", + "Model ID": model_id, + "Model Version": model_version, + "Model Format": model_format, + "Embedding Dimension": str(embedding_dimension) + if embedding_dimension is not None + else "Default", + "Pooling Mode": pooling_mode if pooling_mode is not None else "Default", + "Model Description": model_description + if model_description is not None + else "Default", + } + return model_obj + + +def sort_models(models: List[Dict]) -> List[Dict]: + """ + Sort models + + :param models: List of model dictionary objects to be sorted + :type models: list[dict] + :return: Sorted list of model dictionary objects + :rtype: list[dict] + """ + models = sorted( + models, + key=lambda d: ( + d["Upload Time"], + d["Model Version"], + d["Model ID"], + d["Model Format"], + ), + ) + return models + + +def update_model_json_file( + model_id: str, + model_version: str, + tracing_format: str, + embedding_dimension: Optional[int] = None, + pooling_mode: Optional[str] = None, + model_description: Optional[str] = None, + model_uploader: Optional[str] = None, + upload_time: Optional[str] = None, +) -> None: + """ + Update supported_models.json + + :param model_id: Model ID of the pretrained model + :type model_id: string + :param model_version: Version of the pretrained model for registration + :type model_version: string + :param tracing_format: Tracing format ("TORCH_SCRIPT", "ONNX", or "BOTH") + :type tracing_format: string + :param embedding_dimension: Embedding dimension input + :type embedding_dimension: int + :param pooling_mode: Pooling mode input ("CLS", "MEAN", "MAX", "MEAN_SQRT_LEN" or None) + :type pooling_mode: string + :param model_description: Model description input + :type model_description: string + :param model_uploader: Model uploader input + :type model_uploader: string + :param uploader_time: Upload time input + :type uploader_time: string + :return: No return value expected + :rtype: None + """ + models = [] + if os.path.isfile(MODEL_JSON_FILEPATH): + with open(MODEL_JSON_FILEPATH, "r") as f: + models = json.load(f) + elif not os.path.isdir(DIRNAME): + os.makedirs(DIRNAME) + + if tracing_format == TORCH_SCRIPT_FORMAT or tracing_format == BOTH_FORMAT: + model_obj = create_model_json_obj( + model_id, + model_version, + TORCH_SCRIPT_FORMAT, + embedding_dimension, + pooling_mode, + model_description, + model_uploader, + upload_time, + ) + models.append(model_obj) + + if tracing_format == ONNX_FORMAT or tracing_format == BOTH_FORMAT: + model_obj = create_model_json_obj( + model_id, + model_version, + ONNX_FORMAT, + embedding_dimension, + pooling_mode, + model_description, + model_uploader, + upload_time, + ) + models.append(model_obj) + + models = [dict(t) for t in {tuple(m.items()) for m in models}] + models = sort_models(models) + with open(MODEL_JSON_FILEPATH, "w") as f: + json.dump(models, f, indent=4) + + +def update_md_file(): + """ + Update MODEL_UPLOAD_HISTORY.md + + :return: No return value expected + :rtype: None + """ + models = [] + if os.path.exists(MODEL_JSON_FILEPATH): + with open(MODEL_JSON_FILEPATH, "r") as f: + models = json.load(f) + models = sort_models(models) + table_data = KEYS[:] + for m in models: + for k in KEYS: + if k == "Model ID": + table_data.append(f"`{m[k]}`") + else: + table_data.append(m[k]) + + table = Table().create_table( + columns=len(KEYS), rows=len(models) + 1, text=table_data, text_align="center" + ) + + mdFile = MarkDownFile(MD_FILENAME, dirname=DIRNAME) + mdFile.rewrite_all_file(data=MD_HEADER + table) + print(f"Finished updating {MD_FILENAME}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "model_id", + type=str, + help="Model ID for auto-tracing and uploading (e.g. sentence-transformers/msmarco-distilbert-base-tas-b)", + ) + parser.add_argument( + "model_version", type=str, help="Model version number (e.g. 1.0.1)" + ) + parser.add_argument( + "tracing_format", + choices=["BOTH", "TORCH_SCRIPT", "ONNX"], + help="Model format for auto-tracing", + ) + parser.add_argument( + "-ed", + "--embedding_dimension", + type=int, + nargs="?", + default=None, + const=None, + help="Embedding dimension of the model to use if it does not exist in original config.json", + ) + parser.add_argument( + "-pm", + "--pooling_mode", + type=str, + nargs="?", + default=None, + const=None, + choices=["CLS", "MEAN", "MAX", "MEAN_SQRT_LEN"], + help="Pooling mode if it does not exist in original config.json", + ) + parser.add_argument( + "-md", + "--model_description", + type=str, + nargs="?", + default=None, + const=None, + help="Model description if you want to overwrite the default description", + ) + parser.add_argument( + "-u", + "--model_uploader", + type=str, + nargs="?", + default=None, + const=None, + help="Model Uploader", + ) + parser.add_argument( + "-t", + "--upload_time", + type=str, + nargs="?", + default=None, + const=None, + help="Upload Time", + ) + args = parser.parse_args() + + update_model_json_file( + args.model_id, + args.model_version, + args.tracing_format, + args.embedding_dimension, + args.pooling_mode, + args.model_description, + args.model_uploader, + args.upload_time, + ) + + update_md_file() From 1ccec37d191116608b6821acf8766a68140e3522 Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Wed, 9 Aug 2023 15:06:38 -0700 Subject: [PATCH 3/5] Improve update_changelog_md.py & sentencetransformermodel.py scraping Signed-off-by: Thanawan Atchariyachanvanit --- .../ml_models/sentencetransformermodel.py | 4 +- utils/model_uploader/update_changelog_md.py | 75 +++++++++++++++---- 2 files changed, 64 insertions(+), 15 deletions(-) diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 05db5270..d30238eb 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -1032,11 +1032,11 @@ def _get_model_description_from_readme_file(self, readme_file_path) -> str: readme_data = MarkDownFile.read_file(readme_file_path) # Find the description section - start_str = f"# {self.model_id}" + start_str = f"\n# {self.model_id}" start = readme_data.find(start_str) if start == -1: model_name = self.model_id.split("/")[1] - start_str = f"# {model_name}" + start_str = f"\n# {model_name}" start = readme_data.find(start_str) end = readme_data.find("\n#", start + len(start_str)) diff --git a/utils/model_uploader/update_changelog_md.py b/utils/model_uploader/update_changelog_md.py index e5239a44..0b1cbba6 100644 --- a/utils/model_uploader/update_changelog_md.py +++ b/utils/model_uploader/update_changelog_md.py @@ -5,9 +5,10 @@ # Any modifications Copyright OpenSearch Contributors. See # GitHub history for details. -# This program is run by "Model Auto-tracing & Uploading" workflow -# (See model_uploader.yml) to update CHANGELOG.md after uploading the model -# to our model hub. +# This program is run by "Model Auto-tracing & Uploading" +# & "Model Listing Uploading" workflow (See model_uploader.yml +# & model_listing_uploader.yml) to update CHANGELOG.md after +# uploading the model to our model hub. import argparse @@ -15,7 +16,8 @@ CHANGELOG_DIRNAME = "." CHANGELOG_FILENAME = "CHANGELOG.md" -SECTION_NAME = "Changed" +SUBSECTION_NAME = "Changed" +PREV_SUBSECTION_NAME = "Added" def update_changelog_file( @@ -31,14 +33,24 @@ def update_changelog_file( """ changelog_data = MarkDownFile.read_file(f"{CHANGELOG_DIRNAME}/{CHANGELOG_FILENAME}") - this_version_ptr = changelog_data.find("## [") - assert this_version_ptr != -1, "Cannot find a version section in the CHANGELOG.md" - next_version_ptr = changelog_data.find("## [", this_version_ptr + 1) + # Find the most recent version section and pull it out + this_version_ptr = changelog_data.find("\n## ") + 1 + assert this_version_ptr != 0, "Cannot find a version section in the CHANGELOG.md" + next_version_ptr = changelog_data.find("\n## ", this_version_ptr + 1) + 1 + if next_version_ptr == 0: + next_version_ptr = -1 this_version_section = changelog_data[this_version_ptr:next_version_ptr] - this_subsection_ptr = this_version_section.find(f"### {SECTION_NAME}") - if this_subsection_ptr != -1: - next_subsection_ptr = this_version_section.find("### ", this_subsection_ptr + 1) + # Find the sub-section SUBSECTION_NAME + this_subsection_ptr = this_version_section.find(f"\n### {SUBSECTION_NAME}") + 1 + if this_subsection_ptr != 0: + # Case 1: Section SUBSECTION_NAME exists + # Append a change_log line to the end of that subsection if it exists + next_subsection_ptr = ( + this_version_section.find("\n### ", this_subsection_ptr + 1) + 1 + ) + if next_subsection_ptr == 0: + next_subsection_ptr = -1 this_subsection = this_version_section[ this_subsection_ptr:next_subsection_ptr ].strip() @@ -49,10 +61,47 @@ def update_changelog_file( + this_version_section[next_subsection_ptr:] ) else: - this_subsection = this_version_section.strip() - this_subsection += "\n\n" + f"### {SECTION_NAME}\n- " + changelog_line + "\n\n" - new_version_section = this_subsection + # Case 2: Sub-section SUBSECTION_NAME does not exist + # Create sub-section SUBSECTION_NAME and add a change_log line + this_subsection = f"### {SUBSECTION_NAME}\n- {changelog_line}\n\n" + prev_subsection_ptr = ( + this_version_section.find(f"\n### {PREV_SUBSECTION_NAME}") + 1 + ) + if prev_subsection_ptr != 0: + # Case 2.1: Sub-section PREV_SUBSECTION_NAME exist + # Add a sub-section SUBSECTION_NAME after PREV_SUBSECTION_NAME if PREV_SUBSECTION_NAME exists + next_subsection_ptr = ( + this_version_section.find("\n### ", prev_subsection_ptr + 1) + 1 + ) + prev_subsection = this_version_section[ + prev_subsection_ptr:next_subsection_ptr + ].strip() + new_version_section = ( + this_version_section[:prev_subsection_ptr] + + prev_subsection + + "\n\n" + + this_subsection + + this_version_section[next_subsection_ptr:] + ) + else: + # Case 2.2: Sub-section PREV_SUBSECTION_NAME does not exist + next_subsection_ptr = this_version_section.find("\n### ") + 1 + if next_subsection_ptr != 0: + # Case 2.2.1: There exists other sub-section in this version section + # Add a sub-section SECTION_NAME before other sub-sections + new_version_section = ( + this_version_section[:next_subsection_ptr] + + this_subsection + + this_version_section[next_subsection_ptr:] + ) + else: + # Case 2.2.2: There isn't any other sub-section in this version section + # Add a sub-section SECTION_NAME after version headline + new_version_section = ( + this_version_section.strip() + "\n\n" + this_subsection + ) + # Insert new_version_section back to the document new_changelog_data = ( changelog_data[:this_version_ptr] + new_version_section From 037c1a229e5ef72c7fa044ccced045207ac99a3f Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Wed, 9 Aug 2023 15:18:11 -0700 Subject: [PATCH 4/5] Change default to N/A Signed-off-by: Thanawan Atchariyachanvanit --- .ci/run-repository.sh | 6 +++--- .github/workflows/model_uploader.yml | 6 +++--- utils/model_uploader/model_autotracing.py | 6 +++--- utils/model_uploader/update_models_upload_history_md.py | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.ci/run-repository.sh b/.ci/run-repository.sh index 7aadbdda..6725e47d 100755 --- a/.ci/run-repository.sh +++ b/.ci/run-repository.sh @@ -70,9 +70,9 @@ elif [[ "$TASK_TYPE" == "trace" ]]; then echo -e "\033[34;1mINFO:\033[0m MODEL_ID: ${MODEL_ID}\033[0m" echo -e "\033[34;1mINFO:\033[0m MODEL_VERSION: ${MODEL_VERSION}\033[0m" echo -e "\033[34;1mINFO:\033[0m TRACING_FORMAT: ${TRACING_FORMAT}\033[0m" - echo -e "\033[34;1mINFO:\033[0m EMBEDDING_DIMENSION: ${EMBEDDING_DIMENSION:-Default}\033[0m" - echo -e "\033[34;1mINFO:\033[0m POOLING_MODE: ${POOLING_MODE:-Default}\033[0m" - echo -e "\033[34;1mINFO:\033[0m MODEL_DESCRIPTION: ${MODEL_DESCRIPTION:-Default}\033[0m" + echo -e "\033[34;1mINFO:\033[0m EMBEDDING_DIMENSION: ${EMBEDDING_DIMENSION:-N/A}\033[0m" + echo -e "\033[34;1mINFO:\033[0m POOLING_MODE: ${POOLING_MODE:-N/A}\033[0m" + echo -e "\033[34;1mINFO:\033[0m MODEL_DESCRIPTION: ${MODEL_DESCRIPTION:-N/A}\033[0m" docker run \ --network=${network_name} \ diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index 3a8aadd8..d5264836 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -76,9 +76,9 @@ jobs: - Model ID: ${{ github.event.inputs.model_id }} - Model Version: ${{ github.event.inputs.model_version }} - Tracing Format: ${{ github.event.inputs.tracing_format }} - - Embedding Dimension: ${embedding_dimension:-Default} - - Pooling Mode: ${pooling_mode:-Default} - - Model Description: ${model_description:-Default} + - Embedding Dimension: ${embedding_dimension:-N/A} + - Pooling Mode: ${pooling_mode:-N/A} + - Model Description: ${model_description:-N/A} ======== Workflow Output Information ========= - Embedding Verification: Passed" diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py index 840432c8..e99616b0 100644 --- a/utils/model_uploader/model_autotracing.py +++ b/utils/model_uploader/model_autotracing.py @@ -430,12 +430,12 @@ def main( print("Tracing Format: ", tracing_format) print( "Embedding Dimension: ", - embedding_dimension if embedding_dimension is not None else "Default", + embedding_dimension if embedding_dimension is not None else "N/A", ) - print("Pooling Mode: ", pooling_mode if pooling_mode is not None else "Default") + print("Pooling Mode: ", pooling_mode if pooling_mode is not None else "N/A") print( "Model Description: ", - model_description if model_description is not None else "Default", + model_description if model_description is not None else "N/A", ) print("==========================================") diff --git a/utils/model_uploader/update_models_upload_history_md.py b/utils/model_uploader/update_models_upload_history_md.py index 6a45b8f2..8cdea9ce 100644 --- a/utils/model_uploader/update_models_upload_history_md.py +++ b/utils/model_uploader/update_models_upload_history_md.py @@ -78,11 +78,11 @@ def create_model_json_obj( "Model Format": model_format, "Embedding Dimension": str(embedding_dimension) if embedding_dimension is not None - else "Default", - "Pooling Mode": pooling_mode if pooling_mode is not None else "Default", + else "N/A", + "Pooling Mode": pooling_mode if pooling_mode is not None else "N/A", "Model Description": model_description if model_description is not None - else "Default", + else "N/A", } return model_obj From 95d266af314f9b8d3332574fe46033b4dac240fe Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Wed, 9 Aug 2023 15:31:22 -0700 Subject: [PATCH 5/5] Update update_models_upload_history_md.py Signed-off-by: Thanawan Atchariyachanvanit --- .github/workflows/model_uploader.yml | 4 +-- .../update_models_upload_history_md.py | 30 +++++++++---------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index d5264836..bc792d30 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -316,8 +316,8 @@ jobs: ${{ github.event.inputs.tracing_format }} \ -ed ${{ github.event.inputs.embedding_dimension }} \ -pm ${{ github.event.inputs.pooling_mode }} \ - -md ${model_description:+"$model_description"} \ - -u ${{ github.actor }} -t "${{ needs.model-uploading.outputs.upload_time }}" + -id ${{ github.run_id }} -u ${{ github.actor }} \ + -t "${{ needs.model-uploading.outputs.upload_time }}" - name: Create PR Body id: create_pr_body run: | diff --git a/utils/model_uploader/update_models_upload_history_md.py b/utils/model_uploader/update_models_upload_history_md.py index 8cdea9ce..38232c5a 100644 --- a/utils/model_uploader/update_models_upload_history_md.py +++ b/utils/model_uploader/update_models_upload_history_md.py @@ -33,7 +33,7 @@ "Model Format", "Embedding Dimension", "Pooling Mode", - "Model Description", + "Workflow Run ID", ] MD_HEADER = "# Pretrained Model Upload History\n\nThe model-serving framework supports a variety of open-source pretrained models that can assist with a range of machine learning (ML) search and analytics use cases. \n\n\n## Uploaded Pretrained Models\n\n\n### Sentence transformers\n\nSentence transformer models map sentences and paragraphs across a dimensional dense vector space. The number of vectors depends on the model. Use these models for use cases such as clustering and semantic search. \n\nThe following table shows sentence transformer model upload history.\n\n[//]: # (This may be the most platform independent comment)\n" @@ -44,7 +44,7 @@ def create_model_json_obj( model_format: str, embedding_dimension: Optional[int] = None, pooling_mode: Optional[str] = None, - model_description: Optional[str] = None, + workflow_id: Optional[str] = None, model_uploader: Optional[str] = None, upload_time: Optional[str] = None, ) -> Dict: @@ -61,8 +61,8 @@ def create_model_json_obj( :type embedding_dimension: int :param pooling_mode: Pooling mode input ("CLS", "MEAN", "MAX", "MEAN_SQRT_LEN" or None) :type pooling_mode: string - :param model_description: Model description input - :type model_description: string + :param workflow_id: Workflow run id + :type workflow_id: string :param model_uploader: Model uploader input :type model_uploader: string :param uploader_time: Upload time input @@ -80,9 +80,7 @@ def create_model_json_obj( if embedding_dimension is not None else "N/A", "Pooling Mode": pooling_mode if pooling_mode is not None else "N/A", - "Model Description": model_description - if model_description is not None - else "N/A", + "Workflow Run ID": workflow_id if workflow_id is not None else "-" } return model_obj @@ -114,7 +112,7 @@ def update_model_json_file( tracing_format: str, embedding_dimension: Optional[int] = None, pooling_mode: Optional[str] = None, - model_description: Optional[str] = None, + workflow_id: Optional[str] = None, model_uploader: Optional[str] = None, upload_time: Optional[str] = None, ) -> None: @@ -131,8 +129,8 @@ def update_model_json_file( :type embedding_dimension: int :param pooling_mode: Pooling mode input ("CLS", "MEAN", "MAX", "MEAN_SQRT_LEN" or None) :type pooling_mode: string - :param model_description: Model description input - :type model_description: string + :param workflow_id: Workflow run id + :type workflow_id: string :param model_uploader: Model uploader input :type model_uploader: string :param uploader_time: Upload time input @@ -154,7 +152,7 @@ def update_model_json_file( TORCH_SCRIPT_FORMAT, embedding_dimension, pooling_mode, - model_description, + workflow_id, model_uploader, upload_time, ) @@ -167,7 +165,7 @@ def update_model_json_file( ONNX_FORMAT, embedding_dimension, pooling_mode, - model_description, + workflow_id, model_uploader, upload_time, ) @@ -243,13 +241,13 @@ def update_md_file(): help="Pooling mode if it does not exist in original config.json", ) parser.add_argument( - "-md", - "--model_description", + "-id", + "--workflow_id", type=str, nargs="?", default=None, const=None, - help="Model description if you want to overwrite the default description", + help="Workflow Run ID", ) parser.add_argument( "-u", @@ -277,7 +275,7 @@ def update_md_file(): args.tracing_format, args.embedding_dimension, args.pooling_mode, - args.model_description, + args.workflow_id, args.model_uploader, args.upload_time, )