Add tests for PR2

Signed-off-by: Thanawan Atchariyachanvanit <[email protected]>
thanawan-atc · Aug 17, 2023 · bc4cc78 · bc4cc78
1 parent 68198c8
commit bc4cc78
Show file tree

Hide file tree

Showing 10 changed files with 217 additions and 27 deletions.
diff --git a/tests/ml_model_listing/samples/config_folder/intfloat/e5-small-v2/1.0.1/onnx/config.json b/tests/ml_model_listing/samples/config_folder/intfloat/e5-small-v2/1.0.1/onnx/config.json
@@ -0,0 +1 @@
+{"name": "intfloat/e5-small-v2", "version": "1.0.1", "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space.", "model_format": "ONNX", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "bert", "embedding_dimension": 384, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": true, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/intfloat_e5-small-v2/\", \"architectures\": [\"BertModel\"], \"attention_probs_dropout_prob\": 0.1, \"classifier_dropout\": null, \"hidden_act\": \"gelu\", \"hidden_dropout_prob\": 0.1, \"hidden_size\": 384, \"initializer_range\": 0.02, \"intermediate_size\": 1536, \"layer_norm_eps\": 1e-12, \"max_position_embeddings\": 512, \"model_type\": \"bert\", \"num_attention_heads\": 12, \"num_hidden_layers\": 12, \"pad_token_id\": 0, \"position_embedding_type\": \"absolute\", \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"type_vocab_size\": 2, \"use_cache\": true, \"vocab_size\": 30522}"}}
diff --git a/..._listing/samples/config_folder/jhgan/ko-sroberta-multitask/1.0.1/torch_script/config.json b/..._listing/samples/config_folder/jhgan/ko-sroberta-multitask/1.0.1/torch_script/config.json
@@ -0,0 +1 @@
+{"name": "jhgan/ko-sroberta-multitask", "version": "1.0.1", "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_format": "TORCH_SCRIPT", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "roberta", "embedding_dimension": 768, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": false, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/jhgan_ko-sroberta-multitask/\", \"architectures\": [\"RobertaModel\"], \"attention_probs_dropout_prob\": 0.1, \"bos_token_id\": 0, \"classifier_dropout\": null, \"eos_token_id\": 2, \"gradient_checkpointing\": false, \"hidden_act\": \"gelu\", \"hidden_dropout_prob\": 0.1, \"hidden_size\": 768, \"initializer_range\": 0.02, \"intermediate_size\": 3072, \"layer_norm_eps\": 1e-05, \"max_position_embeddings\": 514, \"model_type\": \"roberta\", \"num_attention_heads\": 12, \"num_hidden_layers\": 12, \"pad_token_id\": 1, \"position_embedding_type\": \"absolute\", \"tokenizer_class\": \"BertTokenizer\", \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"type_vocab_size\": 1, \"use_cache\": true, \"vocab_size\": 32000}"}}
diff --git a/...folder/sentence-transformers/clip-ViT-B-32-multilingual-v1/1.0.1/torch_script/config.json b/...folder/sentence-transformers/clip-ViT-B-32-multilingual-v1/1.0.1/torch_script/config.json
@@ -0,0 +1 @@
+{"name": "sentence-transformers/clip-ViT-B-32-multilingual-v1", "version": "1.0.1", "description": "This is a multi-lingual version of the OpenAI CLIP-ViT-B32 model. You can map text  and images to a common dense vector space such that images and the matching texts are close. This model can be used for image search  and for multi-lingual zero-shot image classification .", "model_format": "TORCH_SCRIPT", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "distilbert", "embedding_dimension": 512, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": false, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/sentence-transformers_clip-ViT-B-32-multilingual-v1/\", \"activation\": \"gelu\", \"architectures\": [\"DistilBertModel\"], \"attention_dropout\": 0.1, \"dim\": 768, \"dropout\": 0.1, \"hidden_dim\": 3072, \"initializer_range\": 0.02, \"max_position_embeddings\": 512, \"model_type\": \"distilbert\", \"n_heads\": 12, \"n_layers\": 6, \"output_past\": true, \"pad_token_id\": 0, \"qa_dropout\": 0.1, \"seq_classif_dropout\": 0.2, \"sinusoidal_pos_embds\": false, \"tie_weights_\": true, \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"vocab_size\": 119547}"}}
diff --git a/...les/config_folder/sentence-transformers/multi-qa-mpnet-base-cos-v1/1.0.1/onnx/config.json b/...les/config_folder/sentence-transformers/multi-qa-mpnet-base-cos-v1/1.0.1/onnx/config.json
@@ -0,0 +1 @@
+{"name": "sentence-transformers/multi-qa-mpnet-base-cos-v1", "version": "1.0.1", "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M  pairs from diverse sources.", "model_format": "ONNX", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "mpnet", "embedding_dimension": 768, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": true, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/sentence-transformers_multi-qa-mpnet-base-cos-v1/\", \"architectures\": [\"MPNetModel\"], \"attention_probs_dropout_prob\": 0.1, \"bos_token_id\": 0, \"eos_token_id\": 2, \"hidden_act\": \"gelu\", \"hidden_dropout_prob\": 0.1, \"hidden_size\": 768, \"initializer_range\": 0.02, \"intermediate_size\": 3072, \"layer_norm_eps\": 1e-05, \"max_position_embeddings\": 514, \"model_type\": \"mpnet\", \"num_attention_heads\": 12, \"num_hidden_layers\": 12, \"pad_token_id\": 1, \"relative_attention_num_buckets\": 32, \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"vocab_size\": 30527}"}}
diff --git a/...ig_folder/sentence-transformers/multi-qa-mpnet-base-cos-v1/1.0.1/torch_script/config.json b/...ig_folder/sentence-transformers/multi-qa-mpnet-base-cos-v1/1.0.1/torch_script/config.json
@@ -0,0 +1 @@
+{"name": "sentence-transformers/multi-qa-mpnet-base-cos-v1", "version": "1.0.1", "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M  pairs from diverse sources.", "model_format": "TORCH_SCRIPT", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "mpnet", "embedding_dimension": 768, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": true, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/sentence-transformers_multi-qa-mpnet-base-cos-v1/\", \"architectures\": [\"MPNetModel\"], \"attention_probs_dropout_prob\": 0.1, \"bos_token_id\": 0, \"eos_token_id\": 2, \"hidden_act\": \"gelu\", \"hidden_dropout_prob\": 0.1, \"hidden_size\": 768, \"initializer_range\": 0.02, \"intermediate_size\": 3072, \"layer_norm_eps\": 1e-05, \"max_position_embeddings\": 514, \"model_type\": \"mpnet\", \"num_attention_heads\": 12, \"num_hidden_layers\": 12, \"pad_token_id\": 1, \"relative_attention_num_buckets\": 32, \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"vocab_size\": 30527}"}}
diff --git a/...ig_folder/sentence-transformers/multi-qa-mpnet-base-cos-v1/2.0.0/torch_script/config.json b/...ig_folder/sentence-transformers/multi-qa-mpnet-base-cos-v1/2.0.0/torch_script/config.json
@@ -0,0 +1 @@
+{"name": "sentence-transformers/multi-qa-mpnet-base-cos-v1", "version": "2.0.0", "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M  pairs from diverse sources. (New Version)", "model_format": "TORCH_SCRIPT", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "mpnet", "embedding_dimension": 768, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": true, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/sentence-transformers_multi-qa-mpnet-base-cos-v1/\", \"architectures\": [\"MPNetModel\"], \"attention_probs_dropout_prob\": 0.1, \"bos_token_id\": 0, \"eos_token_id\": 2, \"hidden_act\": \"gelu\", \"hidden_dropout_prob\": 0.1, \"hidden_size\": 768, \"initializer_range\": 0.02, \"intermediate_size\": 3072, \"layer_norm_eps\": 1e-05, \"max_position_embeddings\": 514, \"model_type\": \"mpnet\", \"num_attention_heads\": 12, \"num_hidden_layers\": 12, \"pad_token_id\": 1, \"relative_attention_num_buckets\": 32, \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"vocab_size\": 30527}"}}
diff --git a/tests/ml_model_listing/samples/config_paths.txt b/tests/ml_model_listing/samples/config_paths.txt
@@ -0,0 +1 @@
+ml-models/huggingface/intfloat/e5-small-v2/1.0.1/onnx/config.json ml-models/huggingface/jhgan/ko-sroberta-multitask/1.0.1/torch_script/config.json ml-models/huggingface/sentence-transformers/clip-ViT-B-32-multilingual-v1/1.0.1/torch_script/config.json ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/1.0.1/onnx/config.json ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/1.0.1/torch_script/config.json ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/2.0.0/torch_script/config.json
diff --git a/tests/ml_model_listing/samples/pretrained_model_listing.json b/tests/ml_model_listing/samples/pretrained_model_listing.json
@@ -0,0 +1,53 @@
+[
+  {
+    "name": "huggingface/intfloat/e5-small-v2",
+    "versions": {
+      "1.0.1": {
+        "format": [
+          "onnx"
+        ],
+        "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space."
+      }
+    }
+  },
+  {
+    "name": "huggingface/jhgan/ko-sroberta-multitask",
+    "versions": {
+      "1.0.1": {
+        "format": [
+          "torch_script"
+        ],
+        "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search."
+      }
+    }
+  },
+  {
+    "name": "huggingface/sentence-transformers/clip-ViT-B-32-multilingual-v1",
+    "versions": {
+      "1.0.1": {
+        "format": [
+          "torch_script"
+        ],
+        "description": "This is a multi-lingual version of the OpenAI CLIP-ViT-B32 model. You can map text  and images to a common dense vector space such that images and the matching texts are close. This model can be used for image search  and for multi-lingual zero-shot image classification ."
+      }
+    }
+  },
+  {
+    "name": "huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1",
+    "versions": {
+      "1.0.1": {
+        "format": [
+          "onnx",
+          "torch_script"
+        ],
+        "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M  pairs from diverse sources."
+      },
+      "2.0.0": {
+        "format": [
+          "torch_script"
+        ],
+        "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M  pairs from diverse sources. (New Version)"
+      }
+    }
+  }
+]
diff --git a/tests/ml_model_listing/test_update_pretrained_model_listing.py b/tests/ml_model_listing/test_update_pretrained_model_listing.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.
+# Any modifications Copyright OpenSearch Contributors. See
+# GitHub history for details.
+
+# We need to append UTILS_MODEL_UPLOADER_DIR path so that we can import
+# functions from update_pretrained_model_listing.py
+# since this python script is not in the root directory.
+
+import json
+import os
+import shutil
+import sys
+
+import pytest
+
+THIS_DIR = os.path.dirname(__file__)
+UTILS_MODEL_UPLOADER_DIR = os.path.join(THIS_DIR, "../../utils/model_uploader")
+sys.path.append(UTILS_MODEL_UPLOADER_DIR)
+
+SAMPLE_FOLDER = os.path.join(THIS_DIR, "samples")
+CONFIG_PATHS_TXT_FILENAME = "config_paths.txt"
+CONFIG_FOLDERNAME = "config_folder"
+SAMPLE_PRETRAINED_MODEL_LISTING = os.path.join(
+    SAMPLE_FOLDER, "pretrained_model_listing.json"
+)
+SAMPLE_FOLDER_COPY = os.path.join(THIS_DIR, "samples_copy")
+SAMPLE_MISSING_CONFIG_SUBFOLDERNAME = "sentence-transformers"
+TEST_FILE = os.path.join(THIS_DIR, "test_pretrained_model_listing.json")
+
+from update_pretrained_model_listing import create_new_pretrained_model_listing
+
+
+def clean_test_file():
+    if os.path.isfile(TEST_FILE):
+        os.remove(TEST_FILE)
+
+
+def copy_samples_folder():
+    shutil.copytree(SAMPLE_FOLDER, SAMPLE_FOLDER_COPY)
+
+
+def clean_samples_folder_copy():
+    if os.path.exists(SAMPLE_FOLDER_COPY):
+        for files in os.listdir(SAMPLE_FOLDER_COPY):
+            sub_path = os.path.join(SAMPLE_FOLDER_COPY, files)
+            if os.path.isfile(sub_path):
+                os.remove(sub_path)
+            else:
+                try:
+                    shutil.rmtree(sub_path)
+                except OSError as err:
+                    print(
+                        "Fail to delete files, please delete all files in "
+                        + str(SAMPLE_FOLDER_COPY)
+                        + " "
+                        + str(err)
+                    )
+
+        shutil.rmtree(SAMPLE_FOLDER_COPY)
+
+
+clean_samples_folder_copy()
+clean_test_file()
+
+
+def test_create_new_pretrained_model_listing():
+    clean_test_file()
+    try:
+        create_new_pretrained_model_listing(
+            os.path.join(SAMPLE_FOLDER, CONFIG_PATHS_TXT_FILENAME),
+            os.path.join(SAMPLE_FOLDER, CONFIG_FOLDERNAME),
+            pretrained_model_listing_json_filepath=TEST_FILE,
+        )
+    except Exception as e:
+        assert False, print(f"Failed while creating new pretrained model listing: {e}")
+
+    try:
+        with open(SAMPLE_PRETRAINED_MODEL_LISTING, "r") as f:
+            sample_pretrained_model_listing = json.load(f)
+    except Exception as e:
+        assert False, print(
+            f"Cannot open {SAMPLE_PRETRAINED_MODEL_LISTING} to use it for verification: {e}"
+        )
+
+    try:
+        with open(TEST_FILE, "r") as f:
+            test_pretrained_model_listing = json.load(f)
+    except Exception as e:
+        assert False, print(f"Cannot open {TEST_FILE} to verify its content: {e}")
+
+    assert test_pretrained_model_listing == sample_pretrained_model_listing, print(
+        "Incorrect pretrained model listing"
+    )
+
+    clean_test_file()
+
+
+def test_missing_config_file():
+    clean_test_file()
+    clean_samples_folder_copy()
+
+    copy_samples_folder()
+    shutil.rmtree(
+        os.path.join(
+            SAMPLE_FOLDER_COPY, CONFIG_FOLDERNAME, SAMPLE_MISSING_CONFIG_SUBFOLDERNAME
+        )
+    )
+
+    with pytest.raises(Exception) as exc_info:
+        create_new_pretrained_model_listing(
+            os.path.join(SAMPLE_FOLDER_COPY, CONFIG_PATHS_TXT_FILENAME),
+            os.path.join(SAMPLE_FOLDER_COPY, CONFIG_FOLDERNAME),
+            pretrained_model_listing_json_filepath=TEST_FILE,
+        )
+    assert exc_info.type is Exception
+    assert "Cannot open" in str(exc_info.value)
+
+    clean_test_file()
+    clean_samples_folder_copy()
+
+
+clean_samples_folder_copy()
+clean_test_file()
diff --git a/utils/model_uploader/update_pretrained_model_listing.py b/utils/model_uploader/update_pretrained_model_listing.py
@@ -27,53 +27,55 @@
 
 
 def get_sentence_transformer_model_description(
-    config_folder_name: str, config_filepath: str
+    config_folderpath: str, config_filepath: str
 ) -> Optional[str]:
     """
     Get description of the pretrained sentence transformer model from config file
 
-    :param config_folder_name: Name of the local folder that stores config files (e.g. 'config_folder')
-    :type config_folder_name: string
+    :param config_folderpath: Path to the folder that stores copies of config files from S3 (e.g. 'config_folder')
+    :type config_folderpath: string
     :param config_filepath: Path to local config file
     (e.g. 'sentence-transformers/all-MiniLM-L12-v2/2.0.0/onnx/config.json')
     :type config_filepath: string
     :return: Description of the model
     :rtype: string or None
     """
-    filepath = os.path.join(config_folder_name, config_filepath)
+    filepath = os.path.join(config_folderpath, config_filepath)
     try:
         with open(filepath, "r") as f:
             model_config = json.load(f)
     except Exception as e:
-        print(f"Cannot open {filepath} to get model description: {e}")
-        return None
+        raise Exception(f"Cannot open {filepath} to get model description: {e}")
     if "description" in model_config:
         return model_config["description"]
     else:
         return None
 
 
 def create_new_pretrained_model_listing(
-    config_paths_txt_filename: str, config_foldername: str
+    config_paths_txt_filepath: str,
+    config_folderpath: str,
+    pretrained_model_listing_json_filepath: str = PRETRAINED_MODEL_LISTING_JSON_FILEPATH,
 ):
     """
-    Create a new pretrained model listing and store it at PRETRAINED_MODEL_LISTING_JSON_FILEPATH
-    based on current models in config_paths_txt_filename and their config files in config_foldername
+    Create a new pretrained model listing and store it at pretrained_model_listing_json_filepath
+    based on current models in config_paths_txt_filepath and their config files in config_folderpath
 
-    :param config_paths_txt_filename: Name of the txt file that stores paths to config file
+    :param config_paths_txt_filepath: Path to the txt file that stores a list of config paths from S3
     in the ml-models/huggingface/ folder of the S3 bucket
-    :type config_paths_txt_filename: string
-    :param config_foldername: Name of the local folder that stores config files
-    :type config_foldername: string
+    :type config_paths_txt_filepath: string
+    :param config_folderpath: Path to the folder that stores copies of config files from S3
+    :type config_folderpath: string
     :return: No return value expected
+    :param pretrained_model_listing_json_filepath: Path to the json file that stores new model listing
     :rtype: None
     """
     print("\n=== Begin running update_pretrained_model_listing.py ===")
-    print(f"--- Reading {config_paths_txt_filename} ---")
-    with open(config_paths_txt_filename, "r") as f:
+    print(f"--- Reading {config_paths_txt_filepath} ---")
+    with open(config_paths_txt_filepath, "r") as f:
         config_paths_lst = f.read().split()
 
-    print("---  Creating New Model Listing --- ")
+    print("\n---  Creating New Model Listing --- ")
     new_model_listing_dict = {}
     for config_filepath in config_paths_lst:
         if config_filepath.startswith(PREFIX_HUGGINGFACE_MODEL_FILEPATH):
@@ -96,7 +98,7 @@ def create_new_pretrained_model_listing(
             versions_content[model_version]["format"].append(model_format)
             if "description" not in versions_content[model_version]:
                 description = get_sentence_transformer_model_description(
-                    config_foldername, local_config_filepath
+                    config_folderpath, local_config_filepath
                 )
                 if description is not None:
                     versions_content[model_version]["description"] = description
@@ -107,34 +109,36 @@ def create_new_pretrained_model_listing(
         model_dict["versions"] = dict(sorted(model_dict["versions"].items()))
 
     print(
-        f"---  Dumping New Model Listing in {PRETRAINED_MODEL_LISTING_JSON_FILEPATH} --- "
+        f"\n---  Dumping New Model Listing in {pretrained_model_listing_json_filepath} --- "
     )
     if not os.path.isdir(JSON_DIRNAME):
         os.makedirs(JSON_DIRNAME)
-    with open(PRETRAINED_MODEL_LISTING_JSON_FILEPATH, "w") as f:
+    with open(pretrained_model_listing_json_filepath, "w") as f:
         json.dump(new_model_listing_lst, f, indent=2)
     print("\n=== Finished running update_pretrained_model_listing.py ===")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument(
-        "config_paths_txt_filename",
+        "config_paths_txt_filepath",
         type=str,
-        help="Name of the file that stores config paths in S3",
+        help="Path to the txt file that stores a list of config paths from S3",
     )
     parser.add_argument(
-        "config_foldername",
+        "config_folderpath",
         type=str,
-        help="Name of the local folder that stores copies of config files from S3",
+        help="Path to the folder that stores copies of config files from S3",
     )
 
     args = parser.parse_args()
 
-    if not args.config_paths_txt_filename.endswith(".txt"):
-        assert False, "Invalid arguments"
+    if not args.config_paths_txt_filepath.endswith(".txt"):
+        raise Exception(
+            "Invalid argument: config_paths_txt_filepath should be .txt file"
+        )
 
     create_new_pretrained_model_listing(
-        args.config_paths_txt_filename,
-        args.config_foldername,
+        args.config_paths_txt_filepath,
+        args.config_folderpath,
     )