Skip to content

Commit

Permalink
Add tests for PR2
Browse files Browse the repository at this point in the history
Signed-off-by: Thanawan Atchariyachanvanit <[email protected]>
  • Loading branch information
thanawan-atc committed Aug 17, 2023
1 parent 68198c8 commit bc4cc78
Show file tree
Hide file tree
Showing 10 changed files with 217 additions and 27 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"name": "intfloat/e5-small-v2", "version": "1.0.1", "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space.", "model_format": "ONNX", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "bert", "embedding_dimension": 384, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": true, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/intfloat_e5-small-v2/\", \"architectures\": [\"BertModel\"], \"attention_probs_dropout_prob\": 0.1, \"classifier_dropout\": null, \"hidden_act\": \"gelu\", \"hidden_dropout_prob\": 0.1, \"hidden_size\": 384, \"initializer_range\": 0.02, \"intermediate_size\": 1536, \"layer_norm_eps\": 1e-12, \"max_position_embeddings\": 512, \"model_type\": \"bert\", \"num_attention_heads\": 12, \"num_hidden_layers\": 12, \"pad_token_id\": 0, \"position_embedding_type\": \"absolute\", \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"type_vocab_size\": 2, \"use_cache\": true, \"vocab_size\": 30522}"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"name": "jhgan/ko-sroberta-multitask", "version": "1.0.1", "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_format": "TORCH_SCRIPT", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "roberta", "embedding_dimension": 768, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": false, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/jhgan_ko-sroberta-multitask/\", \"architectures\": [\"RobertaModel\"], \"attention_probs_dropout_prob\": 0.1, \"bos_token_id\": 0, \"classifier_dropout\": null, \"eos_token_id\": 2, \"gradient_checkpointing\": false, \"hidden_act\": \"gelu\", \"hidden_dropout_prob\": 0.1, \"hidden_size\": 768, \"initializer_range\": 0.02, \"intermediate_size\": 3072, \"layer_norm_eps\": 1e-05, \"max_position_embeddings\": 514, \"model_type\": \"roberta\", \"num_attention_heads\": 12, \"num_hidden_layers\": 12, \"pad_token_id\": 1, \"position_embedding_type\": \"absolute\", \"tokenizer_class\": \"BertTokenizer\", \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"type_vocab_size\": 1, \"use_cache\": true, \"vocab_size\": 32000}"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"name": "sentence-transformers/clip-ViT-B-32-multilingual-v1", "version": "1.0.1", "description": "This is a multi-lingual version of the OpenAI CLIP-ViT-B32 model. You can map text and images to a common dense vector space such that images and the matching texts are close. This model can be used for image search and for multi-lingual zero-shot image classification .", "model_format": "TORCH_SCRIPT", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "distilbert", "embedding_dimension": 512, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": false, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/sentence-transformers_clip-ViT-B-32-multilingual-v1/\", \"activation\": \"gelu\", \"architectures\": [\"DistilBertModel\"], \"attention_dropout\": 0.1, \"dim\": 768, \"dropout\": 0.1, \"hidden_dim\": 3072, \"initializer_range\": 0.02, \"max_position_embeddings\": 512, \"model_type\": \"distilbert\", \"n_heads\": 12, \"n_layers\": 6, \"output_past\": true, \"pad_token_id\": 0, \"qa_dropout\": 0.1, \"seq_classif_dropout\": 0.2, \"sinusoidal_pos_embds\": false, \"tie_weights_\": true, \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"vocab_size\": 119547}"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"name": "sentence-transformers/multi-qa-mpnet-base-cos-v1", "version": "1.0.1", "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources.", "model_format": "ONNX", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "mpnet", "embedding_dimension": 768, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": true, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/sentence-transformers_multi-qa-mpnet-base-cos-v1/\", \"architectures\": [\"MPNetModel\"], \"attention_probs_dropout_prob\": 0.1, \"bos_token_id\": 0, \"eos_token_id\": 2, \"hidden_act\": \"gelu\", \"hidden_dropout_prob\": 0.1, \"hidden_size\": 768, \"initializer_range\": 0.02, \"intermediate_size\": 3072, \"layer_norm_eps\": 1e-05, \"max_position_embeddings\": 514, \"model_type\": \"mpnet\", \"num_attention_heads\": 12, \"num_hidden_layers\": 12, \"pad_token_id\": 1, \"relative_attention_num_buckets\": 32, \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"vocab_size\": 30527}"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"name": "sentence-transformers/multi-qa-mpnet-base-cos-v1", "version": "1.0.1", "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources.", "model_format": "TORCH_SCRIPT", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "mpnet", "embedding_dimension": 768, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": true, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/sentence-transformers_multi-qa-mpnet-base-cos-v1/\", \"architectures\": [\"MPNetModel\"], \"attention_probs_dropout_prob\": 0.1, \"bos_token_id\": 0, \"eos_token_id\": 2, \"hidden_act\": \"gelu\", \"hidden_dropout_prob\": 0.1, \"hidden_size\": 768, \"initializer_range\": 0.02, \"intermediate_size\": 3072, \"layer_norm_eps\": 1e-05, \"max_position_embeddings\": 514, \"model_type\": \"mpnet\", \"num_attention_heads\": 12, \"num_hidden_layers\": 12, \"pad_token_id\": 1, \"relative_attention_num_buckets\": 32, \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"vocab_size\": 30527}"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"name": "sentence-transformers/multi-qa-mpnet-base-cos-v1", "version": "2.0.0", "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources. (New Version)", "model_format": "TORCH_SCRIPT", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "mpnet", "embedding_dimension": 768, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": true, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/sentence-transformers_multi-qa-mpnet-base-cos-v1/\", \"architectures\": [\"MPNetModel\"], \"attention_probs_dropout_prob\": 0.1, \"bos_token_id\": 0, \"eos_token_id\": 2, \"hidden_act\": \"gelu\", \"hidden_dropout_prob\": 0.1, \"hidden_size\": 768, \"initializer_range\": 0.02, \"intermediate_size\": 3072, \"layer_norm_eps\": 1e-05, \"max_position_embeddings\": 514, \"model_type\": \"mpnet\", \"num_attention_heads\": 12, \"num_hidden_layers\": 12, \"pad_token_id\": 1, \"relative_attention_num_buckets\": 32, \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"vocab_size\": 30527}"}}
1 change: 1 addition & 0 deletions tests/ml_model_listing/samples/config_paths.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ml-models/huggingface/intfloat/e5-small-v2/1.0.1/onnx/config.json ml-models/huggingface/jhgan/ko-sroberta-multitask/1.0.1/torch_script/config.json ml-models/huggingface/sentence-transformers/clip-ViT-B-32-multilingual-v1/1.0.1/torch_script/config.json ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/1.0.1/onnx/config.json ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/1.0.1/torch_script/config.json ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/2.0.0/torch_script/config.json
53 changes: 53 additions & 0 deletions tests/ml_model_listing/samples/pretrained_model_listing.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
[
{
"name": "huggingface/intfloat/e5-small-v2",
"versions": {
"1.0.1": {
"format": [
"onnx"
],
"description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space."
}
}
},
{
"name": "huggingface/jhgan/ko-sroberta-multitask",
"versions": {
"1.0.1": {
"format": [
"torch_script"
],
"description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search."
}
}
},
{
"name": "huggingface/sentence-transformers/clip-ViT-B-32-multilingual-v1",
"versions": {
"1.0.1": {
"format": [
"torch_script"
],
"description": "This is a multi-lingual version of the OpenAI CLIP-ViT-B32 model. You can map text and images to a common dense vector space such that images and the matching texts are close. This model can be used for image search and for multi-lingual zero-shot image classification ."
}
}
},
{
"name": "huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1",
"versions": {
"1.0.1": {
"format": [
"onnx",
"torch_script"
],
"description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources."
},
"2.0.0": {
"format": [
"torch_script"
],
"description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources. (New Version)"
}
}
}
]
126 changes: 126 additions & 0 deletions tests/ml_model_listing/test_update_pretrained_model_listing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# SPDX-License-Identifier: Apache-2.0
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.
# Any modifications Copyright OpenSearch Contributors. See
# GitHub history for details.

# We need to append UTILS_MODEL_UPLOADER_DIR path so that we can import
# functions from update_pretrained_model_listing.py
# since this python script is not in the root directory.

import json
import os
import shutil
import sys

import pytest

THIS_DIR = os.path.dirname(__file__)
UTILS_MODEL_UPLOADER_DIR = os.path.join(THIS_DIR, "../../utils/model_uploader")
sys.path.append(UTILS_MODEL_UPLOADER_DIR)

SAMPLE_FOLDER = os.path.join(THIS_DIR, "samples")
CONFIG_PATHS_TXT_FILENAME = "config_paths.txt"
CONFIG_FOLDERNAME = "config_folder"
SAMPLE_PRETRAINED_MODEL_LISTING = os.path.join(
SAMPLE_FOLDER, "pretrained_model_listing.json"
)
SAMPLE_FOLDER_COPY = os.path.join(THIS_DIR, "samples_copy")
SAMPLE_MISSING_CONFIG_SUBFOLDERNAME = "sentence-transformers"
TEST_FILE = os.path.join(THIS_DIR, "test_pretrained_model_listing.json")

from update_pretrained_model_listing import create_new_pretrained_model_listing


def clean_test_file():
if os.path.isfile(TEST_FILE):
os.remove(TEST_FILE)


def copy_samples_folder():
shutil.copytree(SAMPLE_FOLDER, SAMPLE_FOLDER_COPY)


def clean_samples_folder_copy():
if os.path.exists(SAMPLE_FOLDER_COPY):
for files in os.listdir(SAMPLE_FOLDER_COPY):
sub_path = os.path.join(SAMPLE_FOLDER_COPY, files)
if os.path.isfile(sub_path):
os.remove(sub_path)
else:
try:
shutil.rmtree(sub_path)
except OSError as err:
print(
"Fail to delete files, please delete all files in "
+ str(SAMPLE_FOLDER_COPY)
+ " "
+ str(err)
)

shutil.rmtree(SAMPLE_FOLDER_COPY)


clean_samples_folder_copy()
clean_test_file()


def test_create_new_pretrained_model_listing():
clean_test_file()
try:
create_new_pretrained_model_listing(
os.path.join(SAMPLE_FOLDER, CONFIG_PATHS_TXT_FILENAME),
os.path.join(SAMPLE_FOLDER, CONFIG_FOLDERNAME),
pretrained_model_listing_json_filepath=TEST_FILE,
)
except Exception as e:
assert False, print(f"Failed while creating new pretrained model listing: {e}")

try:
with open(SAMPLE_PRETRAINED_MODEL_LISTING, "r") as f:
sample_pretrained_model_listing = json.load(f)
except Exception as e:
assert False, print(
f"Cannot open {SAMPLE_PRETRAINED_MODEL_LISTING} to use it for verification: {e}"
)

try:
with open(TEST_FILE, "r") as f:
test_pretrained_model_listing = json.load(f)
except Exception as e:
assert False, print(f"Cannot open {TEST_FILE} to verify its content: {e}")

assert test_pretrained_model_listing == sample_pretrained_model_listing, print(
"Incorrect pretrained model listing"
)

clean_test_file()


def test_missing_config_file():
clean_test_file()
clean_samples_folder_copy()

copy_samples_folder()
shutil.rmtree(
os.path.join(
SAMPLE_FOLDER_COPY, CONFIG_FOLDERNAME, SAMPLE_MISSING_CONFIG_SUBFOLDERNAME
)
)

with pytest.raises(Exception) as exc_info:
create_new_pretrained_model_listing(
os.path.join(SAMPLE_FOLDER_COPY, CONFIG_PATHS_TXT_FILENAME),
os.path.join(SAMPLE_FOLDER_COPY, CONFIG_FOLDERNAME),
pretrained_model_listing_json_filepath=TEST_FILE,
)
assert exc_info.type is Exception
assert "Cannot open" in str(exc_info.value)

clean_test_file()
clean_samples_folder_copy()


clean_samples_folder_copy()
clean_test_file()
58 changes: 31 additions & 27 deletions utils/model_uploader/update_pretrained_model_listing.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,53 +27,55 @@


def get_sentence_transformer_model_description(
config_folder_name: str, config_filepath: str
config_folderpath: str, config_filepath: str
) -> Optional[str]:
"""
Get description of the pretrained sentence transformer model from config file
:param config_folder_name: Name of the local folder that stores config files (e.g. 'config_folder')
:type config_folder_name: string
:param config_folderpath: Path to the folder that stores copies of config files from S3 (e.g. 'config_folder')
:type config_folderpath: string
:param config_filepath: Path to local config file
(e.g. 'sentence-transformers/all-MiniLM-L12-v2/2.0.0/onnx/config.json')
:type config_filepath: string
:return: Description of the model
:rtype: string or None
"""
filepath = os.path.join(config_folder_name, config_filepath)
filepath = os.path.join(config_folderpath, config_filepath)
try:
with open(filepath, "r") as f:
model_config = json.load(f)
except Exception as e:
print(f"Cannot open {filepath} to get model description: {e}")
return None
raise Exception(f"Cannot open {filepath} to get model description: {e}")
if "description" in model_config:
return model_config["description"]
else:
return None


def create_new_pretrained_model_listing(
config_paths_txt_filename: str, config_foldername: str
config_paths_txt_filepath: str,
config_folderpath: str,
pretrained_model_listing_json_filepath: str = PRETRAINED_MODEL_LISTING_JSON_FILEPATH,
):
"""
Create a new pretrained model listing and store it at PRETRAINED_MODEL_LISTING_JSON_FILEPATH
based on current models in config_paths_txt_filename and their config files in config_foldername
Create a new pretrained model listing and store it at pretrained_model_listing_json_filepath
based on current models in config_paths_txt_filepath and their config files in config_folderpath
:param config_paths_txt_filename: Name of the txt file that stores paths to config file
:param config_paths_txt_filepath: Path to the txt file that stores a list of config paths from S3
in the ml-models/huggingface/ folder of the S3 bucket
:type config_paths_txt_filename: string
:param config_foldername: Name of the local folder that stores config files
:type config_foldername: string
:type config_paths_txt_filepath: string
:param config_folderpath: Path to the folder that stores copies of config files from S3
:type config_folderpath: string
:return: No return value expected
:param pretrained_model_listing_json_filepath: Path to the json file that stores new model listing
:rtype: None
"""
print("\n=== Begin running update_pretrained_model_listing.py ===")
print(f"--- Reading {config_paths_txt_filename} ---")
with open(config_paths_txt_filename, "r") as f:
print(f"--- Reading {config_paths_txt_filepath} ---")
with open(config_paths_txt_filepath, "r") as f:
config_paths_lst = f.read().split()

print("--- Creating New Model Listing --- ")
print("\n--- Creating New Model Listing --- ")
new_model_listing_dict = {}
for config_filepath in config_paths_lst:
if config_filepath.startswith(PREFIX_HUGGINGFACE_MODEL_FILEPATH):
Expand All @@ -96,7 +98,7 @@ def create_new_pretrained_model_listing(
versions_content[model_version]["format"].append(model_format)
if "description" not in versions_content[model_version]:
description = get_sentence_transformer_model_description(
config_foldername, local_config_filepath
config_folderpath, local_config_filepath
)
if description is not None:
versions_content[model_version]["description"] = description
Expand All @@ -107,34 +109,36 @@ def create_new_pretrained_model_listing(
model_dict["versions"] = dict(sorted(model_dict["versions"].items()))

print(
f"--- Dumping New Model Listing in {PRETRAINED_MODEL_LISTING_JSON_FILEPATH} --- "
f"\n--- Dumping New Model Listing in {pretrained_model_listing_json_filepath} --- "
)
if not os.path.isdir(JSON_DIRNAME):
os.makedirs(JSON_DIRNAME)
with open(PRETRAINED_MODEL_LISTING_JSON_FILEPATH, "w") as f:
with open(pretrained_model_listing_json_filepath, "w") as f:
json.dump(new_model_listing_lst, f, indent=2)
print("\n=== Finished running update_pretrained_model_listing.py ===")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"config_paths_txt_filename",
"config_paths_txt_filepath",
type=str,
help="Name of the file that stores config paths in S3",
help="Path to the txt file that stores a list of config paths from S3",
)
parser.add_argument(
"config_foldername",
"config_folderpath",
type=str,
help="Name of the local folder that stores copies of config files from S3",
help="Path to the folder that stores copies of config files from S3",
)

args = parser.parse_args()

if not args.config_paths_txt_filename.endswith(".txt"):
assert False, "Invalid arguments"
if not args.config_paths_txt_filepath.endswith(".txt"):
raise Exception(
"Invalid argument: config_paths_txt_filepath should be .txt file"
)

create_new_pretrained_model_listing(
args.config_paths_txt_filename,
args.config_foldername,
args.config_paths_txt_filepath,
args.config_folderpath,
)

0 comments on commit bc4cc78

Please sign in to comment.