Skip to content

Commit

Permalink
Merge final-pr-1.0-model
Browse files Browse the repository at this point in the history
Signed-off-by: Thanawan Atchariyachanvanit <[email protected]>
  • Loading branch information
thanawan-atc committed Aug 9, 2023
2 parents 7662b0e + 95d266a commit 0f42710
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 43 deletions.
7 changes: 4 additions & 3 deletions .ci/run-repository.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,10 @@ elif [[ "$TASK_TYPE" == "trace" ]]; then
echo -e "\033[34;1mINFO:\033[0m MODEL_ID: ${MODEL_ID}\033[0m"
echo -e "\033[34;1mINFO:\033[0m MODEL_VERSION: ${MODEL_VERSION}\033[0m"
echo -e "\033[34;1mINFO:\033[0m TRACING_FORMAT: ${TRACING_FORMAT}\033[0m"
echo -e "\033[34;1mINFO:\033[0m EMBEDDING_DIMENSION: ${EMBEDDING_DIMENSION:-Default}\033[0m"
echo -e "\033[34;1mINFO:\033[0m POOLING_MODE: ${POOLING_MODE:-Default}\033[0m"
echo -e "\033[34;1mINFO:\033[0m MODEL_DESCRIPTION: ${MODEL_DESCRIPTION:-Default}\033[0m"
echo -e "\033[34;1mINFO:\033[0m EMBEDDING_DIMENSION: ${EMBEDDING_DIMENSION:-N/A}\033[0m"
echo -e "\033[34;1mINFO:\033[0m POOLING_MODE: ${POOLING_MODE:-N/A}\033[0m"
echo -e "\033[34;1mINFO:\033[0m MODEL_DESCRIPTION: ${MODEL_DESCRIPTION:-N/A}\033[0m"


docker run \
--network=${network_name} \
Expand Down
10 changes: 8 additions & 2 deletions .github/workflows/model_uploader.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,15 @@ jobs:
- Model ID: ${{ github.event.inputs.model_id }}
- Model Version: ${{ github.event.inputs.model_version }}
- Tracing Format: ${{ github.event.inputs.tracing_format }}
<<<<<<< HEAD
- Embedding Dimension: ${embedding_dimension:-Default}
- Pooling Mode: ${pooling_mode:-Default}
- Model Description: ${model_description:-Default}
=======
- Embedding Dimension: ${embedding_dimension:-N/A}
- Pooling Mode: ${pooling_mode:-N/A}
- Model Description: ${model_description:-N/A}
>>>>>>> final-pr-1.0-model

======== Workflow Output Information =========
- Embedding Verification: Passed"
Expand Down Expand Up @@ -316,8 +322,8 @@ jobs:
${{ github.event.inputs.tracing_format }} \
-ed ${{ github.event.inputs.embedding_dimension }} \
-pm ${{ github.event.inputs.pooling_mode }} \
-md ${model_description:+"$model_description"} \
-u ${{ github.actor }} -t "${{ needs.model-uploading.outputs.upload_time }}"
-id ${{ github.run_id }} -u ${{ github.actor }} \
-t "${{ needs.model-uploading.outputs.upload_time }}"
- name: Create PR Body
id: create_pr_body
run: |
Expand Down
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
# CHANGELOG
Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)

## [1.2.0]

### Added

### Changed

### Fixed
- Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203))

## [1.1.0]

### Added
Expand Down
2 changes: 1 addition & 1 deletion noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def trace(session):
"1500",
)
session.install(".")

session.run(
"python",
"utils/model_uploader/model_autotracing.py",
Expand Down
4 changes: 2 additions & 2 deletions opensearch_py_ml/ml_models/sentencetransformermodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -1032,11 +1032,11 @@ def _get_model_description_from_readme_file(self, readme_file_path) -> str:
readme_data = MarkDownFile.read_file(readme_file_path)

# Find the description section
start_str = f"# {self.model_id}"
start_str = f"\n# {self.model_id}"
start = readme_data.find(start_str)
if start == -1:
model_name = self.model_id.split("/")[1]
start_str = f"# {model_name}"
start_str = f"\n# {model_name}"
start = readme_data.find(start_str)
end = readme_data.find("\n#", start + len(start_str))

Expand Down
17 changes: 9 additions & 8 deletions utils/model_uploader/model_autotracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,20 @@

import argparse
import json
import numpy as np
import os
import shutil
import sys
import warnings
from typing import List, Optional, Tuple
from zipfile import ZipFile

import numpy as np
from mdutils.fileutils import MarkDownFile
from numpy.typing import DTypeLike
from sentence_transformers import SentenceTransformer
from typing import List, Optional, Tuple
from zipfile import ZipFile

# We need to append ROOT_DIR path so that we can import
# OPENSEARCH_TEST_CLIENT and opensearch_py_ml since this
# We need to append ROOT_DIR path so that we can import
# OPENSEARCH_TEST_CLIENT and opensearch_py_ml since this
# python script is not in the root directory.
THIS_DIR = os.path.dirname(__file__)
ROOT_DIR = os.path.join(THIS_DIR, "../..")
Expand Down Expand Up @@ -429,12 +430,12 @@ def main(
print("Tracing Format: ", tracing_format)
print(
"Embedding Dimension: ",
embedding_dimension if embedding_dimension is not None else "Default",
embedding_dimension if embedding_dimension is not None else "N/A",
)
print("Pooling Mode: ", pooling_mode if pooling_mode is not None else "Default")
print("Pooling Mode: ", pooling_mode if pooling_mode is not None else "N/A")
print(
"Model Description: ",
model_description if model_description is not None else "Default",
model_description if model_description is not None else "N/A",
)
print("==========================================")

Expand Down
75 changes: 62 additions & 13 deletions utils/model_uploader/update_changelog_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,18 @@
# Any modifications Copyright OpenSearch Contributors. See
# GitHub history for details.

# This program is run by "Model Auto-tracing & Uploading" workflow
# (See model_uploader.yml) to update CHANGELOG.md after uploading the model
# to our model hub.
# This program is run by "Model Auto-tracing & Uploading"
# & "Model Listing Uploading" workflow (See model_uploader.yml
# & model_listing_uploader.yml) to update CHANGELOG.md after
# uploading the model to our model hub.

import argparse
from mdutils.fileutils import MarkDownFile

CHANGELOG_DIRNAME = "."
CHANGELOG_FILENAME = "CHANGELOG.md"
SECTION_NAME = "Changed"
SUBSECTION_NAME = "Changed"
PREV_SUBSECTION_NAME = "Added"


def update_changelog_file(
Expand All @@ -30,14 +32,24 @@ def update_changelog_file(
"""
changelog_data = MarkDownFile.read_file(f"{CHANGELOG_DIRNAME}/{CHANGELOG_FILENAME}")

this_version_ptr = changelog_data.find("## [")
assert this_version_ptr != -1, "Cannot find a version section in the CHANGELOG.md"
next_version_ptr = changelog_data.find("## [", this_version_ptr + 1)
# Find the most recent version section and pull it out
this_version_ptr = changelog_data.find("\n## ") + 1
assert this_version_ptr != 0, "Cannot find a version section in the CHANGELOG.md"
next_version_ptr = changelog_data.find("\n## ", this_version_ptr + 1) + 1
if next_version_ptr == 0:
next_version_ptr = -1
this_version_section = changelog_data[this_version_ptr:next_version_ptr]

this_subsection_ptr = this_version_section.find(f"### {SECTION_NAME}")
if this_subsection_ptr != -1:
next_subsection_ptr = this_version_section.find("### ", this_subsection_ptr + 1)
# Find the sub-section SUBSECTION_NAME
this_subsection_ptr = this_version_section.find(f"\n### {SUBSECTION_NAME}") + 1
if this_subsection_ptr != 0:
# Case 1: Section SUBSECTION_NAME exists
# Append a change_log line to the end of that subsection if it exists
next_subsection_ptr = (
this_version_section.find("\n### ", this_subsection_ptr + 1) + 1
)
if next_subsection_ptr == 0:
next_subsection_ptr = -1
this_subsection = this_version_section[
this_subsection_ptr:next_subsection_ptr
].strip()
Expand All @@ -48,10 +60,47 @@ def update_changelog_file(
+ this_version_section[next_subsection_ptr:]
)
else:
this_subsection = this_version_section.strip()
this_subsection += "\n\n" + f"### {SECTION_NAME}\n- " + changelog_line + "\n\n"
new_version_section = this_subsection
# Case 2: Sub-section SUBSECTION_NAME does not exist
# Create sub-section SUBSECTION_NAME and add a change_log line
this_subsection = f"### {SUBSECTION_NAME}\n- {changelog_line}\n\n"
prev_subsection_ptr = (
this_version_section.find(f"\n### {PREV_SUBSECTION_NAME}") + 1
)
if prev_subsection_ptr != 0:
# Case 2.1: Sub-section PREV_SUBSECTION_NAME exist
# Add a sub-section SUBSECTION_NAME after PREV_SUBSECTION_NAME if PREV_SUBSECTION_NAME exists
next_subsection_ptr = (
this_version_section.find("\n### ", prev_subsection_ptr + 1) + 1
)
prev_subsection = this_version_section[
prev_subsection_ptr:next_subsection_ptr
].strip()
new_version_section = (
this_version_section[:prev_subsection_ptr]
+ prev_subsection
+ "\n\n"
+ this_subsection
+ this_version_section[next_subsection_ptr:]
)
else:
# Case 2.2: Sub-section PREV_SUBSECTION_NAME does not exist
next_subsection_ptr = this_version_section.find("\n### ") + 1
if next_subsection_ptr != 0:
# Case 2.2.1: There exists other sub-section in this version section
# Add a sub-section SECTION_NAME before other sub-sections
new_version_section = (
this_version_section[:next_subsection_ptr]
+ this_subsection
+ this_version_section[next_subsection_ptr:]
)
else:
# Case 2.2.2: There isn't any other sub-section in this version section
# Add a sub-section SECTION_NAME after version headline
new_version_section = (
this_version_section.strip() + "\n\n" + this_subsection
)

# Insert new_version_section back to the document
new_changelog_data = (
changelog_data[:this_version_ptr]
+ new_version_section
Expand Down
37 changes: 23 additions & 14 deletions utils/model_uploader/update_models_upload_history_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import json
import os
from typing import Dict, List, Optional

from mdutils.fileutils import MarkDownFile
from mdutils.tools.Table import Table

Expand All @@ -32,7 +33,7 @@
"Model Format",
"Embedding Dimension",
"Pooling Mode",
"Model Description",
"Workflow Run ID",
]
MD_HEADER = "# Pretrained Model Upload History\n\nThe model-serving framework supports a variety of open-source pretrained models that can assist with a range of machine learning (ML) search and analytics use cases. \n\n\n## Uploaded Pretrained Models\n\n\n### Sentence transformers\n\nSentence transformer models map sentences and paragraphs across a dimensional dense vector space. The number of vectors depends on the model. Use these models for use cases such as clustering and semantic search. \n\nThe following table shows sentence transformer model upload history.\n\n[//]: # (This may be the most platform independent comment)\n"

Expand All @@ -43,7 +44,7 @@ def create_model_json_obj(
model_format: str,
embedding_dimension: Optional[int] = None,
pooling_mode: Optional[str] = None,
model_description: Optional[str] = None,
workflow_id: Optional[str] = None,
model_uploader: Optional[str] = None,
upload_time: Optional[str] = None,
) -> Dict:
Expand All @@ -60,8 +61,13 @@ def create_model_json_obj(
:type embedding_dimension: int
:param pooling_mode: Pooling mode input ("CLS", "MEAN", "MAX", "MEAN_SQRT_LEN" or None)
:type pooling_mode: string
<<<<<<< HEAD
:param model_description: Model description input
:type model_description: string
=======
:param workflow_id: Workflow run id
:type workflow_id: string
>>>>>>> final-pr-1.0-model
:param model_uploader: Model uploader input
:type model_uploader: string
:param uploader_time: Upload time input
Expand All @@ -77,11 +83,9 @@ def create_model_json_obj(
"Model Format": model_format,
"Embedding Dimension": str(embedding_dimension)
if embedding_dimension is not None
else "Default",
"Pooling Mode": pooling_mode if pooling_mode is not None else "Default",
"Model Description": model_description
if model_description is not None
else "Default",
else "N/A",
"Pooling Mode": pooling_mode if pooling_mode is not None else "N/A",
"Workflow Run ID": workflow_id if workflow_id is not None else "-"
}
return model_obj

Expand Down Expand Up @@ -113,7 +117,7 @@ def update_model_json_file(
tracing_format: str,
embedding_dimension: Optional[int] = None,
pooling_mode: Optional[str] = None,
model_description: Optional[str] = None,
workflow_id: Optional[str] = None,
model_uploader: Optional[str] = None,
upload_time: Optional[str] = None,
) -> None:
Expand All @@ -130,8 +134,13 @@ def update_model_json_file(
:type embedding_dimension: int
:param pooling_mode: Pooling mode input ("CLS", "MEAN", "MAX", "MEAN_SQRT_LEN" or None)
:type pooling_mode: string
<<<<<<< HEAD
:param model_description: Model description input
:type model_description: string
=======
:param workflow_id: Workflow run id
:type workflow_id: string
>>>>>>> final-pr-1.0-model
:param model_uploader: Model uploader input
:type model_uploader: string
:param uploader_time: Upload time input
Expand All @@ -153,7 +162,7 @@ def update_model_json_file(
TORCH_SCRIPT_FORMAT,
embedding_dimension,
pooling_mode,
model_description,
workflow_id,
model_uploader,
upload_time,
)
Expand All @@ -166,7 +175,7 @@ def update_model_json_file(
ONNX_FORMAT,
embedding_dimension,
pooling_mode,
model_description,
workflow_id,
model_uploader,
upload_time,
)
Expand Down Expand Up @@ -242,13 +251,13 @@ def update_md_file():
help="Pooling mode if it does not exist in original config.json",
)
parser.add_argument(
"-md",
"--model_description",
"-id",
"--workflow_id",
type=str,
nargs="?",
default=None,
const=None,
help="Model description if you want to overwrite the default description",
help="Workflow Run ID",
)
parser.add_argument(
"-u",
Expand Down Expand Up @@ -276,7 +285,7 @@ def update_md_file():
args.tracing_format,
args.embedding_dimension,
args.pooling_mode,
args.model_description,
args.workflow_id,
args.model_uploader,
args.upload_time,
)
Expand Down

0 comments on commit 0f42710

Please sign in to comment.