Merge pull request #2 from mlcommons/master

Merge with latest mlcommons master
openshift-psap · Jan 30, 2024 · cb93b59 · cb93b59
2 parents 207a29b + 3ad8534
commit cb93b59
Show file tree

Hide file tree

Showing 6 changed files with 33 additions and 47 deletions.
diff --git a/README.md b/README.md
@@ -15,7 +15,23 @@ Please see the [MLPerf Inference benchmark paper](https://arxiv.org/abs/1911.025
 ```
 ## MLPerf Inference v4.0 (submission deadline February 23, 2024)
 
-Code freeze coming soon...
+There is an extra one-week extension allowed only for the llama2-70b submissions. For submissions, please use the master branch and any commit since the [4.0 seed release](https://github.com/mlcommons/inference/commit/8e36925bd36a503e39fcbbc488e9e46126f079ed) although it is best to use the latest commit. v4.0 tag will be created from the master branch after the result publication.
+
+For power submissions please use [SPEC PTD 1.10](https://github.com/mlcommons/power/tree/main/inference_v1.0) (needs special access) and any commit of the power-dev repository after the [code-freeze](https://github.com/mlcommons/power-dev/commit/4e026f43481f46ad57d2464d28924018444b0428)
+
+| model | reference app | framework | dataset | category
+| ---- | ---- | ---- | ---- | ---- |
+| resnet50-v1.5 | [vision/classification_and_detection](https://github.com/mlcommons/inference/tree/master/vision/classification_and_detection) | tensorflow, onnx, tvm, ncnn | imagenet2012 | edge,datacenter |
+| retinanet 800x800 | [vision/classification_and_detection](https://github.com/mlcommons/inference/tree/master/vision/classification_and_detection) | pytorch, onnx | openimages resized to 800x800| edge,datacenter |
+| bert | [language/bert](https://github.com/mlcommons/inference/tree/master/language/bert) | tensorflow, pytorch, onnx | squad-1.1 | edge,datacenter |
+| dlrm-v2 | [recommendation/dlrm_v2](https://github.com/mlcommons/inference/tree/master/recommendation/dlrm_v2/pytorch) | pytorch | Multihot Criteo Terabyte | datacenter |
+| 3d-unet | [vision/medical_imaging/3d-unet-kits19](https://github.com/mlcommons/inference/tree/master/vision/medical_imaging/3d-unet-kits19) | pytorch, tensorflow, onnx | KiTS19 | edge,datacenter |
+| rnnt | [speech_recognition/rnnt](https://github.com/mlcommons/inference/tree/master/speech_recognition/rnnt) | pytorch | OpenSLR LibriSpeech Corpus | edge,datacenter |
+| gpt-j | [language/gpt-j](https://github.com/mlcommons/inference/tree/master/language/gpt-j)| pytorch | CNN-Daily Mail | edge,datacenter |
+| stable-diffusion-xl | [text_to_image] (https://github.com/mlcommons/inference/tree/master/text_to_image) | pytorch | COCO 2014| edge,datacenter |
+| llama2-70b | [language/llama2-70b](https://github.com/mlcommons/inference/tree/master/language/llama2-70b) | pytorch | OpenOrca | datacenter |
+
+* Framework here is given for the reference implementation. Submitters are free to use their own frameworks to run the benchmark.
 
 ## MLPerf Inference v3.1 (submission August 18, 2023)
 Please use [v3.1 tag](https://github.com/mlcommons/inference/releases/tag/v3.1) (```git checkout v3.1```) if you would like to reproduce the v3.1 results. 

diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md
@@ -195,15 +195,15 @@ if [ -e ${ACCURACY_LOG_FILE} ]; then
 fi
 ```
 
-The ServerSUT was not tested for GPU runs. You can try setting `--device cuda:0`, but YMMV.
+The ServerSUT was not tested for GPU runs.
 
 
 ## Accuracy Target
 Running the GPU implementation in FP32 precision resulted in the following FP32 accuracy targets (normalized to a 0-100
 scale from a 0.0-1.0 scale):
-- Rouge1: 43.88
-- Rouge2: 21.7108
-- RougeL: 28.2502
-- RougeLsum: 41.4821
+- Rouge1: 44.4312
+- Rouge2: 22.0352
+- RougeL: 28.6162
+- Tokens per sample: 294.45
 
-This was run an 8xH100 node. Total runtime was ~4.5 days.
+This was run on a DGX-H100 node. Total runtime was ~4.5 days.
diff --git a/text_to_image/coco.py b/text_to_image/coco.py
@@ -174,9 +174,9 @@ def save_images(self, ids, ds):
         for id in ids:
             caption = ds.get_caption(id)
             generated = Image.fromarray(self.results[idx[id]])
-            image_path_tmp = f"images/{self.content_ids[id]}.png"
+            image_path_tmp = f"images/{self.content_ids[idx[id]]}.png"
             generated.save(image_path_tmp)
-            info.append((self.content_ids[id], caption))
+            info.append((self.content_ids[idx[id]], caption))
         with open("images/captions.txt", "w+") as f:
             for id, caption in info:
                 f.write(f"{id}  {caption}\n")

diff --git a/tools/submission/power/power_checker.py b/tools/submission/power/power_checker.py
@@ -408,6 +408,8 @@ def get_avg_power(power_path: str, run_path: str) -> Tuple[float, float]:
 
         with open(spl_fname) as f:
             for line in f:
+                if not line.startswith("Time"):
+                    continue
                 timestamp = (
                     datetime.strptime(line.split(",")[1], datetime_format)
                 ).replace(tzinfo=timezone.utc)

diff --git a/tools/submission/power/sources_checksums.json b/tools/submission/power/sources_checksums.json
@@ -1,38 +1,4 @@
 [
-    {
-        "server.py": "c3f90f2f7eeb4db30727556d0c815ebc89b3d28b",
-        "__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
-        "client.py": "33ca4f26368777ac06e01f9567b714a4b8063886",
-        "tests/unit/test_source_hashes.py": "00468a2907583c593e6574a1f6b404e4651c221a",
-        "tests/unit/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
-        "tests/unit/test_server.py": "948c1995d4008bc2aa6c4046a34ffa3858d6d671",
-        "lib/time_sync.py": "3210db56eb0ff0df57bf4293dc4d4b03fffd46f1",
-        "lib/source_hashes.py": "60a2e02193209e8d392803326208d5466342da18",
-        "lib/common.py": "611d8b29633d331eb19c9455ea3b5fa3284ed6df",
-        "lib/server.py": "8054263a14dedddcf8e1c01adc19596c21bad591",
-        "lib/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
-        "lib/summary.py": "aa92f0a3f975eecd44d3c0cd0236342ccc9f941d",
-        "lib/client.py": "c146491755e219a28d440b31f83998dbd5532483",
-        "lib/external/ntplib.py": "4da8f970656505a40483206ef2b5d3dd5e81711d",
-        "lib/external/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709"
-    },
-    {
-        "__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
-        "client.py": "33ca4f26368777ac06e01f9567b714a4b8063886",
-        "lib/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
-        "lib/client.py": "ac2aa093c8e8bbc9569b9e2a3471bc64e58a2258",
-        "lib/common.py": "611d8b29633d331eb19c9455ea3b5fa3284ed6df",
-        "lib/external/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
-        "lib/external/ntplib.py": "4da8f970656505a40483206ef2b5d3dd5e81711d",
-        "lib/server.py": "c7af63c31bb2fbedea4345f571f6e3507d268ada",
-        "lib/source_hashes.py": "60a2e02193209e8d392803326208d5466342da18",
-        "lib/summary.py": "aa92f0a3f975eecd44d3c0cd0236342ccc9f941d",
-        "lib/time_sync.py": "122eba67a9abc85635223e054def53be1367ade2",
-        "server.py": "c3f90f2f7eeb4db30727556d0c815ebc89b3d28b",
-        "tests/unit/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
-        "tests/unit/test_server.py": "948c1995d4008bc2aa6c4046a34ffa3858d6d671",
-        "tests/unit/test_source_hashes.py": "00468a2907583c593e6574a1f6b404e4651c221a"
-    },
     {
         "__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
         "client.py": "33ca4f26368777ac06e01f9567b714a4b8063886",
@@ -41,7 +7,7 @@
         "lib/common.py": "611d8b29633d331eb19c9455ea3b5fa3284ed6df",
         "lib/external/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
         "lib/external/ntplib.py": "4da8f970656505a40483206ef2b5d3dd5e81711d",
-        "lib/server.py": "c7af63c31bb2fbedea4345f571f6e3507d268ada",
+        "lib/server.py": "99303c836c683aa9017ec565104e636161d02acb",
         "lib/source_hashes.py": "60a2e02193209e8d392803326208d5466342da18",
         "lib/summary.py": "aa92f0a3f975eecd44d3c0cd0236342ccc9f941d",
         "lib/time_sync.py": "80894ef2389e540781ff78de94db16aa4203a14e",
@@ -50,4 +16,4 @@
         "tests/unit/test_server.py": "948c1995d4008bc2aa6c4046a34ffa3858d6d671",
         "tests/unit/test_source_hashes.py": "00468a2907583c593e6574a1f6b404e4651c221a"
     }
-]
+]
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
@@ -1124,8 +1124,8 @@
             "3d-unet-99.9": ("DICE", 0.86170 * 0.999),
             "gptj-99" : ("ROUGE1", 42.9865 * 0.99, "ROUGE2", 20.1235 * 0.99, "ROUGEL", 29.9881 * 0.99, "GEN_LEN", 4016878*0.9),
             "gptj-99.9" : ("ROUGE1", 42.9865 * 0.999, "ROUGE2", 20.1235 * 0.999, "ROUGEL", 29.9881 * 0.999, "GEN_LEN", 4016878*0.9),
-            "llama2-70b-99" : ("ROUGE1", 43.88 * 0.99, "ROUGE2", 21.7108 * 0.99, "ROUGEL", 28.2502 * 0.99, "TOKENS_PER_SAMPLE", 293.3*0.9),
-            "llama2-70b-99.9" : ("ROUGE1", 43.88 * 0.999, "ROUGE2", 21.7108 * 0.999, "ROUGEL", 28.2502 * 0.999, "TOKENS_PER_SAMPLE", 293.3*0.9),
+            "llama2-70b-99" : ("ROUGE1", 44.4312 * 0.99, "ROUGE2", 22.0352 * 0.99, "ROUGEL", 28.6162 * 0.99, "TOKENS_PER_SAMPLE", 294.45*0.9),
+            "llama2-70b-99.9" : ("ROUGE1", 44.4312 * 0.999, "ROUGE2", 22.0352 * 0.999, "ROUGEL", 28.6162 * 0.999, "TOKENS_PER_SAMPLE", 294.45*0.9),
             "stable-diffusion-xl": ("CLIP_SCORE", 31.68631873, "FID_SCORE", 23.01085758)
         },
         "accuracy-upper-limit": {
@@ -1785,6 +1785,8 @@ def check_extra_files(path, target_files):
                 if target_file not in files:
                     check_pass = False
                     missing_files.append(f"{os.path.join(path, dir, target_file)}.png")
+            if "captions" not in files:
+                missing_files.append(f"{os.path.join(path, dir, 'captions.txt')}")
     return check_pass, missing_files