Fix sdxl submitter (#46)

* Add support for sdxl to generate_tables * Update code_axs.py to fix getting accuracy_report * Rename generate_tables to generate_table, fix order of fetching FID_SCORE and CLIP_SCORE from sdxl accuracy experiments * Remove debug print statement. * Display sdxl accuracy ranges as target for sdxl
krai · Jun 25, 2024 · 94e88c3 · 94e88c3
1 parent 7367028
commit 94e88c3
Showing 1 changed file with 54 additions and 9 deletions.
diff --git a/submitter/code_axs.py b/submitter/code_axs.py
@@ -515,7 +515,7 @@ def copy_readmes_for_code(experiment_entries, division, submitter, submitted_tre
                 print(f"    NOT Copying: {file_to_copy_source_path}  -->  {code_model_program_path}", file=sys.stderr)
 
 
-def generate_tables(experiment_entries, division, submitter, power, __entry__):
+def generate_table(experiment_entries, division, submitter, power, __entry__):
 
     col_names = ["SUT", "Scenario", "Mode / Compliance?", "Status", "Target metric", "Actual metric", "Power", "Efficiency"]
     table_data = []
@@ -542,7 +542,8 @@ def generate_tables(experiment_entries, division, submitter, power, __entry__):
         target_qps = experiment_entry.get("loadgen_target_qps")
         target_latency = experiment_entry.get("loadgen_target_latency")
         compliance_test_name = experiment_entry.get('loadgen_compliance_test')
-        accuracy_metric = experiment_entry.get("accuracy_report")
+        if mode == "Accuracy":
+            accuracy_metric = experiment_entry.get("accuracy_report")
 
         # Function to extract the actual performance metric
         def get_samples_per_second(file_path):
@@ -601,6 +602,18 @@ def extract_map(accuracy_metric):
                         map_value = map_part.split('%')[0].strip()
                         return map_value
             return "mAP value not found"
+
+        def extract_accuracy_sdxl(accuracy_metric):
+            if accuracy_metric is not None and "\'FID_SCORE\'" in accuracy_metric and "\'CLIP_SCORE\'" in accuracy_metric:
+                fid_score_part = accuracy_metric.split('\'FID_SCORE\':')[1]
+                fid_score_value = fid_score_part.split(',')[0].strip()
+
+                clip_score_part = accuracy_metric.split('\'CLIP_SCORE\':')[1]
+                clip_score_value = clip_score_part.split('}')[0].strip()
+
+                return float(fid_score_value), float(clip_score_value)
+            return "Scores not found."
+
 
         if power and "power_loadgen_output" in experiment_entry["tags"]:
             target_entry = get_testing_entry(experiment_entry)
@@ -615,17 +628,26 @@ def extract_map(accuracy_metric):
             "resnet50": round(76.46 * 0.99, 3),
             "retinanet": round(37.55 * 0.99, 3),
             "bert-99": round(90.874 * 0.99, 3),
-            "bert-99.9": round(90.874 * 0.999, 3)
+            "bert-99.9": round(90.874 * 0.999, 3),
+            "stable-diffusion-xl": ("FID_SCORE", 23.01085758, "CLIP_SCORE", 31.68631873)
         }
 
         # Actual accuracy for workloads
         actual_accuracy = {
             "resnet50": extract_accuracy_ic(accuracy_metric),
             "retinanet": extract_map(accuracy_metric),
             "bert-99": extract_accuracy_bert(accuracy_metric),
-            "bert-99.9": extract_accuracy_bert(accuracy_metric)
+            "bert-99.9": extract_accuracy_bert(accuracy_metric),
+            "stable-diffusion-xl": extract_accuracy_sdxl(accuracy_metric)
         }
 
+        # Accuracy upper limit
+        accuracy_upper_limit = {
+            "stable-diffusion-xl": ("FID_SCORE", 23.95007626, "CLIP_SCORE", 31.81331801)
+        }
+
+        target_acc = target_accuracy[model_name]
+        actual_acc = actual_accuracy[model_name]
 
         if "power_loadgen_output" in experiment_entry["tags"] and power:
             power_experiment_entry = experiment_entry
@@ -644,11 +666,34 @@ def extract_map(accuracy_metric):
             status = get_result_status(mlperf_log_path)
 
         else:
-            if (target_accuracy[model_name]) <= float(actual_accuracy[model_name]):
-                status = "VALID"
-                actual_metric = actual_accuracy[model_name]
-                target = target_accuracy[model_name]
-                energy_eff = "N/A"
+            if model_name == "stable-diffusion-xl":
+                target_fid_score = target_acc[1]
+                target_clip_score = target_acc[3]
+                upper_fid_score = accuracy_upper_limit[model_name][1]
+                upper_clip_score = accuracy_upper_limit[model_name][3]
+                # Extract actual values
+                if isinstance(actual_acc, tuple) and len(actual_acc) == 2:
+                    actual_fid_score, actual_clip_score = actual_acc
+                else:
+                    raise ValueError("Invalid format for actual accuracy values")
+
+                # Compare values within the range
+                if target_fid_score <= actual_fid_score <= upper_fid_score and target_clip_score <= actual_clip_score <= upper_clip_score:
+                    status = "VALID"
+                else:
+                    status = "INVALID"
+
+                actual_metric =f"FID_SCORE: {actual_fid_score}\nCLIP_SCORE: {actual_clip_score}"
+                target = f"FID_SCORE range: [{target_fid_score}, {upper_fid_score}]\nCLIP_SCORE range: [{target_clip_score}, {upper_clip_score}]"
+
+            else:
+                if (float(actual_acc) >= target_acc):
+                    status = "VALID"
+                else:
+                    status = "INVALID"
+                actual_metric = actual_acc
+                target = target_acc
+            energy_eff = "N/A"
 
         if scenario in ["Offline", "Server"] and mode.lower() == "performance" and not power:
             target = target_qps