Llm detokeniser (#44)

* Added detokenisation to llama2 experiments * Removed redundant parent * Switched endianness to little * Fixed transformers loading
krai · Jun 25, 2024 · 2445b09 · 2445b09
1 parent d94e5f7
commit 2445b09
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 3 deletions.
diff --git a/base_llama2_loadgen_experiment/code_axs.py b/base_llama2_loadgen_experiment/code_axs.py
@@ -1,7 +1,32 @@
+import json
+
+from transformers import AutoTokenizer
+
+
 def get_accuracy_dict(accuracy_dict_full):
     accuracy_dict = {}
     for k in accuracy_dict_full.keys():
-        if k in [ "rouge1", "rouge2", "rougeL", "tokens_per_sample" ]:
+        if k in ["rouge1", "rouge2", "rougeL", "tokens_per_sample"]:
             accuracy_dict[k] = accuracy_dict_full[k]
     return accuracy_dict
 
+def detokenise(
+    checkpoint_path: str, tokenised_accuracy_log_path: str, output_log_path: str
+):
+    tokeniser = AutoTokenizer.from_pretrained(checkpoint_path)
+
+    with open(tokenised_accuracy_log_path) as f:
+        log = json.load(f)
+
+    output_log = []
+    for item in log:
+        hex_str = item["data"]
+        hex_tokens = [hex_str[i : i + 8] for i in range(0, len(hex_str), 8)]
+        tokens = [
+            int.from_bytes(bytes.fromhex(tok), byteorder="little") for tok in hex_tokens
+        ]
+        output_log.append(tokeniser.decode(tokens))
+
+    with open(output_log_path, "w") as f:
+        json.dump(output_log, f, indent=2)
+    return output_log_path
diff --git a/base_llama2_loadgen_experiment/data_axs.json b/base_llama2_loadgen_experiment/data_axs.json
@@ -1,5 +1,13 @@
 {
-    "_parent_entries": [ [ "^", "byname", "base_loadgen_experiment" ] , [ "^", "byname", "shell" ] ],
+    "_parent_entries": [ [ "^", "byname", "base_loadgen_experiment" ] , [ "^", "byname", "shell" ], [ "^", "byname", "python_in_shell" ] ],
+
+    "transformers_query": [ "python_package", "package_name=transformers", ["desired_python_version", ["^", "kernel_python_major_dot_minor"]] ],
+
+    "_BEFORE_CODE_LOADING": [ "^^", "execute", [[
+        [ "get_kernel" ],
+        [ "byquery", [[ "^^", "get", "transformers_query" ]] ],
+        [ "use" ]
+    ]] ],
 
     "mlperf_inference_git_entry": [ "^", "byquery", "git_repo,repo_name=mlperf_inference_git" ],
 
@@ -68,6 +76,14 @@
     "gen_len": [ "^^" , "dig","accuracy_dict.gen_len" ],
     "gen_num": [ "^^" , "dig","accuracy_dict.gen_num" ],
 
+    "accuracy_range_dict": { "rouge1": [ 43.986888, null ], "rouge2": [ 21.814848, null ], "rougeL": [ 28.330038, null ], "tokens_per_sample": [ 265.005, null ] },
+
+    "abs_path": [ "^^", "get_path" ],
+    "rel_log_path": "mlperf_log_accuracy.json",
+    "tokenised_accuracy_log_path": [ "^^", "substitute", "#{abs_path}#/#{rel_log_path}#" ],
+
+    "rel_output_log_path": "detokenised_mlperf_log.json",
+    "output_log_path": [ "^^", "substitute", "#{abs_path}#/#{rel_output_log_path}#" ],
 
-    "accuracy_range_dict": { "rouge1": [ 43.986888, null ], "rouge2": [ 21.814848, null ], "rougeL": [ 28.330038, null ], "tokens_per_sample": [ 265.005, null ] }
+    "detokenised_log": [ "^^", "detokenise" ]
 }