Skip to content

Commit

Permalink
Llm detokeniser (#44)
Browse files Browse the repository at this point in the history
* Added detokenisation to llama2 experiments

* Removed redundant parent

* Switched endianness to little

* Fixed transformers loading
  • Loading branch information
Akshat-Tripathi authored Jun 25, 2024
1 parent d94e5f7 commit 2445b09
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 3 deletions.
27 changes: 26 additions & 1 deletion base_llama2_loadgen_experiment/code_axs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,32 @@
import json

from transformers import AutoTokenizer


def get_accuracy_dict(accuracy_dict_full):
accuracy_dict = {}
for k in accuracy_dict_full.keys():
if k in [ "rouge1", "rouge2", "rougeL", "tokens_per_sample" ]:
if k in ["rouge1", "rouge2", "rougeL", "tokens_per_sample"]:
accuracy_dict[k] = accuracy_dict_full[k]
return accuracy_dict

def detokenise(
checkpoint_path: str, tokenised_accuracy_log_path: str, output_log_path: str
):
tokeniser = AutoTokenizer.from_pretrained(checkpoint_path)

with open(tokenised_accuracy_log_path) as f:
log = json.load(f)

output_log = []
for item in log:
hex_str = item["data"]
hex_tokens = [hex_str[i : i + 8] for i in range(0, len(hex_str), 8)]
tokens = [
int.from_bytes(bytes.fromhex(tok), byteorder="little") for tok in hex_tokens
]
output_log.append(tokeniser.decode(tokens))

with open(output_log_path, "w") as f:
json.dump(output_log, f, indent=2)
return output_log_path
20 changes: 18 additions & 2 deletions base_llama2_loadgen_experiment/data_axs.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
{
"_parent_entries": [ [ "^", "byname", "base_loadgen_experiment" ] , [ "^", "byname", "shell" ] ],
"_parent_entries": [ [ "^", "byname", "base_loadgen_experiment" ] , [ "^", "byname", "shell" ], [ "^", "byname", "python_in_shell" ] ],

"transformers_query": [ "python_package", "package_name=transformers", ["desired_python_version", ["^", "kernel_python_major_dot_minor"]] ],

"_BEFORE_CODE_LOADING": [ "^^", "execute", [[
[ "get_kernel" ],
[ "byquery", [[ "^^", "get", "transformers_query" ]] ],
[ "use" ]
]] ],

"mlperf_inference_git_entry": [ "^", "byquery", "git_repo,repo_name=mlperf_inference_git" ],

Expand Down Expand Up @@ -68,6 +76,14 @@
"gen_len": [ "^^" , "dig","accuracy_dict.gen_len" ],
"gen_num": [ "^^" , "dig","accuracy_dict.gen_num" ],

"accuracy_range_dict": { "rouge1": [ 43.986888, null ], "rouge2": [ 21.814848, null ], "rougeL": [ 28.330038, null ], "tokens_per_sample": [ 265.005, null ] },

"abs_path": [ "^^", "get_path" ],
"rel_log_path": "mlperf_log_accuracy.json",
"tokenised_accuracy_log_path": [ "^^", "substitute", "#{abs_path}#/#{rel_log_path}#" ],

"rel_output_log_path": "detokenised_mlperf_log.json",
"output_log_path": [ "^^", "substitute", "#{abs_path}#/#{rel_output_log_path}#" ],

"accuracy_range_dict": { "rouge1": [ 43.986888, null ], "rouge2": [ 21.814848, null ], "rougeL": [ 28.330038, null ], "tokens_per_sample": [ 265.005, null ] }
"detokenised_log": [ "^^", "detokenise" ]
}

0 comments on commit 2445b09

Please sign in to comment.