forked from boostcampaitech7/level2-mrc-nlp-16
-
Notifications
You must be signed in to change notification settings - Fork 0
/
context_dense_embedding.py
82 lines (68 loc) · 2.31 KB
/
context_dense_embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import argparse
import json
import pickle
import numpy as np
import torch
from transformers import AutoTokenizer
import wandb
from data_modules.data_sets import ContextDataset
from models.model import RetrievalModel
from utils.embedding import context_embedding
def main(arg):
model_path = arg.model_path ## wandb artifact 상에 load된 model path
model_name = arg.model_name ## wandb artifact 상에 load된 model name
batch_size = arg.batch_size
## context dataset load
context_path = "data/wikipedia_documents.json"
with open(context_path, "r", encoding="utf-8") as f:
contexts = json.load(f)
contexts = {value["document_id"]: value["text"] for value in contexts.values()}
## model/config loading
wandb.login()
run = wandb.init()
artifact = run.use_artifact(model_path)
model_dir = artifact.download()
with open(f"{model_dir}/config_retrieval.json", "r") as f:
config = json.load(f)
tokenizer = AutoTokenizer.from_pretrained(config["MODEL_NAME"])
retrieval = RetrievalModel(dict(config))
checkpoint = torch.load(f"{model_dir}/{model_name}")
retrieval.load_state_dict(checkpoint["state_dict"])
## dataset setting
context_dataset = ContextDataset(
context=list(contexts.values()),
document_id=list(contexts.keys()),
tokenizer=tokenizer,
max_length=config["CONTEXT_MAX_LEN"],
)
## embedding
contexts_emb = context_embedding(contextdataset=context_dataset, retrieval=retrieval, batch_size=batch_size)
c_emb = contexts_emb["contexts_embedding"].detach().numpy().astype("float32")
contexts_embedding_path = "data/embedding/context_dense_embedding.bin"
with open(contexts_embedding_path, "wb") as f:
pickle.dump(c_emb, f)
if __name__ == "__main__":
args = argparse.ArgumentParser()
args.add_argument(
"-mp",
"--model_path",
default=None,
type=str,
help="artifact path for a model (default: None)",
)
args.add_argument(
"-mn",
"--model_name",
default=None,
type=str,
help="model name in artifact (default: None)",
)
args.add_argument(
"-b",
"--batch_size",
default=2,
type=int,
help="batch size (default: 2)",
)
arg = args.parse_args()
main(arg)