forked from Esmail-ibraheem/Tinyllamas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinference.py
147 lines (122 loc) · 6.18 KB
/
inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import torch
import torch.nn as nn
from model import Transformer
from config import LlamaConfig
from sentencepiece import SentencePieceProcessor
import time
import json
from tqdm import tqdm
from pathlib import Path
from typing import Optional
class LLaMA:
def __init__(self, model: Transformer, tokenizer: SentencePieceProcessor, model_args: LlamaConfig):
self.model = model
self.tokenizer = tokenizer
self.args = model_args
@staticmethod
def build(checkpoints_dir: str, tokenizer_path: str, load_model: bool, max_seq_len: int, max_batch_size: int, device: str):
prev_time = time.time()
if load_model:
checkpoints = sorted(Path(checkpoints_dir).glob("*.pth"))
assert len(checkpoints) > 0, f"no checkpoint files found in {checkpoints_dir}"
ckpt_path = checkpoints[0]
print(f'Loading checkpoint "{ckpt_path}"')
checkpoint = torch.load(ckpt_path, map_location="cpu")
print(f"Loaded checkpoint in {time.time() - prev_time:.2f}s")
prev_time = time.time()
with open(Path(checkpoints_dir) / "params.json", "r") as f:
params = json.loads(f.read())
model_args: LlamaConfig = LlamaConfig(
max_seq_len=max_seq_len,
max_batch_size=max_batch_size,
device=device,
**params
)
tokenizer = SentencePieceProcessor()
tokenizer.load(tokenizer_path)
model_args.vocab_size = tokenizer.vocab_size()
if device == "cuda":
torch.set_default_tensor_type(torch.cuda.HalfTensor)
else:
torch.set_default_tensor_type(torch.BFloat16Tensor)
model = Transformer(model_args).to(device)
if load_model:
del checkpoint['rope.freqs']
model.load_state_dict(checkpoint, strict=True)
print(f"Loaded state dict in {time.time() - prev_time:.2f}s")
return LLaMA(model, tokenizer, model_args)
def text_completion(self, prompts: list[str], temperature: float = 0.6, top_p: float = 0.9, max_gen_len: Optional[int] = None):
if max_gen_len is None:
max_gen_len = self.args.max_seq_len - 1
# Convert each prompt into tokens
prompt_tokens = [self.tokenizer.encode(prompt, out_type=int, add_bos=True, add_eos=False) for prompt in prompts]
# Make sure the batch size is not too large
batch_size = len(prompt_tokens)
assert batch_size <= self.args.max_batch_size, f"batch size must be less than or equal to {self.args.max_batch_size}"
max_prompt_len = max(len(prompt) for prompt in prompt_tokens)
# Make sure the prompt length is not larger than the maximum sequence length
assert max_prompt_len <= self.args.max_seq_len, f"prompt length must be less than or equal to {self.args.max_seq_len}"
total_len = min(self.args.max_seq_len, max_gen_len + max_prompt_len)
# Create the list that will contain the generated tokens, along with the initial prompt tokens
pad_id = self.tokenizer.pad_id()
tokens = torch.full((batch_size, total_len), pad_id, dtype=torch.long, device=device)
for k, t in enumerate(prompt_tokens):
# Populate the initial tokens with the prompt tokens
tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device=device)
eos_reached = torch.tensor([False] * batch_size, device=device)
prompt_tokens_mask = tokens != pad_id
cur_iterator = tqdm(range(1, total_len), desc="Generating tokens")
for cur_pos in cur_iterator:
with torch.no_grad():
logits = self.model.forward(tokens[:, cur_pos-1:cur_pos], cur_pos)
if temperature > 0:
probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
next_token = self._sample_top_p(probs, top_p)
else:
next_token = torch.argmax(logits[:, -1], dim=-1)
next_token = next_token.reshape(-1)
next_token = torch.where(prompt_tokens_mask[:, cur_pos], tokens[:, cur_pos], next_token)
tokens[:, cur_pos] = next_token
eos_reached |= (~prompt_tokens_mask[:, cur_pos]) & (next_token == self.tokenizer.eos_id)
if all(eos_reached):
break
out_tokens = []
out_text = []
for prompt_index, current_prompt_tokens in enumerate(tokens.tolist()):
if self.tokenizer.eos_id in current_prompt_tokens:
eos_idx = current_prompt_tokens.index(self.tokenizer.eos_id)
current_prompt_tokens = current_prompt_tokens[:eos_idx]
out_tokens.append(current_prompt_tokens)
out_text.append(self.tokenizer.decode(current_prompt_tokens))
return (out_tokens, out_text)
def _sample_top_p(self, probs, p):
probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
probs_sum = torch.cumsum(probs_sort, dim=-1)
mask = probs_sum - probs_sort > p
probs_sort[mask] = 0.0
probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
next_token = torch.multinomial(probs_sort, num_samples=1)
next_token = torch.gather(probs_idx, -1, next_token)
return next_token
if __name__ == '__main__':
torch.manual_seed(0)
allow_cuda = False
device = 'cuda' if torch.cuda.is_available() and allow_cuda else 'cpu'
prompts = [
"Simulate the motion of a projectile launched at a certain angle and velocity, including factors like gravity and air resistance.",
"Create a program that calculates the gravitational force between two objects based on their masses and distances."
"Develop a program to simulate the behavior of ideal gases using the laws of thermodynamics."
]
model = LLaMA.build(
checkpoints_dir='llama-2-7b/',
tokenizer_path='tokenizer.model',
load_model=True,
max_seq_len=1024,
max_batch_size=len(prompts),
device=device
)
out_tokens, out_texts = (model.text_completion(prompts, max_gen_len=64))
assert len(out_texts) == len(prompts)
for i in range(len(out_texts)):
print(f'{out_texts[i]}')
print('-' * 50)