-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathsft_dataset.py
55 lines (43 loc) · 1.74 KB
/
sft_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import torch
import json
import random
from tqdm import tqdm
from torch.utils.data import Dataset
class SFTDataset(Dataset):
def __init__(self, data_path, tokenizer):
super(SFTDataset, self).__init__()
data = []
print(f"Reading in data from file: {data_path}")
with open(data_path, "r") as file:
for line in file:
try:
data.append(json.loads(line))
except Exception as e:
print("json processing exception", e)
continue
print(f"Got {len(data)} examples, preprocess...")
data_dict = self.preprocess(data, tokenizer)
self.input_ids = data_dict["input_ids"]
self.labels = data_dict["labels"]
def __len__(self):
return len(self.input_ids)
def __getitem__(self, i):
return dict(input_ids=self.input_ids[i], labels=self.labels[i])
def preprocess(self, examples, tokenizer):
"""
Preprocess the data by tokenizing.
"""
all_input_ids = []
print("Tokenizing dataset...")
for ex in tqdm(examples):
# Add a positive example
text = f"{ex['context']}\n\nQ: {ex['prompt']}\nA: {ex['response']}\n"
tokenized = tokenizer.encode(text)
all_input_ids.append(torch.LongTensor(tokenized))
# Generate a negative example
random_ex = random.choice(examples)
text = f"{random_ex['context']}\n\nQ: {ex['prompt']}\nA: I don't know.\n"
tokenized = tokenizer.encode(text)
all_input_ids.append(torch.LongTensor(tokenized))
random.shuffle(all_input_ids)
return dict(input_ids=all_input_ids, labels=all_input_ids)