-
Notifications
You must be signed in to change notification settings - Fork 1
/
select_data.py
52 lines (45 loc) · 1.31 KB
/
select_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from datasets import load_dataset
import random
import numpy as np
np.random.seed(101)
dataset = "cola"
seq_keys = {
'cola': 'sentence',
'sst2': 'sentence',
'rotten_tomatoes': 'text'
}
seq_key = seq_keys[dataset]
if dataset in ['cola', 'sst2']:
full = load_dataset('glue', dataset)['train']
else:
full = load_dataset(dataset)['train']
idxs = list(range(len(full)))
np.random.shuffle(idxs)
if dataset == 'cola':
assert idxs[0] == 2310 # with seed 101
n_samples = 128
sentences = []
labels = []
for i in range(n_samples):
sentences.append(full[idxs[i]][seq_key])
labels.append(full[idxs[i]]['label'])
#
# if split == 'test':
# assert n_samples <= 1000
# idxs = idxs[:n_samples]
# elif split == 'val':
# idxs = idxs[1000:] # first 1000 saved for testing
# assert len(idxs) >= n_samples
#
# zipped = [(idx, len(full[idx][seq_key])) for idx in idxs]
# zipped = sorted(zipped, key=lambda x: x[1])
# chunk_sz = len(zipped) // n_samples
# idxs = []
# for i in range(n_samples):
# tmp = chunk_sz * i + np.random.randint(0, chunk_sz)
# idxs.append(zipped[tmp][0])
# np.random.shuffle(idxs)
with open(f"data/{dataset}_data_{n_samples}.txt", "w") as f:
for i in range(len(sentences)):
f.write(f"{sentences[i]}\n")
f.write(f"{labels[i]}\n")