-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcode_sample.py
More file actions
55 lines (39 loc) · 1.45 KB
/
code_sample.py
File metadata and controls
55 lines (39 loc) · 1.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from cgitb import text
import csv
def load_data(data_path):
texts = []
labels = []
with open(data_path) as csvfile:
reader = csv.reader(csvfile)
next(reader) # skip header
for row in reader:
text = row[0].strip()
label = int(row[1].strip())
texts.append(text)
labels.append(label)
return texts, labels
def unique(xs):
ret = set()
for x in xs:
ret.add(x)
return ret
def calculate_avg_length(texts):
cnt = 0
for t in texts:
cnt += len(t.split())
return cnt / len(texts)
if __name__ == '__main__':
# Load data
train_texts, train_labels = load_data('data/yelp_train.csv')
valid_texts, valid_labels = train_texts[-int(len(train_texts)/5):], train_labels[-int(len(train_texts)/5):]
train_texts, train_labels = train_texts[:-int(len(train_texts)/5)], train_labels[:-int(len(train_texts)/5)]
test_texts, test_labels = load_data('data/yelp_test.csv')
# Print basic statistics
print("Training set size:", len(train_texts))
print("Validation set size:", len(valid_texts))
print("Test set size:", len(test_texts))
print("Unique labels:", unique(train_labels))
print("Avg. length:", calculate_avg_length(train_texts + valid_texts + test_texts))
# Extract features from the texts
# Train the model and evaluate it on the valid set
# Test the best performing model on the test set