-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_corpora.py
73 lines (59 loc) · 2.57 KB
/
process_corpora.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""
@Author Tana Deeg and Sandro Weick
"""
import preprocess, parse_cmv, parse_micro, parse_essay , parse_usdeb, parse_mardy
import random
import pickle
# get all tuple lists from the 5 corpora
# split lists for train-test (shuffle first!), in Schaefer paper: 90% train, 10% validation&test
# merge lists for leave one out approach
# save all lists to have easy access
def get_corpus_ready(corpus: str):
match corpus:
case "cmv":
loadedcorpus = parse_cmv.parse_cmv_corpus()
case "essay":
loadedcorpus = parse_essay.parse_essay_corpus()
case "usdeb":
loadedcorpus = parse_usdeb.parse_usdeb_corpus()
case "mardy":
loadedcorpus = parse_mardy.parse_mardy_corpus()
case "micro":
loadedcorpus = parse_micro.parse_micro_corpus()
finallist = preprocess.convert_corpus(loadedcorpus)
random.shuffle(finallist)
train = finallist[0:int(len(finallist)*0.9)]
validate = finallist[int(len(finallist)*0.9):]
with open(f"./processed_data_results/train_{corpus}_file.pkl", "wb") as file_train:
pickle.dump(train, file_train)
with open(f"./processed_data_results/val_{corpus}_file.pkl", "wb") as file_val:
pickle.dump(validate, file_val)
# prepare all 5 leave one out approaches by merging four 90% splits
def get_leave_one_out(leave_out:str, corpora=["cmv", "essay", "mardy", "micro", "usdeb"]):
# corpora == list with names of all corpora ( == ["cmv", "essay", "mardy", "micro", "usdeb"])
merged = []
for corpus in corpora:
if corpus != leave_out:
with open(f"./processed_data_results/train_{corpus}_file.pkl", "rb") as g:
corp = pickle.load(g)
merged.extend(corp)
with open(f"./processed_data_results/train_without_{leave_out}_file.pkl", "wb") as file:
pickle.dump(merged, file)
# input should be eval split of corpus
# output: list with gold values (0 or 1)
def create_gold_list(corpus:str):
with open(f"./processed_data_results/val_{corpus}_file.pkl", "rb") as f:
file = pickle.load(f)
goldlist = []
for i in file:
goldlist.append(i[2])
with open(f"./processed_data_results/gold_{corpus}_list.pkl", "wb") as goldfile:
pickle.dump(goldlist, goldfile)
def process_all_corpora(corpora=["cmv", "essay", "mardy", "micro", "usdeb"]):
for corpus in corpora:
get_corpus_ready(corpus)
create_gold_list(corpus)
for corpus in corpora:
get_leave_one_out(corpus, corpora)
if __name__ == "__main__":
process_all_corpora()