-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMakeMasterfile.py
309 lines (274 loc) · 11.3 KB
/
MakeMasterfile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 22 09:36:17 2022
@author: elliemorgenroth
"""
import glob
import itertools
import os
import sys
import time
import warnings
import numpy as np
import seaborn as sn
from matplotlib import pyplot as plt
from pandas import DataFrame, read_csv
from scipy import stats
from constants_emo_film import ALL_PARTICIPANTS_ANNOTATION as all_participants
from constants_emo_film import ITS
from constants_emo_film import MOVIES_DICT as movies
from helper_annot import lins_ccc, load_data
# Set important paths
# adapt USER and paths for your local environment
root = sys.argv[1] if len(sys.argv) > 1 else "/Volumes/Sinergia_Emo/Emo-FilM"
temp = sys.argv[2] if len(sys.argv) > 2 else "/Volumes/Sinergia_Emo/Emo-FilM/temp"
zfiles = sys.argv[3] if len(sys.argv) > 3 else True
saving = sys.argv[4] if len(sys.argv) > 4 else True
out = os.path.join(root, "Annotstudy", "derivatives")
# Changes directory to your local path
os.chdir(root)
# Check that temp directory is empty/does not exists, otherwise creates a new directory
if os.path.exists(temp):
if not [f for f in os.listdir(temp) if not f.startswith('.')]:
print(f"{temp} folder exists but it is empty")
else:
warnings.warn(f"{temp} folder exists and it is empty")
temp = f"{temp}_{time.strftime('%Y%m%d%H%M%S')}"
os.makedirs(temp)
print(f"Using temp folder: {temp}")
max_zscore = 15 # z-threshold for removal of files for outliers
threshold = 0.20 # threshold for removal of files for agreements
# ## Different Quality Control information
excluded = [0, 0] # Files excluded for flat or outliers
bad_ann = [] # List of Bad Annotations, so we can follow which ones they are
five = [] # List of Files removed because they are worst of five
numberAnnot = {
"3": 0,
"4": 0,
"_3": 0,
"_4": 0,
} # number of annotations making a ground truth '_' marks that one was removed for agreement/five
if zfiles is True:
for p in all_participants: # Loop over participants to find files
for n in ITS: # Loop over items
group = np.array([])
val_films = []
for mix, movie in enumerate(movies): # Loop over films
files = glob.glob(
os.path.join(
root,
"Annotstudy",
f"sub-{p}",
"beh",
f"sub-{p}_task-{movie}_recording-{n}_stim.tsv.gz",
)
)
if len(files) > 4:
print(f"greater 4, {len(files)}")
for m in files:
pre_excluded = sum(excluded)
group, excluded = load_data(
m, max_zscore, group, excluded
) # Load data and add to group (or not)
if pre_excluded < sum(excluded):
continue
else:
val_films.append(movie)
# z score data
zgroup = stats.zscore(group)
for num, val in enumerate(val_films):
zdata = zgroup[0 : movies[val]]
if len(zdata) not in movies.values():
raise Warning("ALERT")
zgroup = zgroup[movies[val] :]
np.savetxt(
os.path.join(temp, f"sub-{p}_task-{val}_recording-{n}_stim.tsv"),
zdata,
fmt="%.6f",
delimiter="\t",
)
# ## Prepare variables for Agreement, Weights and final Time Courses
ccc = {} # All Agreement scores
mean_ccc = np.ones((len(movies), len(ITS))) # Mean Agreement scores
MeanTC = {} # Final time courses as a dictionary
for mix, movie in enumerate(movies): # Loop over films
for iix, n in enumerate(ITS): # Loop over items
group = np.array([]) # Start with an empty array to put the annotators TCs in
labels = [] # Empty list of who is annotating this pairing
for six, p in enumerate(
all_participants
): # Loop over participants to find files
files = glob.glob(
os.path.join(temp, f"sub-{p}_task-{movie}_recording-{n}_stim.tsv")
)
for m in files:
labels.append(p) # Append participant labels here
pre_excluded = excluded # Check if the file is used or not
series = read_csv(m, header=None, delimiter="\t", names=["y"])
if np.shape(group)[0] == 0:
group = series
else:
group = np.hstack([group, series])
if group.shape[1] > 2:
# ## All things agreement start here as group is completed for this pairing
# First calculate all cccs for this filmxitem combination
for i, j in enumerate(itertools.combinations(range(group.shape[1]), 2)):
ccc[movie + "_" + n + "_" + str(j[0]) + "_" + str(j[1])] = lins_ccc(
group[:, j[0]], group[:, j[1]]
)
# Calculate the mean_ccc for this filmxitem combination
mean_ccc[mix, iix] = np.mean(
[ccc[z] for z in ccc.keys() if z.find(movie + "_" + n) == 0]
)
# ## Find out if leaving one out will improve agreement
ccc_loolist = [] # ## List of CCCs if one annotator is left out
ccc_loo = {} # ## Mean CCCs if one annotator is left out
for q in range(group.shape[1]):
ccc_loolist = [
ccc[z]
for z in ccc.keys()
if z.find(str(q)) == -1 and z.find(movie + "_" + n) >= 0
]
ccc_loo[q] = np.mean(ccc_loolist)
best_ccc = max(
ccc_loo.values()
) # Best CCC possible after leaving out a participant
wr_idx = [i for i in ccc_loo if ccc_loo[i] == best_ccc][
0
] # Worst raters index in this filmxitem combination
worst_rater = labels[wr_idx] # Worst raters label (to find index overall)
elif group.shape[1] <= 2:
raise Warning(f"ALERT, only {str(group.shape[1])} raters left {movie}_{n}")
raise Warning(
"ALERT, this case should NOT happen when all films and items are selected"
)
mean_ccc[mix, iix] = lins_ccc(group[:, 0], group[:, 1])
if group.shape[1] <= 3: # If there are only 3 or less annotators
if (
best_ccc - mean_ccc[mix, iix]
) >= threshold: # Check that there isn't a major outlier in these
warnings.warn(
f"ALERT, only {str(group.shape[1])} raters left {movie}_{n}"
)
warnings.warn(
f"Agreement is {str(mean_ccc[mix,iix])} instead of {str(best_ccc)}"
)
warnings.warn("Keeping 3 raters anyway")
numberAnnot["3"] += 1
elif group.shape[1] == 4: # Standard case is if there are 4 annotators
if (
best_ccc - mean_ccc[mix, iix]
) >= threshold: # Check if exclusion would make the mean better
# Erase worst rater from everything if True
for i in [
z
for z in ccc.keys()
if z.find(str(wr_idx)) != -1 and z.find(movie + "_" + n) == 0
]:
del ccc[i]
bad_ann.append(f"{movie}_{worst_rater}_{n}")
group = np.delete(group, wr_idx, 1)
mean_ccc[mix, iix] = best_ccc
numberAnnot["_3"] += 1
else:
numberAnnot["4"] += 1
elif (
group.shape[1] == 5
): # In the rare case of 5 annotators we remove the worst in any case
print(f"Group size is 5 for item {n} and film {movie}")
# Erase worst rater from everything
for i in [
z
for z in ccc.keys()
if z.find(str(wr_idx)) != -1 and z.find(movie + "_" + n) == 0
]:
del ccc[i]
five.append(f"{movie}_{worst_rater}_{n}")
group = np.delete(group, wr_idx, 1)
mean_ccc[mix, iix] = best_ccc
numberAnnot["_4"] += 1
elif (
group.shape[1] > 5
): # In the rare case of 5 annotators we remove the worst in any case
print(f"Group size is 6 for item {n} and film {movie}")
# Erase worst rater from everything
for i in [
z
for z in ccc.keys()
if z.find(str(wr_idx)) != -1 and z.find(movie + "_" + n) == 0
]:
del ccc[i]
for i in [
z
for z in ccc.keys()
if z.find(str(wr_idx)) != -1 and z.find(movie + "_" + n) == 0
]:
del ccc[i]
five.append(f"{movie}_{worst_rater}_{n}")
five.append(f"{movie}_second_{n}")
group = np.delete(group, wr_idx, 1)
mean_ccc[mix, iix] = best_ccc
numberAnnot["_4"] += 1
# ## Add final group mean to MeanTC, this is the ground truth
MeanTC[movie + "_" + n] = np.mean(group, axis=1)
# ## Print QC information
print()
print(f"{str(sum(excluded))} annotations excluded")
print(f"{str(excluded[0])} annotations removed for flat")
print(f"{str(excluded[1])} annotations removed for outliers")
print(f"{len(bad_ann)} annotations removed for agreement")
print(f"{len(five)} annotations removed as they were worst of 5")
print(numberAnnot)
print("Mean of completed CCC values:")
print(np.nanmean(list(ccc.values())))
ccct = np.array(list(ccc.values()))
np.save(os.path.join(out, "ccc_values"), ccct)
cccixm_df = DataFrame(mean_ccc, index=movies, columns=ITS)
np.save(os.path.join(out, "mean_ccc"), cccixm_df)
durs = []
AMTC = []
mfilm = np.zeros([len(ITS), len(movies)])
for iix, n in enumerate(ITS): # Loop over items
MTC = []
for mix, movie in enumerate(movies): # Loop over films
if iix == 0:
durs.append(np.shape(MeanTC[movie + "_" + n])[0])
TC = MeanTC[movie + "_" + n]
mfilm[iix, mix] = np.mean(MeanTC[movie + "_" + n])
if len(MTC) == 0:
MTC = TC
else:
MTC = np.hstack([MTC, TC])
MTC = MTC - np.mean(MTC)
if len(AMTC) == 0:
AMTC = MTC
else:
AMTC = np.vstack([AMTC, MTC])
fig = plt.figure(figsize=(15, 5), dpi=300)
sn.heatmap(
mfilm.T, square=True, xticklabels=ITS, yticklabels=movies, cmap="coolwarm", center=0
)
fig.savefig(
os.path.join(out, "Figure2.png"),
bbox_inches="tight",
)
print(f"mean df = {np.mean(np.mean(AMTC))}")
print(f"var df = {np.mean(np.var(AMTC))}")
print(f"max df = {np.mean(np.max(AMTC))}")
print(f"min df = {np.mean(np.min(AMTC))}")
if saving is True:
saveTC = DataFrame(AMTC.transpose())
saveTC.to_csv(
os.path.join(out, "C_Annot_FILMS_stim.tsv"), sep="\t", header=False, index=False
)
for mix, movie in enumerate(movies):
file = AMTC[:, : movies[movie]]
AMTC = AMTC[:, movies[movie] :]
df = DataFrame(file.transpose())
df.to_csv(
os.path.join(out, f"Annot_{movie}_stim.tsv"),
sep="\t",
header=False,
index=False,
)