-
Notifications
You must be signed in to change notification settings - Fork 0
/
helpers.py
121 lines (90 loc) · 2.85 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import numpy
import pandas as pd
import random
import math
def sigmoid(vector):
out = []
for x in vector:
out.append(1 / (1 + math.exp(-x)))
return out
def probably(chance):
return random.random() < chance
def flatten(tensor):
tensor = tensor.reshape(1, -1)
tensor = tensor.squeeze()
return tensor
def sum_lists(a1, a2):
suml = []
for (x, y) in zip(a1, a2):
suml.append(x + y)
return suml
def divide_list(a1, n):
div = []
for x in a1:
div.append(x / n)
return div
def invert_list(a1):
inv = []
for x in a1:
inv.append(1 / x)
return inv
def average_list(a1):
return sum(a1) / len(a1)
def flatten_list(t):
return [item for sublist in t for item in sublist]
def softmax(vector):
e = numpy.exp(vector)
return e / e.sum()
def load_langdata(lang, ex=False):
path = "./data/document_embeds/" + lang
distances = {}
if ex:
ex_list = ["results", "add", "add_w", "mult", "mult_w"]
else:
ex_list = ["results"]
for incarnation in [x for x in os.listdir(path) if ".csv" in x]:
i_name = os.path.splitext(incarnation)[0]
sepr = " "
if i_name not in ex_list:
distances[i_name] = pd.read_csv(path + "/" + incarnation, sep=sepr, engine='python', index_col=0)
chunks = distances["lemma"].columns
colnamesx = {}
rownamesx = {}
for i, chunk in enumerate(chunks):
rownamesx[i] = chunk
colnamesx[str(i)] = chunk
for d in distances:
if distances[d].columns.tolist()[0] == "Unnamed: 0":
distances[d] = distances[d].set_index("Unnamed: 0")
distances[d].rename(rownamesx, inplace=True, axis=0)
distances[d].rename(colnamesx, inplace=True, axis=1)
if d == "bert":
distances[d] = distances[d].transform(lambda x: 1 - x)
return distances
def get_langs(path="./data/document_embeds"):
return next(os.walk(path))[1]
def a_n(lang, plus=False):
data = load_langdata(lang)
chunks = [x for x in data["lemma"].columns if x != "Unnamed: 0"]
try:
novels = list(set([x.split("_")[0] + "_" + x.split("_")[1] for x in chunks]))
except:
print([x for x in chunks if "_" not in x])
authors = [x.split("_")[0] for x in novels]
authors = list(set(authors))
authors_novels = {}
for a in authors:
authors_novels[a] = dict.fromkeys([x for x in novels if x.split("_")[0] == a])
for n in authors_novels[a]:
authors_novels[a][n] = [x for x in chunks if x.split("_")[0] + "_" + x.split("_")[1] == n]
if plus:
return authors_novels, authors, novels, chunks
else:
return authors_novels
def get_author_single(authors_novels):
lista = []
for a in authors_novels:
if len(authors_novels[a]) < 2:
lista.append(a)
return lista