-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_word_embeddings_for_hslld.py
131 lines (108 loc) · 4.14 KB
/
create_word_embeddings_for_hslld.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# https://machinelearningmastery.com/develop-word-embeddings-python-gensim/
import pickle
import os
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot
import time
import gensim
class wordEmbeddings:
def __init__(self):
self.database_from_HSLLD = []
self.dimension_size = 300
# self.HSLLD_file_Loc = self.load('food_files.pickle')
# self.names_of_file_with_hand_labels()
# self.HSLLD_file_Loc = self.correct_file_location(self.HSLLD_file_Loc)
# self.Read_files_from_HSLLD = self.read_files_HSLLD(self.HSLLD_file_Loc) # we do this in training.py now
print "Lenght of sentences HSLLD:", len(self.database_from_HSLLD)
# self.sentences_to_wordEmbeddings_practice(self.database_from_HSLLD)
# self.sentences_to_Google_wordEmbeddings_practice(None)
def all_sentences(self):
return self.database_from_HSLLD
def names_of_file_with_hand_labels(self):
loc = "solutions/HSLLD/HV1/MT/"
list_of_files = os.listdir(loc)
return list_of_files
def food_words_database(self):
return self.food_words
def load(self, fileLocaiton):
with open(fileLocaiton, 'r') as f:
return pickle.load(f)
def from_this_folder_load(self, fileLocaiton):
with open(fileLocaiton, 'r') as f:
return pickle.load(f)
def save(self, fileLocaiton, variable):
with open(fileLocaiton, 'w') as f:
pickle.dump(variable, f)
def correct_file_location(self, fileLocaiton):
for index, fileLoc in enumerate(fileLocaiton):
fileLocaiton[index] = fileLoc
if not os.path.exists(fileLocaiton[index]):
print "Path of the file doesn't exist", fileLocaiton[index]
return fileLocaiton
def read_files_HSLLD(self, fileLoc):
files_already_annotated = self.names_of_file_with_hand_labels()
for file in fileLoc:
temp_file_name = file.split('/')[-1]
#To prevent Overfitting
#Ignoring files already annotated
if temp_file_name in files_already_annotated:
# print "Skipping file ", temp_file_name
continue
pass
f = open(file, 'r')
for sentences in f:
if '*' in sentences:
sentences = sentences[6:]
self.database_from_HSLLD.append(sentences.split())
def sentences_to_wordEmbeddings(self, google_word_embeddings = 0):
if google_word_embeddings:
start = time.time()
model = gensim.models.KeyedVectors.load_word2vec_format('/home/pritish/CCPP/wordEmbeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)
print "Time to load data ", time.time() - start
return model
return Word2Vec.load('word_embeddings_HSLLD_300.bin')
# Not Working
def sentences_to_Google_wordEmbeddings_practice(self, sentence, min_word_count = 1):
# model = Word2Vec(sentence, size = self.dimension_size, min_count = min_word_count)
# model.save('word_embeddings_HSLLD_300.bin')
# new_model = Word2Vec.load('word_embeddings_HSLLD.bin')
# print(model)
model = self.sentences_to_wordEmbeddings(1)
words = list(model.vocab)
# print(words)
# print(model['apple'])
# X = model[words]
X = []
for food_name in self.food_words:
if food_name in words:
X.append(model.word_vec(food_name))
print "....Starting PCA......"
pca = PCA(n_components = 2)
result = pca.fit_transform(X)
print "Done with PCA......"
# pyplot.scatter(result[:, 0], result[:, 1])
for i, word in enumerate(words):
if word in self.food_words:
pyplot.scatter(result[i, 0], result[i, 1])
pyplot.annotate(word, xy =(result[i, 0], result[i, 1]))
pyplot.show()
def sentences_to_wordEmbeddings_practice(self, sentence, min_word_count = 1):
model = Word2Vec(sentence, size = self.dimension_size, min_count = min_word_count)
model.save('word_embeddings_HSLLD_300.bin')
# new_model = Word2Vec.load('word_embeddings_HSLLD.bin')
print(model)
words = list(model.wv.vocab)
# print(words)
# print(model['apple'])
X = model[model.wv.vocab]
pca = PCA(n_components = 2)
result = pca.fit_transform(X)
# pyplot.scatter(result[:, 0], result[:, 1])
for i, word in enumerate(words):
if word in self.food_words:
pyplot.scatter(result[i, 0], result[i, 1])
pyplot.annotate(word, xy =(result[i, 0], result[i, 1]))
pyplot.show()
if __name__ == '__main__':
Embeddings = wordEmbeddings()