-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_vocabulary.py
74 lines (51 loc) · 1.45 KB
/
create_vocabulary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import h5py
import numpy as np
import os
import csv
import nltk
import json
maxsize = 10000000
csv.field_size_limit(maxsize)
def text_to_list(T):
"""This function take a text as a string and change it to a clean list of words"""
tokens = nltk.word_tokenize(T)
return clean_list_words(tokens)
def clean_list_words(L):
clean_list = []
for w in L:
if w.isalpha():
clean_list.append(w)
return clean_list
#################################### create_vocabulary ####################################
def create_vocabulary(h5py_file, data):
with open(data, newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
A = np.array(list(spamreader), dtype=list)
dico = {}
for t in A[:,0]:
l = text_to_list(t)
for w in l:
fill_dict(dico, w)
D = {}
M = np.zeros((len(dico),2))
i=0
for k,v in dico.items():
D[k] = i
M[i, 0] = v
i+=1
with open("dictionary_voc.json", 'w') as f:
json.dump(D,f)
h5py_file.create_dataset("Vocabulary", data=M)
return None
def fill_dict(D,elt):
if elt in D:
D[elt]+=1
else:
D[elt]=1
return None
if __name__ == '__main__':
# Create a h5py file for all my datas.
with h5py.File("mydatas.hdf5", 'w') as hf:
# Split my dataset into two np_array A_train and A_val
create_vocabulary(h5py_file = hf, data = "train.csv")
# Create a numpy array with all vocabulary and respective term frequency for texts of A_train.