forked from yoonkim/CNN_sentence
-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathprocess_data.py
120 lines (112 loc) · 4.37 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import numpy as np
from collections import defaultdict
def load_sentences(train_file, tagField=1, textField=2, lower=True):
"""
Loads sentences.
:param train_file: filename containing labeled sentences in TSV format.
:return: sents (paired with labels), word doc freq, list of labels.
"""
sents = []
tags = {}
word_df = defaultdict(int)
with open(train_file, "rb") as f:
for line in f:
fields = line.strip().split("\t")
text = fields[textField]
tag = fields[tagField]
if tag not in tags:
tags[tag] = len(tags)
if lower:
clean_text = text.lower()
words = clean_text.split()
for word in set(words):
word_df[word] += 1
pair = (words, tags[tag])
sents.append(pair)
labels = [0] * len(tags)
for tag,i in tags.iteritems():
labels[i] = tag
return sents, word_df, labels
def load_vectors(fname, binary=True):
"""
Loads word vectors from file in word2vec format.
:param fname: name of file in word2vec format.
:return: vectors and word list.
"""
with open(fname, "rb") as f:
header = f.readline()
vocab_size, embeddings_size = map(int, header.split())
words = [''] * vocab_size
vectors = np.empty((vocab_size, embeddings_size), dtype='float32')
if binary:
binary_len = np.dtype('float32').itemsize * embeddings_size
for i in xrange(vocab_size):
word = []
while True:
ch = f.read(1)
if ch == ' ':
word = ''.join(word)
break
if ch != '\n':
word.append(ch)
words[i] = word
vectors[i,:] = np.fromstring(f.read(binary_len), dtype='float32')
else: # text
for i,line in enumerate(f):
items = line.split()
words[i] = unicode(items[0], 'utf-8')
vectors[i,:] = np.array(map(float, items[1:]))
return vectors, words
def load_word_vectors(fname, word_index, binary=True):
"""
Loads word vectors from file in word2vec format.
:param fname: name of file in word2vec format.
:return: vectors and word list.
"""
with open(fname, "rb") as f:
header = f.readline()
vocab_size, embeddings_size = map(int, header.split())
vectors = np.zeros((len(word_index), embeddings_size), dtype='float32')
if binary:
binary_len = np.dtype('float32').itemsize * embeddings_size
for i in xrange(vocab_size):
word = []
while True:
ch = f.read(1)
if ch == ' ':
word = ''.join(word)
break
if ch != '\n':
word.append(ch)
if word in word_index:
vectors[word_index[word],:] = np.fromstring(f.read(binary_len), dtype='float32')
else:
f.read(binary_len)
else: # text
for i,line in enumerate(f):
items = line.split()
word = unicode(items[0], 'utf-8')
if word in word_index:
vectors[word_index[word],:] = np.array(map(float, items[1:]))
high = 2.38 / np.sqrt(len(vectors) + embeddings_size) # see (Bottou '88)
for i,v in enumerate(vectors):
if np.count_nonzero(v) == 0:
vectors[i:] = np.random.uniform(-high, high, embeddings_size)
return np.asarray(vectors, dtype="float32")
def add_unknown_words(vectors, words, word_df, k, min_df=1):
"""
Create word vector for words in :param word_df: that occur in at least :param min_df: documents.
:param word_df: dictionary of word document frequencies.
:param k: size of embedding vectors.
"""
wordset = set(words)
high = 2.38 / np.sqrt(len(vectors)) # see (Bottou '88)
start = len(vectors)
end = start
for word in word_df:
if word not in wordset and word_df[word] >= min_df:
end += 1
words.append(word)
vectors.resize((end, k), refcheck=False)
for i in range(start, end):
vectors[i:] = np.random.uniform(-high, high, k)