-
Notifications
You must be signed in to change notification settings - Fork 0
/
input.py
51 lines (40 loc) · 1.55 KB
/
input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import numpy as np
import re
import itertools
from collections import Counter
def load_data_and_labels():
positiveE = list(open("./data/pru.csv", "r", encoding='utf-8').readlines())
positiveE = [s.strip() for s in positiveE]
negativeE = list(open("./data/npru.csv", "r", encoding='utf-8').readlines())
negativeE = [s.strip() for s in negativeE]
xT = positiveE + negativeE
xT = [s.split(" ") for s in xT]
positiveL = [[0,1] for _ in positiveE]
negativeL = [[1,0] for _ in negativeE]
y = np.concatenate([positiveL,negativeL],0)
return [xT, y]
def pad_sentences(sentences, padding_word="<PAD/>"):
sequenceL = max(len(x) for x in sentences)
paddedS = []
for i in range(len(sentences)):
sentence = sentences[i]
num_padding = sequenceL - len(sentence)
newSentence = sentence + [padding_word] * num_padding
paddedS.append(newSentence)
return paddedS
def build_vocab(sentences):
wordC = Counter(itertools.chain(*sentences))
vocabularyInv = [x[0] for x in wordC.most_common()]
vocabularyInv = list(sorted(vocabularyInv))
vocabulary = {x: i for i,x in enumerate(vocabularyInv)}
return [vocabulary, vocabularyInv]
def build_input_data(sentences, labels, vocabulary):
x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
y = np.array(labels)
return [x,y]
def load_data():
sentences, labels = load_data_and_labels()
sentences_padded = pad_sentences(sentences)
vocabulary, vocabularyInv = build_vocab(sentences_padded)
x, y = build_input_data(sentences_padded, labels, vocabulary)
return [x, y, vocabulary, vocabularyInv]