Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update process_data.py #30

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 13 additions & 14 deletions process_data.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
import numpy as np
import theano
import cPickle
import pickle as cPickle
from collections import defaultdict
import sys, re
import pandas as pd
import csv
import getpass


def build_data_cv(datafile, cv=10, clean_string=True):
"""
Loads data and split into 10 folds.
"""
revs = []
vocab = defaultdict(float)

with open(datafile, "rb") as csvf:
with open(datafile, "r") as csvf:
csvreader=csv.reader(csvf,delimiter=',',quotechar='"')
first_line=True
for line in csvreader:
Expand Down Expand Up @@ -93,7 +92,7 @@ def load_bin_vec(fname, vocab):
header = f.readline()
vocab_size, layer1_size = map(int, header.split())
binary_len = np.dtype(theano.config.floatX).itemsize * layer1_size
for line in xrange(vocab_size):
for line in range(vocab_size):
word = []
while True:
ch = f.read(1)
Expand All @@ -116,7 +115,7 @@ def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
for word in vocab:
if word not in word_vecs and vocab[word] >= min_df:
word_vecs[word] = np.random.uniform(-0.25,0.25,k)
print word
print(word)

def clean_str(string, TREC=False):
"""
Expand Down Expand Up @@ -159,24 +158,24 @@ def get_mairesse_features(file_name):
w2v_file = sys.argv[1]
data_folder = sys.argv[2]
mairesse_file = sys.argv[3]
print "loading data...",
print("loading data...",end ='')
revs, vocab = build_data_cv(data_folder, cv=10, clean_string=True)
num_words=pd.DataFrame(revs)["num_words"]
max_l = np.max(num_words)
print "data loaded!"
print "number of status: " + str(len(revs))
print "vocab size: " + str(len(vocab))
print "max sentence length: " + str(max_l)
print "loading word2vec vectors...",
print ("data loaded!")
print ("number of status: " + str(len(revs)))
print ("vocab size: " + str(len(vocab)))
print ("max sentence length: " + str(max_l))
print ("loading word2vec vectors..."),
w2v = load_bin_vec(w2v_file, vocab)
print "word2vec loaded!"
print "num words already in word2vec: " + str(len(w2v))
print ("word2vec loaded!")
print ("num words already in word2vec: " + str(len(w2v)))
add_unknown_words(w2v, vocab)
W, word_idx_map = get_W(w2v)
rand_vecs = {}
add_unknown_words(rand_vecs, vocab)
W2, _ = get_W(rand_vecs)
mairesse = get_mairesse_features(mairesse_file)
cPickle.dump([revs, W, W2, word_idx_map, vocab, mairesse], open("essays_mairesse.p", "wb"))
print "dataset created!"
print ("dataset created!")