Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for multiclass, word embeddings, configuration file and new datasets #77

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ optional arguments:
-h, --help show this help message and exit
--embedding_dim EMBEDDING_DIM
Dimensionality of character embedding (default: 128)
--enable_word_embeddings
Enable/disable the word embeddings (default: True)
--filter_sizes FILTER_SIZES
Comma-separated filter sizes (default: '3,4,5')
--num_filters NUM_FILTERS
Expand Down
56 changes: 56 additions & 0 deletions config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
word_embeddings:
# Two types of word embedding algorithm (word2vec and glove) are supported.
# Just set the default to empty string to disable the word embeddings
default: word2vec
word2vec:
path: ../../data/input/word_embeddings/GoogleNews-vectors-negative300.bin
dimension: 300
binary: True
glove:
path: ../../data/glove.6B.100d.txt
dimension: 100
length: 400000

datasets:
# Support currently 3 datasets: mrpolarity, 20newsgroup and localdata
default: 20newsgroup
mrpolarity:
positive_data_file:
path: "data/rt-polaritydata/rt-polarity.pos"
info: "Data source for the positive data"
negative_data_file:
path: "data/rt-polaritydata/rt-polarity.neg"
info: "Data source for the negative data"
20newsgroup:
# The dataset includes following 20 newsgroups:
# alt.atheism, comp.windows.x, rec.sport.hockey, soc.religion.christian
# comp.graphics, misc.forsale, sci.crypt, talk.politics.guns
# comp.os.ms-windows.misc, rec.autos, sci.electronics, talk.politics.mideast
# comp.sys.ibm.pc.hardware, rec.motorcycles, sci.med, talk.politics.misc
# comp.sys.mac.hardware, rec.sport.baseball, sci.space, talk.religion.misc
categories:
- alt.atheism
- comp.graphics
- sci.med
- soc.religion.christian
shuffle: True
random_state: 42
localdata:
# Load text files with categories as subfolder names.
# Individual samples are assumed to be files stored
# a two levels folder structure such as the following:
# container_folder/
# category_1_folder/
# file_1.txt file_2.txt ... file_42.txt
# category_2_folder/
# file_43.txt file_44.txt ...
#
# As an example, a SentenceCorpus dataset from
# https://archive.ics.uci.edu/ml/datasets/Sentence+Classification
# has been used. The dataset includes following 3 domains:
# arxiv, jdm and plos
container_path: ../../data/input/SentenceCorpus
categories:
shuffle: True
random_state: 42

150 changes: 128 additions & 22 deletions data_helpers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import re
import itertools
from collections import Counter
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import load_files


def clean_str(string):
Expand All @@ -25,26 +25,6 @@ def clean_str(string):
return string.strip().lower()


def load_data_and_labels(positive_data_file, negative_data_file):
"""
Loads MR polarity data from files, splits the data into words and generates labels.
Returns split sentences and labels.
"""
# Load data from files
positive_examples = list(open(positive_data_file, "r").readlines())
positive_examples = [s.strip() for s in positive_examples]
negative_examples = list(open(negative_data_file, "r").readlines())
negative_examples = [s.strip() for s in negative_examples]
# Split by words
x_text = positive_examples + negative_examples
x_text = [clean_str(sent) for sent in x_text]
# Generate labels
positive_labels = [[0, 1] for _ in positive_examples]
negative_labels = [[1, 0] for _ in negative_examples]
y = np.concatenate([positive_labels, negative_labels], 0)
return [x_text, y]


def batch_iter(data, batch_size, num_epochs, shuffle=True):
"""
Generates a batch iterator for a dataset.
Expand All @@ -63,3 +43,129 @@ def batch_iter(data, batch_size, num_epochs, shuffle=True):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
yield shuffled_data[start_index:end_index]


def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42):
"""
Retrieve data from 20 newsgroups
:param subset: train, test or all
:param categories: List of newsgroup name
:param shuffle: shuffle the list or not
:param random_state: seed integer to shuffle the dataset
:return: data and labels of the newsgroup
"""
datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
return datasets


def get_datasets_mrpolarity(positive_data_file, negative_data_file):
"""
Loads MR polarity data from files, splits the data into words and generates labels.
Returns split sentences and labels.
"""
# Load data from files
positive_examples = list(open(positive_data_file, "r").readlines())
positive_examples = [s.strip() for s in positive_examples]
negative_examples = list(open(negative_data_file, "r").readlines())
negative_examples = [s.strip() for s in negative_examples]

datasets = dict()
datasets['data'] = positive_examples + negative_examples
target = [0 for x in positive_examples] + [1 for x in negative_examples]
datasets['target'] = target
datasets['target_names'] = ['positive_examples', 'negative_examples']
return datasets


def get_datasets_localdata(container_path=None, categories=None, load_content=True,
encoding='utf-8', shuffle=True, random_state=42):
"""
Load text files with categories as subfolder names.
Individual samples are assumed to be files stored a two levels folder structure.
:param container_path: The path of the container
:param categories: List of classes to choose, all classes are chosen by default (if empty or omitted)
:param shuffle: shuffle the list or not
:param random_state: seed integer to shuffle the dataset
:return: data and labels of the dataset
"""
datasets = load_files(container_path=container_path, categories=categories,
load_content=load_content, shuffle=shuffle, encoding=encoding,
random_state=random_state)
return datasets


def load_data_labels(datasets):
"""
Load data and labels
:param datasets:
:return:
"""
# Split by words
x_text = datasets['data']
x_text = [clean_str(sent) for sent in x_text]
# Generate labels
labels = []
for i in range(len(x_text)):
label = [0 for j in datasets['target_names']]
label[datasets['target'][i]] = 1
labels.append(label)
y = np.array(labels)
return [x_text, y]


def load_embedding_vectors_word2vec(vocabulary, filename, binary):
# load embedding_vectors from the word2vec
encoding = 'utf-8'
with open(filename, "rb") as f:
header = f.readline()
vocab_size, vector_size = map(int, header.split())
# initial matrix with random uniform
embedding_vectors = np.random.uniform(-0.25, 0.25, (len(vocabulary), vector_size))
if binary:
binary_len = np.dtype('float32').itemsize * vector_size
for line_no in range(vocab_size):
word = []
while True:
ch = f.read(1)
if ch == b' ':
break
if ch == b'':
raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
if ch != b'\n':
word.append(ch)
word = str(b''.join(word), encoding=encoding, errors='strict')
idx = vocabulary.get(word)
if idx != 0:
embedding_vectors[idx] = np.fromstring(f.read(binary_len), dtype='float32')
else:
f.seek(binary_len, 1)
else:
for line_no in range(vocab_size):
line = f.readline()
if line == b'':
raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
parts = str(line.rstrip(), encoding=encoding, errors='strict').split(" ")
if len(parts) != vector_size + 1:
raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
word, vector = parts[0], list(map('float32', parts[1:]))
idx = vocabulary.get(word)
if idx != 0:
embedding_vectors[idx] = vector
f.close()
return embedding_vectors


def load_embedding_vectors_glove(vocabulary, filename, vector_size):
# load embedding_vectors from the glove
# initial matrix with random uniform
embedding_vectors = np.random.uniform(-0.25, 0.25, (len(vocabulary), vector_size))
f = open(filename)
for line in f:
values = line.split()
word = values[0]
vector = np.asarray(values[1:], dtype="float32")
idx = vocabulary.get(word)
if idx != 0:
embedding_vectors[idx] = vector
f.close()
return embedding_vectors
65 changes: 53 additions & 12 deletions eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,28 @@
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import data_helpers
from text_cnn import TextCNN
from tensorflow.contrib import learn
import csv
from sklearn import metrics
import yaml


def softmax(x):
"""Compute softmax values for each sets of scores in x."""
if x.ndim == 1:
x = x.reshape((1, -1))
max_x = np.max(x, axis=1).reshape((-1, 1))
exp_x = np.exp(x - max_x)
return exp_x / np.sum(exp_x, axis=1).reshape((-1, 1))

with open("config.yml", 'r') as ymlfile:
cfg = yaml.load(ymlfile)

# Parameters
# ==================================================

# Data Parameters
tf.flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.")
tf.flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the positive data.")

# Eval Parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
Expand All @@ -34,13 +43,32 @@
print("{}={}".format(attr.upper(), value))
print("")

datasets = None

# CHANGE THIS: Load data. Load your own data here
dataset_name = cfg["datasets"]["default"]
if FLAGS.eval_train:
x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)
if dataset_name == "mrpolarity":
datasets = data_helpers.get_datasets_mrpolarity(cfg["datasets"][dataset_name]["positive_data_file"]["path"],
cfg["datasets"][dataset_name]["negative_data_file"]["path"])
elif dataset_name == "20newsgroup":
datasets = data_helpers.get_datasets_20newsgroup(subset="test",
categories=cfg["datasets"][dataset_name]["categories"],
shuffle=cfg["datasets"][dataset_name]["shuffle"],
random_state=cfg["datasets"][dataset_name]["random_state"])
x_raw, y_test = data_helpers.load_data_labels(datasets)
y_test = np.argmax(y_test, axis=1)
print("Total number of test examples: {}".format(len(y_test)))
else:
x_raw = ["a masterpiece four years in the making", "everything is off."]
y_test = [1, 0]
if dataset_name == "mrpolarity":
datasets = {"target_names": ['positive_examples', 'negative_examples']}
x_raw = ["a masterpiece four years in the making", "everything is off."]
y_test = [1, 0]
else:
datasets = {"target_names": ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']}
x_raw = ["The number of reported cases of gonorrhea in Colorado increased",
"I am in the market for a 24-bit graphics card for a PC"]
y_test = [2, 1]

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
Expand Down Expand Up @@ -68,6 +96,9 @@
# input_y = graph.get_operation_by_name("input_y").outputs[0]
dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]

# Tensors we want to evaluate
scores = graph.get_operation_by_name("output/scores").outputs[0]

# Tensors we want to evaluate
predictions = graph.get_operation_by_name("output/predictions").outputs[0]

Expand All @@ -76,20 +107,30 @@

# Collect the predictions here
all_predictions = []
all_probabilities = None

for x_test_batch in batches:
batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
all_predictions = np.concatenate([all_predictions, batch_predictions])
batch_predictions_scores = sess.run([predictions, scores], {input_x: x_test_batch, dropout_keep_prob: 1.0})
all_predictions = np.concatenate([all_predictions, batch_predictions_scores[0]])
probabilities = softmax(batch_predictions_scores[1])
if all_probabilities is not None:
all_probabilities = np.concatenate([all_probabilities, probabilities])
else:
all_probabilities = probabilities

# Print accuracy if y_test is defined
if y_test is not None:
correct_predictions = float(sum(all_predictions == y_test))
print("Total number of test examples: {}".format(len(y_test)))
print("Accuracy: {:g}".format(correct_predictions/float(len(y_test))))
print(metrics.classification_report(y_test, all_predictions, target_names=datasets['target_names']))
print(metrics.confusion_matrix(y_test, all_predictions))

# Save the evaluation to a csv
predictions_human_readable = np.column_stack((np.array(x_raw), all_predictions))
predictions_human_readable = np.column_stack((np.array(x_raw),
[int(prediction) for prediction in all_predictions],
[ "{}".format(probability) for probability in all_probabilities]))
out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv")
print("Saving evaluation to {0}".format(out_path))
with open(out_path, 'w') as f:
csv.writer(f).writerows(predictions_human_readable)
csv.writer(f).writerows(predictions_human_readable)
1 change: 1 addition & 0 deletions text_cnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def __init__(
self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
self.learning_rate = tf.placeholder(tf.float32)

# Keeping track of l2 regularization loss (optional)
l2_loss = tf.constant(0.0)
Expand Down
Loading