dennybritz · cahya-wirawan · Feb 20, 2017 · Feb 20, 2017 · Feb 21, 2017 · Feb 28, 2017
diff --git a/README.md b/README.md
@@ -21,6 +21,8 @@ optional arguments:
   -h, --help            show this help message and exit
   --embedding_dim EMBEDDING_DIM
                         Dimensionality of character embedding (default: 128)
+  --enable_word_embeddings
+                        Enable/disable the word embeddings (default: True)
   --filter_sizes FILTER_SIZES
                         Comma-separated filter sizes (default: '3,4,5')
   --num_filters NUM_FILTERS

diff --git a/config.yml b/config.yml
@@ -0,0 +1,56 @@
+word_embeddings:
+  # Two types of word embedding algorithm (word2vec and glove) are supported.
+  # Just set the default to empty string to disable the word embeddings
+  default: word2vec
+  word2vec:
+    path: ../../data/input/word_embeddings/GoogleNews-vectors-negative300.bin
+    dimension: 300
+    binary: True
+  glove:
+    path: ../../data/glove.6B.100d.txt
+    dimension: 100
+    length: 400000
+
+datasets:
+  # Support currently 3 datasets: mrpolarity, 20newsgroup and localdata
+  default: 20newsgroup
+  mrpolarity:
+    positive_data_file:
+      path: "data/rt-polaritydata/rt-polarity.pos"
+      info: "Data source for the positive data"
+    negative_data_file:
+      path: "data/rt-polaritydata/rt-polarity.neg"
+      info: "Data source for the negative data"
+  20newsgroup:
+    # The dataset includes following 20 newsgroups:
+    # alt.atheism, comp.windows.x, rec.sport.hockey, soc.religion.christian
+    # comp.graphics, misc.forsale, sci.crypt, talk.politics.guns
+    # comp.os.ms-windows.misc, rec.autos, sci.electronics, talk.politics.mideast
+    # comp.sys.ibm.pc.hardware, rec.motorcycles, sci.med, talk.politics.misc
+    # comp.sys.mac.hardware, rec.sport.baseball, sci.space, talk.religion.misc
+    categories:
+      - alt.atheism
+      - comp.graphics
+      - sci.med
+      - soc.religion.christian
+    shuffle: True
+    random_state: 42
+  localdata:
+    # Load text files with categories as subfolder names.
+    # Individual samples are assumed to be files stored
+    # a two levels folder structure such as the following:
+    # container_folder/
+    #   category_1_folder/
+    #     file_1.txt file_2.txt ... file_42.txt
+    #   category_2_folder/
+    #     file_43.txt file_44.txt ...
+    #
+    # As an example, a SentenceCorpus dataset from
+    # https://archive.ics.uci.edu/ml/datasets/Sentence+Classification
+    # has been used. The dataset includes following 3 domains:
+    # arxiv, jdm and plos
+    container_path: ../../data/input/SentenceCorpus
+    categories:
+    shuffle: True
+    random_state: 42
+
diff --git a/data_helpers.py b/data_helpers.py
@@ -1,7 +1,7 @@
 import numpy as np
 import re
-import itertools
-from collections import Counter
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.datasets import load_files
 
 
 def clean_str(string):
@@ -25,26 +25,6 @@ def clean_str(string):
     return string.strip().lower()
 
 
-def load_data_and_labels(positive_data_file, negative_data_file):
-    """
-    Loads MR polarity data from files, splits the data into words and generates labels.
-    Returns split sentences and labels.
-    """
-    # Load data from files
-    positive_examples = list(open(positive_data_file, "r").readlines())
-    positive_examples = [s.strip() for s in positive_examples]
-    negative_examples = list(open(negative_data_file, "r").readlines())
-    negative_examples = [s.strip() for s in negative_examples]
-    # Split by words
-    x_text = positive_examples + negative_examples
-    x_text = [clean_str(sent) for sent in x_text]
-    # Generate labels
-    positive_labels = [[0, 1] for _ in positive_examples]
-    negative_labels = [[1, 0] for _ in negative_examples]
-    y = np.concatenate([positive_labels, negative_labels], 0)
-    return [x_text, y]
-
-
 def batch_iter(data, batch_size, num_epochs, shuffle=True):
     """
     Generates a batch iterator for a dataset.
@@ -63,3 +43,129 @@ def batch_iter(data, batch_size, num_epochs, shuffle=True):
             start_index = batch_num * batch_size
             end_index = min((batch_num + 1) * batch_size, data_size)
             yield shuffled_data[start_index:end_index]
+
+
+def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42):
+    """
+    Retrieve data from 20 newsgroups
+    :param subset: train, test or all
+    :param categories: List of newsgroup name
+    :param shuffle: shuffle the list or not
+    :param random_state: seed integer to shuffle the dataset
+    :return: data and labels of the newsgroup
+    """
+    datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
+    return datasets
+
+
+def get_datasets_mrpolarity(positive_data_file, negative_data_file):
+    """
+    Loads MR polarity data from files, splits the data into words and generates labels.
+    Returns split sentences and labels.
+    """
+    # Load data from files
+    positive_examples = list(open(positive_data_file, "r").readlines())
+    positive_examples = [s.strip() for s in positive_examples]
+    negative_examples = list(open(negative_data_file, "r").readlines())
+    negative_examples = [s.strip() for s in negative_examples]
+
+    datasets = dict()
+    datasets['data'] = positive_examples + negative_examples
+    target = [0 for x in positive_examples] + [1 for x in negative_examples]
+    datasets['target'] = target
+    datasets['target_names'] = ['positive_examples', 'negative_examples']
+    return datasets
+
+
+def get_datasets_localdata(container_path=None, categories=None, load_content=True,
+                       encoding='utf-8', shuffle=True, random_state=42):
+    """
+    Load text files with categories as subfolder names.
+    Individual samples are assumed to be files stored a two levels folder structure.
+    :param container_path: The path of the container
+    :param categories: List of classes to choose, all classes are chosen by default (if empty or omitted)
+    :param shuffle: shuffle the list or not
+    :param random_state: seed integer to shuffle the dataset
+    :return: data and labels of the dataset
+    """
+    datasets = load_files(container_path=container_path, categories=categories,
+                          load_content=load_content, shuffle=shuffle, encoding=encoding,
+                          random_state=random_state)
+    return datasets
+
+
+def load_data_labels(datasets):
+    """
+    Load data and labels
+    :param datasets:
+    :return:
+    """
+    # Split by words
+    x_text = datasets['data']
+    x_text = [clean_str(sent) for sent in x_text]
+    # Generate labels
+    labels = []
+    for i in range(len(x_text)):
+        label = [0 for j in datasets['target_names']]
+        label[datasets['target'][i]] = 1
+        labels.append(label)
+    y = np.array(labels)
+    return [x_text, y]
+
+
+def load_embedding_vectors_word2vec(vocabulary, filename, binary):
+    # load embedding_vectors from the word2vec
+    encoding = 'utf-8'
+    with open(filename, "rb") as f:
+        header = f.readline()
+        vocab_size, vector_size = map(int, header.split())
+        # initial matrix with random uniform
+        embedding_vectors = np.random.uniform(-0.25, 0.25, (len(vocabulary), vector_size))
+        if binary:
+            binary_len = np.dtype('float32').itemsize * vector_size
+            for line_no in range(vocab_size):
+                word = []
+                while True:
+                    ch = f.read(1)
+                    if ch == b' ':
+                        break
+                    if ch == b'':
+                        raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
+                    if ch != b'\n':
+                        word.append(ch)
+                word = str(b''.join(word), encoding=encoding, errors='strict')
+                idx = vocabulary.get(word)
+                if idx != 0:
+                    embedding_vectors[idx] = np.fromstring(f.read(binary_len), dtype='float32')
+                else:
+                    f.seek(binary_len, 1)
+        else:
+            for line_no in range(vocab_size):
+                line = f.readline()
+                if line == b'':
+                    raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
+                parts = str(line.rstrip(), encoding=encoding, errors='strict').split(" ")
+                if len(parts) != vector_size + 1:
+                    raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
+                word, vector = parts[0], list(map('float32', parts[1:]))
+                idx = vocabulary.get(word)
+                if idx != 0:
+                    embedding_vectors[idx] = vector
+        f.close()
+        return embedding_vectors
+
+
+def load_embedding_vectors_glove(vocabulary, filename, vector_size):
+    # load embedding_vectors from the glove
+    # initial matrix with random uniform
+    embedding_vectors = np.random.uniform(-0.25, 0.25, (len(vocabulary), vector_size))
+    f = open(filename)
+    for line in f:
+        values = line.split()
+        word = values[0]
+        vector = np.asarray(values[1:], dtype="float32")
+        idx = vocabulary.get(word)
+        if idx != 0:
+            embedding_vectors[idx] = vector
+    f.close()
+    return embedding_vectors
diff --git a/eval.py b/eval.py
@@ -3,19 +3,28 @@
 import tensorflow as tf
 import numpy as np
 import os
-import time
-import datetime
 import data_helpers
-from text_cnn import TextCNN
 from tensorflow.contrib import learn
 import csv
+from sklearn import metrics
+import yaml
+
+
+def softmax(x):
+    """Compute softmax values for each sets of scores in x."""
+    if x.ndim == 1:
+        x = x.reshape((1, -1))
+    max_x = np.max(x, axis=1).reshape((-1, 1))
+    exp_x = np.exp(x - max_x)
+    return exp_x / np.sum(exp_x, axis=1).reshape((-1, 1))
+
+with open("config.yml", 'r') as ymlfile:
+    cfg = yaml.load(ymlfile)
 
 # Parameters
 # ==================================================
 
 # Data Parameters
-tf.flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.")
-tf.flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the positive data.")
 
 # Eval Parameters
 tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
@@ -34,13 +43,32 @@
     print("{}={}".format(attr.upper(), value))
 print("")
 
+datasets = None
+
 # CHANGE THIS: Load data. Load your own data here
+dataset_name = cfg["datasets"]["default"]
 if FLAGS.eval_train:
-    x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)
+    if dataset_name == "mrpolarity":
+        datasets = data_helpers.get_datasets_mrpolarity(cfg["datasets"][dataset_name]["positive_data_file"]["path"],
+                                             cfg["datasets"][dataset_name]["negative_data_file"]["path"])
+    elif dataset_name == "20newsgroup":
+        datasets = data_helpers.get_datasets_20newsgroup(subset="test",
+                                              categories=cfg["datasets"][dataset_name]["categories"],
+                                              shuffle=cfg["datasets"][dataset_name]["shuffle"],
+                                              random_state=cfg["datasets"][dataset_name]["random_state"])
+    x_raw, y_test = data_helpers.load_data_labels(datasets)
     y_test = np.argmax(y_test, axis=1)
+    print("Total number of test examples: {}".format(len(y_test)))
 else:
-    x_raw = ["a masterpiece four years in the making", "everything is off."]
-    y_test = [1, 0]
+    if dataset_name == "mrpolarity":
+        datasets = {"target_names": ['positive_examples', 'negative_examples']}
+        x_raw = ["a masterpiece four years in the making", "everything is off."]
+        y_test = [1, 0]
+    else:
+        datasets = {"target_names": ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']}
+        x_raw = ["The number of reported cases of gonorrhea in Colorado increased",
+                 "I am in the market for a 24-bit graphics card for a PC"]
+        y_test = [2, 1]
 
 # Map data into vocabulary
 vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
@@ -68,6 +96,9 @@
         # input_y = graph.get_operation_by_name("input_y").outputs[0]
         dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
 
+        # Tensors we want to evaluate
+        scores = graph.get_operation_by_name("output/scores").outputs[0]
+
         # Tensors we want to evaluate
         predictions = graph.get_operation_by_name("output/predictions").outputs[0]
 
@@ -76,20 +107,30 @@
 
         # Collect the predictions here
         all_predictions = []
+        all_probabilities = None
 
         for x_test_batch in batches:
-            batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
-            all_predictions = np.concatenate([all_predictions, batch_predictions])
+            batch_predictions_scores = sess.run([predictions, scores], {input_x: x_test_batch, dropout_keep_prob: 1.0})
+            all_predictions = np.concatenate([all_predictions, batch_predictions_scores[0]])
+            probabilities = softmax(batch_predictions_scores[1])
+            if all_probabilities is not None:
+                all_probabilities = np.concatenate([all_probabilities, probabilities])
+            else:
+                all_probabilities = probabilities
 
 # Print accuracy if y_test is defined
 if y_test is not None:
     correct_predictions = float(sum(all_predictions == y_test))
     print("Total number of test examples: {}".format(len(y_test)))
     print("Accuracy: {:g}".format(correct_predictions/float(len(y_test))))
+    print(metrics.classification_report(y_test, all_predictions, target_names=datasets['target_names']))
+    print(metrics.confusion_matrix(y_test, all_predictions))
 
 # Save the evaluation to a csv
-predictions_human_readable = np.column_stack((np.array(x_raw), all_predictions))
+predictions_human_readable = np.column_stack((np.array(x_raw),
+                                              [int(prediction) for prediction in all_predictions],
+                                              [ "{}".format(probability) for probability in all_probabilities]))
 out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv")
 print("Saving evaluation to {0}".format(out_path))
 with open(out_path, 'w') as f:
-    csv.writer(f).writerows(predictions_human_readable)
+    csv.writer(f).writerows(predictions_human_readable)
diff --git a/text_cnn.py b/text_cnn.py
@@ -15,6 +15,7 @@ def __init__(
         self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
         self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
+        self.learning_rate = tf.placeholder(tf.float32)
 
         # Keeping track of l2 regularization loss (optional)
         l2_loss = tf.constant(0.0)