Update LSTM notebook

paulshealy1 · Sep 11, 2017 · 116e82a · 116e82a
1 parent 6dad5aa
commit 116e82a
Showing 1 changed file with 110 additions and 132 deletions.
diff --git a/lstm_word2vec.ipynb b/lstm_word2vec.ipynb
@@ -2,21 +2,13 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 14,
    "metadata": {
     "collapsed": false,
     "deletable": true,
     "editable": true
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using CNTK backend\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import numpy as np\n",
     "import os\n",
@@ -35,132 +27,123 @@
     "from keras.utils import to_categorical"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Download the Amazon reviews training data from a public Azure blob"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 13,
    "metadata": {
-    "collapsed": true,
-    "deletable": true,
-    "editable": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
-    "MAX_DOC_LENGTH = 300\n",
-    "MAX_NB_WORDS = 6000\n",
-    "EMBEDDING_DIM = 200\n",
-    "MAX_VOCAB_SIZE = 50000"
+    "CONTAINER_URL = \"https://anargyridsa.blob.core.windows.net/dlvm/\"\n",
+    "trainFile = \"amazon_reviews_train.csv\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 9,
    "metadata": {
     "collapsed": false,
     "deletable": true,
     "editable": true
    },
    "outputs": [],
    "source": [
-    "trainFile = '/home/anargyri/.keras/datasets/amazon_reviews_train.csv'\n",
-    "\n",
     "# read\n",
-    "train_data = pd.read_csv(trainFile, header=None, names=['rating', 'title', 'text'])"
+    "train_data = pd.read_csv(CONTAINER_URL + trainFile, header=None, names=['rating', 'title', 'text'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set the dimensions of the input and the embedding. \n",
+    "\n",
+    "MAX_DOC_LENGTH : the size of the input i.e. the number of words in the document. Longer documents will be truncated, shorter ones will be padded with zeros.\n",
+    "\n",
+    "VOCAB_SIZE : the size of the word encoding (number of most frequent words to keep in the vocabulary)\n",
+    "\n",
+    "EMBEDDING_DIM : the dimensionality of the word embedding"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": null,
    "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true
+    "collapsed": true
    },
    "outputs": [],
    "source": [
-    "labels = []\n",
-    "texts = []\n",
-    "\n",
-    "for idx in range(train_data.shape[0]):\n",
-    "    text = train_data['text'][idx]\n",
-    "    texts.append(text)\n",
-    "    labels.append(train_data['rating'][idx])"
+    "MAX_DOC_LENGTH = 300\n",
+    "VOCAB_SIZE = 6000\n",
+    "EMBEDDING_DIM = 200"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 16,
    "metadata": {
     "collapsed": true,
     "deletable": true,
     "editable": true
    },
    "outputs": [],
    "source": [
-    "tokenizer = Tokenizer(num_words=MAX_NB_WORDS)\n",
-    "tokenizer.fit_on_texts(texts)"
+    "TEXT_COL = 'text'\n",
+    "LABEL_COL = 'rating'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Fit a Keras tokenizer to the most frequent words using the entire training data set as the corpus."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 34,
    "metadata": {
     "collapsed": true,
     "deletable": true,
     "editable": true
    },
    "outputs": [],
    "source": [
-    "data = np.zeros((len(texts), MAX_DOC_LENGTH), dtype='int32')\n",
-    "doc_lst = []\n",
-    "word_index = tokenizer.word_index\n",
-    "\n",
-    "# keep the MAX_NB_WORDS most frequent words and replace the rest with 'UNK'\n",
-    "# truncate to the first MAX_SENTS sentences per doc and MAX_SENT_LENGTH words per sentence\n",
-    "\n",
-    "for i, doc in enumerate(texts):\n",
-    "    wordTokens = text_to_word_sequence(doc)\n",
-    "    j = 0\n",
-    "    words_in_doc = []\n",
-    "    for _, word in enumerate(wordTokens):\n",
-    "        if j < MAX_DOC_LENGTH: \n",
-    "            if (word in word_index) and (word_index[word] < MAX_NB_WORDS):\n",
-    "                data[i, j] = word_index[word]\n",
-    "                words_in_doc.append(word)\n",
-    "            else:\n",
-    "                data[i, j] = MAX_NB_WORDS\n",
-    "                words_in_doc.append('UNK')\n",
-    "            j = j + 1\n",
-    "    doc_lst.append(words_in_doc)"
+    "# tokenize, create seqs, pad\n",
+    "tok = Tokenizer(num_words=VOCAB_SIZE, lower=True, split=\" \")\n",
+    "tok.fit_on_texts(train_data[TEXT_COL])\n",
+    "train_seq = tok.texts_to_sequences(train_data[TEXT_COL])\n",
+    "train_seq = sequence.pad_sequences(train_seq, maxlen=MAX_DOC_LEN)\n",
+    "test_seq = tok.texts_to_sequences(test_data[TEXT_COL])\n",
+    "test_seq = sequence.pad_sequences(test_seq, maxlen=MAX_DOC_LEN)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Convert the ratings to one-hot categorical labels."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": null,
    "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true
+    "collapsed": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Total 951656 unique tokens.\n",
-      "Shape of data tensor: (3000000, 300)\n",
-      "Shape of label tensor: (3000000, 5)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "print('Total %s unique tokens.' % len(word_index))\n",
-    "\n",
-    "labels = to_categorical(np.asarray(labels))\n",
-    "x_train = data\n",
-    "y_train = labels[:,1:]\n",
-    "\n",
-    "print('Shape of data tensor:', x_train.shape)\n",
-    "print('Shape of label tensor:', y_train.shape)"
+    "labels = to_categorical(np.asarray(train_data[LABEL_COL]))\n",
+    "labels = labels[:,1:]\n",
+    "labels = labels.astype('float32')"
    ]
   },
   {
@@ -187,6 +170,32 @@
     "n_classes = y_train.shape[1]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Train word2vec on all the documents in order to initialize the word embedding. Ignore rare words (min_count=6). Use skip-gram as the training algorithm (sg=1)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import nltk \n",
+    "\n",
+    "nltk.download('punkt')\n",
+    "\n",
+    "sent_lst = []\n",
+    "\n",
+    "for doc in train_data[TEXT_COL]:\n",
+    "    sentences = nltk.tokenize.sent_tokenize(doc)\n",
+    "    sent_lst.extend(sentences)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 57,
@@ -1243,13 +1252,18 @@
     }
    ],
    "source": [
-    "# train word2vec on the sentences to initialize the word embedding \n",
     "import gensim, logging\n",
     "\n",
     "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
     "# use skip-gram\n",
-    "word2vec_model = gensim.models.Word2Vec(doc_lst, min_count=6, size=EMBEDDING_DIM, max_vocab_size=MAX_VOCAB_SIZE, sg=1, \n",
-    "                                        workers=os.cpu_count())"
+    "word2vec_model = gensim.models.Word2Vec(sentences=sent_lst, min_count=6, size=EMBEDDING_DIM, sg=1, workers=os.cpu_count())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the initial embedding matrix from the output of word2vec."
    ]
   },
   {
@@ -1279,30 +1293,20 @@
     "print('Total %s word vectors.' % len(embeddings_index))\n",
     "\n",
     "# Initial embedding\n",
-    "embedding_matrix = np.zeros((MAX_NB_WORDS + 1, EMBEDDING_DIM))\n",
+    "embedding_matrix = np.zeros((VOCAB_SIZE + 1, EMBEDDING_DIM))\n",
     "\n",
-    "for word, i in word_index.items():\n",
+    "for word, i in tok.word_index.items():\n",
     "    embedding_vector = embeddings_index.get(word)\n",
-    "    if embedding_vector is not None and i < MAX_NB_WORDS:\n",
-    "        embedding_matrix[i] = embedding_vector\n",
-    "    elif i == MAX_NB_WORDS:\n",
-    "        # index MAX_NB_WORDS in data corresponds to 'UNK'\n",
-    "        embedding_matrix[i] = embeddings_index['UNK']"
+    "    if embedding_vector is not None and i < VOCAB_SIZE:\n",
+    "        embedding_matrix[i] = embedding_vector"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 62,
-   "metadata": {
-    "collapsed": true,
-    "deletable": true,
-    "editable": true
-   },
-   "outputs": [],
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
-    "# Order documents by the number of words \n",
-    "doc_lengths = [len(doc) for doc in doc_lst]\n",
-    "ind = np.argsort(doc_lengths)"
+    "LSTM_DIM is the dimensionality of each LSTM output (the number of LSTM units).\n",
+    "The mask_zero option determines whether masking is performed, i.e. whether the layers ignore the padded zeros in shorter documents. CNTK / Keras does not support masking yet."
    ]
   },
   {
@@ -1335,7 +1339,7 @@
     "    l2_reg = regularizers.l2(reg_param)\n",
     "\n",
     "    # model init\n",
-    "    embedding_layer = Embedding(MAX_NB_WORDS + 1,\n",
+    "    embedding_layer = Embedding(VOCAB_SIZE,\n",
     "                                EMBEDDING_DIM,\n",
     "                                input_length=MAX_DOC_LENGTH,\n",
     "                                trainable=True,\n",
@@ -1367,15 +1371,14 @@
     "              batch_size=BATCH_SIZE,\n",
     "              epochs=NUM_EPOCHS,\n",
     "              callbacks=[history, csv_logger],\n",
-    "              shuffle=False,\n",
     "              verbose=2)\n",
     "    t2 = time.time()\n",
     "\n",
     "    # save model\n",
-    "    model.save('./model_wvec_{}.h5'.format(reg_param))\n",
-    "    np.savetxt('./model_wvec_{}_time.txt'.format(reg_param), \n",
+    "    model.save('./lstm_wvec_{}_model.h5'.format(reg_param))\n",
+    "    np.savetxt('./lstm_wvec_{}_time.txt'.format(reg_param), \n",
     "               [reg_param, (t2-t1) / 3600])\n",
-    "    with open('./model_wvec_{}_history.txt'.format(reg_param), \"w\") as res_file:\n",
+    "    with open('./lstm_wvec_{}_history.txt'.format(reg_param), \"w\") as res_file:\n",
     "        res_file.write(str(history.history))"
    ]
   },
@@ -1399,31 +1402,6 @@
    "source": [
     "lstm_create_train(1e-10)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true
-   },
-   "outputs": [],
-   "source": [
-    "# If you get ResourceExhaustedError, try decreasing BATCH_SIZE. This error could arise due to GPU memory limitations.\n",
-    "# Tensorflow automatically uses several cores and one GPU of the DSVM. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {