From 116e82a7d55617332f47af0b18fa46f64ef94ed3 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 11 Sep 2017 18:28:16 +0000 Subject: [PATCH] Update LSTM notebook --- lstm_word2vec.ipynb | 242 ++++++++++++++++++++------------------------ 1 file changed, 110 insertions(+), 132 deletions(-) diff --git a/lstm_word2vec.ipynb b/lstm_word2vec.ipynb index 4cb4b1e..938072f 100644 --- a/lstm_word2vec.ipynb +++ b/lstm_word2vec.ipynb @@ -2,21 +2,13 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 14, "metadata": { "collapsed": false, "deletable": true, "editable": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using CNTK backend\n" - ] - } - ], + "outputs": [], "source": [ "import numpy as np\n", "import os\n", @@ -35,25 +27,28 @@ "from keras.utils import to_categorical" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download the Amazon reviews training data from a public Azure blob" + ] + }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 13, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": false }, "outputs": [], "source": [ - "MAX_DOC_LENGTH = 300\n", - "MAX_NB_WORDS = 6000\n", - "EMBEDDING_DIM = 200\n", - "MAX_VOCAB_SIZE = 50000" + "CONTAINER_URL = \"https://anargyridsa.blob.core.windows.net/dlvm/\"\n", + "trainFile = \"amazon_reviews_train.csv\"" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 9, "metadata": { "collapsed": false, "deletable": true, @@ -61,34 +56,39 @@ }, "outputs": [], "source": [ - "trainFile = '/home/anargyri/.keras/datasets/amazon_reviews_train.csv'\n", - "\n", "# read\n", - "train_data = pd.read_csv(trainFile, header=None, names=['rating', 'title', 'text'])" + "train_data = pd.read_csv(CONTAINER_URL + trainFile, header=None, names=['rating', 'title', 'text'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the dimensions of the input and the embedding. \n", + "\n", + "MAX_DOC_LENGTH : the size of the input i.e. the number of words in the document. Longer documents will be truncated, shorter ones will be padded with zeros.\n", + "\n", + "VOCAB_SIZE : the size of the word encoding (number of most frequent words to keep in the vocabulary)\n", + "\n", + "EMBEDDING_DIM : the dimensionality of the word embedding" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ - "labels = []\n", - "texts = []\n", - "\n", - "for idx in range(train_data.shape[0]):\n", - " text = train_data['text'][idx]\n", - " texts.append(text)\n", - " labels.append(train_data['rating'][idx])" + "MAX_DOC_LENGTH = 300\n", + "VOCAB_SIZE = 6000\n", + "EMBEDDING_DIM = 200" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 16, "metadata": { "collapsed": true, "deletable": true, @@ -96,13 +96,20 @@ }, "outputs": [], "source": [ - "tokenizer = Tokenizer(num_words=MAX_NB_WORDS)\n", - "tokenizer.fit_on_texts(texts)" + "TEXT_COL = 'text'\n", + "LABEL_COL = 'rating'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fit a Keras tokenizer to the most frequent words using the entire training data set as the corpus." ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 34, "metadata": { "collapsed": true, "deletable": true, @@ -110,57 +117,33 @@ }, "outputs": [], "source": [ - "data = np.zeros((len(texts), MAX_DOC_LENGTH), dtype='int32')\n", - "doc_lst = []\n", - "word_index = tokenizer.word_index\n", - "\n", - "# keep the MAX_NB_WORDS most frequent words and replace the rest with 'UNK'\n", - "# truncate to the first MAX_SENTS sentences per doc and MAX_SENT_LENGTH words per sentence\n", - "\n", - "for i, doc in enumerate(texts):\n", - " wordTokens = text_to_word_sequence(doc)\n", - " j = 0\n", - " words_in_doc = []\n", - " for _, word in enumerate(wordTokens):\n", - " if j < MAX_DOC_LENGTH: \n", - " if (word in word_index) and (word_index[word] < MAX_NB_WORDS):\n", - " data[i, j] = word_index[word]\n", - " words_in_doc.append(word)\n", - " else:\n", - " data[i, j] = MAX_NB_WORDS\n", - " words_in_doc.append('UNK')\n", - " j = j + 1\n", - " doc_lst.append(words_in_doc)" + "# tokenize, create seqs, pad\n", + "tok = Tokenizer(num_words=VOCAB_SIZE, lower=True, split=\" \")\n", + "tok.fit_on_texts(train_data[TEXT_COL])\n", + "train_seq = tok.texts_to_sequences(train_data[TEXT_COL])\n", + "train_seq = sequence.pad_sequences(train_seq, maxlen=MAX_DOC_LEN)\n", + "test_seq = tok.texts_to_sequences(test_data[TEXT_COL])\n", + "test_seq = sequence.pad_sequences(test_seq, maxlen=MAX_DOC_LEN)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Convert the ratings to one-hot categorical labels." ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": null, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true + "collapsed": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total 951656 unique tokens.\n", - "Shape of data tensor: (3000000, 300)\n", - "Shape of label tensor: (3000000, 5)\n" - ] - } - ], + "outputs": [], "source": [ - "print('Total %s unique tokens.' % len(word_index))\n", - "\n", - "labels = to_categorical(np.asarray(labels))\n", - "x_train = data\n", - "y_train = labels[:,1:]\n", - "\n", - "print('Shape of data tensor:', x_train.shape)\n", - "print('Shape of label tensor:', y_train.shape)" + "labels = to_categorical(np.asarray(train_data[LABEL_COL]))\n", + "labels = labels[:,1:]\n", + "labels = labels.astype('float32')" ] }, { @@ -187,6 +170,32 @@ "n_classes = y_train.shape[1]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train word2vec on all the documents in order to initialize the word embedding. Ignore rare words (min_count=6). Use skip-gram as the training algorithm (sg=1)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import nltk \n", + "\n", + "nltk.download('punkt')\n", + "\n", + "sent_lst = []\n", + "\n", + "for doc in train_data[TEXT_COL]:\n", + " sentences = nltk.tokenize.sent_tokenize(doc)\n", + " sent_lst.extend(sentences)" + ] + }, { "cell_type": "code", "execution_count": 57, @@ -1243,13 +1252,18 @@ } ], "source": [ - "# train word2vec on the sentences to initialize the word embedding \n", "import gensim, logging\n", "\n", "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", "# use skip-gram\n", - "word2vec_model = gensim.models.Word2Vec(doc_lst, min_count=6, size=EMBEDDING_DIM, max_vocab_size=MAX_VOCAB_SIZE, sg=1, \n", - " workers=os.cpu_count())" + "word2vec_model = gensim.models.Word2Vec(sentences=sent_lst, min_count=6, size=EMBEDDING_DIM, sg=1, workers=os.cpu_count())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the initial embedding matrix from the output of word2vec." ] }, { @@ -1279,30 +1293,20 @@ "print('Total %s word vectors.' % len(embeddings_index))\n", "\n", "# Initial embedding\n", - "embedding_matrix = np.zeros((MAX_NB_WORDS + 1, EMBEDDING_DIM))\n", + "embedding_matrix = np.zeros((VOCAB_SIZE + 1, EMBEDDING_DIM))\n", "\n", - "for word, i in word_index.items():\n", + "for word, i in tok.word_index.items():\n", " embedding_vector = embeddings_index.get(word)\n", - " if embedding_vector is not None and i < MAX_NB_WORDS:\n", - " embedding_matrix[i] = embedding_vector\n", - " elif i == MAX_NB_WORDS:\n", - " # index MAX_NB_WORDS in data corresponds to 'UNK'\n", - " embedding_matrix[i] = embeddings_index['UNK']" + " if embedding_vector is not None and i < VOCAB_SIZE:\n", + " embedding_matrix[i] = embedding_vector" ] }, { - "cell_type": "code", - "execution_count": 62, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], + "cell_type": "markdown", + "metadata": {}, "source": [ - "# Order documents by the number of words \n", - "doc_lengths = [len(doc) for doc in doc_lst]\n", - "ind = np.argsort(doc_lengths)" + "LSTM_DIM is the dimensionality of each LSTM output (the number of LSTM units).\n", + "The mask_zero option determines whether masking is performed, i.e. whether the layers ignore the padded zeros in shorter documents. CNTK / Keras does not support masking yet." ] }, { @@ -1335,7 +1339,7 @@ " l2_reg = regularizers.l2(reg_param)\n", "\n", " # model init\n", - " embedding_layer = Embedding(MAX_NB_WORDS + 1,\n", + " embedding_layer = Embedding(VOCAB_SIZE,\n", " EMBEDDING_DIM,\n", " input_length=MAX_DOC_LENGTH,\n", " trainable=True,\n", @@ -1367,15 +1371,14 @@ " batch_size=BATCH_SIZE,\n", " epochs=NUM_EPOCHS,\n", " callbacks=[history, csv_logger],\n", - " shuffle=False,\n", " verbose=2)\n", " t2 = time.time()\n", "\n", " # save model\n", - " model.save('./model_wvec_{}.h5'.format(reg_param))\n", - " np.savetxt('./model_wvec_{}_time.txt'.format(reg_param), \n", + " model.save('./lstm_wvec_{}_model.h5'.format(reg_param))\n", + " np.savetxt('./lstm_wvec_{}_time.txt'.format(reg_param), \n", " [reg_param, (t2-t1) / 3600])\n", - " with open('./model_wvec_{}_history.txt'.format(reg_param), \"w\") as res_file:\n", + " with open('./lstm_wvec_{}_history.txt'.format(reg_param), \"w\") as res_file:\n", " res_file.write(str(history.history))" ] }, @@ -1399,31 +1402,6 @@ "source": [ "lstm_create_train(1e-10)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "# If you get ResourceExhaustedError, try decreasing BATCH_SIZE. This error could arise due to GPU memory limitations.\n", - "# Tensorflow automatically uses several cores and one GPU of the DSVM. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [] } ], "metadata": {