diff --git a/hatt_train_small.ipynb b/hatt_train_small.ipynb deleted file mode 100644 index adc0ee4..0000000 --- a/hatt_train_small.ipynb +++ /dev/null @@ -1,954 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "The code in this notebook is based on [Richard Liao's implementation of hierarchical attention networks](https://github.com/richliao/textClassifier/blob/master/textClassifierHATT.py) and a related [Google group discussion](https://groups.google.com/forum/#!topic/keras-users/IWK9opMFavQ). The notebook also includes code from [Keras documentation](https://keras.io/) and [blog](https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html) as well as this [word2vec tutorial](http://adventuresinmachinelearning.com/gensim-word2vec-tutorial/)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "To enable Theano to run on a single GPU: \n", - "\n", - "* check the following dependencies: \n", - "\n", - " `conda install pygpu`\n", - " \n", - "\n", - "* Replace $HOME/.theanorc with this:\n", - "```\n", - "[global]\n", - "floatX = float32\n", - "device = gpu0\n", - "[lib]\n", - "gpuarray.preallocate=1\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, - "scrolled": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10). Please switch to the gpuarray backend. You can get more information about how to switch at this URL:\n", - " https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "gpu0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5110)\n" - ] - } - ], - "source": [ - "import os \n", - "os.environ['THEANO_FLAGS'] = 'floatX=float32,device=gpu0'\n", - "os.environ['PATH'] = os.environ['PATH'] + ':/usr/local/cuda-8.0/bin'\n", - "import theano\n", - "print(theano.config.device) " - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using Theano backend.\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from collections import defaultdict\n", - "import os \n", - "os.environ['KERAS_BACKEND'] = 'theano'\n", - "import subprocess\n", - "import time\n", - "\n", - "from keras.preprocessing.text import Tokenizer, text_to_word_sequence\n", - "from keras.preprocessing.sequence import pad_sequences\n", - "from keras.utils.np_utils import to_categorical\n", - "from keras.optimizers import SGD\n", - "\n", - "from keras.layers import Embedding\n", - "from keras.layers import Dense, Input, Flatten\n", - "from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed\n", - "from keras.models import Model, load_model\n", - "\n", - "from keras import backend as K\n", - "from keras.engine.topology import Layer, InputSpec\n", - "from keras import initializers, regularizers, optimizers\n", - "from keras.callbacks import History, CSVLogger" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Download the book reviews data from Azure Machine Learning" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "\"\"\"\n", - "from azureml import Workspace\n", - "ws = Workspace(\n", - " workspace_id='817780d9ee0d4a878e25f8c9deb3b866',\n", - " authorization_token='6df8a52943bd49eba6e57446bc73f5fc',\n", - " endpoint='https://studioapi.azureml.net'\n", - ")\n", - "ds = ws.datasets['Book Reviews from Amazon']\n", - "all_data = ds.to_dataframe()\n", - "all_data.rename(columns={0: 'rating', 1: 'text'}, inplace=True)\n", - "all_data.loc[:, 'rating'] = all_data['rating'] - 1 # reindex ratings to start from 0\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "from azureml import Workspace\n", - "ws = Workspace(\n", - " workspace_id='817780d9ee0d4a878e25f8c9deb3b866',\n", - " authorization_token='6df8a52943bd49eba6e57446bc73f5fc',\n", - " endpoint='https://studioapi.azureml.net'\n", - ")\n", - "ds = ws.datasets['dfe_happysad_utf.csv']\n", - "all_data = ds.to_dataframe()\n", - "all_data.rename(columns={'features': 'text', 'label': 'rating'}, inplace=True)\n", - "all_data.replace({'rating': {'sadness': 0, 'happiness': 1}}, inplace=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Split data into a training and a test set. " - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "n_tr = 7500\n", - "\n", - "ind_range = np.arange(all_data.shape[0])\n", - "tr_ind = np.random.choice(ind_range, n_tr, replace=False)\n", - "\n", - "train_data = all_data.iloc[tr_ind, :]\n", - "test_data = all_data.iloc[np.setdiff1d(ind_range, tr_ind), :]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Set the dimensions of the input and the embedding. Because of the hierarchical nature of the network, the input has to be a 3-dimensional tensor of fixed size (sample_size x n_sentences x n_words). \n", - "\n", - "MAX_SENT_LEN : the number of words in each sentence. \n", - "\n", - "MAX_SENTS : the number of sentences in each document.\n", - "\n", - "Longer documents and sentences will be truncated, shorter ones will be padded with zeros.\n", - "\n", - "MAX_NB_WORDS : the size of the word encoding (number of most frequent words to keep in the vocabulary)\n", - "\n", - "EMBEDDING_DIM : the dimensionality of the word embedding" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "MAX_SENT_LENGTH = 100\n", - "MAX_SENTS = 30\n", - "MAX_NB_WORDS = 20000\n", - "EMBEDDING_DIM = 200" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Fit a Keras tokenizer to the most frequent words using the entire training data set as the corpus.\n", - "Create the training data in the 3d format required. " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package punkt to /home/anargyri/nltk_data...\n", - "[nltk_data] Package punkt is already up-to-date!\n" - ] - } - ], - "source": [ - "import nltk \n", - "\n", - "nltk.download('punkt')\n", - "\n", - "reviews = []\n", - "labels = []\n", - "texts = []\n", - "\n", - "for idx in range(train_data.shape[0]):\n", - " text = train_data['text'].iloc[idx]\n", - " texts.append(text)\n", - " sentences = nltk.tokenize.sent_tokenize(text)\n", - " reviews.append(sentences)\n", - " labels.append(train_data['rating'].iloc[idx])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "tokenizer = Tokenizer(num_words=MAX_NB_WORDS)\n", - "tokenizer.fit_on_texts(texts)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')\n", - "doc_lst = []\n", - "\n", - "# keep the MAX_NB_WORDS most frequent words and replace the rest with 'UNK'\n", - "# truncate to the first MAX_SENTS sentences per doc and MAX_SENT_LENGTH words per sentence\n", - "\n", - "for i, sentences in enumerate(reviews):\n", - " for j, sent in enumerate(sentences):\n", - " if j < MAX_SENTS:\n", - " wordTokens = text_to_word_sequence(sent)\n", - " k = 0\n", - " words_in_sent = []\n", - " for _, word in enumerate(wordTokens):\n", - " if k < MAX_SENT_LENGTH: \n", - " if (word in tokenizer.word_index) and (tokenizer.word_index[word] < MAX_NB_WORDS):\n", - " data[i, j, k] = tokenizer.word_index[word]\n", - " words_in_sent.append(word)\n", - " else:\n", - " data[i, j, k] = MAX_NB_WORDS\n", - " words_in_sent.append('UNK')\n", - " k = k + 1\n", - " doc_lst.append(words_in_sent)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Convert the ratings to one-hot categorical labels." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total 14725 unique tokens.\n", - "Shape of data tensor: (7500, 30, 100)\n", - "Shape of label tensor: (7500, 2)\n" - ] - } - ], - "source": [ - "word_index = tokenizer.word_index\n", - "print('Total %s unique tokens.' % len(word_index))\n", - "\n", - "y_train = to_categorical(np.asarray(labels))\n", - "x_train = data\n", - "\n", - "print('Shape of data tensor:', x_train.shape)\n", - "print('Shape of label tensor:', y_train.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "n_classes = y_train.shape[1]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Train word2vec on the training documents in order to initialize the word embedding. Ignore rare words (min_count=6). Use skip-gram as the training algorithm (sg=1)." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2017-09-15 17:37:26,241 : INFO : collecting all words and their counts\n", - "2017-09-15 17:37:26,242 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", - "2017-09-15 17:37:26,262 : INFO : PROGRESS: at sentence #10000, processed 78426 words, keeping 12124 word types\n", - "2017-09-15 17:37:26,270 : INFO : collected 14725 word types from a corpus of 104038 raw words and 13287 sentences\n", - "2017-09-15 17:37:26,270 : INFO : Loading a fresh vocabulary\n", - "2017-09-15 17:37:26,281 : INFO : min_count=6 retains 1585 unique words (10% of original 14725, drops 13140)\n", - "2017-09-15 17:37:26,282 : INFO : min_count=6 leaves 85487 word corpus (82% of original 104038, drops 18551)\n", - "2017-09-15 17:37:26,287 : INFO : deleting the raw counts dictionary of 14725 items\n", - "2017-09-15 17:37:26,288 : INFO : sample=0.001 downsamples 73 most-common words\n", - "2017-09-15 17:37:26,289 : INFO : downsampling leaves estimated 62674 word corpus (73.3% of prior 85487)\n", - "2017-09-15 17:37:26,290 : INFO : estimated required memory for 1585 words and 200 dimensions: 3328500 bytes\n", - "2017-09-15 17:37:26,294 : INFO : resetting layer weights\n", - "2017-09-15 17:37:26,317 : INFO : training model with 24 workers on 1585 vocabulary and 200 features, using sg=1 hs=0 sample=0.001 negative=5 window=5\n", - "2017-09-15 17:37:26,616 : INFO : worker thread finished; awaiting finish of 23 more threads\n", - "2017-09-15 17:37:26,631 : INFO : worker thread finished; awaiting finish of 22 more threads\n", - "2017-09-15 17:37:26,632 : INFO : worker thread finished; awaiting finish of 21 more threads\n", - "2017-09-15 17:37:26,633 : INFO : worker thread finished; awaiting finish of 20 more threads\n", - "2017-09-15 17:37:26,634 : INFO : worker thread finished; awaiting finish of 19 more threads\n", - "2017-09-15 17:37:26,656 : INFO : worker thread finished; awaiting finish of 18 more threads\n", - "2017-09-15 17:37:26,657 : INFO : worker thread finished; awaiting finish of 17 more threads\n", - "2017-09-15 17:37:26,658 : INFO : worker thread finished; awaiting finish of 16 more threads\n", - "2017-09-15 17:37:26,659 : INFO : worker thread finished; awaiting finish of 15 more threads\n", - "2017-09-15 17:37:26,659 : INFO : worker thread finished; awaiting finish of 14 more threads\n", - "2017-09-15 17:37:26,660 : INFO : worker thread finished; awaiting finish of 13 more threads\n", - "2017-09-15 17:37:26,667 : INFO : worker thread finished; awaiting finish of 12 more threads\n", - "2017-09-15 17:37:26,669 : INFO : worker thread finished; awaiting finish of 11 more threads\n", - "2017-09-15 17:37:26,675 : INFO : worker thread finished; awaiting finish of 10 more threads\n", - "2017-09-15 17:37:26,676 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2017-09-15 17:37:26,678 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2017-09-15 17:37:26,691 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2017-09-15 17:37:26,692 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2017-09-15 17:37:26,698 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2017-09-15 17:37:26,701 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2017-09-15 17:37:26,702 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2017-09-15 17:37:26,705 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2017-09-15 17:37:26,708 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2017-09-15 17:37:26,709 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2017-09-15 17:37:26,710 : INFO : training on 520190 raw words (313251 effective words) took 0.4s, 836410 effective words/s\n", - "2017-09-15 17:37:26,711 : WARNING : under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" - ] - } - ], - "source": [ - "# train word2vec on the sentences to initialize the word embedding \n", - "import gensim, logging\n", - "\n", - "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", - "# use skip-gram\n", - "word2vec_model = gensim.models.Word2Vec(doc_lst, min_count=6, size=EMBEDDING_DIM, sg=1, workers=os.cpu_count())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Create the initial embedding matrix from the output of word2vec." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total 1585 word vectors.\n" - ] - } - ], - "source": [ - "embeddings_index = {}\n", - "\n", - "for word in word2vec_model.wv.vocab:\n", - " coefs = np.asarray(word2vec_model.wv[word], dtype='float32')\n", - " embeddings_index[word] = coefs\n", - "\n", - "print('Total %s word vectors.' % len(embeddings_index))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "# Initial embedding\n", - "embedding_matrix = np.zeros((MAX_NB_WORDS + 1, EMBEDDING_DIM))\n", - "\n", - "for word, i in word_index.items():\n", - " embedding_vector = embeddings_index.get(word)\n", - " if embedding_vector is not None and i < MAX_NB_WORDS:\n", - " embedding_matrix[i] = embedding_vector\n", - " elif i == MAX_NB_WORDS:\n", - " # index MAX_NB_WORDS in data corresponds to 'UNK'\n", - " embedding_matrix[i] = embeddings_index['UNK']" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "source": [ - "Define the network.\n", - "The mask_zero option determines whether masking is performed, i.e. whether the layers ignore the padded zeros in shorter documents." - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "# building Hierachical Attention network\n", - "\n", - "REG_PARAM = 1e2\n", - "l2_reg = regularizers.l2(REG_PARAM)\n", - "\n", - "embedding_layer = Embedding(MAX_NB_WORDS + 1,\n", - " EMBEDDING_DIM,\n", - " input_length=MAX_SENT_LENGTH,\n", - " trainable=True,\n", - " mask_zero=True,\n", - " embeddings_regularizer=l2_reg,\n", - " weights=[embedding_matrix])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Define a custom layer implementing the attention mechanism." - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "CONTEXT_DIM = 100\n", - "\n", - "class AttLayer(Layer):\n", - " def __init__(self, regularizer=None, **kwargs):\n", - " self.regularizer = regularizer\n", - " self.supports_masking = True\n", - " super(AttLayer, self).__init__(**kwargs)\n", - "\n", - " def build(self, input_shape):\n", - " assert len(input_shape) == 3 \n", - " self.W = self.add_weight(name='W', shape=(input_shape[-1], CONTEXT_DIM), initializer='normal', trainable=True, \n", - " regularizer=self.regularizer)\n", - " self.b = self.add_weight(name='b', shape=(CONTEXT_DIM,), initializer='normal', trainable=True, \n", - " regularizer=self.regularizer)\n", - " self.u = self.add_weight(name='u', shape=(CONTEXT_DIM,), initializer='normal', trainable=True, \n", - " regularizer=self.regularizer) \n", - " super(AttLayer, self).build(input_shape) # be sure you call this somewhere!\n", - "\n", - " def call(self, x, mask=None):\n", - " eij = K.dot(K.tanh(K.dot(x, self.W) + self.b), self.u)\n", - " ai = K.exp(eij)\n", - " alphas = ai / K.sum(ai, axis=1).dimshuffle(0, 'x')\n", - " if mask is not None:\n", - " # use only the inputs specified by the mask\n", - " alphas *= mask\n", - " weighted_input = x * alphas.dimshuffle(0, 1, 'x')\n", - " return weighted_input.sum(axis=1)\n", - "\n", - " def compute_output_shape(self, input_shape):\n", - " return (input_shape[0], input_shape[-1])\n", - " \n", - " def get_config(self):\n", - " config = {}\n", - " base_config = super(AttLayer, self).get_config()\n", - " return dict(list(base_config.items()) + list(config.items()))\n", - "\n", - " def compute_mask(self, inputs, mask):\n", - " return None" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "GRU_UNITS is the dimensionality of each GRU output (the number of GRU units). GRU_IMPL = 2 selects a matricized RNN implementation which is more appropriate for training on a GPU. \n", - "\n", - "There are two levels of models in the definition. The sentence model `sentEncoder` is shared across all sentences in the input document. " - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "GPU_IMPL = 2 \n", - "GRU_UNITS = 50 \n", - "\n", - "sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')\n", - "embedded_sequences = embedding_layer(sentence_input)\n", - "l_lstm = Bidirectional(GRU(GRU_UNITS, return_sequences=True, kernel_regularizer=l2_reg, implementation=GPU_IMPL))(embedded_sequences)\n", - "l_att = AttLayer(regularizer=l2_reg)(l_lstm) \n", - "sentEncoder = Model(sentence_input, l_att)\n", - "\n", - "review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')\n", - "review_encoder = TimeDistributed(sentEncoder)(review_input)\n", - "l_lstm_sent = Bidirectional(GRU(GRU_UNITS, return_sequences=True, kernel_regularizer=l2_reg, implementation=GPU_IMPL))(review_encoder)\n", - "l_att_sent = AttLayer(regularizer=l2_reg)(l_lstm_sent) \n", - "preds = Dense(n_classes, activation='softmax', kernel_regularizer=l2_reg)(l_att_sent)\n", - "model = Model(review_input, preds)" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "model.compile(loss='categorical_crossentropy',\n", - " optimizer=optimizers.SGD(lr=0.01, momentum=0.9),\n", - " metrics=['acc'])" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "_________________________________________________________________\n", - "Layer (type) Output Shape Param # \n", - "=================================================================\n", - "input_10 (InputLayer) (None, 30, 100) 0 \n", - "_________________________________________________________________\n", - "time_distributed_5 (TimeDist (None, 30, 100) 4085700 \n", - "_________________________________________________________________\n", - "bidirectional_10 (Bidirectio (None, 30, 100) 45300 \n", - "_________________________________________________________________\n", - "att_layer_10 (AttLayer) (None, 100) 10200 \n", - "_________________________________________________________________\n", - "dense_5 (Dense) (None, 2) 202 \n", - "=================================================================\n", - "Total params: 4,141,402\n", - "Trainable params: 4,141,402\n", - "Non-trainable params: 0\n", - "_________________________________________________________________\n" - ] - } - ], - "source": [ - "model.summary()" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "ref_str = 'tweets'\n", - "history = History()\n", - "csv_logger = CSVLogger('./hatt_model_' + str(REG_PARAM) + '_' + ref_str + '.log',\n", - " separator=',',\n", - " append=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Order training data by the number of sentences in document (as suggested in the [Yang et al.] paper)." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "doc_lengths = [len(r) for r in reviews]\n", - "ind = np.argsort(doc_lengths)" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "model fitting - Hierachical attention network\n", - "Epoch 1/10\n", - "98s - loss: 26514.2522 - acc: 0.5324\n", - "Epoch 2/10\n", - "97s - loss: 0.8006 - acc: 0.5104\n", - "Epoch 3/10\n", - "97s - loss: 0.6942 - acc: 0.5104\n", - "Epoch 4/10\n", - "98s - loss: 0.6942 - acc: 0.5104\n", - "Epoch 5/10\n", - "97s - loss: 0.6942 - acc: 0.5104\n", - "Epoch 6/10\n", - "97s - loss: 0.6942 - acc: 0.5104\n", - "Epoch 7/10\n", - "97s - loss: 0.6942 - acc: 0.5104\n", - "Epoch 8/10\n", - "97s - loss: 0.6942 - acc: 0.5104\n", - "Epoch 9/10\n", - "98s - loss: 0.6942 - acc: 0.5104\n", - "Epoch 10/10\n", - "97s - loss: 0.6942 - acc: 0.5104\n" - ] - } - ], - "source": [ - "t1 = time.time()\n", - "\n", - "print(\"model fitting - Hierachical attention network\")\n", - "model.fit(x_train[ind,:,:], y_train[ind,:], epochs=10, batch_size=64, shuffle=False, \n", - " callbacks=[history, csv_logger], verbose=2)\n", - "\n", - "t2 = time.time()" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "# save model\n", - "model.save('./hatt_model_{0}_{1}.h5'.format(REG_PARAM, ref_str))" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "np.savetxt('./hatt_model_{0}_{1}_time.txt'.format(REG_PARAM, ref_str), [REG_PARAM, (t2-t1) / 3600])\n", - "with open('./hatt_model_{0}_{1}_history.txt'.format(REG_PARAM, ref_str), \"w\") as res_file:\n", - " res_file.write(str(history.history))" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "test_reviews = []\n", - "test_labels = []\n", - "test_texts = []\n", - "\n", - "for idx in range(test_data.shape[0]):\n", - " text = test_data['text'].iloc[idx]\n", - " test_texts.append(text)\n", - " sentences = nltk.tokenize.sent_tokenize(text)\n", - " test_reviews.append(sentences)\n", - " test_labels.append(test_data['rating'].iloc[idx])" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "data2 = np.zeros((len(test_texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')\n", - "\n", - "for i, sentences in enumerate(test_reviews):\n", - " for j, sent in enumerate(sentences):\n", - " if j < MAX_SENTS:\n", - " wordTokens = text_to_word_sequence(sent)\n", - " k = 0\n", - " words_in_sent = []\n", - " for _, word in enumerate(wordTokens):\n", - " if k < MAX_SENT_LENGTH: \n", - " if (word in tokenizer.word_index) and (tokenizer.word_index[word] < MAX_NB_WORDS):\n", - " data2[i, j, k] = tokenizer.word_index[word]\n", - " words_in_sent.append(word)\n", - " else:\n", - " data2[i, j, k] = MAX_NB_WORDS\n", - " words_in_sent.append('UNK')\n", - " k = k + 1" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "y_test = to_categorical(np.asarray(test_labels))\n", - "x_test = data2" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "from sklearn.metrics import accuracy_score" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.51397624039133472" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "preds = model.predict(x_test)\n", - "accuracy_score(test_labels, preds.argmax(axis=1))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}