diff --git a/lstm_word2vec_small.ipynb b/lstm_word2vec_small.ipynb deleted file mode 100644 index b070565..0000000 --- a/lstm_word2vec_small.ipynb +++ /dev/null @@ -1,680 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "The code in this notebook is based on the [Keras documentation](https://keras.io/) and [blog](https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html) as well as this [word2vec tutorial](http://adventuresinmachinelearning.com/gensim-word2vec-tutorial/)." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using CNTK backend\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import os\n", - "import pandas as pd\n", - "import pickle\n", - "import time\n", - "\n", - "os.environ['KERAS_BACKEND']='cntk'\n", - "from keras.preprocessing import sequence\n", - "from keras.preprocessing.text import Tokenizer, text_to_word_sequence\n", - "from keras.models import Sequential, load_model\n", - "from keras import regularizers\n", - "from keras.optimizers import SGD\n", - "from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional\n", - "from keras.callbacks import History, CSVLogger\n", - "from keras.utils import to_categorical" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Download the book reviews data from Azure Machine Learning" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "from azureml import Workspace\n", - "ws = Workspace(\n", - " workspace_id='817780d9ee0d4a878e25f8c9deb3b866',\n", - " authorization_token='6df8a52943bd49eba6e57446bc73f5fc',\n", - " endpoint='https://studioapi.azureml.net'\n", - ")\n", - "ds = ws.datasets['Book Reviews from Amazon']\n", - "all_data = ds.to_dataframe()\n", - "all_data.rename(columns={0: 'rating', 1: 'text'}, inplace=True)\n", - "all_data.loc[:, 'rating'] = all_data['rating'] - 1 # reindex ratings to start from 0" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "\"\"\"\n", - "from azureml import Workspace\n", - "ws = Workspace(\n", - " workspace_id='817780d9ee0d4a878e25f8c9deb3b866',\n", - " authorization_token='6df8a52943bd49eba6e57446bc73f5fc',\n", - " endpoint='https://studioapi.azureml.net'\n", - ")\n", - "ds = ws.datasets['dfe_happysad_utf.csv']\n", - "all_data = ds.to_dataframe()\n", - "all_data.rename(columns={'features': 'text', 'label': 'rating'}, inplace=True)\n", - "all_data.replace({'rating': {'sadness': 0, 'happiness': 1}}, inplace=True)\n", - "\"\"\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Split data into a training and a test set. " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "n_tr = 7500\n", - "\n", - "ind_range = np.arange(all_data.shape[0])\n", - "tr_ind = np.random.choice(ind_range, n_tr, replace=False)\n", - "\n", - "train_data = all_data.iloc[tr_ind, :]\n", - "test_data = all_data.iloc[np.setdiff1d(ind_range, tr_ind), :]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Set the dimensions of the input and the embedding. \n", - "\n", - "MAX_DOC_LEN : the size of the input i.e. the number of words in the document. Longer documents will be truncated, shorter ones will be padded with zeros.\n", - "\n", - "VOCAB_SIZE : the size of the word encoding (number of most frequent words to keep in the vocabulary)\n", - "\n", - "EMBEDDING_DIM : the dimensionality of the word embedding" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "MAX_DOC_LEN = 300\n", - "VOCAB_SIZE = 6000\n", - "EMBEDDING_DIM = 200" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "TEXT_COL = 'text'\n", - "LABEL_COL = 'rating'" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Fit a Keras tokenizer to the most frequent words using the entire training data set as the corpus." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "# tokenize, create seqs, pad\n", - "tok = Tokenizer(num_words=VOCAB_SIZE, lower=True, split=\" \")\n", - "tok.fit_on_texts(train_data[TEXT_COL])\n", - "train_seq = tok.texts_to_sequences(train_data[TEXT_COL])\n", - "train_seq = sequence.pad_sequences(train_seq, maxlen=MAX_DOC_LEN)\n", - "test_seq = tok.texts_to_sequences(test_data[TEXT_COL])\n", - "test_seq = sequence.pad_sequences(test_seq, maxlen=MAX_DOC_LEN)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Convert the ratings to one-hot categorical labels." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "labels = to_categorical(np.asarray(train_data[LABEL_COL]))\n", - "labels = labels.astype('float32')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "n_classes = labels.shape[1]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Train word2vec on the training documents in order to initialize the word embedding. Ignore rare words (min_count=6). Use skip-gram as the training algorithm (sg=1)." - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package punkt to /home/anargyri/nltk_data...\n", - "[nltk_data] Package punkt is already up-to-date!\n" - ] - } - ], - "source": [ - "import nltk \n", - "\n", - "nltk.download('punkt')\n", - "\n", - "sent_lst = []\n", - "\n", - "for doc in train_data[TEXT_COL]:\n", - " sentences = nltk.tokenize.sent_tokenize(doc)\n", - " for sent in sentences:\n", - " word_lst = [w for w in nltk.tokenize.word_tokenize(sent) if w.isalnum()]\n", - " sent_lst.append(word_lst)" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2017-09-15 11:21:16,427 : INFO : collecting all words and their counts\n", - "2017-09-15 11:21:16,428 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", - "2017-09-15 11:21:16,451 : INFO : PROGRESS: at sentence #10000, processed 75804 words, keeping 13063 word types\n", - "2017-09-15 11:21:16,459 : INFO : collected 15977 word types from a corpus of 100883 raw words and 13257 sentences\n", - "2017-09-15 11:21:16,460 : INFO : Loading a fresh vocabulary\n", - "2017-09-15 11:21:16,470 : INFO : min_count=6 retains 1613 unique words (10% of original 15977, drops 14364)\n", - "2017-09-15 11:21:16,471 : INFO : min_count=6 leaves 80706 word corpus (79% of original 100883, drops 20177)\n", - "2017-09-15 11:21:16,476 : INFO : deleting the raw counts dictionary of 15977 items\n", - "2017-09-15 11:21:16,478 : INFO : sample=0.001 downsamples 65 most-common words\n", - "2017-09-15 11:21:16,479 : INFO : downsampling leaves estimated 59121 word corpus (73.3% of prior 80706)\n", - "2017-09-15 11:21:16,479 : INFO : estimated required memory for 1613 words and 200 dimensions: 3387300 bytes\n", - "2017-09-15 11:21:16,484 : INFO : resetting layer weights\n", - "2017-09-15 11:21:16,520 : INFO : training model with 24 workers on 1613 vocabulary and 200 features, using sg=1 hs=0 sample=0.001 negative=5 window=5\n", - "2017-09-15 11:21:16,831 : INFO : worker thread finished; awaiting finish of 23 more threads\n", - "2017-09-15 11:21:16,833 : INFO : worker thread finished; awaiting finish of 22 more threads\n", - "2017-09-15 11:21:16,844 : INFO : worker thread finished; awaiting finish of 21 more threads\n", - "2017-09-15 11:21:16,846 : INFO : worker thread finished; awaiting finish of 20 more threads\n", - "2017-09-15 11:21:16,848 : INFO : worker thread finished; awaiting finish of 19 more threads\n", - "2017-09-15 11:21:16,854 : INFO : worker thread finished; awaiting finish of 18 more threads\n", - "2017-09-15 11:21:16,858 : INFO : worker thread finished; awaiting finish of 17 more threads\n", - "2017-09-15 11:21:16,861 : INFO : worker thread finished; awaiting finish of 16 more threads\n", - "2017-09-15 11:21:16,865 : INFO : worker thread finished; awaiting finish of 15 more threads\n", - "2017-09-15 11:21:16,880 : INFO : worker thread finished; awaiting finish of 14 more threads\n", - "2017-09-15 11:21:16,882 : INFO : worker thread finished; awaiting finish of 13 more threads\n", - "2017-09-15 11:21:16,889 : INFO : worker thread finished; awaiting finish of 12 more threads\n", - "2017-09-15 11:21:16,891 : INFO : worker thread finished; awaiting finish of 11 more threads\n", - "2017-09-15 11:21:16,895 : INFO : worker thread finished; awaiting finish of 10 more threads\n", - "2017-09-15 11:21:16,897 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2017-09-15 11:21:16,898 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2017-09-15 11:21:16,904 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2017-09-15 11:21:16,907 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2017-09-15 11:21:16,908 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2017-09-15 11:21:16,909 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2017-09-15 11:21:16,910 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2017-09-15 11:21:16,914 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2017-09-15 11:21:16,920 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2017-09-15 11:21:16,921 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2017-09-15 11:21:16,921 : INFO : training on 504415 raw words (295266 effective words) took 0.4s, 758339 effective words/s\n", - "2017-09-15 11:21:16,922 : WARNING : under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" - ] - } - ], - "source": [ - "import gensim, logging\n", - "\n", - "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", - "# use skip-gram\n", - "word2vec_model = gensim.models.Word2Vec(sentences=sent_lst, min_count=6, size=EMBEDDING_DIM, sg=1, workers=os.cpu_count())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Create the initial embedding matrix from the output of word2vec." - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total 1613 word vectors.\n" - ] - } - ], - "source": [ - "embeddings_index = {}\n", - "\n", - "for word in word2vec_model.wv.vocab:\n", - " coefs = np.asarray(word2vec_model.wv[word], dtype='float32')\n", - " embeddings_index[word] = coefs\n", - "\n", - "print('Total %s word vectors.' % len(embeddings_index))\n", - "\n", - "# Initial embedding\n", - "embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))\n", - "\n", - "for word, i in tok.word_index.items():\n", - " embedding_vector = embeddings_index.get(word)\n", - " if embedding_vector is not None and i < VOCAB_SIZE:\n", - " embedding_matrix[i] = embedding_vector" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "LSTM_DIM is the dimensionality of each LSTM output (the number of LSTM units).\n", - "The mask_zero option determines whether masking is performed, i.e. whether the layers ignore the padded zeros in shorter documents." - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "BATCH_SIZE = 100\n", - "NUM_EPOCHS = 10\n", - "LSTM_DIM = 100\n", - "OPTIMIZER = SGD(lr=0.01, nesterov=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "def lstm_create_train(reg_param, ref_str):\n", - " l2_reg = regularizers.l2(reg_param)\n", - "\n", - " # model init\n", - " embedding_layer = Embedding(VOCAB_SIZE,\n", - " EMBEDDING_DIM,\n", - " input_length=MAX_DOC_LEN,\n", - " trainable=True,\n", - " mask_zero=False,\n", - " embeddings_regularizer=l2_reg,\n", - " weights=[embedding_matrix])\n", - "\n", - " lstm_layer = LSTM(units=LSTM_DIM, kernel_regularizer=l2_reg)\n", - " dense_layer = Dense(n_classes, activation='softmax', kernel_regularizer=l2_reg)\n", - "\n", - " model = Sequential()\n", - " model.add(embedding_layer)\n", - " model.add(Bidirectional(lstm_layer))\n", - " model.add(dense_layer)\n", - "\n", - " model.compile(loss='categorical_crossentropy',\n", - " optimizer=OPTIMIZER,\n", - " metrics=['acc'])\n", - "\n", - " history = History()\n", - " csv_logger = CSVLogger('./lstm_model_wvec_{0}_{1}.log'.format(reg_param, ref_str),\n", - " separator=',',\n", - " append=True)\n", - "\n", - " print(\"Training model with regularization parameter = {}\".format(reg_param))\n", - " t1 = time.time()\n", - " # model fit\n", - " model.fit(train_seq,\n", - " labels.astype('float32'),\n", - " batch_size=BATCH_SIZE,\n", - " epochs=NUM_EPOCHS,\n", - " callbacks=[history, csv_logger],\n", - " verbose=2)\n", - " t2 = time.time()\n", - " print(\"\\n\")\n", - " \n", - " # save model\n", - " model.save('./lstm_wvec_{0}_{1}_model.h5'.format(reg_param, ref_str))\n", - " np.savetxt('./lstm_wvec_{0}_{1}_time.txt'.format(reg_param, ref_str), \n", - " [reg_param, (t2-t1) / 3600])\n", - " with open('./lstm_wvec_{0}_{1}_history.txt'.format(reg_param, ref_str), \"w\") as res_file:\n", - " res_file.write(str(history.history))" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training model with regularization parameter = 1e-10\n", - "Epoch 1/10\n", - "33s - loss: 0.6922 - acc: 0.5447\n", - "Epoch 2/10\n", - "32s - loss: 0.6916 - acc: 0.5557\n", - "Epoch 3/10\n", - "32s - loss: 0.6910 - acc: 0.5652\n", - "Epoch 4/10\n", - "32s - loss: 0.6905 - acc: 0.5893\n", - "Epoch 5/10\n", - "32s - loss: 0.6899 - acc: 0.5828\n", - "Epoch 6/10\n", - "32s - loss: 0.6893 - acc: 0.5827\n", - "Epoch 7/10\n", - "32s - loss: 0.6887 - acc: 0.5943\n", - "Epoch 8/10\n", - "32s - loss: 0.6882 - acc: 0.5901\n", - "Epoch 9/10\n", - "32s - loss: 0.6876 - acc: 0.5933\n", - "Epoch 10/10\n", - "32s - loss: 0.6870 - acc: 0.5905\n", - "\n", - "\n", - "Training model with regularization parameter = 1e-07\n", - "Epoch 1/10\n", - "32s - loss: 0.6944 - acc: 0.4816\n", - "Epoch 2/10\n", - "32s - loss: 0.6936 - acc: 0.4899\n", - "Epoch 3/10\n", - "32s - loss: 0.6929 - acc: 0.5145\n", - "Epoch 4/10\n", - "32s - loss: 0.6924 - acc: 0.5299\n", - "Epoch 5/10\n", - "32s - loss: 0.6917 - acc: 0.5463\n", - "Epoch 6/10\n", - "32s - loss: 0.6911 - acc: 0.5524\n", - "Epoch 7/10\n", - "32s - loss: 0.6905 - acc: 0.5656\n", - "Epoch 8/10\n", - "32s - loss: 0.6899 - acc: 0.5616\n", - "Epoch 9/10\n", - "32s - loss: 0.6893 - acc: 0.5647\n", - "Epoch 10/10\n", - "32s - loss: 0.6888 - acc: 0.5716\n", - "\n", - "\n", - "Training model with regularization parameter = 0.0001\n", - "Epoch 1/10\n", - "33s - loss: 0.9469 - acc: 0.4776\n", - "Epoch 2/10\n", - "32s - loss: 0.9460 - acc: 0.4563\n", - "Epoch 3/10\n", - "32s - loss: 0.9453 - acc: 0.4888\n", - "Epoch 4/10\n", - "32s - loss: 0.9447 - acc: 0.5072\n", - "Epoch 5/10\n", - "32s - loss: 0.9441 - acc: 0.5355\n", - "Epoch 6/10\n", - "32s - loss: 0.9435 - acc: 0.5539\n", - "Epoch 7/10\n", - "32s - loss: 0.9429 - acc: 0.5832\n", - "Epoch 8/10\n", - "32s - loss: 0.9423 - acc: 0.5837\n", - "Epoch 9/10\n", - "32s - loss: 0.9418 - acc: 0.5871\n", - "Epoch 10/10\n", - "32s - loss: 0.9412 - acc: 0.5881\n", - "\n", - "\n", - "Training model with regularization parameter = 0.1\n", - "Epoch 1/10\n", - "33s - loss: 218.8508 - acc: 0.4709\n", - "Epoch 2/10\n", - "32s - loss: 162.2584 - acc: 0.4561\n", - "Epoch 3/10\n", - "32s - loss: 120.3472 - acc: 0.4695\n", - "Epoch 4/10\n", - "32s - loss: 89.3082 - acc: 0.4863\n", - "Epoch 5/10\n", - "32s - loss: 66.3210 - acc: 0.4891\n", - "Epoch 6/10\n", - "32s - loss: 49.2968 - acc: 0.5004\n", - "Epoch 7/10\n", - "32s - loss: 36.6888 - acc: 0.5081\n", - "Epoch 8/10\n", - "32s - loss: 27.3514 - acc: 0.5031\n", - "Epoch 9/10\n", - "32s - loss: 20.4361 - acc: 0.5056\n", - "Epoch 10/10\n", - "32s - loss: 15.3147 - acc: 0.5019\n", - "\n", - "\n", - "Training model with regularization parameter = 100.0\n", - "Epoch 1/10\n", - "32s - loss: 252060.6371 - acc: 0.4981\n", - "Epoch 2/10\n", - "32s - loss: 252059.9525 - acc: 0.5048\n", - "Epoch 3/10\n", - "32s - loss: 252059.1579 - acc: 0.5069\n", - "Epoch 4/10\n", - "32s - loss: 252058.5729 - acc: 0.5048\n", - "Epoch 5/10\n", - "32s - loss: 252057.9460 - acc: 0.5047\n", - "Epoch 6/10\n", - "32s - loss: 252057.1785 - acc: 0.5205\n", - "Epoch 7/10\n", - "32s - loss: 252056.3502 - acc: 0.5075\n", - "Epoch 8/10\n", - "32s - loss: 252055.6600 - acc: 0.5020\n", - "Epoch 9/10\n", - "32s - loss: 252054.9252 - acc: 0.5036\n", - "Epoch 10/10\n", - "32s - loss: 252054.3204 - acc: 0.5081\n", - "\n", - "\n" - ] - } - ], - "source": [ - "for rp in [1e-10, 1e-7, 1e-4, 1e-1, 1e2]:\n", - " lstm_create_train(rp, 'tweets')" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(1e-10, 0.59364081062194274)\n", - "(1e-07, 0.57092941998602376)\n", - "(0.0001, 0.57477288609364086)\n", - "(0.1, 0.50454227812718377)\n", - "(100.0, 0.56533892382948991)\n" - ] - } - ], - "source": [ - "from sklearn.metrics import accuracy_score\n", - "\n", - "for rp in [1e-10, 1e-7, 1e-4, 1e-1, 1e2]:\n", - " model = load_model('./lstm_wvec_{0}_{1}_model.h5'.format(rp, 'tweets'))\n", - " preds = model.predict_classes(test_seq, verbose=0)\n", - " print((rp, accuracy_score(test_data[LABEL_COL], preds)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}