diff --git a/hatt_train.ipynb b/hatt_train.ipynb index 0232aa0..70c9e3e 100644 --- a/hatt_train.ipynb +++ b/hatt_train.ipynb @@ -2,7 +2,10 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The code in this notebook is based on [Richard Liao's implementation of hierarchical attention networks](https://github.com/richliao/textClassifier/blob/master/textClassifierHATT.py) and a related [Google group discussion](https://groups.google.com/forum/#!topic/keras-users/IWK9opMFavQ). The notebook also includes code from [Keras documentation](https://keras.io/) and [blog](https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html) as well as this [word2vec tutorial](http://adventuresinmachinelearning.com/gensim-word2vec-tutorial/)." ] @@ -171,11 +174,11 @@ "texts = []\n", "\n", "for idx in range(train_data.shape[0]):\n", - " text = train_data['text'][idx]\n", + " text = train_data['text'].iloc[idx]\n", " texts.append(text)\n", " sentences = nltk.tokenize.sent_tokenize(text)\n", " reviews.append(sentences)\n", - " labels.append(train_data['rating'][idx])" + " labels.append(train_data['rating'].iloc[idx])" ] }, { diff --git a/lstm_word2vec_small.ipynb b/lstm_word2vec_small.ipynb index 4cc87a7..bdbec87 100644 --- a/lstm_word2vec_small.ipynb +++ b/lstm_word2vec_small.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "collapsed": false, "deletable": true, @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": { "collapsed": false, "deletable": true, @@ -65,6 +65,7 @@ }, "outputs": [], "source": [ + "\"\"\"\n", "from azureml import Workspace\n", "ws = Workspace(\n", " workspace_id='817780d9ee0d4a878e25f8c9deb3b866',\n", @@ -73,7 +74,31 @@ ")\n", "ds = ws.datasets['Book Reviews from Amazon']\n", "all_data = ds.to_dataframe()\n", - "all_data.rename(columns={0: 'rating', 1: 'text'}, inplace=True)" + "all_data.rename(columns={0: 'rating', 1: 'text'}, inplace=True)\n", + "all_data.loc[:, 'rating'] = all_data['rating'] - 1 # reindex ratings to start from 0\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from azureml import Workspace\n", + "ws = Workspace(\n", + " workspace_id='817780d9ee0d4a878e25f8c9deb3b866',\n", + " authorization_token='6df8a52943bd49eba6e57446bc73f5fc',\n", + " endpoint='https://studioapi.azureml.net'\n", + ")\n", + "ds = ws.datasets['dfe_happysad_utf.csv']\n", + "all_data = ds.to_dataframe()\n", + "all_data.rename(columns={'features': 'text', 'label': 'rating'}, inplace=True)\n", + "all_data.replace({'rating': {'sadness': 0, 'happiness': 1}}, inplace=True)" ] }, { @@ -88,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 39, "metadata": { "collapsed": false, "deletable": true, @@ -123,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 42, "metadata": { "collapsed": true, "deletable": true, @@ -138,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 43, "metadata": { "collapsed": true, "deletable": true, @@ -162,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 44, "metadata": { "collapsed": false, "deletable": true, @@ -191,7 +216,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 49, "metadata": { "collapsed": false, "deletable": true, @@ -200,13 +225,12 @@ "outputs": [], "source": [ "labels = to_categorical(np.asarray(train_data[LABEL_COL]))\n", - "labels = labels[:,1:] # rating 0 does not exist, so remove \n", "labels = labels.astype('float32')" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 50, "metadata": { "collapsed": false, "deletable": true, @@ -224,12 +248,12 @@ "editable": true }, "source": [ - "Train word2vec on all the documents in order to initialize the word embedding. Ignore rare words (min_count=6). Use skip-gram as the training algorithm (sg=1)." + "Train word2vec on the training documents in order to initialize the word embedding. Ignore rare words (min_count=6). Use skip-gram as the training algorithm (sg=1)." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 53, "metadata": { "collapsed": false, "deletable": true, @@ -261,7 +285,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 54, "metadata": { "collapsed": false, "deletable": true, @@ -273,53 +297,45 @@ "name": "stderr", "output_type": "stream", "text": [ - "2017-09-14 16:47:12,691 : INFO : collecting all words and their counts\n", - "2017-09-14 16:47:12,692 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", - "2017-09-14 16:47:12,740 : INFO : PROGRESS: at sentence #10000, processed 179872 words, keeping 17751 word types\n", - "2017-09-14 16:47:12,788 : INFO : PROGRESS: at sentence #20000, processed 362578 words, keeping 26298 word types\n", - "2017-09-14 16:47:12,834 : INFO : PROGRESS: at sentence #30000, processed 544135 words, keeping 32158 word types\n", - "2017-09-14 16:47:12,882 : INFO : PROGRESS: at sentence #40000, processed 727386 words, keeping 37250 word types\n", - "2017-09-14 16:47:12,931 : INFO : PROGRESS: at sentence #50000, processed 911012 words, keeping 41503 word types\n", - "2017-09-14 16:47:12,982 : INFO : PROGRESS: at sentence #60000, processed 1093093 words, keeping 45119 word types\n", - "2017-09-14 16:47:13,022 : INFO : collected 47857 word types from a corpus of 1245083 raw words and 68283 sentences\n", - "2017-09-14 16:47:13,022 : INFO : Loading a fresh vocabulary\n", - "2017-09-14 16:47:13,072 : INFO : min_count=6 retains 12670 unique words (26% of original 47857, drops 35187)\n", - "2017-09-14 16:47:13,073 : INFO : min_count=6 leaves 1180885 word corpus (94% of original 1245083, drops 64198)\n", - "2017-09-14 16:47:13,106 : INFO : deleting the raw counts dictionary of 47857 items\n", - "2017-09-14 16:47:13,109 : INFO : sample=0.001 downsamples 44 most-common words\n", - "2017-09-14 16:47:13,109 : INFO : downsampling leaves estimated 886078 word corpus (75.0% of prior 1180885)\n", - "2017-09-14 16:47:13,110 : INFO : estimated required memory for 12670 words and 200 dimensions: 26607000 bytes\n", - "2017-09-14 16:47:13,144 : INFO : resetting layer weights\n", - "2017-09-14 16:47:13,341 : INFO : training model with 24 workers on 12670 vocabulary and 200 features, using sg=1 hs=0 sample=0.001 negative=5 window=5\n", - "2017-09-14 16:47:14,357 : INFO : PROGRESS: at 21.53% examples, 950506 words/s, in_qsize 47, out_qsize 0\n", - "2017-09-14 16:47:15,365 : INFO : PROGRESS: at 47.88% examples, 1054121 words/s, in_qsize 47, out_qsize 0\n", - "2017-09-14 16:47:16,367 : INFO : PROGRESS: at 72.71% examples, 1069305 words/s, in_qsize 48, out_qsize 0\n", - "2017-09-14 16:47:17,329 : INFO : worker thread finished; awaiting finish of 23 more threads\n", - "2017-09-14 16:47:17,335 : INFO : worker thread finished; awaiting finish of 22 more threads\n", - "2017-09-14 16:47:17,335 : INFO : worker thread finished; awaiting finish of 21 more threads\n", - "2017-09-14 16:47:17,336 : INFO : worker thread finished; awaiting finish of 20 more threads\n", - "2017-09-14 16:47:17,347 : INFO : worker thread finished; awaiting finish of 19 more threads\n", - "2017-09-14 16:47:17,353 : INFO : worker thread finished; awaiting finish of 18 more threads\n", - "2017-09-14 16:47:17,355 : INFO : worker thread finished; awaiting finish of 17 more threads\n", - "2017-09-14 16:47:17,371 : INFO : PROGRESS: at 97.55% examples, 1076295 words/s, in_qsize 16, out_qsize 1\n", - "2017-09-14 16:47:17,372 : INFO : worker thread finished; awaiting finish of 16 more threads\n", - "2017-09-14 16:47:17,374 : INFO : worker thread finished; awaiting finish of 15 more threads\n", - "2017-09-14 16:47:17,378 : INFO : worker thread finished; awaiting finish of 14 more threads\n", - "2017-09-14 16:47:17,383 : INFO : worker thread finished; awaiting finish of 13 more threads\n", - "2017-09-14 16:47:17,391 : INFO : worker thread finished; awaiting finish of 12 more threads\n", - "2017-09-14 16:47:17,398 : INFO : worker thread finished; awaiting finish of 11 more threads\n", - "2017-09-14 16:47:17,403 : INFO : worker thread finished; awaiting finish of 10 more threads\n", - "2017-09-14 16:47:17,408 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2017-09-14 16:47:17,411 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2017-09-14 16:47:17,417 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2017-09-14 16:47:17,426 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2017-09-14 16:47:17,427 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2017-09-14 16:47:17,428 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2017-09-14 16:47:17,429 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2017-09-14 16:47:17,434 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2017-09-14 16:47:17,440 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2017-09-14 16:47:17,445 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2017-09-14 16:47:17,446 : INFO : training on 6225415 raw words (4431895 effective words) took 4.1s, 1083352 effective words/s\n" + "2017-09-15 11:21:16,427 : INFO : collecting all words and their counts\n", + "2017-09-15 11:21:16,428 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", + "2017-09-15 11:21:16,451 : INFO : PROGRESS: at sentence #10000, processed 75804 words, keeping 13063 word types\n", + "2017-09-15 11:21:16,459 : INFO : collected 15977 word types from a corpus of 100883 raw words and 13257 sentences\n", + "2017-09-15 11:21:16,460 : INFO : Loading a fresh vocabulary\n", + "2017-09-15 11:21:16,470 : INFO : min_count=6 retains 1613 unique words (10% of original 15977, drops 14364)\n", + "2017-09-15 11:21:16,471 : INFO : min_count=6 leaves 80706 word corpus (79% of original 100883, drops 20177)\n", + "2017-09-15 11:21:16,476 : INFO : deleting the raw counts dictionary of 15977 items\n", + "2017-09-15 11:21:16,478 : INFO : sample=0.001 downsamples 65 most-common words\n", + "2017-09-15 11:21:16,479 : INFO : downsampling leaves estimated 59121 word corpus (73.3% of prior 80706)\n", + "2017-09-15 11:21:16,479 : INFO : estimated required memory for 1613 words and 200 dimensions: 3387300 bytes\n", + "2017-09-15 11:21:16,484 : INFO : resetting layer weights\n", + "2017-09-15 11:21:16,520 : INFO : training model with 24 workers on 1613 vocabulary and 200 features, using sg=1 hs=0 sample=0.001 negative=5 window=5\n", + "2017-09-15 11:21:16,831 : INFO : worker thread finished; awaiting finish of 23 more threads\n", + "2017-09-15 11:21:16,833 : INFO : worker thread finished; awaiting finish of 22 more threads\n", + "2017-09-15 11:21:16,844 : INFO : worker thread finished; awaiting finish of 21 more threads\n", + "2017-09-15 11:21:16,846 : INFO : worker thread finished; awaiting finish of 20 more threads\n", + "2017-09-15 11:21:16,848 : INFO : worker thread finished; awaiting finish of 19 more threads\n", + "2017-09-15 11:21:16,854 : INFO : worker thread finished; awaiting finish of 18 more threads\n", + "2017-09-15 11:21:16,858 : INFO : worker thread finished; awaiting finish of 17 more threads\n", + "2017-09-15 11:21:16,861 : INFO : worker thread finished; awaiting finish of 16 more threads\n", + "2017-09-15 11:21:16,865 : INFO : worker thread finished; awaiting finish of 15 more threads\n", + "2017-09-15 11:21:16,880 : INFO : worker thread finished; awaiting finish of 14 more threads\n", + "2017-09-15 11:21:16,882 : INFO : worker thread finished; awaiting finish of 13 more threads\n", + "2017-09-15 11:21:16,889 : INFO : worker thread finished; awaiting finish of 12 more threads\n", + "2017-09-15 11:21:16,891 : INFO : worker thread finished; awaiting finish of 11 more threads\n", + "2017-09-15 11:21:16,895 : INFO : worker thread finished; awaiting finish of 10 more threads\n", + "2017-09-15 11:21:16,897 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2017-09-15 11:21:16,898 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2017-09-15 11:21:16,904 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2017-09-15 11:21:16,907 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2017-09-15 11:21:16,908 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2017-09-15 11:21:16,909 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2017-09-15 11:21:16,910 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2017-09-15 11:21:16,914 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2017-09-15 11:21:16,920 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2017-09-15 11:21:16,921 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2017-09-15 11:21:16,921 : INFO : training on 504415 raw words (295266 effective words) took 0.4s, 758339 effective words/s\n", + "2017-09-15 11:21:16,922 : WARNING : under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" ] } ], @@ -343,7 +359,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 55, "metadata": { "collapsed": false, "deletable": true, @@ -354,7 +370,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Total 12670 word vectors.\n" + "Total 1613 word vectors.\n" ] } ], @@ -389,7 +405,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 56, "metadata": { "collapsed": true, "deletable": true, @@ -405,7 +421,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 57, "metadata": { "collapsed": false, "deletable": true, @@ -413,7 +429,7 @@ }, "outputs": [], "source": [ - "def lstm_create_train(reg_param):\n", + "def lstm_create_train(reg_param, ref_str):\n", " l2_reg = regularizers.l2(reg_param)\n", "\n", " # model init\n", @@ -438,7 +454,7 @@ " metrics=['acc'])\n", "\n", " history = History()\n", - " csv_logger = CSVLogger('./lstm_model_wvec_{}_log'.format(reg_param),\n", + " csv_logger = CSVLogger('./lstm_model_wvec_{0}_{1}.log'.format(reg_param, ref_str),\n", " separator=',',\n", " append=True)\n", "\n", @@ -455,10 +471,10 @@ " print(\"\\n\")\n", " \n", " # save model\n", - " model.save('./lstm_wvec_{}_model.h5'.format(reg_param))\n", - " np.savetxt('./lstm_wvec_{}_time.txt'.format(reg_param), \n", + " model.save('./lstm_wvec_{0}_{1}_model.h5'.format(reg_param, ref_str))\n", + " np.savetxt('./lstm_wvec_{0}_{1}_time.txt'.format(reg_param, ref_str), \n", " [reg_param, (t2-t1) / 3600])\n", - " with open('./lstm_wvec_{}_history.txt'.format(reg_param), \"w\") as res_file:\n", + " with open('./lstm_wvec_{0}_{1}_history.txt'.format(reg_param, ref_str), \"w\") as res_file:\n", " res_file.write(str(history.history))" ] }, @@ -537,7 +553,7 @@ } ], "source": [ - "preds = model.predict_classes(test_seq) + 1 # add 1 since we removed the 0 class" + "preds = model.predict_classes(test_seq) " ] }, { @@ -565,6 +581,181 @@ "\n", "accuracy_score(test_data[LABEL_COL], preds)" ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training model with regularization parameter = 1e-10\n", + "Epoch 1/10\n", + "33s - loss: 0.6922 - acc: 0.5447\n", + "Epoch 2/10\n", + "32s - loss: 0.6916 - acc: 0.5557\n", + "Epoch 3/10\n", + "32s - loss: 0.6910 - acc: 0.5652\n", + "Epoch 4/10\n", + "32s - loss: 0.6905 - acc: 0.5893\n", + "Epoch 5/10\n", + "32s - loss: 0.6899 - acc: 0.5828\n", + "Epoch 6/10\n", + "32s - loss: 0.6893 - acc: 0.5827\n", + "Epoch 7/10\n", + "32s - loss: 0.6887 - acc: 0.5943\n", + "Epoch 8/10\n", + "32s - loss: 0.6882 - acc: 0.5901\n", + "Epoch 9/10\n", + "32s - loss: 0.6876 - acc: 0.5933\n", + "Epoch 10/10\n", + "32s - loss: 0.6870 - acc: 0.5905\n", + "\n", + "\n", + "Training model with regularization parameter = 1e-07\n", + "Epoch 1/10\n", + "32s - loss: 0.6944 - acc: 0.4816\n", + "Epoch 2/10\n", + "32s - loss: 0.6936 - acc: 0.4899\n", + "Epoch 3/10\n", + "32s - loss: 0.6929 - acc: 0.5145\n", + "Epoch 4/10\n", + "32s - loss: 0.6924 - acc: 0.5299\n", + "Epoch 5/10\n", + "32s - loss: 0.6917 - acc: 0.5463\n", + "Epoch 6/10\n", + "32s - loss: 0.6911 - acc: 0.5524\n", + "Epoch 7/10\n", + "32s - loss: 0.6905 - acc: 0.5656\n", + "Epoch 8/10\n", + "32s - loss: 0.6899 - acc: 0.5616\n", + "Epoch 9/10\n", + "32s - loss: 0.6893 - acc: 0.5647\n", + "Epoch 10/10\n", + "32s - loss: 0.6888 - acc: 0.5716\n", + "\n", + "\n", + "Training model with regularization parameter = 0.0001\n", + "Epoch 1/10\n", + "33s - loss: 0.9469 - acc: 0.4776\n", + "Epoch 2/10\n", + "32s - loss: 0.9460 - acc: 0.4563\n", + "Epoch 3/10\n", + "32s - loss: 0.9453 - acc: 0.4888\n", + "Epoch 4/10\n", + "32s - loss: 0.9447 - acc: 0.5072\n", + "Epoch 5/10\n", + "32s - loss: 0.9441 - acc: 0.5355\n", + "Epoch 6/10\n", + "32s - loss: 0.9435 - acc: 0.5539\n", + "Epoch 7/10\n", + "32s - loss: 0.9429 - acc: 0.5832\n", + "Epoch 8/10\n", + "32s - loss: 0.9423 - acc: 0.5837\n", + "Epoch 9/10\n", + "32s - loss: 0.9418 - acc: 0.5871\n", + "Epoch 10/10\n", + "32s - loss: 0.9412 - acc: 0.5881\n", + "\n", + "\n", + "Training model with regularization parameter = 0.1\n", + "Epoch 1/10\n", + "33s - loss: 218.8508 - acc: 0.4709\n", + "Epoch 2/10\n", + "32s - loss: 162.2584 - acc: 0.4561\n", + "Epoch 3/10\n", + "32s - loss: 120.3472 - acc: 0.4695\n", + "Epoch 4/10\n", + "32s - loss: 89.3082 - acc: 0.4863\n", + "Epoch 5/10\n", + "32s - loss: 66.3210 - acc: 0.4891\n", + "Epoch 6/10\n", + "32s - loss: 49.2968 - acc: 0.5004\n", + "Epoch 7/10\n", + "32s - loss: 36.6888 - acc: 0.5081\n", + "Epoch 8/10\n", + "32s - loss: 27.3514 - acc: 0.5031\n", + "Epoch 9/10\n", + "32s - loss: 20.4361 - acc: 0.5056\n", + "Epoch 10/10\n", + "32s - loss: 15.3147 - acc: 0.5019\n", + "\n", + "\n", + "Training model with regularization parameter = 100.0\n", + "Epoch 1/10\n", + "32s - loss: 252060.6371 - acc: 0.4981\n", + "Epoch 2/10\n", + "32s - loss: 252059.9525 - acc: 0.5048\n", + "Epoch 3/10\n", + "32s - loss: 252059.1579 - acc: 0.5069\n", + "Epoch 4/10\n", + "32s - loss: 252058.5729 - acc: 0.5048\n", + "Epoch 5/10\n", + "32s - loss: 252057.9460 - acc: 0.5047\n", + "Epoch 6/10\n", + "32s - loss: 252057.1785 - acc: 0.5205\n", + "Epoch 7/10\n", + "32s - loss: 252056.3502 - acc: 0.5075\n", + "Epoch 8/10\n", + "32s - loss: 252055.6600 - acc: 0.5020\n", + "Epoch 9/10\n", + "32s - loss: 252054.9252 - acc: 0.5036\n", + "Epoch 10/10\n", + "32s - loss: 252054.3204 - acc: 0.5081\n", + "\n", + "\n" + ] + } + ], + "source": [ + "for rp in [1e-10, 1e-7, 1e-4, 1e-1, 1e2]:\n", + " lstm_create_train(rp, 'tweets')" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1e-10, 0.59364081062194274)\n", + "(1e-07, 0.57092941998602376)\n", + "(0.0001, 0.57477288609364086)\n", + "(0.1, 0.50454227812718377)\n", + "(100.0, 0.56533892382948991)\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score\n", + "\n", + "for rp in [1e-10, 1e-7, 1e-4, 1e-1, 1e2]:\n", + " model = load_model('./lstm_wvec_{0}_{1}_model.h5'.format(rp, 'tweets'))\n", + " preds = model.predict_classes(test_seq, verbose=0)\n", + " print((rp, accuracy_score(test_data[LABEL_COL], preds)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] } ], "metadata": {