diff --git a/lstm_word2vec.ipynb b/lstm_word2vec.ipynb index 938072f..d839e7e 100644 --- a/lstm_word2vec.ipynb +++ b/lstm_word2vec.ipynb @@ -29,7 +29,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Download the Amazon reviews training data from a public Azure blob" ] @@ -38,7 +41,9 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -62,7 +67,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Set the dimensions of the input and the embedding. \n", "\n", @@ -77,7 +85,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -102,7 +112,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Fit a Keras tokenizer to the most frequent words using the entire training data set as the corpus." ] @@ -126,9 +139,24 @@ "test_seq = sequence.pad_sequences(test_seq, maxlen=MAX_DOC_LEN)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "np.save('train_seq', train_seq)\n", + "np.save('test_seq', test_seq)" + ] + }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Convert the ratings to one-hot categorical labels." ] @@ -137,7 +165,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -172,7 +202,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Train word2vec on all the documents in order to initialize the word embedding. Ignore rare words (min_count=6). Use skip-gram as the training algorithm (sg=1)." ] @@ -181,7 +214,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1261,7 +1296,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Create the initial embedding matrix from the output of word2vec." ] @@ -1303,7 +1341,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "LSTM_DIM is the dimensionality of each LSTM output (the number of LSTM units).\n", "The mask_zero option determines whether masking is performed, i.e. whether the layers ignore the padded zeros in shorter documents. CNTK / Keras does not support masking yet." @@ -1366,8 +1407,8 @@ "\n", " t1 = time.time()\n", " # model fit\n", - " model.fit(train_seq[ind,:],\n", - " y_train[ind,:].astype('float32'),\n", + " model.fit(train_seq,\n", + " labels.astype('float32'),\n", " batch_size=BATCH_SIZE,\n", " epochs=NUM_EPOCHS,\n", " callbacks=[history, csv_logger],\n",