From 12c186e1e2bf036436b7fab0ac923df441133232 Mon Sep 17 00:00:00 2001
From: Ubuntu
 <anargyri@anargyrigpu2.4jaf42zu4v0erje5lkulqf23gh.bx.internal.cloudapp.net>
Date: Tue, 12 Sep 2017 13:28:08 +0000
Subject: [PATCH] Edit preprocessing

---
 lstm_word2vec.ipynb | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lstm_word2vec.ipynb b/lstm_word2vec.ipynb
index 376c64b..abf8a74 100644
--- a/lstm_word2vec.ipynb
+++ b/lstm_word2vec.ipynb
@@ -200,8 +200,8 @@
    ],
    "source": [
     "print('Number of reviews by class in training set')\n",
-    "print(y_train.sum(axis=0))\n",
-    "n_classes = y_train.shape[1]"
+    "print(labels.sum(axis=0))\n",
+    "n_classes = labels.shape[1]"
    ]
   },
   {
@@ -232,7 +232,9 @@
     "\n",
     "for doc in train_data[TEXT_COL]:\n",
     "    sentences = nltk.tokenize.sent_tokenize(doc)\n",
-    "    sent_lst.extend(sentences)"
+    "    for sent in sentences:\n",
+    "        word_lst = [w for w in nltk.tokenize.word_tokenize(sent) if w.isalnum()]\n",
+    "        sent_lst.append(word_lst)"
    ]
   },
   {