From 12c186e1e2bf036436b7fab0ac923df441133232 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 12 Sep 2017 13:28:08 +0000 Subject: [PATCH] Edit preprocessing --- lstm_word2vec.ipynb | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lstm_word2vec.ipynb b/lstm_word2vec.ipynb index 376c64b..abf8a74 100644 --- a/lstm_word2vec.ipynb +++ b/lstm_word2vec.ipynb @@ -200,8 +200,8 @@ ], "source": [ "print('Number of reviews by class in training set')\n", - "print(y_train.sum(axis=0))\n", - "n_classes = y_train.shape[1]" + "print(labels.sum(axis=0))\n", + "n_classes = labels.shape[1]" ] }, { @@ -232,7 +232,9 @@ "\n", "for doc in train_data[TEXT_COL]:\n", " sentences = nltk.tokenize.sent_tokenize(doc)\n", - " sent_lst.extend(sentences)" + " for sent in sentences:\n", + " word_lst = [w for w in nltk.tokenize.word_tokenize(sent) if w.isalnum()]\n", + " sent_lst.append(word_lst)" ] }, {