Skip to content

Commit

Permalink
Update LSTM notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
Ubuntu committed Sep 11, 2017
1 parent 6dad5aa commit 116e82a
Showing 1 changed file with 110 additions and 132 deletions.
242 changes: 110 additions & 132 deletions lstm_word2vec.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,13 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 14,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using CNTK backend\n"
]
}
],
"outputs": [],
"source": [
"import numpy as np\n",
"import os\n",
Expand All @@ -35,132 +27,123 @@
"from keras.utils import to_categorical"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Download the Amazon reviews training data from a public Azure blob"
]
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 13,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": false
},
"outputs": [],
"source": [
"MAX_DOC_LENGTH = 300\n",
"MAX_NB_WORDS = 6000\n",
"EMBEDDING_DIM = 200\n",
"MAX_VOCAB_SIZE = 50000"
"CONTAINER_URL = \"https://anargyridsa.blob.core.windows.net/dlvm/\"\n",
"trainFile = \"amazon_reviews_train.csv\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 9,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"trainFile = '/home/anargyri/.keras/datasets/amazon_reviews_train.csv'\n",
"\n",
"# read\n",
"train_data = pd.read_csv(trainFile, header=None, names=['rating', 'title', 'text'])"
"train_data = pd.read_csv(CONTAINER_URL + trainFile, header=None, names=['rating', 'title', 'text'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Set the dimensions of the input and the embedding. \n",
"\n",
"MAX_DOC_LENGTH : the size of the input i.e. the number of words in the document. Longer documents will be truncated, shorter ones will be padded with zeros.\n",
"\n",
"VOCAB_SIZE : the size of the word encoding (number of most frequent words to keep in the vocabulary)\n",
"\n",
"EMBEDDING_DIM : the dimensionality of the word embedding"
]
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": [
"labels = []\n",
"texts = []\n",
"\n",
"for idx in range(train_data.shape[0]):\n",
" text = train_data['text'][idx]\n",
" texts.append(text)\n",
" labels.append(train_data['rating'][idx])"
"MAX_DOC_LENGTH = 300\n",
"VOCAB_SIZE = 6000\n",
"EMBEDDING_DIM = 200"
]
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 16,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"tokenizer = Tokenizer(num_words=MAX_NB_WORDS)\n",
"tokenizer.fit_on_texts(texts)"
"TEXT_COL = 'text'\n",
"LABEL_COL = 'rating'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Fit a Keras tokenizer to the most frequent words using the entire training data set as the corpus."
]
},
{
"cell_type": "code",
"execution_count": 44,
"execution_count": 34,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"data = np.zeros((len(texts), MAX_DOC_LENGTH), dtype='int32')\n",
"doc_lst = []\n",
"word_index = tokenizer.word_index\n",
"\n",
"# keep the MAX_NB_WORDS most frequent words and replace the rest with 'UNK'\n",
"# truncate to the first MAX_SENTS sentences per doc and MAX_SENT_LENGTH words per sentence\n",
"\n",
"for i, doc in enumerate(texts):\n",
" wordTokens = text_to_word_sequence(doc)\n",
" j = 0\n",
" words_in_doc = []\n",
" for _, word in enumerate(wordTokens):\n",
" if j < MAX_DOC_LENGTH: \n",
" if (word in word_index) and (word_index[word] < MAX_NB_WORDS):\n",
" data[i, j] = word_index[word]\n",
" words_in_doc.append(word)\n",
" else:\n",
" data[i, j] = MAX_NB_WORDS\n",
" words_in_doc.append('UNK')\n",
" j = j + 1\n",
" doc_lst.append(words_in_doc)"
"# tokenize, create seqs, pad\n",
"tok = Tokenizer(num_words=VOCAB_SIZE, lower=True, split=\" \")\n",
"tok.fit_on_texts(train_data[TEXT_COL])\n",
"train_seq = tok.texts_to_sequences(train_data[TEXT_COL])\n",
"train_seq = sequence.pad_sequences(train_seq, maxlen=MAX_DOC_LEN)\n",
"test_seq = tok.texts_to_sequences(test_data[TEXT_COL])\n",
"test_seq = sequence.pad_sequences(test_seq, maxlen=MAX_DOC_LEN)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Convert the ratings to one-hot categorical labels."
]
},
{
"cell_type": "code",
"execution_count": 53,
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total 951656 unique tokens.\n",
"Shape of data tensor: (3000000, 300)\n",
"Shape of label tensor: (3000000, 5)\n"
]
}
],
"outputs": [],
"source": [
"print('Total %s unique tokens.' % len(word_index))\n",
"\n",
"labels = to_categorical(np.asarray(labels))\n",
"x_train = data\n",
"y_train = labels[:,1:]\n",
"\n",
"print('Shape of data tensor:', x_train.shape)\n",
"print('Shape of label tensor:', y_train.shape)"
"labels = to_categorical(np.asarray(train_data[LABEL_COL]))\n",
"labels = labels[:,1:]\n",
"labels = labels.astype('float32')"
]
},
{
Expand All @@ -187,6 +170,32 @@
"n_classes = y_train.shape[1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Train word2vec on all the documents in order to initialize the word embedding. Ignore rare words (min_count=6). Use skip-gram as the training algorithm (sg=1)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import nltk \n",
"\n",
"nltk.download('punkt')\n",
"\n",
"sent_lst = []\n",
"\n",
"for doc in train_data[TEXT_COL]:\n",
" sentences = nltk.tokenize.sent_tokenize(doc)\n",
" sent_lst.extend(sentences)"
]
},
{
"cell_type": "code",
"execution_count": 57,
Expand Down Expand Up @@ -1243,13 +1252,18 @@
}
],
"source": [
"# train word2vec on the sentences to initialize the word embedding \n",
"import gensim, logging\n",
"\n",
"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
"# use skip-gram\n",
"word2vec_model = gensim.models.Word2Vec(doc_lst, min_count=6, size=EMBEDDING_DIM, max_vocab_size=MAX_VOCAB_SIZE, sg=1, \n",
" workers=os.cpu_count())"
"word2vec_model = gensim.models.Word2Vec(sentences=sent_lst, min_count=6, size=EMBEDDING_DIM, sg=1, workers=os.cpu_count())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create the initial embedding matrix from the output of word2vec."
]
},
{
Expand Down Expand Up @@ -1279,30 +1293,20 @@
"print('Total %s word vectors.' % len(embeddings_index))\n",
"\n",
"# Initial embedding\n",
"embedding_matrix = np.zeros((MAX_NB_WORDS + 1, EMBEDDING_DIM))\n",
"embedding_matrix = np.zeros((VOCAB_SIZE + 1, EMBEDDING_DIM))\n",
"\n",
"for word, i in word_index.items():\n",
"for word, i in tok.word_index.items():\n",
" embedding_vector = embeddings_index.get(word)\n",
" if embedding_vector is not None and i < MAX_NB_WORDS:\n",
" embedding_matrix[i] = embedding_vector\n",
" elif i == MAX_NB_WORDS:\n",
" # index MAX_NB_WORDS in data corresponds to 'UNK'\n",
" embedding_matrix[i] = embeddings_index['UNK']"
" if embedding_vector is not None and i < VOCAB_SIZE:\n",
" embedding_matrix[i] = embedding_vector"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"cell_type": "markdown",
"metadata": {},
"source": [
"# Order documents by the number of words \n",
"doc_lengths = [len(doc) for doc in doc_lst]\n",
"ind = np.argsort(doc_lengths)"
"LSTM_DIM is the dimensionality of each LSTM output (the number of LSTM units).\n",
"The mask_zero option determines whether masking is performed, i.e. whether the layers ignore the padded zeros in shorter documents. CNTK / Keras does not support masking yet."
]
},
{
Expand Down Expand Up @@ -1335,7 +1339,7 @@
" l2_reg = regularizers.l2(reg_param)\n",
"\n",
" # model init\n",
" embedding_layer = Embedding(MAX_NB_WORDS + 1,\n",
" embedding_layer = Embedding(VOCAB_SIZE,\n",
" EMBEDDING_DIM,\n",
" input_length=MAX_DOC_LENGTH,\n",
" trainable=True,\n",
Expand Down Expand Up @@ -1367,15 +1371,14 @@
" batch_size=BATCH_SIZE,\n",
" epochs=NUM_EPOCHS,\n",
" callbacks=[history, csv_logger],\n",
" shuffle=False,\n",
" verbose=2)\n",
" t2 = time.time()\n",
"\n",
" # save model\n",
" model.save('./model_wvec_{}.h5'.format(reg_param))\n",
" np.savetxt('./model_wvec_{}_time.txt'.format(reg_param), \n",
" model.save('./lstm_wvec_{}_model.h5'.format(reg_param))\n",
" np.savetxt('./lstm_wvec_{}_time.txt'.format(reg_param), \n",
" [reg_param, (t2-t1) / 3600])\n",
" with open('./model_wvec_{}_history.txt'.format(reg_param), \"w\") as res_file:\n",
" with open('./lstm_wvec_{}_history.txt'.format(reg_param), \"w\") as res_file:\n",
" res_file.write(str(history.history))"
]
},
Expand All @@ -1399,31 +1402,6 @@
"source": [
"lstm_create_train(1e-10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# If you get ResourceExhaustedError, try decreasing BATCH_SIZE. This error could arise due to GPU memory limitations.\n",
"# Tensorflow automatically uses several cores and one GPU of the DSVM. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit 116e82a

Please sign in to comment.