{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook was run using my default environment.\n", "\n", "This notebook plays with amazon review data - http://jmcauley.ucsd.edu/data/amazon/. I have only downloaded the electronics reviews and I have not downloaded the metadata. Maybe in the future. " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [ { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mpath\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'./reviews_Electronics_5.json.gz'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"output.strict\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0ml\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 12\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ml\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mparse\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgzip\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0ml\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mg\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0meval\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ml\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/home/dan-laptop/anaconda3/lib/python3.5/gzip.py\u001b[0m in \u001b[0;36mreadline\u001b[0;34m(self, size)\u001b[0m\n\u001b[1;32m 370\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 371\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_not_closed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 372\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_buffer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 373\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 374\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "import gzip\n", "\n", "import json \n", "def parse(path): \n", " g = gzip.open(path, 'r') \n", " for l in g: \n", " yield json.dumps(eval(l)) \n", "\n", "path = './reviews_Electronics_5.json.gz'\n", "f = open(\"output.strict\", 'w') \n", "for l in parse(path): \n", " f.write(l + '\\n')\n", " \n", "from nltk import word_tokenize\n", "\n", "review_dict = json.loads(eval(l))\n", "review_text = review_dict['reviewText']\n", "tok_review = word_tokenize(review_text)\n", "len(tok_review)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using Theano backend.\n" ] } ], "source": [ "from keras.preprocessing.text import Tokenizer\n", "import gzip\n", "\n", "MAX_SEQUENCE_LENGTH = 1000\n", "MAX_NB_WORDS = 40000\n", "EMBEDDING_DIM = 100\n", "VALIDATION_SPLIT = 0.2\n", "\n", "def generator_review_parse(path):\n", " g = gzip.open(path, 'r') \n", " for l in g: \n", " review_dict = eval(l)\n", " yield review_dict['reviewText']\n", "\n", "tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)\n", "\n", "#tokenizer.fit_on_texts(all_text)\n", "path = './reviews_Electronics_5.json.gz'\n", "tokenizer.fit_on_texts(generator_review_parse(path))\n", "sequences = tokenizer.texts_to_sequences_generator(generator_review_parse(path))\n", "word_index = tokenizer.word_index\n", "print('Found %s unique tokens.' % len(word_index))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import os\n", "import numpy as np\n", "GLOVE_DIR = '/home/dan-laptop/github/ulysses/glove.6B/'\n", "\n", "embeddings_index = {}\n", "f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))\n", "for line in f:\n", " values = line.split()\n", " word = values[0]\n", " coefs = np.asarray(values[1:], dtype='float32')\n", " embeddings_index[word] = coefs\n", "f.close()\n", "\n", "print('Found %s word vectors.' % len(embeddings_index))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#not sure why but I had to reduce the embedding dim to 100 here...\n", "embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))\n", "for word, i in word_index.items():\n", " embedding_vector = embeddings_index.get(word)\n", " if embedding_vector is not None:\n", " # words not found in embedding index will be all-zeros.\n", " embedding_matrix[i] = embedding_vector" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from keras.layers import Embedding\n", "\n", "embedding_layer = Embedding(len(word_index) + 1,\n", " EMBEDDING_DIM,\n", " weights=[embedding_matrix],\n", " input_length=MAX_SEQUENCE_LENGTH,\n", " trainable=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def generator_modelData(path,batch_size=1,token_model=tokenizer):\n", " g = gzip.open(path, 'r') \n", " count = 0\n", " for l in g: \n", " if count == 0: reviews, scores = [], []\n", " \n", " review_dict = eval(l)\n", " \n", " temp_review = np.zeros((MAX_SEQUENCE_LENGTH,))\n", " temp_r = token_model.texts_to_sequences(review_dict['reviewText'])\n", " temp_r = [x[0] for x in temp_r if len(x) > 0]\n", " if len(temp_r) > MAX_SEQUENCE_LENGTH: \n", " temp_review = temp_r[:MAX_SEQUENCE_LENGTH]\n", " elif len(temp_r) == 0:\n", " continue\n", " else:\n", " temp_review[-len(temp_r):] = np.squeeze(temp_r) \n", " temp_review = np.reshape(temp_review,(1,1000))\n", " \n", " temp_score = np.zeros((5))\n", " temp_score[int(review_dict['overall'])-1] = 1\n", " \n", " if len(temp_score) == 0: continue\n", " \n", " scores.append(np.reshape(temp_score,(1,5)))\n", " reviews.append(temp_review)\n", " count += 1\n", " \n", " if count == batch_size:\n", " yield (reviews,scores)\n", " count = 0\n", "\n", "#test = next(generator_modelData(path))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "from keras.layers import Dense, Input, Flatten\n", "from keras.layers import Conv1D, MaxPooling1D\n", "from keras.models import Model\n", "\n", "sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n", "embedded_sequences = embedding_layer(sequence_input)\n", "x = Conv1D(128, 5, activation='relu')(embedded_sequences)\n", "x = MaxPooling1D(5)(x)\n", "x = Conv1D(128, 5, activation='relu')(x)\n", "x = MaxPooling1D(5)(x)\n", "x = Conv1D(128, 5, activation='relu')(x)\n", "x = MaxPooling1D(35)(x) # global max pooling\n", "x = Flatten()(x)\n", "x = Dense(128, activation='relu')(x)\n", "preds = Dense(5, activation='softmax')(x)\n", "\n", "model = Model(sequence_input, preds)\n", "model.compile(loss='categorical_crossentropy',\n", " optimizer='rmsprop',\n", " metrics=['acc'])\n", " \n", "# happy learning!\n", "trials_per_epoch = 5000\n", "model.fit_generator(generator_modelData(path), trials_per_epoch, nb_epoch=5, \n", " validation_data=generator_modelData(path),nb_val_samples=1280)\n", "model.save_weights('./amazon_ratings_convnet.h5')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }