{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook was run using my default environment.\n",
    "\n",
    "This notebook plays with amazon review data - http://jmcauley.ucsd.edu/data/amazon/. I have only downloaded the electronics reviews and I have not downloaded the metadata. Maybe in the future. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-1-2b4d981af6ed>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0mpath\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'./reviews_Electronics_5.json.gz'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"output.strict\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0ml\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     12\u001b[0m     \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ml\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<ipython-input-1-2b4d981af6ed>\u001b[0m in \u001b[0;36mparse\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0mg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgzip\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m     \u001b[0;32mfor\u001b[0m \u001b[0ml\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mg\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      7\u001b[0m         \u001b[0;32myield\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0meval\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ml\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/home/dan-laptop/anaconda3/lib/python3.5/gzip.py\u001b[0m in \u001b[0;36mreadline\u001b[0;34m(self, size)\u001b[0m\n\u001b[1;32m    370\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    371\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_not_closed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 372\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_buffer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    373\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    374\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "import gzip\n",
    "\n",
    "import json \n",
    "def parse(path): \n",
    "    g = gzip.open(path, 'r') \n",
    "    for l in g: \n",
    "        yield json.dumps(eval(l)) \n",
    "\n",
    "path = './reviews_Electronics_5.json.gz'\n",
    "f = open(\"output.strict\", 'w') \n",
    "for l in parse(path): \n",
    "    f.write(l + '\\n')\n",
    "    \n",
    "from nltk import word_tokenize\n",
    "\n",
    "review_dict = json.loads(eval(l))\n",
    "review_text = review_dict['reviewText']\n",
    "tok_review = word_tokenize(review_text)\n",
    "len(tok_review)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using Theano backend.\n"
     ]
    }
   ],
   "source": [
    "from keras.preprocessing.text import Tokenizer\n",
    "import gzip\n",
    "\n",
    "MAX_SEQUENCE_LENGTH = 1000\n",
    "MAX_NB_WORDS = 40000\n",
    "EMBEDDING_DIM = 100\n",
    "VALIDATION_SPLIT = 0.2\n",
    "\n",
    "def generator_review_parse(path):\n",
    "    g = gzip.open(path, 'r') \n",
    "    for l in g: \n",
    "        review_dict = eval(l)\n",
    "        yield review_dict['reviewText']\n",
    "\n",
    "tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)\n",
    "\n",
    "#tokenizer.fit_on_texts(all_text)\n",
    "path = './reviews_Electronics_5.json.gz'\n",
    "tokenizer.fit_on_texts(generator_review_parse(path))\n",
    "sequences = tokenizer.texts_to_sequences_generator(generator_review_parse(path))\n",
    "word_index = tokenizer.word_index\n",
    "print('Found %s unique tokens.' % len(word_index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "GLOVE_DIR = '/home/dan-laptop/github/ulysses/glove.6B/'\n",
    "\n",
    "embeddings_index = {}\n",
    "f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))\n",
    "for line in f:\n",
    "    values = line.split()\n",
    "    word = values[0]\n",
    "    coefs = np.asarray(values[1:], dtype='float32')\n",
    "    embeddings_index[word] = coefs\n",
    "f.close()\n",
    "\n",
    "print('Found %s word vectors.' % len(embeddings_index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#not sure why but I had to reduce the embedding dim to 100 here...\n",
    "embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))\n",
    "for word, i in word_index.items():\n",
    "    embedding_vector = embeddings_index.get(word)\n",
    "    if embedding_vector is not None:\n",
    "        # words not found in embedding index will be all-zeros.\n",
    "        embedding_matrix[i] = embedding_vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from keras.layers import Embedding\n",
    "\n",
    "embedding_layer = Embedding(len(word_index) + 1,\n",
    "                            EMBEDDING_DIM,\n",
    "                            weights=[embedding_matrix],\n",
    "                            input_length=MAX_SEQUENCE_LENGTH,\n",
    "                            trainable=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def generator_modelData(path,batch_size=1,token_model=tokenizer):\n",
    "    g = gzip.open(path, 'r') \n",
    "    count = 0\n",
    "    for l in g: \n",
    "        if count == 0: reviews, scores = [], []\n",
    "                \n",
    "        review_dict = eval(l)\n",
    "        \n",
    "        temp_review = np.zeros((MAX_SEQUENCE_LENGTH,))\n",
    "        temp_r = token_model.texts_to_sequences(review_dict['reviewText'])\n",
    "        temp_r = [x[0] for x in temp_r if len(x) > 0]\n",
    "        if len(temp_r) > MAX_SEQUENCE_LENGTH: \n",
    "            temp_review = temp_r[:MAX_SEQUENCE_LENGTH]\n",
    "        elif len(temp_r) == 0:\n",
    "            continue\n",
    "        else:\n",
    "            temp_review[-len(temp_r):] = np.squeeze(temp_r)    \n",
    "        temp_review = np.reshape(temp_review,(1,1000))\n",
    "        \n",
    "        temp_score = np.zeros((5))\n",
    "        temp_score[int(review_dict['overall'])-1] = 1\n",
    "        \n",
    "        if len(temp_score) == 0: continue\n",
    "            \n",
    "        scores.append(np.reshape(temp_score,(1,5)))\n",
    "        reviews.append(temp_review)\n",
    "        count += 1\n",
    "        \n",
    "        if count == batch_size:\n",
    "            yield (reviews,scores)\n",
    "            count = 0\n",
    "\n",
    "#test = next(generator_modelData(path))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from keras.layers import Dense, Input, Flatten\n",
    "from keras.layers import Conv1D, MaxPooling1D\n",
    "from keras.models import Model\n",
    "\n",
    "sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n",
    "embedded_sequences = embedding_layer(sequence_input)\n",
    "x = Conv1D(128, 5, activation='relu')(embedded_sequences)\n",
    "x = MaxPooling1D(5)(x)\n",
    "x = Conv1D(128, 5, activation='relu')(x)\n",
    "x = MaxPooling1D(5)(x)\n",
    "x = Conv1D(128, 5, activation='relu')(x)\n",
    "x = MaxPooling1D(35)(x)  # global max pooling\n",
    "x = Flatten()(x)\n",
    "x = Dense(128, activation='relu')(x)\n",
    "preds = Dense(5, activation='softmax')(x)\n",
    "\n",
    "model = Model(sequence_input, preds)\n",
    "model.compile(loss='categorical_crossentropy',\n",
    "              optimizer='rmsprop',\n",
    "              metrics=['acc'])\n",
    "            \n",
    "# happy learning!\n",
    "trials_per_epoch = 5000\n",
    "model.fit_generator(generator_modelData(path), trials_per_epoch, nb_epoch=5, \n",
    "                    validation_data=generator_modelData(path),nb_val_samples=1280)\n",
    "model.save_weights('./amazon_ratings_convnet.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}