{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using TensorFlow backend.\n" ] } ], "source": [ "import os\n", "import sys\n", "import numpy as np\n", "from keras.preprocessing.text import Tokenizer\n", "from keras.preprocessing.sequence import pad_sequences\n", "from keras.utils import to_categorical\n", "from keras.layers import Dense, Input, Flatten\n", "from keras.layers import Conv1D, MaxPooling1D, Embedding\n", "from keras.models import Model" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def read_20_newgroup_files(path_to_data_directory):\n", " texts = []\n", " labels_index = {}\n", " labels = []\n", " \n", " for name in sorted(os.listdir(path_to_data_directory)):\n", " path = os.path.join(path_to_data_directory,name)\n", " \n", " if os.path.isdir(path):\n", " label_id = len(labels_index)\n", " labels_index[name] = label_id\n", " \n", " for fname in sorted(os.listdir(path)):\n", " if fname.isdigit():\n", " fpath = os.path.join(path, fname)\n", " if sys.version_info < (3,):\n", " f = open(fpath)\n", " else:\n", " f = open(fpath, encoding='latin-1')\n", " t = f.read()\n", " i = t.find('\\n\\n') +2 # skip header\n", " \n", " if i > 0:\n", " t = t[i:]\n", " \n", " texts.append(t)\n", " f.close()\n", " labels.append(label_id) \n", " \n", " \n", " return (texts,labels_index,labels)\n", "\n", "path = \"/home/felipe/data/20_newsgroup/20_newsgroup/\"\n", " \n", "texts,labels_index,labels = read_20_newgroup_files(path) " ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "MAX_NB_WORDS = 20000\n", "MAX_SEQUENCE_LENGTH = 1000\n", "VALIDATION_SPLIT = 0.2\n", "\n", "tokenizer = Tokenizer(num_words=MAX_NB_WORDS)\n", "tokenizer.fit_on_texts(texts)\n", "sequences = tokenizer.texts_to_sequences(texts)\n", "word_index = tokenizer.word_index\n", "\n", "data = pad_sequences(sequences,maxlen=MAX_SEQUENCE_LENGTH)\n", "\n", "labels = to_categorical(np.asarray(labels))\n", "\n", "indices = np.arange(data.shape[0])\n", "np.random.shuffle(indices)\n", "\n", "data = data[indices]\n", "labels = labels[indices]\n", "num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])\n", "\n", "X_train = data[:-num_validation_samples]\n", "y_train = labels[:-num_validation_samples]\n", "X_val = data[-num_validation_samples:]\n", "y_val = labels[-num_validation_samples:]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "GLOVE_DIR = \"/media/felipe/SAMSUNG/GloVe\"\n", "EMBEDDING_DIM = 100\n", "embeddings_index = {}\n", "\n", "with open(os.path.join(GLOVE_DIR,\"glove.6B.{0}d.txt\".format(EMBEDDING_DIM)),'r') as f:\n", " for line in f:\n", " values = line.split()\n", " word = values[0]\n", " coefs = np.asarray(values[1:],dtype='float32')\n", "\n", " embeddings_index[word] = coefs" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "174074" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(word_index)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "embedding_matrix = np.zeros((len(word_index)+1,EMBEDDING_DIM))\n", "\n", "for word,i in word_index.items():\n", " \n", " if i >= MAX_NB_WORDS:\n", " continue\n", " \n", " embedding_vector = embeddings_index.get(word)\n", " \n", " if embedding_vector is not None:\n", " embedding_matrix[i] = embedding_vector" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "embedding_layer = Embedding(len(word_index)+1,\n", " EMBEDDING_DIM,\n", " weights=[embedding_matrix],\n", " input_length=MAX_SEQUENCE_LENGTH,\n", " trainable = False)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "\n", "sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,),dtype='int32')\n", "embedded_sequences = embedding_layer(sequence_input)\n", "\n", "x = Conv1D(128, 5, activation='relu')(embedded_sequences)\n", "x = MaxPooling1D(5)(x)\n", "x = Conv1D(128, 5, activation='relu')(x)\n", "x = MaxPooling1D(5)(x)\n", "x = Conv1D(128, 5, activation='relu')(x)\n", "x = MaxPooling1D(35)(x)\n", "x = Flatten()(x)\n", "x = Dense(128, activation='relu')(x)\n", "preds = Dense(len(labels_index), activation='softmax')(x)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "model = Model(sequence_input, preds)\n", "model.compile(loss ='categorical_crossentropy',\n", " optimizer='rmsprop',\n", " metrics=['acc'])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 15998 samples, validate on 3999 samples\n", "Epoch 1/20\n", " 4352/15998 [=======>......................] - ETA: 94s - loss: 2.8506 - acc: 0.0843" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m model.fit(X_train,y_train, validation_data=(X_val, y_val),\n\u001b[0;32m----> 2\u001b[0;31m epochs=20, batch_size=128)\n\u001b[0m", "\u001b[0;32m/home/felipe/tf-venv3/lib/python3.5/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, **kwargs)\u001b[0m\n\u001b[1;32m 1484\u001b[0m \u001b[0mval_f\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mval_f\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mval_ins\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mval_ins\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mshuffle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1485\u001b[0m \u001b[0mcallback_metrics\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcallback_metrics\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1486\u001b[0;31m initial_epoch=initial_epoch)\n\u001b[0m\u001b[1;32m 1487\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1488\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m32\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/home/felipe/tf-venv3/lib/python3.5/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36m_fit_loop\u001b[0;34m(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch)\u001b[0m\n\u001b[1;32m 1139\u001b[0m \u001b[0mbatch_logs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'size'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch_ids\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1140\u001b[0m \u001b[0mcallbacks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_batch_begin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_logs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1141\u001b[0;31m \u001b[0mouts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mins_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1142\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mouts\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1143\u001b[0m \u001b[0mouts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mouts\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/home/felipe/tf-venv3/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m 2101\u001b[0m \u001b[0msession\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_session\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2102\u001b[0m updated = session.run(self.outputs + [self.updates_op],\n\u001b[0;32m-> 2103\u001b[0;31m feed_dict=feed_dict)\n\u001b[0m\u001b[1;32m 2104\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mupdated\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2105\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/home/felipe/tf-venv3/lib/python3.5/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 776\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 777\u001b[0m result = self._run(None, fetches, feed_dict, options_ptr,\n\u001b[0;32m--> 778\u001b[0;31m run_metadata_ptr)\n\u001b[0m\u001b[1;32m 779\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 780\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/home/felipe/tf-venv3/lib/python3.5/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_run\u001b[0;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 980\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfinal_fetches\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mfinal_targets\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 981\u001b[0m results = self._do_run(handle, final_targets, final_fetches,\n\u001b[0;32m--> 982\u001b[0;31m feed_dict_string, options, run_metadata)\n\u001b[0m\u001b[1;32m 983\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 984\u001b[0m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/home/felipe/tf-venv3/lib/python3.5/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_run\u001b[0;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 1030\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhandle\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1031\u001b[0m return self._do_call(_run_fn, self._session, feed_dict, fetch_list,\n\u001b[0;32m-> 1032\u001b[0;31m target_list, options, run_metadata)\n\u001b[0m\u001b[1;32m 1033\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1034\u001b[0m return self._do_call(_prun_fn, self._session, handle, feed_dict,\n", "\u001b[0;32m/home/felipe/tf-venv3/lib/python3.5/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_call\u001b[0;34m(self, fn, *args)\u001b[0m\n\u001b[1;32m 1037\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_do_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1038\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1039\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1040\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOpError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1041\u001b[0m \u001b[0mmessage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/home/felipe/tf-venv3/lib/python3.5/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_run_fn\u001b[0;34m(session, feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[1;32m 1019\u001b[0m return tf_session.TF_Run(session, options,\n\u001b[1;32m 1020\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget_list\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1021\u001b[0;31m status, run_metadata)\n\u001b[0m\u001b[1;32m 1022\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1023\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_prun_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "model.fit(X_train,y_train, validation_data=(X_val, y_val),\n", " epochs=20, batch_size=128)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Global TF Kernel (Python 3)", "language": "python", "name": "global-tf-python-3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }