{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Modelando Textos Probabilisticamente\n", "====================================\n", "\n", "Nesta prática, vamos usar redes neurais para estimar as probabilidades condicionais de textos, caractere a caractere.\n", "Para uma discussão interessante sobre o assunto, veja o seguinte blog: http://karpathy.github.io/2015/05/21/rnn-effectiveness/.\n", "\n", "Vamos usar a biblioteca Keras adaptando um de seus exemplos." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-11-16T16:27:54.823674Z", "start_time": "2020-11-16T16:27:52.082111Z" } }, "outputs": [], "source": [ "from __future__ import print_function\n", "from keras.models import Sequential\n", "from keras.layers.core import Dense, Activation, Dropout\n", "from keras.layers.recurrent import LSTM\n", "from keras.utils.data_utils import get_file\n", "import numpy as np\n", "import scipy.stats as st\n", "import random\n", "import sys" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Primeiro vamos utilizar o mesmo texto usado no exemplo original" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-11-16T16:29:11.408514Z", "start_time": "2020-11-16T16:29:11.398686Z" } }, "outputs": [], "source": [ "path = get_file('nietzsche.txt', origin=\"https://s3.amazonaws.com/text-datasets/nietzsche.txt\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-11-16T16:29:27.696605Z", "start_time": "2020-11-16T16:29:27.671537Z" } }, "outputs": [], "source": [ "try: \n", " text = open(path).read().lower()\n", "except UnicodeDecodeError:\n", " import codecs\n", " text = codecs.open(path, encoding='utf-8').read().lower()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-11-16T18:17:28.797326Z", "start_time": "2020-11-16T18:17:28.791360Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Comprimento do corpus: 600893\n" ] } ], "source": [ "print('Comprimento do corpus:', len(text))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2020-11-16T18:18:02.231772Z", "start_time": "2020-11-16T18:18:02.224895Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "preface\n", "\n", "\n", "supposing that truth is a woman--what then? is there not ground\n", "for suspecting that all philosophers, in so far as they have been\n", "dogmatists\n" ] } ], "source": [ "print(text[:150])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Como o modelo vai se basear em caracteres, precisamos definir o conjunto de caracteres do texto:" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2020-11-16T18:19:17.049779Z", "start_time": "2020-11-16T18:19:17.029599Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total chars: 57\n" ] } ], "source": [ "chars = set(text)\n", "print('total chars:', len(chars))\n", "char_indices = dict((c, i) for i, c in enumerate(chars))\n", "indices_char = dict((i, c) for i, c in enumerate(chars))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2020-11-16T18:19:25.815497Z", "start_time": "2020-11-16T18:19:25.809010Z" } }, "outputs": [ { "data": { "text/plain": [ "{'\\n',\n", " ' ',\n", " '!',\n", " '\"',\n", " \"'\",\n", " '(',\n", " ')',\n", " ',',\n", " '-',\n", " '.',\n", " '0',\n", " '1',\n", " '2',\n", " '3',\n", " '4',\n", " '5',\n", " '6',\n", " '7',\n", " '8',\n", " '9',\n", " ':',\n", " ';',\n", " '=',\n", " '?',\n", " '[',\n", " ']',\n", " '_',\n", " 'a',\n", " 'b',\n", " 'c',\n", " 'd',\n", " 'e',\n", " 'f',\n", " 'g',\n", " 'h',\n", " 'i',\n", " 'j',\n", " 'k',\n", " 'l',\n", " 'm',\n", " 'n',\n", " 'o',\n", " 'p',\n", " 'q',\n", " 'r',\n", " 's',\n", " 't',\n", " 'u',\n", " 'v',\n", " 'w',\n", " 'x',\n", " 'y',\n", " 'z',\n", " 'ä',\n", " 'æ',\n", " 'é',\n", " 'ë'}" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chars" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "O modelo envolve probabilidades condicionais entre caracteres consecutivos, então precisamos alimentar o modelo com sequências de caracteres, com sobreposição." ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2020-11-16T18:22:29.786079Z", "start_time": "2020-11-16T18:22:29.516307Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "num sequences: 200291\n" ] } ], "source": [ "maxlen = 20\n", "step = 3\n", "sentences = []\n", "next_chars = []\n", "for i in range(0, len(text) - maxlen, step):\n", " sentences.append(text[i: i + maxlen])\n", " next_chars.append(text[i + maxlen])\n", "print('num sequences:', len(sentences))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-11-16T16:29:34.880435Z", "start_time": "2020-11-16T16:29:33.821619Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Vetorizando...\n" ] } ], "source": [ "print('Vetorizando...')\n", "X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)\n", "y = np.zeros((len(sentences), len(chars)), dtype=np.bool)\n", "for i, sentence in enumerate(sentences):\n", " for t, char in enumerate(sentence):\n", " X[i, t, char_indices[char]] = 1\n", " y[i, char_indices[next_chars[i]]] = 1" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-11-16T16:29:36.667073Z", "start_time": "2020-11-16T16:29:35.914897Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Construindo o modelo...\n" ] } ], "source": [ "print('Construindo o modelo...')\n", "model = Sequential()\n", "model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))\n", "model.add(Dropout(0.2))\n", "model.add(LSTM(512, return_sequences=False))\n", "model.add(Dropout(0.2))\n", "model.add(Dense(len(chars)))\n", "model.add(Activation('softmax'))\n", "\n", "model.compile(loss='categorical_crossentropy', optimizer='rmsprop')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-11-16T16:29:38.295210Z", "start_time": "2020-11-16T16:29:38.282892Z" } }, "outputs": [], "source": [ "def sample(a, temperature=1.0):\n", " # helper function to sample an index from a probability array\n", " a = np.exp(np.log(a) / temperature)\n", " a /= a.sum() +.001\n", " try:\n", " sp = np.argmax(st.multinomial.rvs(1,a,1))\n", " \n", "# sp = np.argmax(np.random.multinomial(1, a, 1))\n", " except ValueError as e:\n", " print(a[:-1].sum(), len(a),a)\n", " raise(e)\n", " return sp" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-11-16T17:45:42.714540Z", "start_time": "2020-11-16T16:29:39.558393Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/5\n", "196/196 [==============================] - 817s 4s/step - loss: 2.8547\n", "Epoch 2/5\n", "196/196 [==============================] - 902s 5s/step - loss: 2.2621\n", "Epoch 3/5\n", "196/196 [==============================] - 880s 4s/step - loss: 2.0057\n", "Epoch 4/5\n", "196/196 [==============================] - 958s 5s/step - loss: 1.8301\n", "Epoch 5/5\n", "196/196 [==============================] - 980s 5s/step - loss: 1.6950\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "model.fit(X, y, batch_size=1024, epochs=5)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2020-11-16T18:44:39.262757Z", "start_time": "2020-11-16T18:41:26.466155Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "----- diversity: 0.2\n", "----- Generating with seed: \"ave piped to him far\"\n", "ave piped to him far44 44 4ne4s the 444--4ne 44444444 the 4ar44 of 4ne4s 4now 4ne44s 4nou4s 44 4n44 4444 4ne 44444 4now 4ne4s the 444--the 44444 4f the 44ti4n of the 444--4noth44g 4444 44 4ne 444--the 444--4n4444 4ne4s 4ne 44ough 4ne4s and 44444 4ne 4ut444\n", "4n44 4ne4s 4ne4s and 444--the 4a4t of 4444444444444 4n44 the 44444 4444 44 44 4ne 4one44 44 4n4444444444 4ne4s 444 4444444444444 4ne 4444--the 444 44t of 4ne4s and\n", "\n", "----- diversity: 0.5\n", "----- Generating with seed: \"ave piped to him far\"\n", "ave piped to him far many his prowed and the bart of other is the sance of the same the same of presist strunged the sciunce of the grom of the stranges and regarding to other and strong have and pleass and the 4man his one an will would the not one strong of the can of the strung the can of the post of the sensition, of the post of the and intellect and consting to the manning and presention of the great the onding \n", "\n", "----- diversity: 1.0\n", "----- Generating with seed: \"ave piped to him far\"\n", "ave piped to him farte, hence and mlowure treins craricble dees 4o the grest in the reast the\n", "modere to be loge on\n", "utbuts, (in freed be my of accorspales \"out impreberons not itseln\n", "and\n", "naces.;\n", "and there is obst-and cansing, of perhaps they cermand willd; with\n", "who or the obdict\n", "of whines, in other tod of ourmel senk in unhers- i kas the oness har obbercance sides of its obn oneple\n", "senvition of pood.\n", "7] decn: vood=--t\n", "\n", "----- diversity: 1.2\n", "----- Generating with seed: \"ave piped to him far\"\n", "ave piped to him farwing of denind:--withligging in\"ither elep.\" enmortus someridicas. they than good and slywitht as intever, longuryers liktem, which he _eppecemang: at even qhivan. \"up,--is gourd wordde are lay--ars and spinboud not le evenince recepsation\n", "to same by leng! un, in more plone as \n", "wicses thatcsucal usazing of the once, ons plingul xoder wifl thesef in midder tan icprainant andind to too, permineds: f\n" ] } ], "source": [ " start_index = random.randint(0, len(text) - maxlen - 1)\n", "\n", " for diversity in [0.2, 0.5, 1.0, 1.2]:\n", " print()\n", " print('----- diversity:', diversity)\n", "\n", " generated = ''\n", " sentence = text[start_index: start_index + maxlen]\n", " generated += sentence\n", " print('----- Generating with seed: \"' + sentence + '\"')\n", " sys.stdout.write(generated)\n", "\n", " for i in range(400):\n", " x = np.zeros((1, maxlen, len(chars)))\n", " for t, char in enumerate(sentence):\n", " x[0, t, char_indices[char]] = 1.\n", "\n", " preds = model.predict(x, verbose=0)[0]\n", " next_index = sample(preds, diversity)\n", " next_char = indices_char[next_index]\n", "\n", " generated += next_char\n", " sentence = sentence[1:] + next_char\n", "\n", " sys.stdout.write(next_char)\n", " sys.stdout.flush()\n", " print()" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 1, 0]])" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "st.multinomial.rvs(1,[0.5,0.5,0],1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autoclose": false, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false }, "nbTranslate": { "displayLangs": [ "*" ], "hotkey": "alt-t", "langInMainMenu": true, "sourceLang": "en", "targetLang": "fr", "useGoogleTranslate": true }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 1 }