{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] } ], "source": [ "%pylab inline\n", "\n", "from nltk.corpus import stopwords\n", "import wikipedia\n", "\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.decomposition import TruncatedSVD" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "figsize(10, 8)\n", "plt.style.use(['dark_background'])" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "stopw = stopwords.words(\"portuguese\") +\\\n", " stopwords.words(\"english\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## tf-idf" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "wikipedia.set_lang(\"pt\")\n", "text = wikipedia.page(\"Alan_Turing\").content" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(61, 664)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tfidf = TfidfVectorizer(stop_words=stopw)\n", "\n", "X = tfidf.fit_transform(text.splitlines())\n", "X.shape" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<61x664 sparse matrix of type ''\n", "\twith 862 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "computação\n", "cheshire\n", "junho\n", "ciência\n", "influente\n", "algoritmo\n", "east\n", "lógico\n", "desenvolvimento\n", "desempenhando\n" ] } ], "source": [ "ft_name = tfidf.get_feature_names()\n", "\n", "top_10_tfidf_sklearn = X[0].transpose().toarray().argsort(axis=0)[::-1]\n", "for i in top_10_tfidf_sklearn[:10]:\n", " print(ft_name[i[0]])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlMAAABYCAYAAADRGXICAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAGE9JREFUeJzt3XtQFFe+B/AvM4MPfABiZBJAGFcxGI2OKJCwbqKSRGJtzN7SlaprSa1erOSa7E12q4Syarf8Yzdrbt0qsSpVppZQCbdKC9kkrMRklfisu7sBx2V4BUZA2QRWeexFSYy5Pphz/9AZB+iZ7pnumZ6B76eqC+g+3efXvz7dc5jpOR0FQICIiIiIAmLQOwAiIiKiSMbOFBEREZEK7EwRERERqcDOFBEREZEK7EwRERERqcDOFBEREZEKIetMvfDCC3A4HOjs7ERxcXGoqo1o5eXl6O/vR0tLi3tefHw8amtr0dHRgdraWsTFxbmXHTx4EJ2dnWhqaoLVatUj5LCWnJyMM2fOoK2tDa2trfj5z38OgDkN1NSpU1FfX4/Gxka0trZi3759AIC0tDTU1dWho6MDlZWViI6OBgBMmTIFlZWV6OzsRF1dHVJTU3WMPjwZDAY0NDTgk08+AcBcqtXd3Y3m5mbY7XbYbDYAPN/ViI2NxR/+8Ae0t7ejra0NOTk5zKcHEezJYDCIrq4uYbFYRHR0tGhsbBQZGRlBrzfSpzVr1gir1SpaWlrc895++21RXFwsAIji4mKxf/9+AUDk5+eLzz77TAAQ2dnZoq6uTvf4w20ym83CarUKAGLmzJni0qVLIiMjgzlVMc2YMUMAECaTSdTV1Yns7Gxx9OhRsXXrVgFAHDp0SLzyyisCgHj11VfFoUOHBACxdetWUVlZqXv84Ta9+eab4vDhw+KTTz4RAJhLlVN3d7dISEgYNY/ne+DTBx98IHbu3CkAiOjoaBEbG8t8PpyCX0lOTo44ceKE+++SkhJRUlKi945HxJSamjqqM+VwOITZbBbA/c6Bw+EQAMS7774rCgoKJMtxkp7++Mc/iry8POZUg2n69Onib3/7m8jKyhKDg4PCaDQKYPS5f+LECZGTkyMACKPRKAYHB3WPO5ympKQkcerUKbF27Vp3Z4q5VDdJdaZ4vgc2zZo1S1y5cmXcfObz/hSSj/mSkpLQ09Pj/ru3txdJSUmhqHrCSUxMRF9fHwCgr68P8+bNA8Ac+ys1NRVWqxX19fXMqQoGgwF2ux0DAwP4/PPPcfnyZdy4cQMjIyMARufMM58jIyMYHh5GQkKCbrGHm9LSUuzZswdOpxMAkJCQwFyqJIRAbW0tLl68iKKiIgC8hgZqwYIFGBwcxPvvv4+GhgaUlZUhJiaG+XwgJJ2pqKiocfOEEKGoetJgjpWbMWMGPvroI7zxxhv49ttvvZZjTuU5nU5YrVYkJycjKysLGRkZ48q4csZ8erdx40YMDAygoaHBPc9XvphLZXJzc5GZmYn8/Hzs3r0ba9as8VqWOfXNZDJh5cqVOHToEFauXInvvvsOJSUlXstPtnyGpDPV29uLlJQU99/Jycm4evVqKKqecPr7+2E2mwEAZrMZAwMDAJhjpUwmEz766CMcPnwY1dXVAJhTLQwPD+PcuXPIyclBXFwcjEYjgNE588yn0WhEbGwshoaGdIs5nOTm5uKll15Cd3c3KisrsW7dOpSWljKXKl27dg0AMDg4iOrqamRlZfF8D1Bvby96e3tx4cIFAMCHH36IlStXMp8PhKQzZbPZsGjRIqSlpSE6OhoFBQWoqakJRdUTTk1NDQoLCwEAhYWFOHbsmHv+9u3bAQDZ2dkYHh52v/VKD5WXl6O9vR0HDhxwz2NOAzN37lzExsYCAKZNm4a8vDy0t7fj7Nmz2Lx5M4Dx+XTlefPmzThz5ow+gYehvXv3IiUlBRaLBQUFBThz5gy2bdvGXKoQExODmTNnun9//vnn0drayvM9QP39/ejp6UF6ejoAYP369Whra2M+PYTk5qz8/Hxx6dIl0dXVJfbu3av7zWKRMB05ckRcvXpV3LlzR/T09IgdO3aIOXPmiFOnTomOjg5x6tQpER8f7y7/zjvviK6uLtHc3CwyMzN1jz/cptzcXCGEEE1NTcJutwu73S7y8/OZ0wCnZcuWiYaGBtHU1CRaWlrEr371KwFAWCwWUV9fLzo7O0VVVZWYMmWKACCmTp0qqqqqRGdnp6ivrxcWi0X3fQjH6ZlnnnHfgM5cBj5ZLBbR2NgoGhsbRWtrq/t1h+d74NPy5cuFzWYTTU1Norq6WsTFxTGfD6aoB78QERERUQA4AjoRERGRCuxMEREREanAzhQRERGRCuxMEREREanAzhQRERGRCqo6Uy+88AIcDgc6OztRXFwsW941nD9pg/nUFvOpLeZTW8yntphPbTGfAY6pYDAYRFdXl7BYLCI6Olo0NjaKjIwMn+vYbDbdx4KYSBPzyXyG88R8Mp/hPDGfzKeWU8DvTGVlZaGrqwvd3d24e/cuKisrsWnTpkA3R0RERBSRTIGuKPVE6Ozs7HHlioqKsGvXLgDAk0ufxCVbV6BVho8Z04Hvvtc7CsQ+Mtt3PmOmAbf+7+HPsaZPA76XmK9UmORBK+58xkwHbt3fr5GEGTD+73c6Rxa4kTkzYBwaE7+/xy3A4yzbPoNRv9o2rYUgxaBZPkMhZjqcUwww3FDZ9vyu18u1ToLPfPqxHV3MmH7/px7XXy/HMKD2GUh78Lg+u02JBu7cDUr7ik+djXnz5smWC7gzpfSJ0GVlZSgrKwMAXLJ1YXeW96dMKyGeWo6oL5pUbSMY2wqE4cnH4Wx2uOPwN56oVUshLrbqvh8TRaTmMZhxh0tOlMZhfGIxRr68FIKIlNEqf65zPRz4u0+B5GDsOq5rpdrtakVJ3Uqu657LAt2fW/+SjZiP633GJ5U/rYWqjUrVY0pOwr3efwBQtq9yufZc/rsL8veDA0DAj5PJycnBvn37sGHDBgBAScn9TtL+/fu9rqNFZ4qIiIgoFH53oRirV6+WLRfwPVM2mw2LFi1CWloaoqOjUVBQgJqamkA3R0RERBSRAu5MjYyM4LXXXsPJkyfR3t6OqqoqtLW1aRkbhQHj2cf0DoFo0uL5RxQZZDtT5eXl6O/vR0tLi3tefHw8amtrcfDgQXz11VdYtWoV3nrrrYCDiLI+gSjrE16XhQOTJVXvEBRTmjPj4oWyZUbWXg24TsPSxyWXhcsxnYj8ya2v887btlzzXD9dx9iT1Dyp9X2VU8Of7SotG8o26xmT0vMPAIxL0sfNUxp3sI6FKya5tuYvf9t5JNIjbn+vB74M/2uO2nDG8faaZVieoXrbvl7DlJDtTH3wwQfu+6JcSkpKcPr0aaSnp+P06dPu+6UCYVy8EML+JYT9S8nl3uaH2r3ur/xex1dnRc0y1+RtPaU5i7p9R1E5uZi81ena/thlao+pt/0PJ2Pj0zJeX9vyJ7eu8864eCGMixYo2pZrnuvn2DZkXLwQtx+bKRufsH8JZ2vgN8T6yoGv+sdSeg74yuvtjfL3U4w1Nn7PbfgTv6eRto5x85S2BzXHwkXqmLhi8nWNV8KUNn/U3/6280jjel1UUk5Lvur0XGZakCZbf+zhOk3i89yG1PlqXLwQzunRsuvKXZOl9t2vawkU3ICempqK48ePY9myZQAAh8OBZ599Fn19fTCbzTh37hwef1y+B8cb0ImIiChSBPUG9MTERPT19QEA+vr6FI3BQDQZnLzaqHcIREQUYkF/0HFRURFsNhtsNhtiH5kd7OomnDsb/P8IIRJ9vykLwOj9vbc+U69wAvL9piy88NgKvcMIK/4ew0g75p5cbZgin6sdTsRjGshritw6avMktX6kvfbxYz4iIiIiCUH9mK+mpgaFhYUAgMLCQhw7diyQzRARERFFPNnO1JEjR/DFF19g8eLF6OnpwY4dO7B//34899xz6OjowHPPPedz1HOiUOC9SkREpBfZztSePXvgcDjQ1dWF4eFhzJw5E0NDQ9iyZQv+/ve/Y/78+Th69Cji4uJCES+RJN6rREREepHtTN27dw+//OUvsWTJEuTk5GD37t3IyMjQdKwpIiIiokgl25nq6+uD3W4HANy8eRPt7e1ISkrCpk2bUFFRAQCoqKjAyy+/HNxIyW98FAURTSa85pFe/LoBPTU1FVarFfX19UEba8qUNn/UpLexcWgRk2sbWjwuwJSS7HWZc0/C/TLJSaP2w/3zUbPq+sfF4yM/oTymauoxpSTDucbq/t3bdoPdVpW0O6n5o9Z7cOz94dr3UPEnf3Kx3c3Tf2gFqfMqXNu91PXDW46VbNt9bUlJxrcFyh8novQaO3bZ2L/d1zw/zhd/4wvG+q7joOTcCyQWuWt9OLzW+iJ3jbid//Dbdq59DfWxVjQ0AgDMmDED58+fx29/+1tUV1fj+vXriI+Pdy8fGhrCnDlzxq1XVFSEXbt2AQBSHknFNsu/BxQokZ4GXnsa8975q95hhIVg5uLG9qcQ999fBGXbROHqf//tKSS8F17tPpyveQOvPQ0A7vjE08sR9demUcu1il3p0AiKOlMmkwnHjx/HyZMnceDAAQCBjTXFcaaIiIgoUmg6zlR5eTna29vdHSmAY00REU1WHIqEaDSTXIHc3Fxs374dzc3N7hvR9+7di/3796Oqqgo7d+7E119/jS1btgQ9WCIi0h+HIiEaTfadqb/85S+IiorC8uXLYbVaYbVa8ac//QlDQ0PIy8tDeno68vLycP369aAFOfLsyqBtO9h1RK1aGpTtAvc/J56olBwPNcdM7fEORZvUk2b5z1qmQTQq6pcoO/LsSp/rji2rNaV1E43i5VzS4otMngJt/1q8HunV/rWoV7YzNXXqVNTX16OxsRGtra3Yt28fACAtLQ11dXXo6OhAZWUloqOj/a/8Sfnn+QGA8VwDAOD2i8F78KGrDq2Ji62K99NfnjfcaclXvMYl6bLzlBwnVx3e6lJyPPw9ZoYnH3dPrnUDPTbBai9yeRlbTm09LsYl6aOOo9T+jVvHRw7d8y60qIrTF3+OgWdZ47kGn+uOLeuNz/1WGIu3ZVIPfg3WdcTFn4fVmhakKW6rE41u++vlXBL2LxVvwt/26a2tSl3j5V6PjIsXysbkqk/qdcbnthPVjSag9vUAUNCZun37NtatW4cVK1ZgxYoV2LBhA7Kzs/H222/jwIEDSE9Px/Xr17Fz506/K3c2O0b97etr/gAw9TOb33Uo4fnukSkl2WscUvPHfXX+wfqmpIfjnTibHX5t0xvJ+pN8j6siFZ9ceV/HJWr4JkwpyRBPPfwvZKStY1T5addu+awDeHjsfeVGC+7j8WC/XNPYOLSu0/Pn2Fh8lffMi1Q591eoFcTtq/6x64+0dYw7jmPX9Van1DFUclxNSY+Na79aDtchd27Irv8gfs+27rlPzmbHqGWueVJlvc3zlqPpxy6MKhO1aqnstr2Ra3cuMf+45XWZK5fu9hMzzWdbdZF618QzHqX7YTInel+m8jh7xiX3SYLneeB5bfFcrrQura57xsR5irflGbtUTErNsPdIzpc6rq59jfrmps+YPHMfNSxd1ms7SPD+BBZ/2pua1wPFQyMAwPTp0/HnP/8Zr776Kj799FOYzWaMjIwgJycH+/btw4YNG3yuz2/zERERUaTQ9Nt8BoMBdrsdAwMD+Pzzz3H58mXcuHEDIyMjAIDe3l4kJSWpi5hokuis4H0xREpoea7wvKNgUtSZcjqdsFqtSE5ORlZWFjIyMsaVEUL6Da6ioiLYbDbYbDbEPvpwUE9j+g8CDFmacaEFxoWWUfNMltRx9WhdrxyR6/1bL6bUlHHzXPG5f47ZJ62ZLKle6/A3V8HMbTC2rXSbWte9qLBhXM496zBZUjWvX2k70jPPgZR37de99ZmqYvf3PAvFdUSujmDHcG+999HkfX1covi8ksm5Mf0HWFSozb2J99ZnurfluV+eI2fL8bVfxvQfSC7X8vo99rogVZe3+ny9Dkluy8u+qtkfz/i1zpVx0QL37/7uq1b8+pgPAH7961/j1q1bKC4u5sd8RERENGFp9jHf3LlzERsbCwCYNm0a8vLy0N7ejrNnz2Lz5s0AOGgnERERTV6yg3Y++uijqKiogNFohMFgQFVVFT799FO0tbWhsrISv/nNb2C321FeXh6KeImIiIjCiuw7Uy0tLVi5ciWsVivu3r2LrKz7Y5EIIdyT0+n0es9UsBiWj79vy9f8UNQfqrql+KpbanyPUMeqZ27kGJ9YLDlfqzam5757G9slXHjmXu6cUpLHu3ne7/NRIhjH6tutOZrVGW7nkRbxKNnGnQ3yH7O42pJre3rlKhivTT6v7yruNTIsz9A8T0q3F0i93u7l+m5zdkDb03LfFd8z9eabb2LVqlWYPXs2fvzjH+Po0aP4+OOPcfToURw6dAhNTU149913fW6D90wRERFRpNB0aISkpCRs3LgR7733nnveunXr8OGHHwIAKioq8PLLLwcYKhEREVHwBesh3Yo6U6WlpdizZw+cTicAICEhgeNMEVFECNbFk4giT7Ae0i3bmdq4cSMGBgbQ0PBwvI+oqKhx5RSNM/XIbBWhEhH5L1gXT/LtJ22DeodAk0THu8qfKxksst/my83NxUsvvYQXX3wR06ZNw+zZs1FaWoq4uDgYjUaMjIwgOTkZV69elVy/rKwMZWVlAO7fM0VERBNf9ZJH9A6BPBjPPoaRtdKv05Eu/ZUL8oWCTPadqb179yIlJQUWiwUFBQU4c+YMtm3bNunGmeo86PsbOUREkW7xxWi9Q6AgmagdKQDoe+NpvUNQds+UlOLiYvziF79AZ2cnEhISJvw4U4v+o07vEIiIgurSqrt6h0DkN3PpX/UOQf5jPk/nz5/H+fPnAQDd3d3Izs4OSlBEREREkULRO1Pd3d1obm6G3W6HzWYDAMTHx6O2thYdHR2ora1FXFxcUAMlIiIi5fhN1tBR/DHf2rVrYbVa3YNXlZSU4PTp00hPT8fp06dRUsLBOImIiMIFv8kaOgHfM7Vp0yZUVFQA4KCdRERENHkp6kwJIVBbW4uLFy+iqKgIAJCYmIi+vj4AQF9fH+bNmye5rtQ4U9cLn1IVdCDrq63z3jr/nvn1bUFkfftPbX6Iwo2e56CS8+n7TdqNjcPzlyazKOsTeoegrDOVm5uLzMxM5OfnY/fu3VizZo3iCsrKyrB69WqsXr0aw4PfAADiK74ILNoHxq4/9qIpdRH1XOfWT/y/cd5w1+lX+VmVkfXtP7XHJFgCOVbBEmkd5MlOz3NQ6nwa236MtwN7OLzc9c2f9Yh5mQiiHjyNRU+KOlPXrl0DAAwODqK6uhpZWVno7++H2WwGAJjNZgwMDCiqUOsXx1s/yR530ZS7iBpvK+sYecZq+B+7/8FFsNsvyj/YMRSUHqtQCPcOcrgcs1CTuqaEOhdK3rke234Md0a3baXXRiXtUGpbnuspzc9kaFPBOK8nQ97CibPZoXcI8p2pmJgYzJw50/37888/j9bWVtTU1KCwsBCAf4N2xlTXqwh3tJs/zfFrezd/ev8/kKmf2RSV1zLWSOOMHv/IICmunAbL1M9sQa9jolB6zCYaqfM01LkQJv/rG7uO2uuN5wu43LaU5ifS25Re145Izxv5LwqAz/eaLRYLqqurAQAmkwlHjhzBW2+9hTlz5qCqqgrz58/H119/jS1btuD69es+K7tk68LuLH7rj4iIiMLf7y4Uu0cx8EX2nanu7m4888wz6OzshNFoxLZt25CTkwMhBJzO+29TO51Orw86JiIiIprIFN0zdfDgQZw4cQIZGRlYvnw52tvbOc4UERERERR0pmbNmoUf/ehH7mfv3b17F8PDwxxnKsx9VbVM7xCIJo3L/xV+9/XxGkAUOrKdqQULFmBwcBDvv/8+GhoaUFZWhpiYGMXjTE0UQzsiaxyX1J+26B0CUVBc+c/wOxdjrgY8/nHQKL0GhGM+ifxxe6P+356UvQE9MzMTdXV1yM3NxYULF1BaWopvvvkGr7/+OuLj493lhoaGMGfOnHHrFxUVYdeuXQCApUuXorW1Vds9mMTmzp2Lf/7zn3qHMWEwn9piPrXFfGqL+dTWRM1namqq4jeLhK8pMTFRdHd3u//+4Q9/KI4fPy4cDocwm80CgDCbzcLhcPjcDgBhs9lky3BSPjGfzGc4T8wn8xnOE/PJfGo5yb433d/fj56eHqSnpwMA1q9fj7a2toDHmSIiIiKaSExKCr3++us4fPgwpkyZgitXruBnP/sZDAYDqqqqsHPnTvc4U0RERESTjaLOVFNTk+SgVXl5eX5V9vvf/96v8uQb86kt5lNbzKe2mE9tMZ/amuz5lL0BnYiIiIi8C7/v8xIRERFFEHamiIiIiFRgZ4qIiIhIBXamiIiIiFRgZ4qIiIhIBXamiIiIiFRgZ4qIiIhIhf8HD2i6GLIOc/wAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig, ax = subplots(figsize=(10, 8))\n", "ax.matshow(X.todense())\n", "savefig(\"../images/lsa.png\", transparent=True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(61, 664)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "TruncatedSVD(algorithm='randomized', n_components=61, n_iter=1000,\n", " random_state=None, tol=0.0)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lsa = TruncatedSVD(n_components=61, n_iter=1000)\n", "\n", "lsa.fit(X)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(61, 664)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lsa.components_.shape" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'desenvolveu'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ft_name = tfidf.get_feature_names()\n", "ft_name[234]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "paragrafo: 0\n", "turing\n", "máquina\n", "alan\n", "prêmio\n", "memorial\n", "guerra\n", "enigma\n", "bletchley\n", "park\n", "computação\n", "--------------------\n", "paragrafo: 1\n", "inglês\n", "isbn\n", "cambridge\n", "ed\n", "history\n", "massachusetts\n", "mit\n", "press\n", "262\n", "alan\n", "--------------------\n", "paragrafo: 2\n", "morte\n", "governo\n", "ministro\n", "pedido\n", "perdão\n", "anos\n", "cianeto\n", "britânico\n", "devido\n", "reino\n", "--------------------\n", "paragrafo: 3\n", "alan\n", "turing\n", "memorial\n", "prêmio\n", "pai\n", "genealogy\n", "mathematics\n", "project\n", "família\n", "havia\n", "--------------------\n", "paragrafo: 4\n", "computação\n", "ser\n", "poderia\n", "artificial\n", "sistema\n", "computabilidade\n", "respostas\n", "teórica\n", "fazer\n", "ciência\n", "--------------------\n", "paragrafo: 5\n", "morte\n", "pai\n", "índia\n", "família\n", "havia\n", "julius\n", "sido\n", "stoney\n", "trabalho\n", "maçã\n", "--------------------\n", "paragrafo: 6\n", "morte\n", "maçã\n", "cianeto\n", "alan\n", "bletchley\n", "park\n", "junho\n", "memorial\n", "1954\n", "meio\n", "--------------------\n", "paragrafo: 7\n", "inglês\n", "biografia\n", "mactutor\n", "genealogy\n", "mathematics\n", "project\n", "bletchley\n", "park\n", "pai\n", "família\n", "--------------------\n", "paragrafo: 8\n", "química\n", "onde\n", "inglês\n", "biografia\n", "mactutor\n", "laboratório\n", "base\n", "sobre\n", "prêmio\n", "interpretado\n", "--------------------\n", "paragrafo: 9\n", "prêmio\n", "máquina\n", "turing\n", "dados\n", "estados\n", "segura\n", "transatlânticos\n", "transmissão\n", "unidos\n", "projeto\n", "--------------------\n", "paragrafo: 10\n", "forma\n", "dados\n", "estados\n", "segura\n", "transatlânticos\n", "transmissão\n", "unidos\n", "projeto\n", "bletchley\n", "park\n", "--------------------\n", "paragrafo: 11\n", "referências\n", "ver\n", "homofobia\n", "bibliografia\n", "cinebiografias\n", "marian\n", "rejewski\n", "1947\n", "morte\n", "memorial\n", "--------------------\n", "paragrafo: 12\n", "homofobia\n", "dedicação\n", "externas\n", "ligações\n", "ver\n", "desculpa\n", "oficiais\n", "pedidos\n", "referências\n", "marian\n", "--------------------\n", "paragrafo: 13\n", "ver\n", "dedicação\n", "externas\n", "ligações\n", "cinebiografias\n", "bibliografia\n", "consagração\n", "marian\n", "rejewski\n", "alan\n", "--------------------\n", "paragrafo: 14\n", "cinebiografias\n", "ver\n", "carreira\n", "juventude\n", "homofobia\n", "marian\n", "rejewski\n", "externas\n", "ligações\n", "desculpa\n", "--------------------\n", "paragrafo: 15\n", "consagração\n", "homofobia\n", "ver\n", "marian\n", "rejewski\n", "cinebiografias\n", "bibliografia\n", "biografia\n", "mactutor\n", "inglês\n", "--------------------\n", "paragrafo: 16\n", "dedicação\n", "carreira\n", "juventude\n", "homofobia\n", "referências\n", "bibliografia\n", "externas\n", "ligações\n", "consagração\n", "máquina\n", "--------------------\n", "paragrafo: 17\n", "bibliografia\n", "desculpa\n", "oficiais\n", "pedidos\n", "marian\n", "rejewski\n", "homofobia\n", "externas\n", "ligações\n", "ver\n", "--------------------\n", "paragrafo: 18\n", "referências\n", "marian\n", "rejewski\n", "cinebiografias\n", "consagração\n", "externas\n", "ligações\n", "desculpa\n", "oficiais\n", "pedidos\n", "--------------------\n", "paragrafo: 19\n", "marian\n", "rejewski\n", "dedicação\n", "ver\n", "carreira\n", "juventude\n", "desculpa\n", "oficiais\n", "pedidos\n", "prêmio\n", "--------------------\n", "paragrafo: 20\n", "cinebiografias\n", "dedicação\n", "desculpa\n", "oficiais\n", "pedidos\n", "homofobia\n", "bibliografia\n", "forma\n", "dados\n", "estados\n", "--------------------\n", "paragrafo: 21\n", "desculpa\n", "oficiais\n", "pedidos\n", "referências\n", "consagração\n", "ver\n", "carreira\n", "juventude\n", "dedicação\n", "1954\n", "--------------------\n", "paragrafo: 22\n", "junho\n", "bletchley\n", "park\n", "computação\n", "1954\n", "cheshire\n", "meio\n", "dados\n", "estados\n", "segura\n", "--------------------\n", "paragrafo: 23\n", "prêmio\n", "morte\n", "máquina\n", "1950\n", "computadores\n", "condenado\n", "crescer\n", "declarado\n", "equivalia\n", "estrogênio\n", "--------------------\n", "paragrafo: 24\n", "morte\n", "7538\n", "2200\n", "david\n", "invention\n", "knew\n", "leavitt\n", "man\n", "much\n", "phoenix\n", "--------------------\n", "paragrafo: 25\n", "interpretado\n", "prêmio\n", "castração\n", "1996\n", "2014\n", "alex\n", "além\n", "ator\n", "benedict\n", "breaking\n", "--------------------\n", "paragrafo: 26\n", "prêmio\n", "bletchley\n", "park\n", "memorial\n", "laboratório\n", "turing\n", "ser\n", "comprovados\n", "dedicava\n", "formado\n", "--------------------\n", "paragrafo: 27\n", "morte\n", "memorial\n", "computação\n", "alan\n", "londres\n", "genealogy\n", "mathematics\n", "project\n", "ciência\n", "mathison\n", "--------------------\n", "paragrafo: 28\n", "memorial\n", "alan\n", "genealogy\n", "mathematics\n", "project\n", "pode\n", "mãe\n", "acidental\n", "cena\n", "conduzida\n", "--------------------\n", "paragrafo: 29\n", "computação\n", "biografia\n", "mactutor\n", "prêmio\n", "ciência\n", "mathison\n", "bletchley\n", "park\n", "londres\n", "stoney\n", "--------------------\n", "paragrafo: 30\n", "bletchley\n", "park\n", "sistema\n", "sistemas\n", "artificial\n", "filhos\n", "inglaterra\n", "john\n", "pode\n", "computador\n", "--------------------\n", "paragrafo: 31\n", "sistema\n", "sistemas\n", "stoney\n", "ser\n", "prêmio\n", "formal\n", "operador\n", "símbolos\n", "meio\n", "artificial\n", "--------------------\n", "paragrafo: 32\n", "genealogy\n", "mathematics\n", "project\n", "prêmio\n", "inglês\n", "códigos\n", "eletromecânica\n", "inteligência\n", "chefe\n", "centro\n", "--------------------\n", "paragrafo: 33\n", "memorial\n", "códigos\n", "eletromecânica\n", "prêmio\n", "alemães\n", "centro\n", "definições\n", "encontrar\n", "especializado\n", "frota\n", "--------------------\n", "paragrafo: 34\n", "homossexualidade\n", "aceitou\n", "1952\n", "algumas\n", "alguns\n", "alternativa\n", "aniversário\n", "antes\n", "aparente\n", "apesar\n", "--------------------\n", "paragrafo: 35\n", "máquina\n", "bletchley\n", "park\n", "turing\n", "química\n", "04\n", "1983\n", "books\n", "burnett\n", "intelligence\n", "--------------------\n", "paragrafo: 36\n", "01202\n", "agar\n", "government\n", "jon\n", "machine\n", "revolutionary\n", "2003\n", "978\n", "computer\n", "computação\n", "--------------------\n", "paragrafo: 37\n", "ainda\n", "acidental\n", "55\n", "alemães\n", "53169\n", "afirmado\n", "42\n", "510060\n", "01202\n", "2003\n", "--------------------\n", "paragrafo: 38\n", "53169\n", "afirma\n", "510060\n", "2003\n", "42\n", "acompanhar\n", "1996\n", "alemã\n", "abstração\n", "12º\n", "--------------------\n", "paragrafo: 39\n", "acidental\n", "55\n", "53169\n", "aceitou\n", "agar\n", "afirmado\n", "afirma\n", "510060\n", "04\n", "7538\n", "--------------------\n", "paragrafo: 40\n", "afirma\n", "ace\n", "acidentes\n", "55\n", "01202\n", "978\n", "memorial\n", "000\n", "1912\n", "1881\n", "--------------------\n", "paragrafo: 41\n", "ainda\n", "agar\n", "55\n", "aceitou\n", "1912\n", "alemanha\n", "24\n", "7538\n", "memorial\n", "000\n", "--------------------\n", "paragrafo: 42\n", "acidentes\n", "acidental\n", "55\n", "alegre\n", "978\n", "alan\n", "12\n", "ajudou\n", "12º\n", "01202\n", "--------------------\n", "paragrafo: 43\n", "aceitou\n", "ace\n", "acidental\n", "alan\n", "262\n", "acompanhar\n", "42\n", "alemães\n", "2009\n", "12\n", "--------------------\n", "paragrafo: 44\n", "55\n", "978\n", "alemães\n", "7538\n", "alan\n", "alemã\n", "afirmativo\n", "1998\n", "alemanha\n", "acompanhar\n", "--------------------\n", "paragrafo: 45\n", "55\n", "alemães\n", "alan\n", "alegre\n", "agar\n", "afirma\n", "abstração\n", "ajudaria\n", "2009\n", "2007\n", "--------------------\n", "paragrafo: 46\n", "afirmado\n", "55\n", "alemã\n", "acompanhar\n", "01202\n", "afirmativo\n", "acordo\n", "isbn\n", "alan\n", "2003\n", "--------------------\n", "paragrafo: 47\n", "alegre\n", "aceitou\n", "ajudaria\n", "978\n", "afirmado\n", "alemanha\n", "1998\n", "abstração\n", "55\n", "42\n", "--------------------\n", "paragrafo: 48\n", "aceitou\n", "acidentes\n", "ainda\n", "alemães\n", "55\n", "alemanha\n", "afirmado\n", "7538\n", "01202\n", "afirmativo\n", "--------------------\n", "paragrafo: 49\n", "978\n", "aceitou\n", "ainda\n", "53169\n", "agar\n", "acompanhar\n", "ajudou\n", "2007\n", "1912\n", "55\n", "--------------------\n", "paragrafo: 50\n", "aceitou\n", "ajudaria\n", "acompanhar\n", "alemã\n", "978\n", "37\n", "afirmado\n", "ainda\n", "7538\n", "000\n", "--------------------\n", "paragrafo: 51\n", "acordo\n", "acompanhar\n", "ajudou\n", "alemanha\n", "24\n", "ajudaria\n", "7538\n", "1983\n", "1996\n", "2200\n", "--------------------\n", "paragrafo: 52\n", "abstração\n", "acordo\n", "alemã\n", "ajudaria\n", "aceitou\n", "afirmado\n", "978\n", "55\n", "11\n", "ajudou\n", "--------------------\n", "paragrafo: 53\n", "alemã\n", "alemanha\n", "acompanhar\n", "ajudaria\n", "alegre\n", "acidentes\n", "2014\n", "2200\n", "510060\n", "aceitou\n", "--------------------\n", "paragrafo: 54\n", "ajudaria\n", "acidentes\n", "alemã\n", "ajudou\n", "510060\n", "afirma\n", "1912\n", "2200\n", "12º\n", "aceitou\n", "--------------------\n", "paragrafo: 55\n", "alegre\n", "ajudou\n", "alemã\n", "acordo\n", "afirmativo\n", "ajudaria\n", "aceitou\n", "acompanhar\n", "510060\n", "ace\n", "--------------------\n", "paragrafo: 56\n", "alemã\n", "acordo\n", "alegre\n", "alemanha\n", "acidentes\n", "510060\n", "afirma\n", "37\n", "12º\n", "2007\n", "--------------------\n", "paragrafo: 57\n", "ajudaria\n", "afirmativo\n", "acompanhar\n", "alemanha\n", "acordo\n", "alegre\n", "acidentes\n", "1996\n", "24\n", "510060\n", "--------------------\n", "paragrafo: 58\n", "alan\n", "ainda\n", "510060\n", "afirma\n", "10\n", "978\n", "afirmado\n", "acidentes\n", "12\n", "12º\n", "--------------------\n", "paragrafo: 59\n", "510060\n", "ace\n", "55\n", "53169\n", "memorial\n", "01202\n", "alemanha\n", "7538\n", "000\n", "24\n", "--------------------\n", "paragrafo: 60\n", "510060\n", "alemães\n", "acidental\n", "afirma\n", "aceitou\n", "978\n", "55\n", "ceruzzi\n", "computing\n", "modern\n", "--------------------\n" ] } ], "source": [ "for i, comp in enumerate(lsa.components_):\n", " terms_in_comp = zip(ft_name, comp)\n", " sorted_terms = sorted(terms_in_comp, key=lambda x: x[1], reverse=True)[:10]\n", " print(f\"paragrafo: {i}\")\n", " for t in sorted_terms:\n", " print(t[0])\n", " print(\"-\"*20)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 2 }