{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Topic modeling\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Getting text\n", "In this sample, texts are three ebooks with different topics from Guternburg:\n", "1. Adrift in New York (children fiction)\n", "2. Beethoven (music)\n", "3. Sandwiches (cook)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3\n" ] } ], "source": [ "\n", "# define a function to get .txt files in a folder\n", "import codecs\n", "from os import listdir\n", "def list_textfiles(directory):\n", " \"Return a list of filenames ending in '.txt' in DIRECTORY.\"\n", " textfiles = []\n", " for filename in listdir(directory):\n", " if filename.endswith(\".txt\"):\n", " textfiles.append(directory + \"/\" + filename)\n", " return textfiles \n", "\n", "# define a function to read the text in a .txt file\n", "\n", "def read_txt(filename):\n", " try:\n", " f = codecs.open(filename,'r','utf-8') #open(filename,'r')\n", " text = f.read()\n", " finally:\n", " if f:\n", " f.close()\n", " return text\n", "\n", "#import harry potter textfiles\n", "filenames = list_textfiles('Plaintexts')\n", "raw_texts = []\n", "for n in filenames:\n", " raw_texts.append(read_txt(n))\n", "print len(raw_texts)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preprocessing\n", "- remove stopwords\n", "- remove puctuation\n", "- lemmatize" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "\n", "from nltk.corpus import stopwords \n", "from nltk.stem.wordnet import WordNetLemmatizer\n", "from nltk.tokenize import word_tokenize\n", "import string\n", "\n", "\n", "clean_texts = []\n", "for text in raw_texts:\n", " # tokenize\n", " tok = \" \".join(word_tokenize(text))\n", " \n", " #remove punctuation\n", " punctuation = set(string.punctuation) \n", " re_punc = \"\".join(i for i in tok if i not in punctuation)\n", " \n", " #remove stopwords\n", " re_sw = \" \".join([i for i in re_punc.lower().split() if i not in stopwords.words('english')])\n", " \n", " #lemmatization\n", " lemmatize = WordNetLemmatizer()\n", " le = \" \".join(lemmatize.lemmatize(i) for i in re_sw.split())\n", " \n", " clean_texts.append(le)\n", "\n" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "#vectorize text\n", "\n", "from sklearn.feature_extraction.text import CountVectorizer \n", "n_features = 1000\n", "tf_vectorizer = CountVectorizer(min_df = 2,\n", " strip_accents = 'unicode',\n", " max_features=n_features,\n", " stop_words='english')\n", "tf = tf_vectorizer.fit_transform(clean_texts)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### LDA topic modeling" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 2.78220006e-04 9.99694288e-01 2.74919906e-05]\n", " [ 9.99641689e-01 3.31086127e-04 2.72251925e-05]\n", " [ 7.62868577e-05 7.44416846e-05 9.99849271e-01]]\n" ] } ], "source": [ "from sklearn.decomposition import LatentDirichletAllocation\n", "n_topic = 3\n", "lda = LatentDirichletAllocation(n_components = n_topic, \n", " learning_method='online',\n", " max_iter=50,\n", " random_state=0)\n", "doctopic = lda.fit(tf)\n", "\n", "# topic_distribution is a distribution of the topics in each text\n", "topic_distribution = lda.transform(tf)\n", "\n", "print topic_distribution # not normalized (sum of each row is not 1)\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- The list above is a probability of topic distrubition in the three texts.\n", "\n", "- Next we will try to visualize the topic distribution in a heatmap." ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAbkAAAFpCAYAAAABalYHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAExdJREFUeJzt3HuwXXV5h/HnS5K2WKgUAQeUIdBaRBEjCYmoUBFrteP0\nMoamFuq9eBtbtWK9YE1brSId26naCqVWbUWpFmac2hY0lIsKgRBDCCpeEKulKsFLsWrE+PaPvQK7\nMTk55yQnO3l9PjN7zjpr/fZav3XWZJ6stU+SqkKSpI72mfQEJEmaK0ZOktSWkZMktWXkJEltGTlJ\nUltGTpLUlpGTJLVl5CRJbRk5SVJbRk6S1Nb8SU9AUzvooINq4RELJz0NzcAta2+f9BQ0Q0cff9ik\np6AZuO2Lt7Fx48ZMZ6yR28MtPGIhq1dfN+lpaAZOXbBy0lPQDK1avXLSU9AMLFu2dNpjfVwpSWrL\nyEmS2jJykqS2jJwkqS0jJ0lqy8hJktoycpKktoycJKktIydJasvISZLaMnKSpLaMnCSpLSMnSWrL\nyEmS2jJykqS2jJwkqS0jJ0lqy8hJktoycpKktoycJKktIydJasvISZLaMnKSpLaMnCSpLSMnSWrL\nyEmS2jJykqS2jJwkqS0jJ0lqy8hJktoycpKktoycJKktIydJasvISZLaMnKSpLaMnCSpLSMnSWrL\nyEmS2jJykqS2jJwkqS0jJ0lqy8hJktoycpKktoycJKktIydJasvISZLaMnKSpLaMnCSprWlFLsmv\nJ6kkD55izDuTLN/Otj9J8vhh+aQkNydZl+SYJL+9nfccluQD05nf2HuuSLJm7PslSa6YyT52sP/3\nJHn+2PfLkqxPsmAG+/hykgN21ZwkSds33Tu5pwIfHb7OSJJ5VfVHVfWRYdXpwBuqahFwf2Cbkauq\n26tqm9HcgUOSPGkW75uOlwJnJTk4yT7AW4EXVNXd03lzknlzNC9J0jbsMHJJ9gMeAzwb+K2x9Uny\n1iS3JPkIcMjYttuSnJNkLXDalru8JM8BfhP40yTvAd4InDTc1b1kq+MuTLJhWH5GkouT/HuSzyZ5\n0xRTPhd49TbOY16Sc5NcP9x9PXdY/7YkvzosX5LkHcPys5K8fnwfVfVV4M+BNwHPA9ZX1UeH8Wck\nuSnJhiR/Nqybn+SbSf4yyXpg6dh87pPksiTPmuJcJEk7YTp3cr8G/HtVfQa4M8niYf1vAEcDDwGe\nBjxqq/fdWVXHV9X7tqyoqguADwJnVdXpwCuAq6tqUVX9xQ7msQhYATwMWJHk8O2Muwb4fpJTtlr/\nbOBbVXUCcALwu0mOBK4GThrGPGA4H4Z1V21j/28fxpwFvBwgyQOB1wGnAI8AHp3kycP4+wJXVdVx\nVXXNsG5/4F+Ad1bVO3Zw3pKkWZpO5J4KbAnV+7j3keXJwHuranNV3Q5cvtX7Lto1U7zHqqr6VlV9\nD/gkcMQUY18HnL3VuicAT0uyDlgN3A94EEPkkjxk2O9XkxwKnAh8fOsdV9UPgfOAf6uqO4fVy4DL\nq2rj8OjyQkY/H4DvA5dstZt/Ac6rqgu3NfkkZyZZk2TNHRvvmOI0JUlTmT/VxiQHAo8DHpakgHlA\nJTlrGvv+310wv3GbxpY3M8Xcq+ryJK8DHjm2OsCLqurSrccPvwjyREZ3bgcyeqT67aq6azuH+OHw\nmo7vVlVtte5jwJOS/NM2tlFV5wPnAyxZvORHtkuSpmdHd3LLgX+oqiOqamFVHQ58gXsf5a0YPus6\nlNGjupm6i9Gju7nwOobHiYNLgedv+U3IJL+Q5KeHbdcCL2Z0TlcDLxu+Ttdq4JQk90syn9Fnl1dO\nMf5VwHeAv5rBMSRJM7SjyD2VH33U9s9j6z/L6BHfuxl9FjZT64HNSW7c+hdPdlZV/Ssw/qzvAkZz\nXTv8Qst53Hs3eDUwv6o+B6xldDc37chV1ZeB1wBXAOuAa6vqQzt42wuBA7b8kookadfLNp6WaQ+y\nZPGSWr36uklPQzNw6oKVk56CZmjV3SsnPQXNwLJlS1lzw5pMZ6z/44kkqS0jJ0lqy8hJktoycpKk\ntoycJKktIydJasvISZLaMnKSpLaMnCSpLSMnSWrLyEmS2jJykqS2jJwkqS0jJ0lqy8hJktoycpKk\ntoycJKktIydJasvISZLaMnKSpLaMnCSpLSMnSWrLyEmS2jJykqS2jJwkqS0jJ0lqy8hJktoycpKk\ntoycJKktIydJasvISZLaMnKSpLaMnCSpLSMnSWrLyEmS2jJykqS2jJwkqS0jJ0lqy8hJktoycpKk\ntoycJKktIydJasvISZLaMnKSpLaMnCSpLSMnSWrLyEmS2jJykqS2jJwkqa35k56ApnbL2ts5dcHK\nSU9DM7Dq7pWTnoKkgXdykqS2jJwkqS0jJ0lqy8hJktoycpKktoycJKktIydJasvISZLaMnKSpLaM\nnCSpLSMnSWrLyEmS2jJykqS2jJwkqS0jJ0lqy8hJktoycpKktoycJKktIydJasvISZLaMnKSpLaM\nnCSpLSMnSWrLyEmS2jJykqS2jJwkqS0jJ0lqy8hJktoycpKktoycJKktIydJasvISZLaMnKSpLaM\nnCSpLSMnSWrLyEmS2jJykqS2jJwkqS0jJ0lqy8hJktoycpKktoycJKktIydJasvISZLaMnKSpLaM\nnCSpLSMnSWrLyEmS2jJykqS2jJwkqa29LnJJNidZl+TGJGuTPGon9vWqseWFSTbsmllKkvYEe13k\ngO9W1aKqejjwSuANO7GvV+14iCRpb7U3Rm7czwDf2PJNkrOSXJ9kfZI/Hlt/RpLrhjvA85LMS/JG\nYN9h3XuGofOS/G2Sm5NclmTf4f2Lklw77PeSJD+b5MFJrhs7xsIkNw3Li5NcmeSGJJcmOXRYf0WS\nc4a5fCbJSbvhZyRJP7b2xshtCdOngQuAPwVI8gTgQcBSYBGwOMnJSY4BVgCPrqpFwGbg9Kp6Bffe\nFZ4+7PtBwNuq6qHAN4GnDOvfDfxhVR0H3AS8tqo+DfxEkiOHMSuAi5IsAN4CLK+qxcA7gNePzX9+\nVS0FXgy8dhf/bCRJY+ZPegKz8N0hViQ5EXh3kmOBJwyvTwzj9mMUreOAxcD1SQD2Bb62nX1/oarW\nDcs3AAuT3Bc4oKquHNa/C3j/sPxPjOL2xuHrCuBo4Fjgw8Px5gH/PXaMi8f3v61JJDkTOBPgJ7nv\n9n8SkqQp7Y2Ru0dVXZPkIOBgIMAbquq88TFJXgS8q6peOY1dbhpb3swoiFO5CHh/kotH06nPJnkY\ncHNVnbiDY2xmOz//qjofOB9g/xxW05i3JGkb9sbHlfdI8mBGd0p3ApcCz0qy37DtAUkOAVYBy4dl\nkhyY5IhhF3cPjxe3q6q+BXxj7POz3wGuHLZ9nlGsXsMoeAC3AAcPd5kkWZDkobvkhCVJM7I33snt\nm2TLI8UAT6+qzcBlw+dv1wyPCb8NnFFVn0xy9rB9H+Bu4IXAFxndLa1PshZ49RTHfDrw9iT3AW4F\nnjm27SLgXOBIgKr6fpLlwF8NjzrnA38J3LwLzl2SNAOp8mnYnmz/HFaLec6kp6EZWHX3yklPQWpt\n2bKlrLlhTaYzdq9+XClJ0lSMnCSpLSMnSWrLyEmS2jJykqS2jJwkqS0jJ0lqy8hJktoycpKktoyc\nJKktIydJasvISZLaMnKSpLaMnCSpLSMnSWrLyEmS2jJykqS2jJwkqS0jJ0lqy8hJktoycpKktoyc\nJKktIydJasvISZLaMnKSpLaMnCSpLSMnSWrLyEmS2jJykqS2jJwkqS0jJ0lqy8hJktoycpKktoyc\nJKktIydJasvISZLaMnKSpLaMnCSpLSMnSWrLyEmS2jJykqS2jJwkqS0jJ0lqy8hJktoycpKktoyc\nJKktIydJasvISZLaMnKSpLbmT3oCmtrRxx/GqtUrJz0NqbVTF6yc9BQ0A7dw+7THeicnSWrLyEmS\n2jJykqS2jJwkqS0jJ0lqy8hJktoycpKktoycJKktIydJasvISZLaMnKSpLaMnCSpLSMnSWrLyEmS\n2jJykqS2jJwkqS0jJ0lqy8hJktoycpKktoycJKktIydJasvISZLaMnKSpLaMnCSpLSMnSWrLyEmS\n2jJykqS2jJwkqS0jJ0lqy8hJktoycpKktoycJKktIydJasvISZLaMnKSpLaMnCSpLSMnSWrLyEmS\n2jJykqS2jJwkqS0jJ0lqy8hJktoycpKktoycJKktIydJasvISZLaMnKSpLaMnCSpLSMnSWprj45c\nklcnuTnJ+iTrkizbBfu8IsmSaY69IMlDpti+MsnLdnZOkqS5MX/SE9ieJCcCTwaOr6pNSQ4CfmJ3\nzqGqnrM7jydJ2rX25Du5Q4GNVbUJoKo2VtXtSf4oyfVJNiQ5P0ngnju0c5Jcl+QzSU4a1u+b5H1J\nPpXkEmDfYf1pSd48LP9+kluH5aOSfGxsn0uG5ScmWZvkxiSrxub5kGHcrUl+b8vKJGcMc1mX5Lwk\n84bXO4e535TkJXP+U5SkH2N7cuQuAw4fgvXXSX5xWP/Wqjqhqo5lFKwnj71nflUtBV4MvHZY93zg\nO1V1zLBu8bD+auCkYfkk4M4kDxiWrxqfSJKDgb8FnlJVDwdOG9v8YOCXgaXAa5MsSHIMsAJ4dFUt\nAjYDpwOLgAdU1bFV9TDg72f905Ek7dAeG7mq+jajIJ0J3AFclOQZwClJVie5CXgc8NCxt108fL0B\nWDgsnwz847DP9cD6YfkrwH5J9gcOBy4cxp7EKIDjHglcVVVfGN779bFtH6qqTVW1EfgacH/g1GHu\n1ydZN3x/FHArcFSStyR5IvA/2zr3JGcmWZNkzR0b75jOj0uStA177GdyAFW1GbgCuGKI2nOB44Al\nVfWlJCuBnxp7y6bh62amd24fB54J3MIobM8CTgT+YAbT3DS2vOW4Ad5VVa/cenCShzO683se8JvD\nMf+fqjofOB9gyeIlNYO5SJLG7LF3ckmOTvKgsVWLGMUIYGOS/YDl09jVVcBvD/s8llEkt7gaeNkw\n5hPAKcCmqvrWVvu4Fjg5yZHDfg7cwTFXAcuTHLJlfJIjhl+e2aeq/hk4Gzh+GvOXJM3Snnwntx/w\nliQHAD8APsfo0eU3gQ3AV4Drp7GfvwH+PsmngE8xepS5xdWMHlVeVVWbk3wJ+PTWO6iqO5KcCVyc\nZB9GjyV/aXsHrKpPJjkbuGwYfzfwQuC7w1y2/OXiR+70JEm7Tqp8GrYnW7J4Sa1efd2kpyG1duqC\nlZOegmbgBi7grro90xm7xz6ulCRpZxk5SVJbRk6S1JaRkyS1ZeQkSW0ZOUlSW0ZOktSWkZMktWXk\nJEltGTlJUltGTpLUlpGTJLVl5CRJbRk5SVJbRk6S1JaRkyS1ZeQkSW0ZOUlSW0ZOktSWkZMktWXk\nJEltGTlJUltGTpLUlpGTJLVl5CRJbRk5SVJbRk6S1JaRkyS1ZeQkSW0ZOUlSW0ZOktSWkZMktWXk\nJEltGTlJUltGTpLUlpGTJLVl5CRJbRk5SVJbRk6S1JaRkyS1ZeQkSW0ZOUlSW0ZOktSWkZMktWXk\nJEltGTlJUltGTpLUlpGTJLVl5CRJbRk5SVJbqapJz0FTSHIH8MVJz2OOHARsnPQkNG1er71P12t2\nRFUdPJ2BRk4Tk2RNVS2Z9Dw0PV6vvY/XzMeVkqTGjJwkqS0jp0k6f9IT0Ix4vfY+P/bXzM/kJElt\neScnSWrLyGnWkhyQ5AU78f5/TXLAFNt/MslFST6XZHWShbM9lkZ2wzU7OcnaJD9Isny2x9G9dsM1\ne2mSTyZZn2RVkiNme6w9kZHTzjgAmPUfvqr6lar65hRDng18o6p+HvgL4JzZHkv3mOtr9p/AM4AL\nZ3sM/Yi5vmafAJZU1XHAB4A3zfZYeyIjp53xRuDnkqxLcu7w2pDkpiQrAJI8NslVST6U5JYkb0+y\nz7DttiQHDctPG/4meWOSfxj2/2vAu4blDwCnJsluPsdu5vSaVdVtVbUe+OGkTrChub5m/1FV3xmO\ndS3wwAmc49ypKl++ZvUCFgIbhuWnAB8G5gH3Z/Q3+kOBxwLfA44atn0YWD685zZG/yPDQ4HPAAcN\n6w8cvm4AHjh2vM9vGeNrz7xmY8d555b3+No7rtmw7q3A2ZM+51358k5Ou8pjgPdW1eaq+ipwJXDC\nsO26qrq1qjYD7x3Gjnsc8P6q2ghQVV/fXZP+Mec12/vM2TVLcgawBDh3Lk9gdzNy2h22/ncq0/13\nK/8FHA6QZD5wX+DOXTgvbd9sr5kmZ9bXLMnjgVcDv1pVm3bprCbMyGln3AXsPyxfDaxIMi/JwcDJ\nwHXDtqVJjhw+I1gBfHSr/VwOnJbkfgBJDhzWfxB4+rC8HLi8hmcqmrW5vmba9eb0miV5BHAeo8B9\nbW5PZfczcpq1qroT+FiSDcCJwHrgRkZ/mF5eVV8Zhl7P6Fn/p4AvAJdstZ+bgdcDVya5EXjzsOnv\ngPsl+RzwUuAVc3tG/c31NUtyQpIvA6cB5yW5ee7Pqrfd8OfsXGA/4P3DL7d8cI5PabfyfzzRnEry\nWOBlVfXkSc9F0+M12/t4zbbPOzlJUlveyUmS2vJOTpLUlpGTJLVl5CRJbRk5SVJbRk6S1JaRkyS1\n9X/WAtmQuL86sgAAAABJRU5ErkJggg==\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.ticker as ticker\n", "import matplotlib.cm as cm\n", "import matplotlib as mpl\n", "\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "flight_matrix = topic_distribution \n", "\n", "yLabel = ['Adrift in New York','Beethoven','Sandwiches']\n", "xLabel = ['topic0','topic1','topic2']\n", "\n", "fig = plt.figure()\n", "fig, ax = plt.subplots(1,1, figsize=(6,6))\n", "\n", "\n", "ax.set_xticks(np.arange(len(xLabel)))\n", "ax.set_yticks(np.arange(len(yLabel)))\n", "\n", "ax.set_xticklabels(xLabel)\n", "ax.set_yticklabels(yLabel)\n", "\n", "heatplot = ax.imshow(flight_matrix, cmap='Purples')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.13" } }, "nbformat": 4, "nbformat_minor": 2 }