{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# FILE READER" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The following code will read several books in .pdf format to later compute the frequency of the words by language." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "de10e45fb50044019eab08c096b3e7cb", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=4), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "-----------------------------\n", " lang author book_title \\\n", "0 english shakespeare A Midsummer Night's Dream \n", "1 english shakespeare Hamlet \n", "2 english shakespeare Macbeth \n", "3 english shakespeare Othello \n", "4 english shakespeare Richard III \n", "5 english shakespeare Romeo and Juliet \n", "6 english shakespeare The Merchant of Venice \n", "7 french chevalier L'ale de sable \n", "8 french chevalier L'enfer et le paradis de l'autre monde \n", "9 french chevalier La capitaine \n", "10 french chevalier La fille des indiens rouges \n", "11 french chevalier La fille du pirate \n", "12 french chevalier Le chasseur noir \n", "13 french chevalier Les derniers Iroquois \n", "14 french de Maupassant Boule de Suif \n", "15 french de Maupassant Claire de Lune \n", "16 french de Maupassant Contes de la Becasse \n", "17 french de Maupassant Euvres completes de Guy de Maupassant \n", "18 french de Maupassant L'inutile beaut \n", "19 french de Maupassant La Main Gauche \n", "20 french de Maupassant La Maison Tellier \n", "21 french de Maupassant La petite roque \n", "22 french de Maupassant Le Horla \n", "23 french diderot Ceci n'est pas un conte \n", "24 french diderot Entretien d'un pare avec ses enfants \n", "25 french diderot L'oiseau blanc \n", "26 french diderot Les deux amis de Bourbonne \n", "27 french diderot Regrets sur ma vieille robe de chambre \n", "28 french sand cora \n", "29 french sand Jacques le fataliste et son maatre \n", ".. ... ... ... \n", "72 german shakespeare Romeo und Julia \n", "73 portuguese branco A Filha do Arcediago \n", "74 portuguese branco A Neta do Arcediago \n", "75 portuguese branco A Queda d'um Anjo \n", "76 portuguese branco Agulha em Palheiro \n", "77 portuguese branco Amor de Perdicao \n", "78 portuguese branco Amor de Salvacao \n", "79 portuguese branco Annos de Prosa \n", "80 portuguese branco Carlota Angela \n", "81 portuguese branco Estrellas Funestas \n", "82 portuguese branco Estrellas Propicias \n", "83 portuguese branco Lagrimas Abenaoadas \n", "84 portuguese branco Livro de Consolacao \n", "85 portuguese branco O Olho de Vidro \n", "86 portuguese branco O que fazem mulheres \n", "87 portuguese branco O Regicida \n", "88 portuguese branco Scenas Contemporaneas \n", "89 portuguese dinis A Morgadinha dos Cannaviaes \n", "90 portuguese dinis Os fidalgos da Casa Mourisca \n", "91 portuguese dinis Uma familia ingleza \n", "92 portuguese Queiros A Cidade e as Serras \n", "93 portuguese Queiros A correspondancia de Fradique Mendes \n", "94 portuguese Queiros A Illustre Casa de Ramires \n", "95 portuguese Queiros A Reliquia \n", "96 portuguese Queiros Cartas de Inglaterra \n", "97 portuguese Queiros O crime do padre Amaro \n", "98 portuguese Queiros O Mandarim \n", "99 portuguese Queiros O Primo Bazilio \n", "100 portuguese Queiros Os Maias \n", "101 portuguese shakespeare Hamlet \n", "\n", " process_time uniq_words total_words \n", "0 81.30 3226 16972 \n", "1 145.04 4794 29575 \n", "2 86.92 3552 17646 \n", "3 131.33 4032 27379 \n", "4 164.85 4705 34665 \n", "5 136.36 4325 28920 \n", "6 99.99 3529 21951 \n", "7 498.27 13420 77237 \n", "8 238.54 7549 44085 \n", "9 281.60 9421 49153 \n", "10 468.99 12677 72098 \n", "11 325.74 11148 52969 \n", "12 317.44 8860 55197 \n", "13 368.96 11370 59296 \n", "14 216.56 8322 39201 \n", "15 142.55 6434 28054 \n", "16 200.75 7948 37309 \n", "17 353.40 11747 57768 \n", "18 222.65 7967 41145 \n", "19 197.98 7431 37089 \n", "20 274.03 9308 47994 \n", "21 241.70 8027 44309 \n", "22 244.25 8129 44560 \n", "23 50.01 3111 11379 \n", "24 51.34 3027 11679 \n", "25 111.02 4992 23468 \n", "26 37.41 2545 8804 \n", "27 21.43 1636 5264 \n", "28 59.10 3742 13224 \n", "29 587.25 12076 95090 \n", ".. ... ... ... \n", "72 111.66 5365 23293 \n", "73 488.96 12740 78149 \n", "74 342.99 10690 58000 \n", "75 333.35 12274 52895 \n", "76 275.29 10048 46938 \n", "77 296.79 9580 51352 \n", "78 322.33 11600 52100 \n", "79 436.01 12960 67263 \n", "80 299.18 10465 50429 \n", "81 305.61 10431 51228 \n", "82 271.82 9644 47073 \n", "83 284.45 9699 49025 \n", "84 369.52 12429 58501 \n", "85 283.07 10423 48083 \n", "86 254.62 9819 43882 \n", "87 335.32 11662 55239 \n", "88 320.58 11097 53202 \n", "89 1200.59 19715 148738 \n", "90 1126.08 17492 144454 \n", "91 969.08 17612 121923 \n", "92 508.40 14453 71227 \n", "93 388.24 13465 56881 \n", "94 834.74 17577 107379 \n", "95 616.67 15747 84947 \n", "96 300.57 11205 48481 \n", "97 1129.89 18832 141629 \n", "98 116.44 6831 22486 \n", "99 900.40 17980 118417 \n", "100 1989.46 24453 215271 \n", "101 183.22 7206 34327 \n", "\n", "[102 rows x 6 columns]\n", "\n", "-----------------------------\n", " lang author book_title words freq\n", "0 english shakespeare A Midsummer Night's Dream the 579\n", "1 english shakespeare A Midsummer Night's Dream and 562\n", "2 english shakespeare A Midsummer Night's Dream i 443\n", "3 english shakespeare A Midsummer Night's Dream to 337\n", "4 english shakespeare A Midsummer Night's Dream you 273\n", "5 english shakespeare A Midsummer Night's Dream of 269\n", "6 english shakespeare A Midsummer Night's Dream a 264\n", "7 english shakespeare A Midsummer Night's Dream in 239\n", "8 english shakespeare A Midsummer Night's Dream my 204\n", "9 english shakespeare A Midsummer Night's Dream is 190\n", "10 english shakespeare A Midsummer Night's Dream that 184\n", "11 english shakespeare A Midsummer Night's Dream with 175\n", "12 english shakespeare A Midsummer Night's Dream me 174\n", "13 english shakespeare A Midsummer Night's Dream not 171\n", "14 english shakespeare A Midsummer Night's Dream this 162\n", "15 english shakespeare A Midsummer Night's Dream her 148\n", "16 english shakespeare A Midsummer Night's Dream for 143\n", "17 english shakespeare A Midsummer Night's Dream it 132\n", "18 english shakespeare A Midsummer Night's Dream your 128\n", "19 english shakespeare A Midsummer Night's Dream but 121\n", "20 english shakespeare A Midsummer Night's Dream thou 118\n", "21 english shakespeare A Midsummer Night's Dream as 115\n", "22 english shakespeare A Midsummer Night's Dream so 113\n", "23 english shakespeare A Midsummer Night's Dream will 111\n", "24 english shakespeare A Midsummer Night's Dream loue 105\n", "25 english shakespeare A Midsummer Night's Dream be 104\n", "26 english shakespeare A Midsummer Night's Dream haue 95\n", "27 english shakespeare A Midsummer Night's Dream his 93\n", "28 english shakespeare A Midsummer Night's Dream all 91\n", "29 english shakespeare A Midsummer Night's Dream no 85\n", "... ... ... ... ... ...\n", "825219 portuguese shakespeare Hamlet lançarmas 1\n", "825220 portuguese shakespeare Hamlet arrancarme 1\n", "825221 portuguese shakespeare Hamlet representem 1\n", "825222 portuguese shakespeare Hamlet esbofetearme 1\n", "825223 portuguese shakespeare Hamlet attentado 1\n", "825224 portuguese shakespeare Hamlet inacção 1\n", "825225 portuguese shakespeare Hamlet fico 1\n", "825226 portuguese shakespeare Hamlet confusa 1\n", "825227 portuguese shakespeare Hamlet tibia 1\n", "825228 portuguese shakespeare Hamlet ficavam 1\n", "825229 portuguese shakespeare Hamlet vilipendios 1\n", "825230 portuguese shakespeare Hamlet possiveis 1\n", "825231 portuguese shakespeare Hamlet inoffensivo 1\n", "825232 portuguese shakespeare Hamlet fel 1\n", "825233 portuguese shakespeare Hamlet trahese 1\n", "825234 portuguese shakespeare Hamlet espontanea 1\n", "825235 portuguese shakespeare Hamlet perturbaram 1\n", "825236 portuguese shakespeare Hamlet dramaticas 1\n", "825237 portuguese shakespeare Hamlet assistindo 1\n", "825238 portuguese shakespeare Hamlet eila 1\n", "825239 portuguese shakespeare Hamlet pausa 1\n", "825240 portuguese shakespeare Hamlet procuremos 1\n", "825241 portuguese shakespeare Hamlet imprecações 1\n", "825242 portuguese shakespeare Hamlet vãs 1\n", "825243 portuguese shakespeare Hamlet gastar 1\n", "825244 portuguese shakespeare Hamlet instigam 1\n", "825245 portuguese shakespeare Hamlet adulterio 1\n", "825246 portuguese shakespeare Hamlet impudico 1\n", "825247 portuguese shakespeare Hamlet abutres 1\n", "825248 portuguese shakespeare Hamlet hear 1\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[825249 rows x 5 columns]\n" ] } ], "source": [ "import os\n", "import pandas as pd\n", "import time\n", "from tqdm import tqdm_notebook as tqdm\n", "\n", "# specify the folder's directory where the book files are located\n", "book_dir = './Books'\n", "\n", "# create two empty Dataframes to later store the info computed from every book file\n", "count_result = pd.DataFrame(columns=['lang', 'author', 'book_title', 'words', 'freq'])\n", "stat_result = pd.DataFrame(columns=['lang', 'author', 'book_title', 'process_time',\n", " 'uniq_words','total_words'])\n", "\n", "time_start = time.time()\n", "time_accum = 0\n", "\n", "# iterate and read every file by language, author, and title\n", "for language in tqdm(os.listdir(book_dir)):\n", " for author in os.listdir(book_dir + '/' + language):\n", " for title in os.listdir(book_dir + '/' + language + '/' + author):\n", " \n", " # this is the resulting path...\n", " title_path = book_dir + '/' + language + '/' + author + '/' + title\n", " \n", " # now it will read on every file\n", " with open(title_path, 'r', encoding='utf8') as current_file:\n", " text = current_file.read()\n", " \n", " # the following lines clean the book's content for the further analysis\n", " text = text.replace('\\n', ' ').replace('\\r', ' ') \n", " text = text.lower() # turn every letter into lower case\n", " \n", " # remove the most common symbols, marks, and numbers\n", " skip_list = [',', '.', ':', ';', '¿', '?', '¡', '!', '#' '\"', \"'\", '-', '(', ')', '{', '}',\n", " '1', '2', '3', '4', '5', '6', '7', '8', '9', '0']\n", " \n", " for ch in skip_list:\n", " text = text.replace(ch, '')\n", " \n", " # create a temporary dataframe for every book title to store and isolate the stats collected\n", " temp_df = pd.DataFrame(columns=['lang', 'author', 'book_title', 'words', 'freq'])\n", " \n", " # this loop will count the frequency for every unique word\n", " for word in list(filter(None, text.split(' '))):\n", " if word in temp_df['words'].values:\n", " temp_df.loc[temp_df.words == word, 'freq'] += 1\n", " else:\n", " temp_df.loc[len(temp_df)] = language, author, title.replace('.txt', ''), word, 1\n", " \n", " temp_df = temp_df.sort_values('freq', ascending=False) # sort the dataframe in descending order\n", " \n", " # collect the data from the current file before moving to the next one\n", " process_time = round(time.time() - time_accum - time_start, 2)\n", " time_accum += process_time\n", " stat_result.loc[len(stat_result)] = language, author, title.replace('.txt', ''), process_time, len(temp_df), sum(temp_df['freq'].values)\n", " \n", " # this will save and accumulate the info collected from the temporary dataframe into a different table\n", " count_result = pd.concat([count_result, temp_df], axis=0, ignore_index=True)\n", "\n", "# output the results as a .csv file\n", "stat_result.to_csv('stat_summary.csv', sep=',')\n", "count_result.to_csv('word_frequencies.csv', sep=',')\n", "\n", "print('\\n-----------------------------')\n", "print(stat_result)\n", "print('\\n-----------------------------')\n", "print(count_result)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### VISUALIZE THE RESULTS" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "plt.figure(figsize = (12,12))\n", "\n", "# plot the stat from every book by language\n", "for language in os.listdir(book_dir):\n", " subset = stat_result[stat_result.lang == language] # filter the stat data by language\n", " plt.loglog(subset.total_words, subset.uniq_words, \"o\", label = language)\n", "\n", "plt.legend()\n", "plt.xlabel(\"Total Number of Words\")\n", "plt.ylabel(\"Number of unique words\")\n", "plt.savefig(\"total_vs_unique_words.png\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 2 }