{ "metadata": { "name": "", "signature": "sha256:7f04e8becbaa17ada9a62765fc89afb2f6b64b69b42337ce1c8a53e4f86939ff" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Top words per decade appeared" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import time\n", "from math import ceil\n", "import pickle" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "# load pickle of all words and decades and remove those that appear in more than 10 decades\n", "\n", "df = pd.read_pickle(\"../data_user_pickle_csv/coha_1.pickle\")\n", "origlen = len(df)\n", "origwds = len(df.word.unique())\n", "df = df[df.nonalpha == False] # remove words with nonalphanumeric characters\n", "wordcount = pd.DataFrame(df.groupby('word').decade.count())\n", "wordcount = wordcount[wordcount.decade <= 15]\n", "df = df[df.word.isin(wordcount.index)]\n", "df = df[['word', 'decade', 'pct']]\n", "print \"{0} records reduced to {1} ({2:0.1f} %)\".format(origlen, len(df), len(df)*100.0/origlen)\n", "print \"{0} words reduced to {1} ({2:0.1f} %)\".format(origwds, len(df.word.unique()), \n", " len(df.word.unique())*100.0/origwds)\n", "print df.head(10)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "2539728 records reduced to 1214911 (47.8 %)\n", "436103 words reduced to 289826 (66.5 %)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", " word decade pct\n", "102 aaa 1850 0.000006\n", "103 aaa 1910 0.000009\n", "104 aaa 1920 0.000008\n", "105 aaa 1930 0.001382\n", "106 aaa 1940 0.000170\n", "107 aaa 1950 0.000110\n", "108 aaa 1960 0.000035\n", "109 aaa 1970 0.000052\n", "110 aaa 1980 0.000070\n", "111 aaa 1990 0.000319\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "#keep words in crossword dictionary, e.g. not proper nouns\n", "\n", "origlen = len(df)\n", "origwds = len(df.word.unique())\n", "\n", "import json\n", "xwords = json.loads(open('../data_user_pickle_csv/coha_and_xword.json', 'r').read())\n", "df = df[df.word.isin(xwords)]\n", "\n", "print \"{0} records reduced to {1} ({2:0.1f} %)\".format(origlen, len(df), len(df)*100.0/origlen)\n", "print \"{0} words reduced to {1} ({2:0.1f} %)\".format(origwds, len(df.word.unique()), \n", " len(df.word.unique())*100.0/origwds)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "1214911 records reduced to 313863 (25.8 %)\n", "289826 words reduced to 41068 (14.2 %)\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "# keep top 10000 words in terms of max and sum\n", "\n", "origlen = len(df)\n", "origwds = len(df.word.unique())\n", "\n", "dfsum = pd.DataFrame(df.groupby('word').pct.sum())\n", "dfsum.sort('pct', ascending=False, inplace=True)\n", "dfsum = dfsum[:10000]\n", "dfmax = pd.DataFrame(df.groupby('word').pct.max())\n", "dfmax.sort('pct', ascending=False, inplace=True)\n", "dfmax = dfmax[:10000]\n", "\n", "df = df[(df.word.isin(dfsum.index)) | (df.word.isin(dfmax.index))]\n", "\n", "print \"{0} records reduced to {1} ({2:0.1f} %)\".format(origlen, len(df), len(df)*100.0/origlen)\n", "print \"{0} words reduced to {1} ({2:0.1f} %)\".format(origwds, len(df.word.unique()), \n", " len(df.word.unique())*100.0/origwds)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "313863 records reduced to 128688 (41.0 %)\n", "41068 words reduced to 11740 (28.6 %)\n" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "# add sum per decade to dfsum\n", "\n", "series_count = df.groupby('word').decade.count()\n", "\n", "dfsum['pct_per_decade'] = 0.0\n", "dfsum['decades'] = 0\n", "dfsum['decade_specificity'] = 0.0\n", "\n", "for i in range(len(dfsum)):\n", " dfsum.pct_per_decade.iloc[i] = (dfsum.pct[i] /\n", " series_count[dfsum.index[i]])\n", " dfsum.decades[i] = series_count[dfsum.index[i]]\n", " dfsum.decade_specificity[i] = 20 - series_count[dfsum.index[i]]\n", " \n", "dfsum.sort('pct_per_decade', ascending=False, inplace=True)\n", " \n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "print dfsum.head(50)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " pct pct_per_decade decades decade_specificity\n", "word \n", "soviet 0.133485 0.011124 12 8\n", "radio 0.102161 0.007859 13 7\n", "phone 0.106324 0.007595 14 6\n", "television 0.071525 0.006502 11 9\n", "okay 0.063832 0.006383 10 10\n", "telephone 0.092873 0.006192 15 5\n", "movie 0.056559 0.005142 11 9\n", "programs 0.062321 0.004155 15 5\n", "nuclear 0.047605 0.003967 12 8\n", "computer 0.045346 0.003779 12 8\n", "cigarette 0.053111 0.003541 15 5\n", "airport 0.031493 0.003499 9 11\n", "automobile 0.038150 0.002935 13 7\n", "photo 0.043835 0.002922 15 5\n", "sutta 0.008483 0.002828 3 17\n", "movies 0.030464 0.002769 11 9\n", "baseball 0.037568 0.002505 15 5\n", "shit 0.027538 0.002295 12 8\n", "weekend 0.024362 0.002215 11 9\n", "unemployment 0.026051 0.002171 12 8\n", "concept 0.032165 0.002144 15 5\n", "aircraft 0.027237 0.002095 13 7\n", "scheduled 0.027750 0.001982 14 6\n", "fucking 0.013840 0.001977 7 13\n", "parking 0.023615 0.001968 12 8\n", "golf 0.029198 0.001947 15 5\n", "global 0.019391 0.001939 10 10\n", "environmental 0.023204 0.001934 12 8\n", "garage 0.022876 0.001906 12 8\n", "brittles 0.001743 0.001743 1 19\n", "soviets 0.016589 0.001659 10 10\n", "computers 0.014902 0.001656 9 11\n", "fizgig 0.001648 0.001648 1 19\n", "almah 0.001642 0.001642 1 19\n", "buddy 0.019513 0.001626 12 8\n", "fuck 0.014576 0.001620 9 11\n", "cloddy 0.009557 0.001593 6 14\n", "gasoline 0.022212 0.001587 14 6\n", "output 0.023681 0.001579 15 5\n", "electronic 0.015746 0.001575 10 10\n", "racial 0.023483 0.001566 15 5\n", "airplane 0.017098 0.001554 11 9\n", "nazi 0.018378 0.001532 12 8\n", "regional 0.021399 0.001528 14 6\n", "airlines 0.013531 0.001503 9 11\n", "skills 0.022295 0.001486 15 5\n", "basketball 0.017710 0.001476 12 8\n", "techniques 0.018977 0.001460 13 7\n", "taxi 0.018859 0.001451 13 7\n", "video 0.017279 0.001440 12 8\n" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "# for contrast, let's see proper nouns\n", "\n", "df_proper = pd.read_pickle(\"../data_user_pickle_csv/coha_1.pickle\")\n", "df_proper = df_proper[df_proper.nonalpha == False] # remove words with nonalphanumeric characters\n", "wordcount = pd.DataFrame(df_proper.groupby('word').decade.count())\n", "wordcount = wordcount[wordcount.decade <= 15]\n", "df_proper = df_proper[df_proper.word.isin(wordcount.index)]\n", "df_proper = df_proper[['word', 'decade', 'pct']]\n", "df_propersum = pd.DataFrame(df_proper.groupby('word').pct.sum())\n", "df_propersum.sort('pct', ascending=False, inplace=True)\n", "df_propersum = df_propersum[:10000]\n", "df_propermax = pd.DataFrame(df_proper.groupby('word').pct.max())\n", "df_propermax.sort('pct', ascending=False, inplace=True)\n", "df_propermax = df_propermax[:10000]\n", "df_proper = df_proper[(df_proper.word.isin(df_propersum.index)) | (df_proper.word.isin(df_propermax.index))]\n", "proper_series_count = df_proper.groupby('word').decade.count()\n", "df_propersum['pct_per_decade'] = 0.0\n", "for i in range(len(df_propersum)):\n", " df_propersum.pct_per_decade.iloc[i] = (df_propersum.pct[i] /\n", " proper_series_count[df_propersum.index[i]])\n", "df_propersum.sort('pct_per_decade', ascending=False, inplace=True)\n", "df_propersum = df_propersum[~df_propersum.index.isin(dfsum.index)]\n", "print df_propersum.head(50)\n", "df_propersum[:50].to_csv('coha_top_omitted_proper_nouns.csv')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " pct pct_per_decade\n", "word \n", "dorriville 0.033207 0.033207\n", "altorf 0.042033 0.021016\n", "madiboo 0.018765 0.018765\n", "selico 0.018074 0.018074\n", "pacomo 0.016863 0.016863\n", "pufpace 0.016171 0.016171\n", "brazzo 0.015393 0.015393\n", "lescourt 0.013923 0.013923\n", "rossberg 0.027246 0.013623\n", "rheinthal 0.011415 0.011415\n", "plotwell 0.011242 0.011242\n", "fourbin 0.010983 0.010983\n", "immorina 0.010983 0.010983\n", "ridolpho 0.010810 0.010810\n", "bertocci 0.010118 0.010118\n", "demba 0.010118 0.010118\n", "torribal 0.009858 0.009858\n", "devalmore 0.009512 0.009512\n", "erlach 0.037984 0.009496\n", "lesc 0.009426 0.009426\n", "ploughby 0.009253 0.009253\n", "eberard 0.018079 0.009040\n", "makesafe 0.008994 0.008994\n", "ksenia 0.008994 0.008994\n", "joblin 0.017905 0.008952\n", "mentzikoff 0.008648 0.008648\n", "usaldo 0.008561 0.008561\n", "ubal 0.008475 0.008475\n", "almeyda 0.016175 0.008088\n", "hippolito 0.015397 0.007699\n", "barogo 0.007610 0.007610\n", "beraldo 0.015061 0.007531\n", "hardrun 0.007523 0.007523\n", "arandez 0.007351 0.007351\n", "maillac 0.007091 0.007091\n", "mahadi 0.007091 0.007091\n", "bloomville 0.028237 0.007059\n", "spendall 0.007005 0.007005\n", "lanissa 0.006659 0.006659\n", "spicket 0.006287 0.006287\n", "ridol 0.006226 0.006226\n", "shenac 0.006137 0.006137\n", "rainouard 0.006053 0.006053\n", "flaurence 0.005967 0.005967\n", "wildenhain 0.017735 0.005912\n", "cerval 0.011765 0.005883\n", "oresca 0.005794 0.005794\n", "quicksite 0.005448 0.005448\n", "darina 0.005362 0.005362\n", "chetwynde 0.005292 0.005292\n" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "# make pivot table showing in which decades words occurred\n", "\n", "decades = range(1810, 2010, 10)\n", "dftop = dfsum[:50]\n", "dftoplookup = df.copy()\n", "for decade in decades:\n", " dftop[decade] = 0.0\n", "for i in range(len(dftop)):\n", " for decade in decades:\n", " if len(dftoplookup[(dftoplookup.word == dftop.index[i]) &\n", " (dftoplookup.decade == decade)]) > 0:\n", " dftop[decade].iloc[i] = dftoplookup[(dftoplookup.word == dftop.index[i]) &\n", " (dftoplookup.decade == decade)].pct.iloc[0]\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 25 }, { "cell_type": "code", "collapsed": false, "input": [ "print dftop.head()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " pct pct_per_decade decades decade_specificity 1810 1820 \\\n", "word \n", "soviet 0.133485 0.011124 12 8 0 0 \n", "radio 0.102161 0.007859 13 7 0 0 \n", "phone 0.106324 0.007595 14 6 0 0 \n", "television 0.071525 0.006502 11 9 0 0 \n", "okay 0.063832 0.006383 10 10 0 0 \n", "\n", " 1830 1840 1850 1860 ... 1910 1920 1930 \\\n", "word ... \n", "soviet 0 0 0 0 ... 0.000208 0.004427 0.006941 \n", "radio 0 0 0 0 ... 0.000253 0.005014 0.012146 \n", "phone 0 0 0 0 ... 0.001941 0.001890 0.005214 \n", "television 0 0 0 0 ... 0.000000 0.000499 0.000588 \n", "okay 0 0 0 0 ... 0.000000 0.000008 0.001353 \n", "\n", " 1940 1950 1960 1970 1980 1990 \\\n", "word \n", "soviet 0.011937 0.030687 0.022103 0.019477 0.027580 0.007039 \n", "radio 0.017642 0.014509 0.012027 0.009458 0.010498 0.010467 \n", "phone 0.006885 0.008053 0.012009 0.013640 0.014318 0.018203 \n", "television 0.001981 0.008791 0.011224 0.014161 0.014006 0.010957 \n", "okay 0.003058 0.004406 0.006313 0.008459 0.009622 0.014044 \n", "\n", " 2000 \n", "word \n", "soviet 0.003077 \n", "radio 0.010118 \n", "phone 0.023380 \n", "television 0.009295 \n", "okay 0.016564 \n", "\n", "[5 rows x 24 columns]\n" ] } ], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "dftop.to_csv('coha_top_decades.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }