{
 "metadata": {
  "name": "",
  "signature": "sha256:7f04e8becbaa17ada9a62765fc89afb2f6b64b69b42337ce1c8a53e4f86939ff"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Top words per decade appeared"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas as pd\n",
      "import time\n",
      "from math import ceil\n",
      "import pickle"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# load pickle of all words and decades and remove those that appear in more than 10 decades\n",
      "\n",
      "df = pd.read_pickle(\"../data_user_pickle_csv/coha_1.pickle\")\n",
      "origlen = len(df)\n",
      "origwds = len(df.word.unique())\n",
      "df = df[df.nonalpha == False] # remove words with nonalphanumeric characters\n",
      "wordcount = pd.DataFrame(df.groupby('word').decade.count())\n",
      "wordcount = wordcount[wordcount.decade <= 15]\n",
      "df = df[df.word.isin(wordcount.index)]\n",
      "df = df[['word', 'decade', 'pct']]\n",
      "print \"{0} records reduced to {1} ({2:0.1f} %)\".format(origlen, len(df), len(df)*100.0/origlen)\n",
      "print \"{0} words reduced to {1} ({2:0.1f} %)\".format(origwds, len(df.word.unique()), \n",
      "                                                          len(df.word.unique())*100.0/origwds)\n",
      "print df.head(10)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "2539728 records reduced to 1214911 (47.8 %)\n",
        "436103 words reduced to 289826 (66.5 %)"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "    word  decade       pct\n",
        "102  aaa    1850  0.000006\n",
        "103  aaa    1910  0.000009\n",
        "104  aaa    1920  0.000008\n",
        "105  aaa    1930  0.001382\n",
        "106  aaa    1940  0.000170\n",
        "107  aaa    1950  0.000110\n",
        "108  aaa    1960  0.000035\n",
        "109  aaa    1970  0.000052\n",
        "110  aaa    1980  0.000070\n",
        "111  aaa    1990  0.000319\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#keep words in crossword dictionary, e.g. not proper nouns\n",
      "\n",
      "origlen = len(df)\n",
      "origwds = len(df.word.unique())\n",
      "\n",
      "import json\n",
      "xwords = json.loads(open('../data_user_pickle_csv/coha_and_xword.json', 'r').read())\n",
      "df = df[df.word.isin(xwords)]\n",
      "\n",
      "print \"{0} records reduced to {1} ({2:0.1f} %)\".format(origlen, len(df), len(df)*100.0/origlen)\n",
      "print \"{0} words reduced to {1} ({2:0.1f} %)\".format(origwds, len(df.word.unique()), \n",
      "                                                          len(df.word.unique())*100.0/origwds)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1214911 records reduced to 313863 (25.8 %)\n",
        "289826 words reduced to 41068 (14.2 %)\n"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# keep top 10000 words in terms of max and sum\n",
      "\n",
      "origlen = len(df)\n",
      "origwds = len(df.word.unique())\n",
      "\n",
      "dfsum = pd.DataFrame(df.groupby('word').pct.sum())\n",
      "dfsum.sort('pct', ascending=False, inplace=True)\n",
      "dfsum = dfsum[:10000]\n",
      "dfmax = pd.DataFrame(df.groupby('word').pct.max())\n",
      "dfmax.sort('pct', ascending=False, inplace=True)\n",
      "dfmax = dfmax[:10000]\n",
      "\n",
      "df = df[(df.word.isin(dfsum.index)) | (df.word.isin(dfmax.index))]\n",
      "\n",
      "print \"{0} records reduced to {1} ({2:0.1f} %)\".format(origlen, len(df), len(df)*100.0/origlen)\n",
      "print \"{0} words reduced to {1} ({2:0.1f} %)\".format(origwds, len(df.word.unique()), \n",
      "                                                          len(df.word.unique())*100.0/origwds)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "313863 records reduced to 128688 (41.0 %)\n",
        "41068 words reduced to 11740 (28.6 %)\n"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# add sum per decade to dfsum\n",
      "\n",
      "series_count = df.groupby('word').decade.count()\n",
      "\n",
      "dfsum['pct_per_decade'] = 0.0\n",
      "dfsum['decades'] = 0\n",
      "dfsum['decade_specificity'] = 0.0\n",
      "\n",
      "for i in range(len(dfsum)):\n",
      "    dfsum.pct_per_decade.iloc[i] = (dfsum.pct[i] /\n",
      "                                    series_count[dfsum.index[i]])\n",
      "    dfsum.decades[i] = series_count[dfsum.index[i]]\n",
      "    dfsum.decade_specificity[i] = 20 - series_count[dfsum.index[i]]\n",
      "    \n",
      "dfsum.sort('pct_per_decade', ascending=False, inplace=True)\n",
      "    \n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print dfsum.head(50)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "                    pct  pct_per_decade  decades  decade_specificity\n",
        "word                                                                \n",
        "soviet         0.133485        0.011124       12                   8\n",
        "radio          0.102161        0.007859       13                   7\n",
        "phone          0.106324        0.007595       14                   6\n",
        "television     0.071525        0.006502       11                   9\n",
        "okay           0.063832        0.006383       10                  10\n",
        "telephone      0.092873        0.006192       15                   5\n",
        "movie          0.056559        0.005142       11                   9\n",
        "programs       0.062321        0.004155       15                   5\n",
        "nuclear        0.047605        0.003967       12                   8\n",
        "computer       0.045346        0.003779       12                   8\n",
        "cigarette      0.053111        0.003541       15                   5\n",
        "airport        0.031493        0.003499        9                  11\n",
        "automobile     0.038150        0.002935       13                   7\n",
        "photo          0.043835        0.002922       15                   5\n",
        "sutta          0.008483        0.002828        3                  17\n",
        "movies         0.030464        0.002769       11                   9\n",
        "baseball       0.037568        0.002505       15                   5\n",
        "shit           0.027538        0.002295       12                   8\n",
        "weekend        0.024362        0.002215       11                   9\n",
        "unemployment   0.026051        0.002171       12                   8\n",
        "concept        0.032165        0.002144       15                   5\n",
        "aircraft       0.027237        0.002095       13                   7\n",
        "scheduled      0.027750        0.001982       14                   6\n",
        "fucking        0.013840        0.001977        7                  13\n",
        "parking        0.023615        0.001968       12                   8\n",
        "golf           0.029198        0.001947       15                   5\n",
        "global         0.019391        0.001939       10                  10\n",
        "environmental  0.023204        0.001934       12                   8\n",
        "garage         0.022876        0.001906       12                   8\n",
        "brittles       0.001743        0.001743        1                  19\n",
        "soviets        0.016589        0.001659       10                  10\n",
        "computers      0.014902        0.001656        9                  11\n",
        "fizgig         0.001648        0.001648        1                  19\n",
        "almah          0.001642        0.001642        1                  19\n",
        "buddy          0.019513        0.001626       12                   8\n",
        "fuck           0.014576        0.001620        9                  11\n",
        "cloddy         0.009557        0.001593        6                  14\n",
        "gasoline       0.022212        0.001587       14                   6\n",
        "output         0.023681        0.001579       15                   5\n",
        "electronic     0.015746        0.001575       10                  10\n",
        "racial         0.023483        0.001566       15                   5\n",
        "airplane       0.017098        0.001554       11                   9\n",
        "nazi           0.018378        0.001532       12                   8\n",
        "regional       0.021399        0.001528       14                   6\n",
        "airlines       0.013531        0.001503        9                  11\n",
        "skills         0.022295        0.001486       15                   5\n",
        "basketball     0.017710        0.001476       12                   8\n",
        "techniques     0.018977        0.001460       13                   7\n",
        "taxi           0.018859        0.001451       13                   7\n",
        "video          0.017279        0.001440       12                   8\n"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# for contrast, let's see proper nouns\n",
      "\n",
      "df_proper = pd.read_pickle(\"../data_user_pickle_csv/coha_1.pickle\")\n",
      "df_proper = df_proper[df_proper.nonalpha == False] # remove words with nonalphanumeric characters\n",
      "wordcount = pd.DataFrame(df_proper.groupby('word').decade.count())\n",
      "wordcount = wordcount[wordcount.decade <= 15]\n",
      "df_proper = df_proper[df_proper.word.isin(wordcount.index)]\n",
      "df_proper = df_proper[['word', 'decade', 'pct']]\n",
      "df_propersum = pd.DataFrame(df_proper.groupby('word').pct.sum())\n",
      "df_propersum.sort('pct', ascending=False, inplace=True)\n",
      "df_propersum = df_propersum[:10000]\n",
      "df_propermax = pd.DataFrame(df_proper.groupby('word').pct.max())\n",
      "df_propermax.sort('pct', ascending=False, inplace=True)\n",
      "df_propermax = df_propermax[:10000]\n",
      "df_proper = df_proper[(df_proper.word.isin(df_propersum.index)) | (df_proper.word.isin(df_propermax.index))]\n",
      "proper_series_count = df_proper.groupby('word').decade.count()\n",
      "df_propersum['pct_per_decade'] = 0.0\n",
      "for i in range(len(df_propersum)):\n",
      "    df_propersum.pct_per_decade.iloc[i] = (df_propersum.pct[i] /\n",
      "                                    proper_series_count[df_propersum.index[i]])\n",
      "df_propersum.sort('pct_per_decade', ascending=False, inplace=True)\n",
      "df_propersum = df_propersum[~df_propersum.index.isin(dfsum.index)]\n",
      "print df_propersum.head(50)\n",
      "df_propersum[:50].to_csv('coha_top_omitted_proper_nouns.csv')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "                 pct  pct_per_decade\n",
        "word                                \n",
        "dorriville  0.033207        0.033207\n",
        "altorf      0.042033        0.021016\n",
        "madiboo     0.018765        0.018765\n",
        "selico      0.018074        0.018074\n",
        "pacomo      0.016863        0.016863\n",
        "pufpace     0.016171        0.016171\n",
        "brazzo      0.015393        0.015393\n",
        "lescourt    0.013923        0.013923\n",
        "rossberg    0.027246        0.013623\n",
        "rheinthal   0.011415        0.011415\n",
        "plotwell    0.011242        0.011242\n",
        "fourbin     0.010983        0.010983\n",
        "immorina    0.010983        0.010983\n",
        "ridolpho    0.010810        0.010810\n",
        "bertocci    0.010118        0.010118\n",
        "demba       0.010118        0.010118\n",
        "torribal    0.009858        0.009858\n",
        "devalmore   0.009512        0.009512\n",
        "erlach      0.037984        0.009496\n",
        "lesc        0.009426        0.009426\n",
        "ploughby    0.009253        0.009253\n",
        "eberard     0.018079        0.009040\n",
        "makesafe    0.008994        0.008994\n",
        "ksenia      0.008994        0.008994\n",
        "joblin      0.017905        0.008952\n",
        "mentzikoff  0.008648        0.008648\n",
        "usaldo      0.008561        0.008561\n",
        "ubal        0.008475        0.008475\n",
        "almeyda     0.016175        0.008088\n",
        "hippolito   0.015397        0.007699\n",
        "barogo      0.007610        0.007610\n",
        "beraldo     0.015061        0.007531\n",
        "hardrun     0.007523        0.007523\n",
        "arandez     0.007351        0.007351\n",
        "maillac     0.007091        0.007091\n",
        "mahadi      0.007091        0.007091\n",
        "bloomville  0.028237        0.007059\n",
        "spendall    0.007005        0.007005\n",
        "lanissa     0.006659        0.006659\n",
        "spicket     0.006287        0.006287\n",
        "ridol       0.006226        0.006226\n",
        "shenac      0.006137        0.006137\n",
        "rainouard   0.006053        0.006053\n",
        "flaurence   0.005967        0.005967\n",
        "wildenhain  0.017735        0.005912\n",
        "cerval      0.011765        0.005883\n",
        "oresca      0.005794        0.005794\n",
        "quicksite   0.005448        0.005448\n",
        "darina      0.005362        0.005362\n",
        "chetwynde   0.005292        0.005292\n"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# make pivot table showing in which decades words occurred\n",
      "\n",
      "decades = range(1810, 2010, 10)\n",
      "dftop = dfsum[:50]\n",
      "dftoplookup = df.copy()\n",
      "for decade in decades:\n",
      "    dftop[decade] = 0.0\n",
      "for i in range(len(dftop)):\n",
      "    for decade in decades:\n",
      "        if len(dftoplookup[(dftoplookup.word == dftop.index[i]) &\n",
      "                           (dftoplookup.decade == decade)]) > 0:\n",
      "            dftop[decade].iloc[i] = dftoplookup[(dftoplookup.word == dftop.index[i]) &\n",
      "                                                (dftoplookup.decade == decade)].pct.iloc[0]\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 25
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print dftop.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "                 pct  pct_per_decade  decades  decade_specificity  1810  1820  \\\n",
        "word                                                                            \n",
        "soviet      0.133485        0.011124       12                   8     0     0   \n",
        "radio       0.102161        0.007859       13                   7     0     0   \n",
        "phone       0.106324        0.007595       14                   6     0     0   \n",
        "television  0.071525        0.006502       11                   9     0     0   \n",
        "okay        0.063832        0.006383       10                  10     0     0   \n",
        "\n",
        "            1830  1840  1850  1860  ...       1910      1920      1930  \\\n",
        "word                                ...                                  \n",
        "soviet         0     0     0     0  ...   0.000208  0.004427  0.006941   \n",
        "radio          0     0     0     0  ...   0.000253  0.005014  0.012146   \n",
        "phone          0     0     0     0  ...   0.001941  0.001890  0.005214   \n",
        "television     0     0     0     0  ...   0.000000  0.000499  0.000588   \n",
        "okay           0     0     0     0  ...   0.000000  0.000008  0.001353   \n",
        "\n",
        "                1940      1950      1960      1970      1980      1990  \\\n",
        "word                                                                     \n",
        "soviet      0.011937  0.030687  0.022103  0.019477  0.027580  0.007039   \n",
        "radio       0.017642  0.014509  0.012027  0.009458  0.010498  0.010467   \n",
        "phone       0.006885  0.008053  0.012009  0.013640  0.014318  0.018203   \n",
        "television  0.001981  0.008791  0.011224  0.014161  0.014006  0.010957   \n",
        "okay        0.003058  0.004406  0.006313  0.008459  0.009622  0.014044   \n",
        "\n",
        "                2000  \n",
        "word                  \n",
        "soviet      0.003077  \n",
        "radio       0.010118  \n",
        "phone       0.023380  \n",
        "television  0.009295  \n",
        "okay        0.016564  \n",
        "\n",
        "[5 rows x 24 columns]\n"
       ]
      }
     ],
     "prompt_number": 26
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "dftop.to_csv('coha_top_decades.csv')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 27
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}