{
 "metadata": {
  "name": "",
  "signature": "sha256:590abb69089ae760b48b14eb4e5e171be027c87e3fd271a49cfd5585b37f354b"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# word_list_tools repo\n",
      "\n",
      "by David Taylor, www.prooffreader.com, prooffreader@gmail.com\n",
      "\n",
      "a collection of tools to create and analyze lists of words using python with pandas and matplotlib"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## letter proximity ###\n",
      "\n",
      "determine frequency of bigrams and conditional letter frequency in a corpus before and after a given letter\n",
      "\n",
      "source word list is pandas dataframe with columns 'word' and 'freq'. Any other columns will be ignored.\n",
      "\n",
      "** initial_data_munge must be run first to create pickled dataframes of word lists **"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "dataframe_base = 'coha_words' # change as needed to point to pickle\n",
      "\n",
      "data_path = 'data'\n",
      "nb_path = 'letter_proximity'\n",
      "\n",
      "save_filename = '' #used for .pickle and .png, leave as '' to use a default filename\n",
      "\n",
      "import pandas as pd\n",
      "import os\n",
      "import time\n",
      "\n",
      "plotly_apikey = open(\"../../plotly_yekipa.txt\", \"r\").read()\n",
      "plotly_username = open(\"../../plotly_emanresu.txt\", \"r\").read()\n",
      "\n",
      "words = pd.read_pickle(data_path + '/' + dataframe_base + '.pickle')\n",
      "\n",
      "alphabet = '_abcdefghijklmnopqrstuvwxyz'\n",
      "# note: underscore(_) is a null value, i.e. '_a' signifies a word that starts with the letter a\n",
      "\n",
      "def increment_dict(d, key, increment=1):\n",
      "    if key in d.keys():\n",
      "        d[key] += increment\n",
      "    else:\n",
      "        d[key] = increment\n",
      "    return d\n",
      "\n",
      "possible_bigrams = []\n",
      "for let1 in alphabet:\n",
      "    for let2 in alphabet:\n",
      "        if not (let1 == '_' and let2=='_'):\n",
      "            possible_bigrams.append(let1+let2)\n",
      "            \n",
      "possible_trigrams = [] # for later implementation\n",
      "for bigram in possible_bigrams:\n",
      "    for letter in alphabet:\n",
      "        if bigram[-1] != '_':\n",
      "            possible_trigrams.append(bigram+letter)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "class progress_bar: \n",
      "    def __init__(self, loop_length):\n",
      "        import time\n",
      "        self.start = time.time()\n",
      "        self.increment_size = 100.0/loop_length\n",
      "        self.curr_count = 0\n",
      "        self.curr_pct = 0\n",
      "        self.finished = False\n",
      "        self.overflow = False\n",
      "        print '% complete:',\n",
      "    \n",
      "    def increment(self):\n",
      "        self.curr_count += self.increment_size\n",
      "        if int(self.curr_count) > self.curr_pct:\n",
      "            self.curr_pct = int(self.curr_count)\n",
      "            if self.finished == False:\n",
      "                if self.curr_pct <= 99:\n",
      "                    print self.curr_pct,\n",
      "                elif self.curr_pct == 100:\n",
      "                    print \"100\"\n",
      "                    self.finished = True  \n",
      "                elif self.overflow == False:\n",
      "                    print \"***** Count has gone over 100%; likely either due to an error in the loop_length specified when \" + \\\n",
      "                          \"progress_bar was instantiated or in the placement of the increment function *****\"\n",
      "                    self.overflow = True\n",
      "                else:\n",
      "                    self.finished = True\n",
      "            if self.overflow == False and self.finished == True:\n",
      "                print 'Elapsed time: %0.1f seconds.' % (time.time() - self.start)\n",
      "                self.overflow = True\n",
      "                \n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes\n",
      "\n",
      "if not os.path.isdir(nb_path):    \n",
      "    os.mkdir(nb_path)\n",
      "    \n",
      "if not os.path.isfile(nb_path+'/'+dataframe_base+'_bigrams.pickle') or redo_pickle == True:    \n",
      "    \n",
      "    start = time.time()\n",
      "    \n",
      "    bigrams = {}\n",
      "    \n",
      "    progbar = progress_bar(len(words))\n",
      "    \n",
      "    for i in range(len(words)):\n",
      "        progbar.increment()\n",
      "        wd = '_' + words.word.iloc[i] + '_'\n",
      "        freq = words.freq.iloc[i]\n",
      "        stop_bigrams = len(wd) -2\n",
      "        for pos in range(len(wd)):\n",
      "            if pos <= stop_bigrams:\n",
      "                increment_dict(bigrams, wd[pos:pos+2], freq)            \n",
      "                    \n",
      "    df_bigrams = pd.DataFrame()\n",
      "    for key in bigrams.keys():\n",
      "        df_bigrams = df_bigrams.append(pd.DataFrame({'letter_1':[key[0]],\n",
      "            'letter_2':[key[1]], 'freq':[bigrams[key]]}, index=[key]))\n",
      "    df_trigrams = pd.DataFrame()\n",
      "    bigrams_total = df_bigrams.freq.sum()\n",
      "\n",
      "    def calc_pct_freq(df):\n",
      "        return pd.Series({'pct_freq': df.freq * 100.0 / \n",
      "                          bigrams_total})\n",
      "\n",
      "    df_bigrams = df_bigrams.merge(df_bigrams.apply(calc_pct_freq, axis=1), left_index=True, right_index=True)\n",
      " \n",
      "    df_bigrams.to_pickle(nb_path+'/'+dataframe_base+'_bigrams.pickle')\n",
      "    \n",
      "else:\n",
      "    \n",
      "    df_bigrams=pd.read_pickle(nb_path+'/'+dataframe_base+'_bigrams.pickle')\n",
      "    print \"Pickle loaded.\"\n",
      "    \n",
      "print df_bigrams.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "% complete: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "2 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "3 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "4 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "5 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "6 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "7 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "8 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "9 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "10 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "11 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "12 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "13 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "14 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "15 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "16 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "17 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "18 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "19 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "20 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "21 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "22 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "23 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "24 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "25 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "26 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "27 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "28 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "29 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "30 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "31 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "32 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "33 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "34 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "35 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "36 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "37 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "38 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "39 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "40 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "41 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "42 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "43 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "44 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "45 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "46 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "47 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "48 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "49 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "50 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "51 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "52 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "53 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "54 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "55 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "56 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "57 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "58 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "59 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "60 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "61 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "62 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "63 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "64 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "65 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "66 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "67 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "68 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "69 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "70 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "71 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "72 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "73 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "74 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "75 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "76 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "77 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "78 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "79 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "80 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "81 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "82 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "83 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "84 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "85 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "86 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "87 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "88 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "89 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "90 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "91 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "92 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "93 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "94 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "95 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "96 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "97 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "98 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "99 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "        freq letter_1 letter_2  pct_freq\n",
        "gw     17983        g        w  0.000845\n",
        "t_  39024960        t        _  1.833252\n",
        "gu   1028478        g        u  0.048314\n",
        "gt    213703        g        t  0.010039\n",
        "gs    791136        g        s  0.037165\n"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# calculate conditional probabilities\n",
      "\n",
      "df_bi_cond = pd.DataFrame()\n",
      "for bigram in possible_bigrams:\n",
      "    let1, let2 = (bigram[0], bigram[1])\n",
      "    if bigram in df_bigrams.index:\n",
      "        prob_1_given_2 = df_bigrams.ix[bigram].freq * 100.0 / df_bigrams[df_bigrams.letter_2 == let2].freq.sum()\n",
      "        prob_2_given_1 = df_bigrams.ix[bigram].freq * 100.0 / df_bigrams[df_bigrams.letter_1 == let1].freq.sum()\n",
      "        df_bi_cond = df_bi_cond.append(pd.DataFrame({'let1':[let1], 'let2':[let2], 'pct_1_given_2':[prob_1_given_2], \n",
      "                                                     'pct_2_given_1':[prob_2_given_1]}, index=[bigram]))\n",
      "    else:\n",
      "        df_bi_cond = df_bi_cond.append(pd.DataFrame({'let1':[let1], 'let2':[let2], 'pct_1_given_2':[0.0], \n",
      "                                                     'pct_2_given_1':[0.0]}, index=[bigram]))\n",
      "        \n",
      "print df_bi_cond.head()\n",
      "\n",
      "df_bi_cond.to_csv(nb_path + '/bi_cond' + dataframe_base + '.csv')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "   let1 let2  pct_1_given_2  pct_2_given_1\n",
        "_a    _    a      31.379124      11.291626\n",
        "_b    _    b      68.037261       4.682944\n",
        "_c    _    c      35.925454       4.386648\n",
        "_d    _    d      17.032289       3.163017\n",
        "_e    _    e       3.946567       2.200153\n"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# create pivot table - no longer necessary\n",
      "\n",
      "df_pivot_2g1 = pd.pivot_table(data=df_bi_cond, values='pct_2_given_1', index='let1', columns='let2').fillna(0)\n",
      "print df_pivot_2g1.head()\n",
      "df_pivot_1g2 = pd.pivot_table(data=df_bi_cond, values='pct_1_given_2', index='let1', columns='let2').fillna(0)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "let2          _          a         b         c         d          e         f  \\\n",
        "let1                                                                            \n",
        "_      0.000000  11.819304  4.651243  4.805972  2.928036   2.448977  4.103325   \n",
        "a      7.624876   0.013944  2.131876  4.320334  4.275015   0.086080  0.960819   \n",
        "b      1.396656   8.525391  0.775292  0.038129  0.056488  31.217873  0.001412   \n",
        "c      3.487082  12.994243  0.016849  1.689834  0.025274  15.614996  0.010531   \n",
        "d     57.425042   3.132064  0.056374  0.052505  0.961671  14.826872  0.038688   \n",
        "\n",
        "let2         g          h         i    ...            q          r         s  \\\n",
        "let1                                   ...                                     \n",
        "_     1.704057   5.447910  6.829877    ...     0.199549   2.624486  6.902913   \n",
        "a     1.986265   0.117454  3.466779    ...     0.015821  10.338392  9.232230   \n",
        "b     0.016946   0.012710  5.240637    ...     0.000000   5.726431  1.957295   \n",
        "c     0.004914  14.851867  6.103623    ...     0.101095   3.648554  0.496349   \n",
        "d     0.602979   0.083455  8.830243    ...     0.037030   1.872496  2.464421   \n",
        "\n",
        "let2          t          u         v         w         x         y         z  \n",
        "let1                                                                          \n",
        "_     16.208252   1.157056  0.653244  6.130388  0.003463  0.822234  0.022002  \n",
        "a     13.989397   1.089268  2.066981  0.666647  0.197098  2.526340  0.161433  \n",
        "b      0.704683  11.321527  0.302209  0.021183  0.000000  8.053720  0.000000  \n",
        "c      9.802724   3.235748  0.000702  0.002106  0.000000  0.815782  0.010531  \n",
        "d      0.040899   2.461105  0.335480  0.166358  0.000000  1.226407  0.001658  \n",
        "\n",
        "[5 rows x 27 columns]\n"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}