{ "metadata": { "name": "", "signature": "sha256:590abb69089ae760b48b14eb4e5e171be027c87e3fd271a49cfd5585b37f354b" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# word_list_tools repo\n", "\n", "by David Taylor, www.prooffreader.com, prooffreader@gmail.com\n", "\n", "a collection of tools to create and analyze lists of words using python with pandas and matplotlib" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## letter proximity ###\n", "\n", "determine frequency of bigrams and conditional letter frequency in a corpus before and after a given letter\n", "\n", "source word list is pandas dataframe with columns 'word' and 'freq'. Any other columns will be ignored.\n", "\n", "** initial_data_munge must be run first to create pickled dataframes of word lists **" ] }, { "cell_type": "code", "collapsed": false, "input": [ "dataframe_base = 'coha_words' # change as needed to point to pickle\n", "\n", "data_path = 'data'\n", "nb_path = 'letter_proximity'\n", "\n", "save_filename = '' #used for .pickle and .png, leave as '' to use a default filename\n", "\n", "import pandas as pd\n", "import os\n", "import time\n", "\n", "plotly_apikey = open(\"../../plotly_yekipa.txt\", \"r\").read()\n", "plotly_username = open(\"../../plotly_emanresu.txt\", \"r\").read()\n", "\n", "words = pd.read_pickle(data_path + '/' + dataframe_base + '.pickle')\n", "\n", "alphabet = '_abcdefghijklmnopqrstuvwxyz'\n", "# note: underscore(_) is a null value, i.e. '_a' signifies a word that starts with the letter a\n", "\n", "def increment_dict(d, key, increment=1):\n", " if key in d.keys():\n", " d[key] += increment\n", " else:\n", " d[key] = increment\n", " return d\n", "\n", "possible_bigrams = []\n", "for let1 in alphabet:\n", " for let2 in alphabet:\n", " if not (let1 == '_' and let2=='_'):\n", " possible_bigrams.append(let1+let2)\n", " \n", "possible_trigrams = [] # for later implementation\n", "for bigram in possible_bigrams:\n", " for letter in alphabet:\n", " if bigram[-1] != '_':\n", " possible_trigrams.append(bigram+letter)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "class progress_bar: \n", " def __init__(self, loop_length):\n", " import time\n", " self.start = time.time()\n", " self.increment_size = 100.0/loop_length\n", " self.curr_count = 0\n", " self.curr_pct = 0\n", " self.finished = False\n", " self.overflow = False\n", " print '% complete:',\n", " \n", " def increment(self):\n", " self.curr_count += self.increment_size\n", " if int(self.curr_count) > self.curr_pct:\n", " self.curr_pct = int(self.curr_count)\n", " if self.finished == False:\n", " if self.curr_pct <= 99:\n", " print self.curr_pct,\n", " elif self.curr_pct == 100:\n", " print \"100\"\n", " self.finished = True \n", " elif self.overflow == False:\n", " print \"***** Count has gone over 100%; likely either due to an error in the loop_length specified when \" + \\\n", " \"progress_bar was instantiated or in the placement of the increment function *****\"\n", " self.overflow = True\n", " else:\n", " self.finished = True\n", " if self.overflow == False and self.finished == True:\n", " print 'Elapsed time: %0.1f seconds.' % (time.time() - self.start)\n", " self.overflow = True\n", " \n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes\n", "\n", "if not os.path.isdir(nb_path): \n", " os.mkdir(nb_path)\n", " \n", "if not os.path.isfile(nb_path+'/'+dataframe_base+'_bigrams.pickle') or redo_pickle == True: \n", " \n", " start = time.time()\n", " \n", " bigrams = {}\n", " \n", " progbar = progress_bar(len(words))\n", " \n", " for i in range(len(words)):\n", " progbar.increment()\n", " wd = '_' + words.word.iloc[i] + '_'\n", " freq = words.freq.iloc[i]\n", " stop_bigrams = len(wd) -2\n", " for pos in range(len(wd)):\n", " if pos <= stop_bigrams:\n", " increment_dict(bigrams, wd[pos:pos+2], freq) \n", " \n", " df_bigrams = pd.DataFrame()\n", " for key in bigrams.keys():\n", " df_bigrams = df_bigrams.append(pd.DataFrame({'letter_1':[key[0]],\n", " 'letter_2':[key[1]], 'freq':[bigrams[key]]}, index=[key]))\n", " df_trigrams = pd.DataFrame()\n", " bigrams_total = df_bigrams.freq.sum()\n", "\n", " def calc_pct_freq(df):\n", " return pd.Series({'pct_freq': df.freq * 100.0 / \n", " bigrams_total})\n", "\n", " df_bigrams = df_bigrams.merge(df_bigrams.apply(calc_pct_freq, axis=1), left_index=True, right_index=True)\n", " \n", " df_bigrams.to_pickle(nb_path+'/'+dataframe_base+'_bigrams.pickle')\n", " \n", "else:\n", " \n", " df_bigrams=pd.read_pickle(nb_path+'/'+dataframe_base+'_bigrams.pickle')\n", " print \"Pickle loaded.\"\n", " \n", "print df_bigrams.head()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "% complete: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "1 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "2 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "3 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "4 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "5 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "6 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "7 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "8 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "9 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "10 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "11 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "12 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "13 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "14 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "15 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "16 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "17 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "18 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "19 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "20 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "21 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "22 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "23 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "24 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "25 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "26 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "27 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "28 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "29 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "30 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "31 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "32 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "33 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "34 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "35 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "36 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "37 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "38 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "39 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "40 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "41 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "42 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "43 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "44 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "45 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "46 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "47 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "48 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "49 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "50 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "51 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "52 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "53 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "54 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "55 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "56 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "57 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "58 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "59 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "60 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "61 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "62 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "63 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "64 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "65 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "66 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "67 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "68 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "69 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "70 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "71 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "72 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "73 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "74 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "75 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "76 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "77 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "78 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "79 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "80 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "81 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "82 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "83 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "84 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "85 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "86 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "87 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "88 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "89 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "90 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "91 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "92 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "93 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "94 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "95 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "96 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "97 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "98 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "99 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " freq letter_1 letter_2 pct_freq\n", "gw 17983 g w 0.000845\n", "t_ 39024960 t _ 1.833252\n", "gu 1028478 g u 0.048314\n", "gt 213703 g t 0.010039\n", "gs 791136 g s 0.037165\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "# calculate conditional probabilities\n", "\n", "df_bi_cond = pd.DataFrame()\n", "for bigram in possible_bigrams:\n", " let1, let2 = (bigram[0], bigram[1])\n", " if bigram in df_bigrams.index:\n", " prob_1_given_2 = df_bigrams.ix[bigram].freq * 100.0 / df_bigrams[df_bigrams.letter_2 == let2].freq.sum()\n", " prob_2_given_1 = df_bigrams.ix[bigram].freq * 100.0 / df_bigrams[df_bigrams.letter_1 == let1].freq.sum()\n", " df_bi_cond = df_bi_cond.append(pd.DataFrame({'let1':[let1], 'let2':[let2], 'pct_1_given_2':[prob_1_given_2], \n", " 'pct_2_given_1':[prob_2_given_1]}, index=[bigram]))\n", " else:\n", " df_bi_cond = df_bi_cond.append(pd.DataFrame({'let1':[let1], 'let2':[let2], 'pct_1_given_2':[0.0], \n", " 'pct_2_given_1':[0.0]}, index=[bigram]))\n", " \n", "print df_bi_cond.head()\n", "\n", "df_bi_cond.to_csv(nb_path + '/bi_cond' + dataframe_base + '.csv')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " let1 let2 pct_1_given_2 pct_2_given_1\n", "_a _ a 31.379124 11.291626\n", "_b _ b 68.037261 4.682944\n", "_c _ c 35.925454 4.386648\n", "_d _ d 17.032289 3.163017\n", "_e _ e 3.946567 2.200153\n" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "# create pivot table - no longer necessary\n", "\n", "df_pivot_2g1 = pd.pivot_table(data=df_bi_cond, values='pct_2_given_1', index='let1', columns='let2').fillna(0)\n", "print df_pivot_2g1.head()\n", "df_pivot_1g2 = pd.pivot_table(data=df_bi_cond, values='pct_1_given_2', index='let1', columns='let2').fillna(0)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "let2 _ a b c d e f \\\n", "let1 \n", "_ 0.000000 11.819304 4.651243 4.805972 2.928036 2.448977 4.103325 \n", "a 7.624876 0.013944 2.131876 4.320334 4.275015 0.086080 0.960819 \n", "b 1.396656 8.525391 0.775292 0.038129 0.056488 31.217873 0.001412 \n", "c 3.487082 12.994243 0.016849 1.689834 0.025274 15.614996 0.010531 \n", "d 57.425042 3.132064 0.056374 0.052505 0.961671 14.826872 0.038688 \n", "\n", "let2 g h i ... q r s \\\n", "let1 ... \n", "_ 1.704057 5.447910 6.829877 ... 0.199549 2.624486 6.902913 \n", "a 1.986265 0.117454 3.466779 ... 0.015821 10.338392 9.232230 \n", "b 0.016946 0.012710 5.240637 ... 0.000000 5.726431 1.957295 \n", "c 0.004914 14.851867 6.103623 ... 0.101095 3.648554 0.496349 \n", "d 0.602979 0.083455 8.830243 ... 0.037030 1.872496 2.464421 \n", "\n", "let2 t u v w x y z \n", "let1 \n", "_ 16.208252 1.157056 0.653244 6.130388 0.003463 0.822234 0.022002 \n", "a 13.989397 1.089268 2.066981 0.666647 0.197098 2.526340 0.161433 \n", "b 0.704683 11.321527 0.302209 0.021183 0.000000 8.053720 0.000000 \n", "c 9.802724 3.235748 0.000702 0.002106 0.000000 0.815782 0.010531 \n", "d 0.040899 2.461105 0.335480 0.166358 0.000000 1.226407 0.001658 \n", "\n", "[5 rows x 27 columns]\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }