{ "metadata": { "name": "", "signature": "sha256:726fde94518e38b897eee4faacd6133a0406643155946d3eec2b176647ac291a" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Single letter frequency comparison of Gadsby and Brown\n", "\n", "by David Taylor, www.prooffreader.com, prooffreader@gmail.com\n", "\n", "from a collection of tools to create and analyze lists of words using python with pandas and matplotlib" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import os\n", "import time\n", "\n", "words = pd.read_pickle('gadsby_analysis.pickle')\n", "\n", "alphabet = 'abcdefghijklmnopqrstuvwxyz'\n", "\n", "def increment_dict(d, key, increment=1): # from before I found collections.Counter\n", " if key in d.keys():\n", " d[key] += increment\n", " else:\n", " d[key] = increment\n", " return d\n", "\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 31 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Single letter frequencies" ] }, { "cell_type": "code", "collapsed": false, "input": [ "redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes\n", "\n", "if not os.path.isfile('gadsby_analysis_letfreqs.pickle') or redo_pickle == True: \n", " \n", " start = time.time()\n", " \n", " letfreq_dict = {}\n", " \n", " for ltr in alphabet:\n", " letfreq_dict[ltr] = 0\n", " \n", " for i in range(len(words)):\n", " wd = words.word.iloc[i]\n", " for ltr in wd:\n", " letfreq_dict[ltr] += words.freq.iloc[i]\n", " \n", " letfreqs_df = pd.DataFrame()\n", " for letter in alphabet:\n", " temp = pd.DataFrame({'letter':[letter],\n", " 'freq':[letfreq_dict[letter]]})\n", " letfreqs_df = letfreqs_df.append(temp, ignore_index=True)\n", "\n", " \n", " \n", " letfreqs_df['normal'] = 0.0\n", "\n", " letfreqsum = letfreqs_df.freq.sum()\n", "\n", " for i in range(len(letfreqs_df)):\n", " letter = letfreqs_df.letter.iloc[i]\n", " freq = letfreqs_df.freq.iloc[i]\n", " letfreqs_df.normal.iloc[i] = 100.0 * freq / letfreqsum\n", "\n", " letfreqs_df.set_index('letter', drop=False, inplace=True)\n", " \n", " print \"%d seconds elapsed.\" % (time.time() - start)\n", " \n", " letfreqs_df.to_pickle('gadsby_analysis_letfreqs.pickle')\n", "\n", "else:\n", " print 'Reading from pickle.'\n", " letfreqs_df = pd.read_pickle('gadsby_analysis_letfreqs.pickle')\n", "\n", " \n", "df = letfreqs_df.copy()\n", "df.columns = ['gadsby_freq', 'letter', 'gadsby_pct']\n", "df = df[['letter', 'gadsby_freq', 'gadsby_pct']]\n", "print df.head()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Reading from pickle.\n", " letter gadsby_freq gadsby_pct\n", "letter \n", "a a 22105 11.139219\n", "b b 4270 2.151751\n", "c c 5217 2.628967\n", "d d 8234 4.149302\n", "e e 0 0.000000\n" ] } ], "prompt_number": 32 }, { "cell_type": "code", "collapsed": false, "input": [ "redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes\n", "\n", "brown = pd.read_pickle('brown_non_e.pickle')\n", "\n", "if not os.path.isfile('brown_non_e_leftfreq.pickle') or redo_pickle == True: \n", " \n", " start = time.time()\n", " \n", " letfreq_dict = {}\n", " \n", " for ltr in alphabet:\n", " letfreq_dict[ltr] = 0\n", " \n", " for i in range(len(words)):\n", " wd = brown.word.iloc[i]\n", " for ltr in wd:\n", " letfreq_dict[ltr] += brown.freq.iloc[i]\n", " \n", " letfreqs_df = pd.DataFrame()\n", " for letter in alphabet:\n", " temp = pd.DataFrame({'letter':[letter],\n", " 'freq':[letfreq_dict[letter]]})\n", " letfreqs_df = letfreqs_df.append(temp, ignore_index=True)\n", "\n", " letfreqs_df['normal'] = 0.0\n", "\n", " letfreqsum = letfreqs_df.freq.sum()\n", "\n", " for i in range(len(letfreqs_df)):\n", " letter = letfreqs_df.letter.iloc[i]\n", " freq = letfreqs_df.freq.iloc[i]\n", " letfreqs_df.normal.iloc[i] = 100.0 * freq / letfreqsum\n", "\n", " letfreqs_df.set_index('letter', drop=False, inplace=True)\n", " \n", " print \"%d seconds elapsed.\" % (time.time() - start)\n", " \n", " letfreqs_df.to_pickle('brown_non_e_letfreqs.pickle')\n", "\n", "else:\n", " print 'Reading from pickle.'\n", " letfreqs_df = pd.read_pickle('brown_non_e_letfreqs.pickle')\n", " \n", "brown = letfreqs_df.copy()\n", "brown.columns = ['brown_freq', 'letter', 'brown_pct']\n", "brown = brown[['letter', 'brown_freq', 'brown_pct']]\n", "print brown.head()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "1 seconds elapsed.\n", " letter brown_freq brown_pct\n", "letter \n", "a a 208195 11.319178\n", "b b 27813 1.512141\n", "c c 48343 2.628320\n", "d d 78828 4.285733\n", "e e 0 0.000000\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.merge(df, brown, how='inner', on='letter', left_index=True, right_index = False)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [ "df['brown_normalized'] = (df.brown_freq * df.gadsby_freq.sum()/df.brown_freq.sum()).astype(int)\n", "df['diff'] = df.gadsby_pct - df.brown_pct\n", "df['ratio'] = df.gadsby_pct / df.brown_pct" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 34 }, { "cell_type": "code", "collapsed": false, "input": [ "def loglike(n1, t1, n2, t2):\n", " \"\"\"Calculates Dunning log likelihood of an observation of \n", " frequency n1 in a corpus of size t1, compared to a frequency n2 \n", " in a corpus of size t2. If result is positive, it is more \n", " likely to occur in corpus 1, otherwise in corpus 2.\"\"\"\n", " e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values\n", " e2 = t2*1.0*(n1+n2)/(t1+t2)\n", " LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2)))\n", " if n2*1.0/t2 > n1*1.0/t1:\n", " LL = -LL\n", " return LL" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 35 }, { "cell_type": "code", "collapsed": false, "input": [ "from numpy import log\n", "\n", "t1 = df.gadsby_freq.sum()\n", "t2 = df.brown_freq.sum()\n", "df['log_likelihood'] = 0.0\n", "\n", "for i in range(len(df)):\n", " n1 = df.gadsby_freq.iloc[i]\n", " n2 = df.brown_freq.iloc[i]\n", " df.log_likelihood.iloc[i] = loglike(n1, t1, n2, t2)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [ "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", " | letter | \n", "gadsby_freq | \n", "gadsby_pct | \n", "brown_freq | \n", "brown_pct | \n", "brown_normalized | \n", "diff | \n", "ratio | \n", "log_likelihood | \n", "
---|---|---|---|---|---|---|---|---|---|
letter | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
a | \n", "a | \n", "22105 | \n", "11.139219 | \n", "208195 | \n", "11.319178 | \n", "22462 | \n", "-0.179959 | \n", "0.984101 | \n", "-5.154793 | \n", "
b | \n", "b | \n", "4270 | \n", "2.151751 | \n", "27813 | \n", "1.512141 | \n", "3000 | \n", "0.639610 | \n", "1.422983 | \n", "422.428550 | \n", "
c | \n", "c | \n", "5217 | \n", "2.628967 | \n", "48343 | \n", "2.628320 | \n", "5215 | \n", "0.000647 | \n", "1.000246 | \n", "0.000285 | \n", "
d | \n", "d | \n", "8234 | \n", "4.149302 | \n", "78828 | \n", "4.285733 | \n", "8504 | \n", "-0.136431 | \n", "0.968166 | \n", "-7.871319 | \n", "
e | \n", "e | \n", "0 | \n", "0.000000 | \n", "0 | \n", "0.000000 | \n", "0 | \n", "0.000000 | \n", "NaN | \n", "NaN | \n", "
f | \n", "f | \n", "4368 | \n", "2.201136 | \n", "71896 | \n", "3.908853 | \n", "7756 | \n", "-1.707717 | \n", "0.563116 | \n", "-1614.594975 | \n", "
g | \n", "g | \n", "7175 | \n", "3.615648 | \n", "38652 | \n", "2.101438 | \n", "4170 | \n", "1.514210 | \n", "1.720559 | \n", "1571.243993 | \n", "
h | \n", "h | \n", "9836 | \n", "4.956587 | \n", "93001 | \n", "5.056293 | \n", "10033 | \n", "-0.099706 | \n", "0.980281 | \n", "-3.547318 | \n", "
i | \n", "i | \n", "17550 | \n", "8.843849 | \n", "178746 | \n", "9.718090 | \n", "19284 | \n", "-0.874241 | \n", "0.910040 | \n", "-145.728668 | \n", "
j | \n", "j | \n", "466 | \n", "0.234828 | \n", "3361 | \n", "0.182731 | \n", "362 | \n", "0.052097 | \n", "1.285100 | \n", "24.170986 | \n", "
k | \n", "k | \n", "2310 | \n", "1.164062 | \n", "12785 | \n", "0.695097 | \n", "1379 | \n", "0.468965 | \n", "1.674676 | \n", "461.058467 | \n", "
l | \n", "l | \n", "10449 | \n", "5.265492 | \n", "77191 | \n", "4.196732 | \n", "8328 | \n", "1.068760 | \n", "1.254665 | \n", "447.169631 | \n", "
m | \n", "m | \n", "4099 | \n", "2.065581 | \n", "41491 | \n", "2.255789 | \n", "4476 | \n", "-0.190209 | \n", "0.915680 | \n", "-29.653479 | \n", "
n | \n", "n | \n", "16926 | \n", "8.529401 | \n", "159318 | \n", "8.661826 | \n", "17188 | \n", "-0.132424 | \n", "0.984712 | \n", "-3.646751 | \n", "
o | \n", "o | \n", "20823 | \n", "10.493189 | \n", "224504 | \n", "12.205868 | \n", "24221 | \n", "-1.712679 | \n", "0.859684 | \n", "-454.252165 | \n", "
p | \n", "p | \n", "3710 | \n", "1.869554 | \n", "27267 | \n", "1.482456 | \n", "2941 | \n", "0.387098 | \n", "1.261119 | \n", "165.729733 | \n", "
q | \n", "q | \n", "103 | \n", "0.051904 | \n", "684 | \n", "0.037188 | \n", "73 | \n", "0.014716 | \n", "1.395728 | \n", "9.165714 | \n", "
r | \n", "r | \n", "9426 | \n", "4.749979 | \n", "81398 | \n", "4.425459 | \n", "8782 | \n", "0.324519 | \n", "1.073330 | \n", "41.521963 | \n", "
s | \n", "s | \n", "13389 | \n", "6.747026 | \n", "120158 | \n", "6.532769 | \n", "12963 | \n", "0.214257 | \n", "1.032797 | \n", "12.438128 | \n", "
t | \n", "t | \n", "16770 | \n", "8.450789 | \n", "177992 | \n", "9.677097 | \n", "19203 | \n", "-1.226307 | \n", "0.873277 | \n", "-292.146957 | \n", "
u | \n", "u | \n", "8338 | \n", "4.201710 | \n", "64735 | \n", "3.519523 | \n", "6984 | \n", "0.682188 | \n", "1.193830 | \n", "221.523997 | \n", "
v | \n", "v | \n", "621 | \n", "0.312936 | \n", "5171 | \n", "0.281138 | \n", "557 | \n", "0.031798 | \n", "1.113106 | \n", "6.189885 | \n", "
w | \n", "w | \n", "5633 | \n", "2.838598 | \n", "53064 | \n", "2.884992 | \n", "5725 | \n", "-0.046393 | \n", "0.983919 | \n", "-1.344228 | \n", "
x | \n", "x | \n", "87 | \n", "0.043841 | \n", "1117 | \n", "0.060729 | \n", "120 | \n", "-0.016888 | \n", "0.721914 | \n", "-9.412268 | \n", "
y | \n", "y | \n", "6333 | \n", "3.191345 | \n", "42703 | \n", "2.321683 | \n", "4607 | \n", "0.869661 | \n", "1.374582 | \n", "515.920749 | \n", "
z | \n", "z | \n", "205 | \n", "0.103304 | \n", "899 | \n", "0.048877 | \n", "96 | \n", "0.054427 | \n", "2.113556 | \n", "79.507689 | \n", "