{ "metadata": { "name": "", "signature": "sha256:410aa4729416fde09eed8fb5fc2038a2272ccfb9dbb745931f9b9df6e505f41a" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Baby names iPython notebooks #\n", "\n", " * By David Taylor, [www.prooffreader.com](http://www.prooffreader.com)\n", " * using data from United States Social Security Administration\n", " * I am making this public to give a head start to those who want to explore this dataset, so they don't have to download and format the data and the python objects used to do preliminary analysis. Please let me know if you find this helpful!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Letter frequencies and bigrams #" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "this should be done only for 2013" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "Set working path, and import libraries and read dataframe pickles" ] }, { "cell_type": "code", "collapsed": false, "input": [ "last_year = 2013 #change this when Social Security database is updated\n", "save_path = \"user_charts\" # files created by this notebook will be saved in this directory\n", "\n", "import time\n", "import os\n", "if not os.path.isdir(save_path): # creates path if it does not exist\n", " os.makedirs(save_path)\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "#import seaborn # comment out if you don't have it, but it makes good-looking charts\n", "%run download_and_process.py\n", "\n", "# used to round limit of y axis up to second-most-significant digit\n", "def determine_y_limit(x): \n", " significance = int(floor((log10(x))))\n", " val = floor(x / (10 ** (significance - 1))) + 1\n", " val = val * (10 ** (significance - 1))\n", " return val" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Data already downloaded.\n", "Data already extracted.\n", "Reading from pickle.\n", "Tail of dataframe 'yob':" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", " name sex births year pct ranked\n", "1792086 Zyhier M 5 2013 0.000267 12995\n", "1792087 Zylar M 5 2013 0.000267 12995\n", "1792088 Zymari M 5 2013 0.000267 12995\n", "1792089 Zymeer M 5 2013 0.000267 12995\n", "1792090 Zyree M 5 2013 0.000267 12995\n", "\n", "Tail of dataframe 'names':\n", " name sex year_count year_min year_max pct_sum pct_max\n", "102685 Gross M 1 1925 1925 0.000538 0.000538\n", "102686 Elik M 1 2012 2012 0.000318 0.000318\n", "102687 Patrickjoseph M 1 1998 1998 0.000262 0.000262\n", "102688 Southern M 1 1923 1923 0.000547 0.000547\n", "102689 Jeon M 1 1999 1999 0.000261 0.000261\n", "\n", "Tail of dataframe 'years':\n", " year births_f births_m births_t new_names unique_names\n", "128 2008 1886765 2035811 3922576 2046 32483\n", "129 2009 1832276 1978582 3810858 1789 32210\n", "130 2010 1771846 1912915 3684761 1635 31593\n", "131 2011 1752198 1891800 3643998 1539 31412\n", "132 2012 1751866 1886972 3638838 1531 31212\n" ] } ], "prompt_number": 2 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Count letter frequencies and bigram frequencies" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Group yob dataframe to just have birth counts; separate into M and F sexes\n", "\n", "dfm = yob[yob.sex == 'M']\n", "dff = yob[yob.sex == 'F']\n", "dfm = pd.DataFrame(dfm.groupby('name').sum()['births'])\n", "dff = pd.DataFrame(dff.groupby('name').sum()['births'])\n", "dfm['sex'] = 'M'\n", "dff['sex'] = 'F'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "# Create empty dicts to hold frequencies, then iterate over dfs to fill them\n", "\n", "import time\n", "start = time.time()\n", "\n", "alphabet = 'abcdefghijklmnopqrstuvwxyz'\n", "lettersf = {}\n", "lettersm = {}\n", "bigramsf = {}\n", "bigramsm = {}\n", "\n", "for letter in alphabet:\n", " lettersf[letter] = 0\n", " lettersm[letter] = 0\n", " for second_letter in alphabet:\n", " bigram = letter+second_letter\n", " bigramsf[bigram] = 0\n", " bigramsm[bigram] = 0\n", " \n", "for i in range(len(dff)):\n", " name = dff.index[i].lower()\n", " count = dff.births.iloc[i]\n", " for j in range(len(name)):\n", " lettersf[name[j]] += count\n", " if j < (len(name) - 1):\n", " bigram = name[j]+name[j+1]\n", " bigramsf[bigram] += count\n", " \n", "for i in range(len(dfm)):\n", " name = dfm.index[i].lower()\n", " count = dfm.births.iloc[i]\n", " for j in range(len(name)):\n", " lettersm[name[j]] += count\n", " if j < (len(name) - 1):\n", " bigram = name[j]+name[j+1]\n", " bigramsm[bigram] += count\n", "\n", "# count totals\n", "\n", "letsumf = 0\n", "letsumm = 0\n", "bigrsumf = 0\n", "bigrsumm = 0\n", "\n", "for letter in alphabet:\n", " letsumf += lettersf[letter]\n", " letsumm += lettersm[letter]\n", " for second_letter in alphabet:\n", " bigram = letter+second_letter\n", " bigrsumf += bigramsf[bigram]\n", " bigrsumm += bigramsm[bigram]\n", "\n", "# recalculate dicts as proportions\n", "\n", "for letter in alphabet:\n", " lettersf[letter] = lettersf[letter] * 1.0 / letsumf\n", " lettersm[letter] = lettersm[letter] * 1.0 / letsumm\n", " for second_letter in alphabet:\n", " bigram = letter+second_letter\n", " bigramsf[bigram] = bigramsf[bigram] * 1.0 / bigrsumf\n", " bigramsm[bigram] = bigramsf[bigram] * 1.0 / bigrsumf\n", " \n", "print \"%0.1f seconds elapsed.\" % ((time.time() - start))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "4.7 seconds elapsed.\n" ] } ], "prompt_number": 38 }, { "cell_type": "code", "collapsed": false, "input": [ "# make dataframe of results.\n", "\n", "df = pd.DataFrame()\n", "\n", "for letter in alphabet:\n", " df = df.append(pd.DataFrame({'firstlet':[letter], 'secondlet':[''], 'sex':['F'],\n", " 'pct':[lettersf[letter]]}), ignore_index=True)\n", " df = df.append(pd.DataFrame({'firstlet':[letter], 'secondlet':[''], 'sex':['M'],\n", " 'pct':[lettersm[letter]]}), ignore_index=True)\n", " for second_letter in alphabet:\n", " bigram = letter+second_letter\n", " df = df.append(pd.DataFrame({'firstlet':[letter], 'secondlet':[second_letter],\n", " 'sex':['F'], 'pct':[bigramsf[bigram]]}), ignore_index=True)\n", " df = df.append(pd.DataFrame({'firstlet':[letter], 'secondlet':[second_letter],\n", " 'sex':['M'], 'pct':[bigramsm[bigram]]}), ignore_index=True)\n", "\n", "df.sort(['firstlet', 'secondlet'], ascending=True, inplace=True)\n", "df.reset_index(drop=True, inplace=True)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 44 }, { "cell_type": "code", "collapsed": false, "input": [ "df.to_pickle(save_path + 'baby_name_letter_bigram_freqs.pickle')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 46 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }