{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Snapchat Leak\n",
      "Snapchat let 4.6M usernames and phone numbers get out. This notebook takes that data and explores what usernames are most prevelant in each area code. Usernames with \"love\" in them for example, are more common in California and Boston. \"lynn\" is more common in the South. \"5280\" is very common in Denver, which is in an interesting one to investigate (it's an elevation).\n",
      "\n",
      "## Algorithm Shop\n",
      "This notebook goes along with this algorithmshop.com [post](http://algorithmshop.com/20140102-snapchat-leak.html). Check out the post for animated visualizations of this data on a map of the US.\n",
      "\n",
      "## Getting Data\n",
      "The leaked usernames and phone numbers came from [here](http://snapchatdb.info/)\n",
      "\n",
      "The geocoding came from taking the area code names from that file and running them through this [Geocode](http://www.gpsvisualizer.com/geocoder/). You will have to geocode these places yourself, since there are licensing issues."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas as pd\n",
      "\n",
      "locations = pd.read_csv('locations.tsv', delimiter='\\t')\n",
      "all_users = pd.read_csv('schat.csv',\n",
      "                        names=['numbers', 'username', 'location'],\n",
      "                        index_col='location',\n",
      "                        header=None)\n",
      "\n",
      "# We just need the location names\n",
      "del all_users['numbers']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "all_users[0:10]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>username</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>location</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>Manhattan</th>\n",
        "      <td>      slthornton</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Manhattan</th>\n",
        "      <td>  strict_daddy4u</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Manhattan</th>\n",
        "      <td>       whoknew69</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Manhattan</th>\n",
        "      <td>  testingtesting</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Manhattan</th>\n",
        "      <td>       s.fullb13</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Manhattan</th>\n",
        "      <td>     gavan_smith</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Manhattan</th>\n",
        "      <td>  thismyusername</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Manhattan</th>\n",
        "      <td>    erinspickles</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Manhattan</th>\n",
        "      <td>    flyinghorses</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Manhattan</th>\n",
        "      <td> saraelizabeth98</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 2,
       "text": [
        "                  username\n",
        "location                  \n",
        "Manhattan       slthornton\n",
        "Manhattan   strict_daddy4u\n",
        "Manhattan        whoknew69\n",
        "Manhattan   testingtesting\n",
        "Manhattan        s.fullb13\n",
        "Manhattan      gavan_smith\n",
        "Manhattan   thismyusername\n",
        "Manhattan     erinspickles\n",
        "Manhattan     flyinghorses\n",
        "Manhattan  saraelizabeth98"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from collections import defaultdict\n",
      "\n",
      "\n",
      "def get_subs(s, n):\n",
      "    \"\"\"Get all substrings of s of length n\"\"\"\n",
      "    for i in range(len(s) - n + 1):\n",
      "        yield s[i:i+n]\n",
      "        \n",
      "\n",
      "def get_len_n_dict(n, lower_bound):\n",
      "    sub_counts = defaultdict(int)\n",
      "    for name in all_users.username:\n",
      "        for s in get_subs(name, n):\n",
      "            sub_counts[s] += 1\n",
      "    return dict((k,v) for (k,v) in sub_counts.items() if v >= lower_bound)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Find substrings of a certain lenght that occur a minimal\n",
      "# number of times:\n",
      "\n",
      "SUBSTRING_LENGTH = 4\n",
      "MIN_OCCURENCE = 60\n",
      "substring_counts = get_len_n_dict(SUBSTRING_LENGTH, MIN_OCCURENCE)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 22
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "'5820' in substring_counts"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 24,
       "text": [
        "False"
       ]
      }
     ],
     "prompt_number": 24
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Example of the most common substrings\n",
      "from heapq import nlargest\n",
      "\n",
      "nlargest(10, substring_counts.items(), key=lambda x: x[1])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 5,
       "text": [
        "[('love', 51034),\n",
        " ('mari', 48834),\n",
        " ('anna', 33225),\n",
        " ('elle', 31469),\n",
        " ('alex', 28967),\n",
        " ('chri', 28713),\n",
        " ('hris', 28330),\n",
        " ('arie', 26568),\n",
        " ('stin', 26260),\n",
        " ('chel', 26169)]"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "counts_per_area = all_users.groupby(level=0).agg(len)\n",
      "counts_per_area.rename(columns={'username': 'users'}, inplace=True)\n",
      "counts_per_area[0:10]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>users</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>location</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>Arkansas</th>\n",
        "      <td>  28940</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Boston</th>\n",
        "      <td>  41857</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Boulder-Denver</th>\n",
        "      <td> 139265</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Bronx, Queens, Brooklyn</th>\n",
        "      <td>  51086</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Buffalo</th>\n",
        "      <td> 144939</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Canadian territories in the Arctic far north</th>\n",
        "      <td>     31</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Central Arizona</th>\n",
        "      <td>  35631</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Central Florida</th>\n",
        "      <td>   3258</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Central Georgia</th>\n",
        "      <td>   1396</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Central Texas</th>\n",
        "      <td>   1542</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 6,
       "text": [
        "                                               users\n",
        "location                                            \n",
        "Arkansas                                       28940\n",
        "Boston                                         41857\n",
        "Boulder-Denver                                139265\n",
        "Bronx, Queens, Brooklyn                        51086\n",
        "Buffalo                                       144939\n",
        "Canadian territories in the Arctic far north      31\n",
        "Central Arizona                                35631\n",
        "Central Florida                                 3258\n",
        "Central Georgia                                 1396\n",
        "Central Texas                                   1542"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Now get counts of each substring by state\n",
      "def counts_in_group(g, n, total_counts):\n",
      "    output_counts = defaultdict(int)\n",
      "    for name in g.username:\n",
      "        for s in get_subs(name, n):\n",
      "            if s in total_counts:\n",
      "                output_counts[s] += 1\n",
      "    return pd.DataFrame([output_counts])\n",
      "\n",
      "groupby_fn = lambda x: counts_in_group(x, SUBSTRING_LENGTH, substring_counts)\n",
      "\n",
      "by_state = all_users.groupby(level=0).apply(groupby_fn)\n",
      "\n",
      "# I always end up doing this with pandas \"apply\". Halp me. I don't want to do it like this...\n",
      "by_state = by_state.reset_index(level=1, drop=True)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "by_state[0:3][['.mar', 'love', 'zzzz']]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>.mar</th>\n",
        "      <th>love</th>\n",
        "      <th>zzzz</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>location</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>Arkansas</th>\n",
        "      <td> 11</td>\n",
        "      <td>  286</td>\n",
        "      <td>  4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Boston</th>\n",
        "      <td> 18</td>\n",
        "      <td>  670</td>\n",
        "      <td> 17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Boulder-Denver</th>\n",
        "      <td> 63</td>\n",
        "      <td> 1092</td>\n",
        "      <td> 74</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 8,
       "text": [
        "                .mar  love  zzzz\n",
        "location                        \n",
        "Arkansas          11   286     4\n",
        "Boston            18   670    17\n",
        "Boulder-Denver    63  1092    74"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Filter out the  small data\n",
      "by_state_normalized = by_state / counts_per_area\n",
      "normalized = by_state.div(counts_per_area.users, axis='index')\n",
      "drop_small = normalized[counts_per_area.users > 2000].T"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# We get regional strings by looking for high variance\n",
      "#in the ratio of users with a given substring for a state.\n",
      "VARIANCE_QUANTILE = .933\n",
      "\n",
      "variances = drop_small.var(axis=1)\n",
      "large_variance = drop_small[variances > variances.quantile(q=VARIANCE_QUANTILE)]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from random import randint\n",
      "\n",
      "\n",
      "class StreamSampler(object):\n",
      "    def __init__(self, num_samples=1):\n",
      "        self.num_samples = num_samples\n",
      "        self.saved = []\n",
      "        self.num_seen = 0\n",
      "\n",
      "    def present(self, item):\n",
      "        if len(self.saved) < self.num_samples:\n",
      "            self.saved.append(item)\n",
      "            self.num_seen += 1\n",
      "            return\n",
      "        else:\n",
      "            v = random.randint(0, self.num_seen)\n",
      "            if v < self.num_samples:\n",
      "                self.saved[v] = item\n",
      "            self.num_seen += 1\n",
      "    \n",
      "    def samples(self):\n",
      "        return self.saved"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# For each substring, get 2 example usernames per area code.\n",
      "\n",
      "desired_words = large_variance.index\n",
      "example_users = defaultdict(lambda: defaultdict(lambda: StreamSampler(2)))\n",
      "\n",
      "for (area_code, r) in all_users.iterrows():\n",
      "    name = r['username']\n",
      "    for s in get_subs(name, 4):\n",
      "        if s in desired_words:\n",
      "            example_users[s][area_code].present(name)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def print_report(word):\n",
      "    print('\\nSUBSTRING {}'.format(word))\n",
      "    c = large_variance.ix[word].copy()\n",
      "    c.sort(ascending=False)\n",
      "    print(c)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "for s in ('love', 'girl', 'baby', 'lynn', 'ngel', '1234', '5280'):\n",
      "    print_report(s)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "SUBSTRING love\n",
        "location\n",
        "Southeastern California                  0.022592\n",
        "Downtown Los Angeles                     0.022140\n",
        "Eastern Los Angeles                      0.019226\n",
        "Eastern San Francisco                    0.018228\n",
        "Southern California                      0.017534\n",
        "Boston                                   0.016007\n",
        "Oakland                                  0.015008\n",
        "San Fernando Valley                      0.014406\n",
        "Denver-Boulder                           0.014058\n",
        "Central Florida                          0.012584\n",
        "Los Angeles                              0.012254\n",
        "Southeastern Virginia                    0.011904\n",
        "Florida                                  0.011734\n",
        "New York City                            0.011320\n",
        "Southeastern Colorado                    0.011114\n",
        "Southeastern Michigan incl. Ann Arbor    0.010739\n",
        "Central Arizona                          0.010553\n",
        "Miami                                    0.010417\n",
        "Fort Lauderdale                          0.010315\n",
        "South Carolina                           0.010262\n",
        "Pennsylvania                             0.010250\n",
        "Chicago                                  0.010140\n",
        "Arkansas                                 0.009883\n",
        "Eastern part of Southern New Jersey      0.009638\n",
        "Manhattan                                0.009591\n",
        "Western and Northern Colorado            0.009569\n",
        "Idaho                                    0.009468\n",
        "Buffalo                                  0.009452\n",
        "Northwestern Arkansas                    0.009041\n",
        "San Francisco                            0.008973\n",
        "Bronx, Queens, Brooklyn                  0.008965\n",
        "Southern New York State                  0.008729\n",
        "Eastern Ohio                             0.008557\n",
        "Northern New York                        0.008301\n",
        "Mountain View                            0.008281\n",
        "Southwestern Wisconsin                   0.008119\n",
        "Indianapolis                             0.008097\n",
        "Chicago Suburbs                          0.008048\n",
        "Southeastern Ohio                        0.007933\n",
        "Boulder-Denver                           0.007841\n",
        "Southwest Connecticut                    0.007752\n",
        "Northeastern New York State              0.007708\n",
        "Southern Illinois                        0.007680\n",
        "Northern Louisiana                       0.007620\n",
        "Seattle                                  0.007531\n",
        "Champaign-Urbana                         0.006832\n",
        "Westchester County, NY                   0.006559\n",
        "Northern Chicago Suburbs                 0.006492\n",
        "Southern Michigan                        0.006390\n",
        "Maine                                    0.005827\n",
        "Minnesota                                0.004887\n",
        "Manitoba                                 0.002774\n",
        "Name: love, Length: 52, dtype: float64\n",
        "\n",
        "SUBSTRING girl\n",
        "location\n",
        "South Carolina                           0.008779\n",
        "Northwestern Arkansas                    0.008767\n",
        "Arkansas                                 0.008086\n",
        "Southeastern Virginia                    0.007936\n",
        "Eastern San Francisco                    0.007661\n",
        "Central Florida                          0.007366\n",
        "Florida                                  0.007192\n",
        "Central Arizona                          0.007157\n",
        "Southeastern Colorado                    0.007073\n",
        "Western and Northern Colorado            0.007003\n",
        "Idaho                                    0.006747\n",
        "Maine                                    0.006617\n",
        "Eastern Ohio                             0.006540\n",
        "Southern Illinois                        0.006474\n",
        "Southern California                      0.006425\n",
        "Southeastern Ohio                        0.006381\n",
        "Southwestern Wisconsin                   0.006315\n",
        "Denver-Boulder                           0.006145\n",
        "Champaign-Urbana                         0.006081\n",
        "Chicago Suburbs                          0.005955\n",
        "Pennsylvania                             0.005857\n",
        "Northeastern New York State              0.005737\n",
        "Northern Louisiana                       0.005487\n",
        "Eastern Los Angeles                      0.005342\n",
        "Northern New York                        0.005317\n",
        "Eastern part of Southern New Jersey      0.005178\n",
        "Seattle                                  0.005177\n",
        "Southeastern California                  0.005153\n",
        "Southeastern Michigan incl. Ann Arbor    0.005087\n",
        "Boulder-Denver                           0.005055\n",
        "Fort Lauderdale                          0.004992\n",
        "Southern New York State                  0.004946\n",
        "Minnesota                                0.004887\n",
        "Buffalo                                  0.004754\n",
        "Chicago                                  0.004384\n",
        "Oakland                                  0.004374\n",
        "Northern Chicago Suburbs                 0.004206\n",
        "Boston                                   0.004205\n",
        "Westchester County, NY                   0.004176\n",
        "Indianapolis                             0.004171\n",
        "Los Angeles                              0.004169\n",
        "Southwest Connecticut                    0.004156\n",
        "New York City                            0.004081\n",
        "Downtown Los Angeles                     0.003945\n",
        "San Fernando Valley                      0.003916\n",
        "Bronx, Queens, Brooklyn                  0.003856\n",
        "Mountain View                            0.003823\n",
        "Manhattan                                0.003548\n",
        "San Francisco                            0.003490\n",
        "Miami                                    0.003302\n",
        "Southern Michigan                        0.002925\n",
        "Manitoba                                 0.001664\n",
        "Name: girl, Length: 52, dtype: float64\n",
        "\n",
        "SUBSTRING baby\n",
        "location\n",
        "Eastern San Francisco                    0.008718\n",
        "Florida                                  0.007949\n",
        "Oakland                                  0.007761\n",
        "Central Florida                          0.007060\n",
        "Southern California                      0.006170\n",
        "Boston                                   0.006068\n",
        "Southeastern Colorado                    0.005586\n",
        "Southeastern California                  0.005549\n",
        "Indianapolis                             0.005521\n",
        "Eastern Los Angeles                      0.005355\n",
        "Eastern Ohio                             0.005195\n",
        "Denver-Boulder                           0.005194\n",
        "Buffalo                                  0.005140\n",
        "Northern New York                        0.005032\n",
        "Fort Lauderdale                          0.005004\n",
        "Southeastern Michigan incl. Ann Arbor    0.004946\n",
        "South Carolina                           0.004934\n",
        "Downtown Los Angeles                     0.004912\n",
        "Chicago                                  0.004879\n",
        "Northern Louisiana                       0.004877\n",
        "Central Arizona                          0.004631\n",
        "Pennsylvania                             0.004588\n",
        "Southeastern Virginia                    0.004535\n",
        "Northeastern New York State              0.004368\n",
        "Idaho                                    0.004324\n",
        "San Fernando Valley                      0.004320\n",
        "Eastern part of Southern New Jersey      0.004315\n",
        "Chicago Suburbs                          0.004256\n",
        "Champaign-Urbana                         0.004152\n",
        "Arkansas                                 0.004077\n",
        "Southern New York State                  0.004073\n",
        "Southern Illinois                        0.004048\n",
        "Western and Northern Colorado            0.004048\n",
        "Southeastern Ohio                        0.003880\n",
        "Northwestern Arkansas                    0.003836\n",
        "Miami                                    0.003792\n",
        "New York City                            0.003758\n",
        "San Francisco                            0.003720\n",
        "Mountain View                            0.003622\n",
        "Manhattan                                0.003558\n",
        "Bronx, Queens, Brooklyn                  0.003445\n",
        "Los Angeles                              0.003426\n",
        "Boulder-Denver                           0.003325\n",
        "Southwest Connecticut                    0.003282\n",
        "Southwestern Wisconsin                   0.003157\n",
        "Northern Chicago Suburbs                 0.002914\n",
        "Southern Michigan                        0.002889\n",
        "Minnesota                                0.002793\n",
        "Westchester County, NY                   0.002727\n",
        "Maine                                    0.002666\n",
        "Seattle                                  0.002636\n",
        "Manitoba                                 0.001664\n",
        "Name: baby, Length: 52, dtype: float64\n",
        "\n",
        "SUBSTRING lynn\n",
        "location\n",
        "Southern New York State                  0.010474\n",
        "Northern Louisiana                       0.009144\n",
        "Florida                                  0.009084\n",
        "Maine                                    0.008888\n",
        "Southeastern Ohio                        0.008364\n",
        "Southeastern Michigan incl. Ann Arbor    0.007348\n",
        "Northern New York                        0.007304\n",
        "Southern Illinois                        0.007083\n",
        "Arkansas                                 0.007049\n",
        "Northwestern Arkansas                    0.006712\n",
        "Buffalo                                  0.006327\n",
        "Champaign-Urbana                         0.006184\n",
        "Northeastern New York State              0.006165\n",
        "Eastern Ohio                             0.005990\n",
        "Idaho                                    0.005815\n",
        "Minnesota                                0.005585\n",
        "Southeastern Colorado                    0.005363\n",
        "Eastern part of Southern New Jersey      0.005178\n",
        "Chicago Suburbs                          0.005103\n",
        "Southwestern Wisconsin                   0.004962\n",
        "Western and Northern Colorado            0.004888\n",
        "Manitoba                                 0.004576\n",
        "Indianapolis                             0.004417\n",
        "Pennsylvania                             0.004393\n",
        "Central Arizona                          0.004322\n",
        "Southern California                      0.004145\n",
        "Southeastern Virginia                    0.003921\n",
        "Eastern San Francisco                    0.003875\n",
        "Southwest Connecticut                    0.003761\n",
        "Central Florida                          0.003683\n",
        "Denver-Boulder                           0.003643\n",
        "South Carolina                           0.003633\n",
        "Southeastern California                  0.003567\n",
        "Boulder-Denver                           0.003547\n",
        "Eastern Los Angeles                      0.002988\n",
        "Fort Lauderdale                          0.002847\n",
        "Northern Chicago Suburbs                 0.002777\n",
        "Seattle                                  0.002730\n",
        "Westchester County, NY                   0.002401\n",
        "Bronx, Queens, Brooklyn                  0.002192\n",
        "Southern Michigan                        0.001945\n",
        "San Fernando Valley                      0.001868\n",
        "New York City                            0.001758\n",
        "Los Angeles                              0.001753\n",
        "Chicago                                  0.001669\n",
        "Oakland                                  0.001647\n",
        "Boston                                   0.001625\n",
        "Mountain View                            0.001525\n",
        "Miami                                    0.001453\n",
        "Manhattan                                0.001431\n",
        "Downtown Los Angeles                     0.001376\n",
        "San Francisco                            0.001212\n",
        "Name: lynn, Length: 52, dtype: float64\n",
        "\n",
        "SUBSTRING ngel\n",
        "location\n",
        "Eastern San Francisco                    0.005988\n",
        "Southeastern California                  0.005945\n",
        "Downtown Los Angeles                     0.004805\n",
        "Central Florida                          0.004604\n",
        "Eastern Los Angeles                      0.004355\n",
        "Denver-Boulder                           0.004281\n",
        "Southern California                      0.004215\n",
        "Oakland                                  0.004007\n",
        "Bronx, Queens, Brooklyn                  0.003739\n",
        "New York City                            0.003702\n",
        "Southwestern Wisconsin                   0.003608\n",
        "Southeastern Colorado                    0.003517\n",
        "San Fernando Valley                      0.003483\n",
        "Florida                                  0.003407\n",
        "Fort Lauderdale                          0.003342\n",
        "Miami                                    0.003342\n",
        "Central Arizona                          0.003340\n",
        "Boston                                   0.003321\n",
        "Eastern part of Southern New Jersey      0.003308\n",
        "Los Angeles                              0.003206\n",
        "Southern New York State                  0.003200\n",
        "Manhattan                                0.003200\n",
        "Mountain View                            0.003029\n",
        "Southeastern Virginia                    0.003023\n",
        "South Carolina                           0.002906\n",
        "San Francisco                            0.002875\n",
        "Minnesota                                0.002793\n",
        "Chicago                                  0.002772\n",
        "Chicago Suburbs                          0.002760\n",
        "Western and Northern Colorado            0.002756\n",
        "Pennsylvania                             0.002733\n",
        "Buffalo                                  0.002732\n",
        "Eastern Ohio                             0.002659\n",
        "Idaho                                    0.002647\n",
        "Manitoba                                 0.002635\n",
        "Southeastern Michigan incl. Ann Arbor    0.002543\n",
        "Arkansas                                 0.002522\n",
        "Westchester County, NY                   0.002469\n",
        "Northeastern New York State              0.002456\n",
        "Northern New York                        0.002292\n",
        "Boulder-Denver                           0.002269\n",
        "Southwest Connecticut                    0.002260\n",
        "Northern Chicago Suburbs                 0.002251\n",
        "Southern Michigan                        0.002139\n",
        "Southern Illinois                        0.002114\n",
        "Indianapolis                             0.002086\n",
        "Southeastern Ohio                        0.002070\n",
        "Champaign-Urbana                         0.001958\n",
        "Northwestern Arkansas                    0.001781\n",
        "Northern Louisiana                       0.001422\n",
        "Seattle                                  0.001412\n",
        "Maine                                    0.001284\n",
        "Name: ngel, Length: 52, dtype: float64\n",
        "\n",
        "SUBSTRING 1234\n",
        "location\n",
        "South Carolina                           0.003421\n",
        "Northwestern Arkansas                    0.003288\n",
        "Southeastern Ohio                        0.003190\n",
        "Southeastern Michigan incl. Ann Arbor    0.002967\n",
        "Southern New York State                  0.002910\n",
        "Western and Northern Colorado            0.002877\n",
        "Eastern Ohio                             0.002567\n",
        "Northern Louisiana                       0.002540\n",
        "Arkansas                                 0.002522\n",
        "Denver-Boulder                           0.002480\n",
        "Northern New York                        0.002414\n",
        "Minnesota                                0.002374\n",
        "Maine                                    0.002370\n",
        "Northeastern New York State              0.002325\n",
        "Northern Chicago Suburbs                 0.002302\n",
        "Boston                                   0.002246\n",
        "Buffalo                                  0.002208\n",
        "Southwest Connecticut                    0.002177\n",
        "Seattle                                  0.002165\n",
        "Eastern part of Southern New Jersey      0.002158\n",
        "Westchester County, NY                   0.002143\n",
        "Chicago Suburbs                          0.002139\n",
        "Boulder-Denver                           0.002133\n",
        "Southern Illinois                        0.002114\n",
        "Southern Michigan                        0.002046\n",
        "Southeastern Colorado                    0.002001\n",
        "Idaho                                    0.001976\n",
        "Central Arizona                          0.001965\n",
        "Chicago                                  0.001938\n",
        "Champaign-Urbana                         0.001929\n",
        "Mountain View                            0.001874\n",
        "Indianapolis                             0.001840\n",
        "New York City                            0.001785\n",
        "Pennsylvania                             0.001757\n",
        "Fort Lauderdale                          0.001686\n",
        "Eastern San Francisco                    0.001673\n",
        "Miami                                    0.001637\n",
        "Southeastern California                  0.001585\n",
        "Central Florida                          0.001535\n",
        "Southern California                      0.001515\n",
        "Manhattan                                0.001511\n",
        "San Fernando Valley                      0.001508\n",
        "Bronx, Queens, Brooklyn                  0.001507\n",
        "Southeastern Virginia                    0.001464\n",
        "Los Angeles                              0.001425\n",
        "Eastern Los Angeles                      0.001413\n",
        "Manitoba                                 0.001387\n",
        "San Francisco                            0.001286\n",
        "Downtown Los Angeles                     0.001240\n",
        "Oakland                                  0.001126\n",
        "Florida                                  0.000757\n",
        "Southwestern Wisconsin                   0.000451\n",
        "Name: 1234, Length: 52, dtype: float64\n",
        "\n",
        "SUBSTRING 5280\n",
        "location\n",
        "Denver-Boulder                           0.003272\n",
        "Boulder-Denver                           0.002599\n",
        "Western and Northern Colorado            0.000173\n",
        "Southeastern Colorado                    0.000087\n",
        "Southern Illinois                        0.000035\n",
        "Arkansas                                 0.000035\n",
        "Southwest Connecticut                    0.000033\n",
        "Eastern Ohio                             0.000031\n",
        "Southern Michigan                        0.000029\n",
        "Champaign-Urbana                         0.000022\n",
        "Westchester County, NY                   0.000017\n",
        "Northern Chicago Suburbs                 0.000015\n",
        "Los Angeles                              0.000014\n",
        "Chicago                                  0.000014\n",
        "Downtown Los Angeles                     0.000012\n",
        "San Francisco                            0.000009\n",
        "New York City                            0.000009\n",
        "Oakland                                  0.000008\n",
        "Northeastern New York State              0.000007\n",
        "Buffalo                                  0.000007\n",
        "Northern New York                        0.000007\n",
        "Fort Lauderdale                          0.000006\n",
        "Southern California                      0.000005\n",
        "San Fernando Valley                      0.000005\n",
        "Boston                                        NaN\n",
        "Bronx, Queens, Brooklyn                       NaN\n",
        "Central Arizona                               NaN\n",
        "Central Florida                               NaN\n",
        "Chicago Suburbs                               NaN\n",
        "Eastern Los Angeles                           NaN\n",
        "Eastern San Francisco                         NaN\n",
        "Eastern part of Southern New Jersey           NaN\n",
        "Florida                                       NaN\n",
        "Idaho                                         NaN\n",
        "Indianapolis                                  NaN\n",
        "Maine                                         NaN\n",
        "Manhattan                                     NaN\n",
        "Manitoba                                      NaN\n",
        "Miami                                         NaN\n",
        "Minnesota                                     NaN\n",
        "Mountain View                                 NaN\n",
        "Northern Louisiana                            NaN\n",
        "Northwestern Arkansas                         NaN\n",
        "Pennsylvania                                  NaN\n",
        "Seattle                                       NaN\n",
        "South Carolina                                NaN\n",
        "Southeastern California                       NaN\n",
        "Southeastern Michigan incl. Ann Arbor         NaN\n",
        "Southeastern Ohio                             NaN\n",
        "Southeastern Virginia                         NaN\n",
        "Southern New York State                       NaN\n",
        "Southwestern Wisconsin                        NaN\n",
        "Name: 5280, Length: 52, dtype: float64\n"
       ]
      }
     ],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# This is just for generating json for use in algorithmshop.com's visualization:\n",
      "# http://algorithmshop.com/20140102-snapchat-leak.html\n",
      "\n",
      "import json\n",
      "import math\n",
      "from random import shuffle\n",
      "\n",
      "PATH_PREFIX = '/post-files/20140202-snapchat'\n",
      "\n",
      "# Some canadian things sneak in...\n",
      "BLACKLIST = {'Manitoba'}\n",
      "\n",
      "output_blobs = []\n",
      "for (sub, r) in large_variance.iterrows():\n",
      "    blob = {}\n",
      "    blob['substring'] = sub\n",
      "    blob['location_data'] = [{'location': location,\n",
      "                              'frequency': frequency,\n",
      "                              'example_users': example_users[sub][location].samples()}\n",
      "                             for (location, frequency) in r.iteritems()\n",
      "                             if (not math.isnan(frequency) and location not in BLACKLIST)]\n",
      "\n",
      "    path = 'blobs/blob-{}.json'.format(sub)\n",
      "    with open(path, 'wt') as f:\n",
      "        json.dump(blob, f)\n",
      "    output_blobs.append({'fragment': str(abs(hash(sub))),\n",
      "                         'path': '{}/{}'.format(PATH_PREFIX, path)})\n",
      "\n",
      "shuffle(output_blobs)\n",
      "with open('blobs/all_blobs.json', 'wt') as f:\n",
      "    json.dump(output_blobs, f)\n",
      "    \n",
      "\n",
      "with open('blobs/locations.json', 'wt') as f:\n",
      "    all_locations = []\n",
      "    for (_, r) in locations[:-1].iterrows():\n",
      "        single_location = {'location': r['name'],\n",
      "                            'lat': r['latitude'],\n",
      "                            'lon': r['longitude']}\n",
      "        all_locations.append(single_location)\n",
      "    json.dump(all_locations, f)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 16
    }
   ],
   "metadata": {}
  }
 ]
}