{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "from numpy.random import randn\n",
    "import numpy as np\n",
    "np.random.seed(123)\n",
    "import os\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "plt.rc(\"figure\", figsize=(10, 6))\n",
    "np.set_printoptions(precision=4)\n",
    "pd.options.display.max_columns = 20\n",
    "pd.options.display.max_rows = 20\n",
    "pd.options.display.max_colwidth = 80"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"datasets/bitly_usagov/example.txt\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "with open(path) as f:\n",
    "    records = [json.loads(line) for line in f]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "time_zones = [rec[\"tz\"] for rec in records]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "time_zones = [rec[\"tz\"] for rec in records if \"tz\" in rec]\n",
    "time_zones[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_counts(sequence):\n",
    "    counts = {}\n",
    "    for x in sequence:\n",
    "        if x in counts:\n",
    "            counts[x] += 1\n",
    "        else:\n",
    "            counts[x] = 1\n",
    "    return counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "def get_counts2(sequence):\n",
    "    counts = defaultdict(int) # values will initialize to 0\n",
    "    for x in sequence:\n",
    "        counts[x] += 1\n",
    "    return counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "counts = get_counts(time_zones)\n",
    "counts[\"America/New_York\"]\n",
    "len(time_zones)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def top_counts(count_dict, n=10):\n",
    "    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]\n",
    "    value_key_pairs.sort()\n",
    "    return value_key_pairs[-n:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_counts(counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "counts = Counter(time_zones)\n",
    "counts.most_common(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame = pd.DataFrame(records)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.info()\n",
    "frame[\"tz\"].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "tz_counts = frame[\"tz\"].value_counts()\n",
    "tz_counts.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_tz = frame[\"tz\"].fillna(\"Missing\")\n",
    "clean_tz[clean_tz == \"\"] = \"Unknown\"\n",
    "tz_counts = clean_tz.value_counts()\n",
    "tz_counts.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "subset = tz_counts.head()\n",
    "sns.barplot(y=subset.index, x=subset.to_numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame[\"a\"][1]\n",
    "frame[\"a\"][50]\n",
    "frame[\"a\"][51][:50]  # long line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = pd.Series([x.split()[0] for x in frame[\"a\"].dropna()])\n",
    "results.head(5)\n",
    "results.value_counts().head(8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "cframe = frame[frame[\"a\"].notna()].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "cframe[\"os\"] = np.where(cframe[\"a\"].str.contains(\"Windows\"),\n",
    "                        \"Windows\", \"Not Windows\")\n",
    "cframe[\"os\"].head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "by_tz_os = cframe.groupby([\"tz\", \"os\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "agg_counts = by_tz_os.size().unstack().fillna(0)\n",
    "agg_counts.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "indexer = agg_counts.sum(\"columns\").argsort()\n",
    "indexer.values[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "count_subset = agg_counts.take(indexer[-10:])\n",
    "count_subset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "agg_counts.sum(axis=\"columns\").nlargest(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "count_subset = count_subset.stack()\n",
    "count_subset.name = \"total\"\n",
    "count_subset = count_subset.reset_index()\n",
    "count_subset.head(10)\n",
    "sns.barplot(x=\"total\", y=\"tz\", hue=\"os\",  data=count_subset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "def norm_total(group):\n",
    "    group[\"normed_total\"] = group[\"total\"] / group[\"total\"].sum()\n",
    "    return group\n",
    "\n",
    "results = count_subset.groupby(\"tz\").apply(norm_total)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.barplot(x=\"normed_total\", y=\"tz\", hue=\"os\",  data=results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "g = count_subset.groupby(\"tz\")\n",
    "results2 = count_subset[\"total\"] / g[\"total\"].transform(\"sum\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "unames = [\"user_id\", \"gender\", \"age\", \"occupation\", \"zip\"]\n",
    "users = pd.read_table(\"datasets/movielens/users.dat\", sep=\"::\",\n",
    "                      header=None, names=unames, engine=\"python\")\n",
    "\n",
    "rnames = [\"user_id\", \"movie_id\", \"rating\", \"timestamp\"]\n",
    "ratings = pd.read_table(\"datasets/movielens/ratings.dat\", sep=\"::\",\n",
    "                        header=None, names=rnames, engine=\"python\")\n",
    "\n",
    "mnames = [\"movie_id\", \"title\", \"genres\"]\n",
    "movies = pd.read_table(\"datasets/movielens/movies.dat\", sep=\"::\",\n",
    "                       header=None, names=mnames, engine=\"python\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "users.head(5)\n",
    "ratings.head(5)\n",
    "movies.head(5)\n",
    "ratings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.merge(pd.merge(ratings, users), movies)\n",
    "data\n",
    "data.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean_ratings = data.pivot_table(\"rating\", index=\"title\",\n",
    "                                columns=\"gender\", aggfunc=\"mean\")\n",
    "mean_ratings.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratings_by_title = data.groupby(\"title\").size()\n",
    "ratings_by_title.head()\n",
    "active_titles = ratings_by_title.index[ratings_by_title >= 250]\n",
    "active_titles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean_ratings = mean_ratings.loc[active_titles]\n",
    "mean_ratings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean_ratings = mean_ratings.rename(index={\"Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)\":\n",
    "                           \"Seven Samurai (Shichinin no samurai) (1954)\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_female_ratings = mean_ratings.sort_values(\"F\", ascending=False)\n",
    "top_female_ratings.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean_ratings[\"diff\"] = mean_ratings[\"M\"] - mean_ratings[\"F\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted_by_diff = mean_ratings.sort_values(\"diff\")\n",
    "sorted_by_diff.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted_by_diff[::-1].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "rating_std_by_title = data.groupby(\"title\")[\"rating\"].std()\n",
    "rating_std_by_title = rating_std_by_title.loc[active_titles]\n",
    "rating_std_by_title.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "rating_std_by_title.sort_values(ascending=False)[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies[\"genres\"].head()\n",
    "movies[\"genres\"].head().str.split(\"|\")\n",
    "movies[\"genre\"] = movies.pop(\"genres\").str.split(\"|\")\n",
    "movies.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies_exploded = movies.explode(\"genre\")\n",
    "movies_exploded[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratings_with_genre = pd.merge(pd.merge(movies_exploded, ratings), users)\n",
    "ratings_with_genre.iloc[0]\n",
    "genre_ratings = (ratings_with_genre.groupby([\"genre\", \"age\"])\n",
    "                 [\"rating\"].mean()\n",
    "                 .unstack(\"age\"))\n",
    "genre_ratings[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "!head -n 10 datasets/babynames/yob1880.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "names1880 = pd.read_csv(\"datasets/babynames/yob1880.txt\",\n",
    "                        names=[\"name\", \"sex\", \"births\"])\n",
    "names1880"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "names1880.groupby(\"sex\")[\"births\"].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "pieces = []\n",
    "for year in range(1880, 2011):\n",
    "    path = f\"datasets/babynames/yob{year}.txt\"\n",
    "    frame = pd.read_csv(path, names=[\"name\", \"sex\", \"births\"])\n",
    "\n",
    "    # Add a column for the year\n",
    "    frame[\"year\"] = year\n",
    "    pieces.append(frame)\n",
    "\n",
    "# Concatenate everything into a single DataFrame\n",
    "names = pd.concat(pieces, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "total_births = names.pivot_table(\"births\", index=\"year\",\n",
    "                                 columns=\"sex\", aggfunc=sum)\n",
    "total_births.tail()\n",
    "total_births.plot(title=\"Total births by sex and year\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_prop(group):\n",
    "    group[\"prop\"] = group[\"births\"] / group[\"births\"].sum()\n",
    "    return group\n",
    "names = names.groupby([\"year\", \"sex\"], group_keys=False).apply(add_prop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "names.groupby([\"year\", \"sex\"])[\"prop\"].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_top1000(group):\n",
    "    return group.sort_values(\"births\", ascending=False)[:1000]\n",
    "grouped = names.groupby([\"year\", \"sex\"])\n",
    "top1000 = grouped.apply(get_top1000)\n",
    "top1000.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "top1000 = top1000.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "top1000.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "boys = top1000[top1000[\"sex\"] == \"M\"]\n",
    "girls = top1000[top1000[\"sex\"] == \"F\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "total_births = top1000.pivot_table(\"births\", index=\"year\",\n",
    "                                   columns=\"name\",\n",
    "                                   aggfunc=sum)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "total_births.info()\n",
    "subset = total_births[[\"John\", \"Harry\", \"Mary\", \"Marilyn\"]]\n",
    "subset.plot(subplots=True, figsize=(12, 10),\n",
    "            title=\"Number of births per year\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "table = top1000.pivot_table(\"prop\", index=\"year\",\n",
    "                            columns=\"sex\", aggfunc=sum)\n",
    "table.plot(title=\"Sum of table1000.prop by year and sex\",\n",
    "           yticks=np.linspace(0, 1.2, 13))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = boys[boys[\"year\"] == 2010]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "prop_cumsum = df[\"prop\"].sort_values(ascending=False).cumsum()\n",
    "prop_cumsum[:10]\n",
    "prop_cumsum.searchsorted(0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = boys[boys.year == 1900]\n",
    "in1900 = df.sort_values(\"prop\", ascending=False).prop.cumsum()\n",
    "in1900.searchsorted(0.5) + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_quantile_count(group, q=0.5):\n",
    "    group = group.sort_values(\"prop\", ascending=False)\n",
    "    return group.prop.cumsum().searchsorted(q) + 1\n",
    "\n",
    "diversity = top1000.groupby([\"year\", \"sex\"]).apply(get_quantile_count)\n",
    "diversity = diversity.unstack()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "diversity.head()\n",
    "diversity.plot(title=\"Number of popular names in top 50%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_last_letter(x):\n",
    "    return x[-1]\n",
    "\n",
    "last_letters = names[\"name\"].map(get_last_letter)\n",
    "last_letters.name = \"last_letter\"\n",
    "\n",
    "table = names.pivot_table(\"births\", index=last_letters,\n",
    "                          columns=[\"sex\", \"year\"], aggfunc=sum)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "subtable = table.reindex(columns=[1910, 1960, 2010], level=\"year\")\n",
    "subtable.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "subtable.sum()\n",
    "letter_prop = subtable / subtable.sum()\n",
    "letter_prop"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "fig, axes = plt.subplots(2, 1, figsize=(10, 8))\n",
    "letter_prop[\"M\"].plot(kind=\"bar\", rot=0, ax=axes[0], title=\"Male\")\n",
    "letter_prop[\"F\"].plot(kind=\"bar\", rot=0, ax=axes[1], title=\"Female\",\n",
    "                      legend=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.subplots_adjust(hspace=0.25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "letter_prop = table / table.sum()\n",
    "\n",
    "dny_ts = letter_prop.loc[[\"d\", \"n\", \"y\"], \"M\"].T\n",
    "dny_ts.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.close(\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "dny_ts.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_names = pd.Series(top1000[\"name\"].unique())\n",
    "lesley_like = all_names[all_names.str.contains(\"Lesl\")]\n",
    "lesley_like"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered = top1000[top1000[\"name\"].isin(lesley_like)]\n",
    "filtered.groupby(\"name\")[\"births\"].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "table = filtered.pivot_table(\"births\", index=\"year\",\n",
    "                             columns=\"sex\", aggfunc=\"sum\")\n",
    "table = table.div(table.sum(axis=\"columns\"), axis=\"index\")\n",
    "table.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "table.plot(style={\"M\": \"k-\", \"F\": \"k--\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "db = json.load(open(\"datasets/usda_food/database.json\"))\n",
    "len(db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "db[0].keys()\n",
    "db[0][\"nutrients\"][0]\n",
    "nutrients = pd.DataFrame(db[0][\"nutrients\"])\n",
    "nutrients.head(7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "info_keys = [\"description\", \"group\", \"id\", \"manufacturer\"]\n",
    "info = pd.DataFrame(db, columns=info_keys)\n",
    "info.head()\n",
    "info.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.value_counts(info[\"group\"])[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "nutrients = []\n",
    "\n",
    "for rec in db:\n",
    "    fnuts = pd.DataFrame(rec[\"nutrients\"])\n",
    "    fnuts[\"id\"] = rec[\"id\"]\n",
    "    nutrients.append(fnuts)\n",
    "\n",
    "nutrients = pd.concat(nutrients, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "nutrients"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "nutrients.duplicated().sum()  # number of duplicates\n",
    "nutrients = nutrients.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "col_mapping = {\"description\" : \"food\",\n",
    "               \"group\"       : \"fgroup\"}\n",
    "info = info.rename(columns=col_mapping, copy=False)\n",
    "info.info()\n",
    "col_mapping = {\"description\" : \"nutrient\",\n",
    "               \"group\" : \"nutgroup\"}\n",
    "nutrients = nutrients.rename(columns=col_mapping, copy=False)\n",
    "nutrients"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "ndata = pd.merge(nutrients, info, on=\"id\")\n",
    "ndata.info()\n",
    "ndata.iloc[30000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = ndata.groupby([\"nutrient\", \"fgroup\"])[\"value\"].quantile(0.5)\n",
    "result[\"Zinc, Zn\"].sort_values().plot(kind=\"barh\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "by_nutrient = ndata.groupby([\"nutgroup\", \"nutrient\"])\n",
    "\n",
    "def get_maximum(x):\n",
    "    return x.loc[x.value.idxmax()]\n",
    "\n",
    "max_foods = by_nutrient.apply(get_maximum)[[\"value\", \"food\"]]\n",
    "\n",
    "# make the food a little smaller\n",
    "max_foods[\"food\"] = max_foods[\"food\"].str[:50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_foods.loc[\"Amino Acids\"][\"food\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "fec = pd.read_csv(\"datasets/fec/P00000001-ALL.csv\", low_memory=False)\n",
    "fec.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "fec.iloc[123456]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_cands = fec[\"cand_nm\"].unique()\n",
    "unique_cands\n",
    "unique_cands[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "parties = {\"Bachmann, Michelle\": \"Republican\",\n",
    "           \"Cain, Herman\": \"Republican\",\n",
    "           \"Gingrich, Newt\": \"Republican\",\n",
    "           \"Huntsman, Jon\": \"Republican\",\n",
    "           \"Johnson, Gary Earl\": \"Republican\",\n",
    "           \"McCotter, Thaddeus G\": \"Republican\",\n",
    "           \"Obama, Barack\": \"Democrat\",\n",
    "           \"Paul, Ron\": \"Republican\",\n",
    "           \"Pawlenty, Timothy\": \"Republican\",\n",
    "           \"Perry, Rick\": \"Republican\",\n",
    "           \"Roemer, Charles E. 'Buddy' III\": \"Republican\",\n",
    "           \"Romney, Mitt\": \"Republican\",\n",
    "           \"Santorum, Rick\": \"Republican\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "fec[\"cand_nm\"][123456:123461]\n",
    "fec[\"cand_nm\"][123456:123461].map(parties)\n",
    "# Add it as a column\n",
    "fec[\"party\"] = fec[\"cand_nm\"].map(parties)\n",
    "fec[\"party\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "(fec[\"contb_receipt_amt\"] > 0).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "fec = fec[fec[\"contb_receipt_amt\"] > 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "fec_mrbo = fec[fec[\"cand_nm\"].isin([\"Obama, Barack\", \"Romney, Mitt\"])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "fec[\"contbr_occupation\"].value_counts()[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "occ_mapping = {\n",
    "   \"INFORMATION REQUESTED PER BEST EFFORTS\" : \"NOT PROVIDED\",\n",
    "   \"INFORMATION REQUESTED\" : \"NOT PROVIDED\",\n",
    "   \"INFORMATION REQUESTED (BEST EFFORTS)\" : \"NOT PROVIDED\",\n",
    "   \"C.E.O.\": \"CEO\"\n",
    "}\n",
    "\n",
    "def get_occ(x):\n",
    "    # If no mapping provided, return x\n",
    "    return occ_mapping.get(x, x)\n",
    "\n",
    "fec[\"contbr_occupation\"] = fec[\"contbr_occupation\"].map(get_occ)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "emp_mapping = {\n",
    "   \"INFORMATION REQUESTED PER BEST EFFORTS\" : \"NOT PROVIDED\",\n",
    "   \"INFORMATION REQUESTED\" : \"NOT PROVIDED\",\n",
    "   \"SELF\" : \"SELF-EMPLOYED\",\n",
    "   \"SELF EMPLOYED\" : \"SELF-EMPLOYED\",\n",
    "}\n",
    "\n",
    "def get_emp(x):\n",
    "    # If no mapping provided, return x\n",
    "    return emp_mapping.get(x, x)\n",
    "\n",
    "fec[\"contbr_employer\"] = fec[\"contbr_employer\"].map(get_emp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "by_occupation = fec.pivot_table(\"contb_receipt_amt\",\n",
    "                                index=\"contbr_occupation\",\n",
    "                                columns=\"party\", aggfunc=\"sum\")\n",
    "over_2mm = by_occupation[by_occupation.sum(axis=\"columns\") > 2000000]\n",
    "over_2mm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "over_2mm.plot(kind=\"barh\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_top_amounts(group, key, n=5):\n",
    "    totals = group.groupby(key)[\"contb_receipt_amt\"].sum()\n",
    "    return totals.nlargest(n)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped = fec_mrbo.groupby(\"cand_nm\")\n",
    "grouped.apply(get_top_amounts, \"contbr_occupation\", n=7)\n",
    "grouped.apply(get_top_amounts, \"contbr_employer\", n=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "bins = np.array([0, 1, 10, 100, 1000, 10000,\n",
    "                 100_000, 1_000_000, 10_000_000])\n",
    "labels = pd.cut(fec_mrbo[\"contb_receipt_amt\"], bins)\n",
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped = fec_mrbo.groupby([\"cand_nm\", labels])\n",
    "grouped.size().unstack(level=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "bucket_sums = grouped[\"contb_receipt_amt\"].sum().unstack(level=0)\n",
    "normed_sums = bucket_sums.div(bucket_sums.sum(axis=\"columns\"),\n",
    "                              axis=\"index\")\n",
    "normed_sums\n",
    "normed_sums[:-2].plot(kind=\"barh\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped = fec_mrbo.groupby([\"cand_nm\", \"contbr_st\"])\n",
    "totals = grouped[\"contb_receipt_amt\"].sum().unstack(level=0).fillna(0)\n",
    "totals = totals[totals.sum(axis=\"columns\") > 100000]\n",
    "totals.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "percent = totals.div(totals.sum(axis=\"columns\"), axis=\"index\")\n",
    "percent.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}