{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "\n", "from numpy.random import randn\n", "import numpy as np\n", "np.random.seed(123)\n", "import os\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "plt.rc(\"figure\", figsize=(10, 6))\n", "np.set_printoptions(precision=4)\n", "pd.options.display.max_columns = 20\n", "pd.options.display.max_rows = 20\n", "pd.options.display.max_colwidth = 80" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "path = \"datasets/bitly_usagov/example.txt\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import json\n", "with open(path) as f:\n", " records = [json.loads(line) for line in f]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "time_zones = [rec[\"tz\"] for rec in records]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "time_zones = [rec[\"tz\"] for rec in records if \"tz\" in rec]\n", "time_zones[:10]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def get_counts(sequence):\n", " counts = {}\n", " for x in sequence:\n", " if x in counts:\n", " counts[x] += 1\n", " else:\n", " counts[x] = 1\n", " return counts" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict\n", "\n", "def get_counts2(sequence):\n", " counts = defaultdict(int) # values will initialize to 0\n", " for x in sequence:\n", " counts[x] += 1\n", " return counts" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "counts = get_counts(time_zones)\n", "counts[\"America/New_York\"]\n", "len(time_zones)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def top_counts(count_dict, n=10):\n", " value_key_pairs = [(count, tz) for tz, count in count_dict.items()]\n", " value_key_pairs.sort()\n", " return value_key_pairs[-n:]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "top_counts(counts)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "from collections import Counter\n", "counts = Counter(time_zones)\n", "counts.most_common(10)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "frame = pd.DataFrame(records)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "frame.info()\n", "frame[\"tz\"].head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "tz_counts = frame[\"tz\"].value_counts()\n", "tz_counts.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "clean_tz = frame[\"tz\"].fillna(\"Missing\")\n", "clean_tz[clean_tz == \"\"] = \"Unknown\"\n", "tz_counts = clean_tz.value_counts()\n", "tz_counts.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(10, 4))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import seaborn as sns\n", "subset = tz_counts.head()\n", "sns.barplot(y=subset.index, x=subset.to_numpy())" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "frame[\"a\"][1]\n", "frame[\"a\"][50]\n", "frame[\"a\"][51][:50] # long line" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "results = pd.Series([x.split()[0] for x in frame[\"a\"].dropna()])\n", "results.head(5)\n", "results.value_counts().head(8)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "cframe = frame[frame[\"a\"].notna()].copy()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "cframe[\"os\"] = np.where(cframe[\"a\"].str.contains(\"Windows\"),\n", " \"Windows\", \"Not Windows\")\n", "cframe[\"os\"].head(5)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "by_tz_os = cframe.groupby([\"tz\", \"os\"])" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "agg_counts = by_tz_os.size().unstack().fillna(0)\n", "agg_counts.head()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "indexer = agg_counts.sum(\"columns\").argsort()\n", "indexer.values[:10]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "count_subset = agg_counts.take(indexer[-10:])\n", "count_subset" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "agg_counts.sum(axis=\"columns\").nlargest(10)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "plt.figure()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "count_subset = count_subset.stack()\n", "count_subset.name = \"total\"\n", "count_subset = count_subset.reset_index()\n", "count_subset.head(10)\n", "sns.barplot(x=\"total\", y=\"tz\", hue=\"os\", data=count_subset)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "def norm_total(group):\n", " group[\"normed_total\"] = group[\"total\"] / group[\"total\"].sum()\n", " return group\n", "\n", "results = count_subset.groupby(\"tz\").apply(norm_total)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "plt.figure()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "sns.barplot(x=\"normed_total\", y=\"tz\", hue=\"os\", data=results)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "g = count_subset.groupby(\"tz\")\n", "results2 = count_subset[\"total\"] / g[\"total\"].transform(\"sum\")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "unames = [\"user_id\", \"gender\", \"age\", \"occupation\", \"zip\"]\n", "users = pd.read_table(\"datasets/movielens/users.dat\", sep=\"::\",\n", " header=None, names=unames, engine=\"python\")\n", "\n", "rnames = [\"user_id\", \"movie_id\", \"rating\", \"timestamp\"]\n", "ratings = pd.read_table(\"datasets/movielens/ratings.dat\", sep=\"::\",\n", " header=None, names=rnames, engine=\"python\")\n", "\n", "mnames = [\"movie_id\", \"title\", \"genres\"]\n", "movies = pd.read_table(\"datasets/movielens/movies.dat\", sep=\"::\",\n", " header=None, names=mnames, engine=\"python\")" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "users.head(5)\n", "ratings.head(5)\n", "movies.head(5)\n", "ratings" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "data = pd.merge(pd.merge(ratings, users), movies)\n", "data\n", "data.iloc[0]" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "mean_ratings = data.pivot_table(\"rating\", index=\"title\",\n", " columns=\"gender\", aggfunc=\"mean\")\n", "mean_ratings.head(5)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "ratings_by_title = data.groupby(\"title\").size()\n", "ratings_by_title.head()\n", "active_titles = ratings_by_title.index[ratings_by_title >= 250]\n", "active_titles" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "mean_ratings = mean_ratings.loc[active_titles]\n", "mean_ratings" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "mean_ratings = mean_ratings.rename(index={\"Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)\":\n", " \"Seven Samurai (Shichinin no samurai) (1954)\"})" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "top_female_ratings = mean_ratings.sort_values(\"F\", ascending=False)\n", "top_female_ratings.head()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "mean_ratings[\"diff\"] = mean_ratings[\"M\"] - mean_ratings[\"F\"]" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "sorted_by_diff = mean_ratings.sort_values(\"diff\")\n", "sorted_by_diff.head()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "sorted_by_diff[::-1].head()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "rating_std_by_title = data.groupby(\"title\")[\"rating\"].std()\n", "rating_std_by_title = rating_std_by_title.loc[active_titles]\n", "rating_std_by_title.head()" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "rating_std_by_title.sort_values(ascending=False)[:10]" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "movies[\"genres\"].head()\n", "movies[\"genres\"].head().str.split(\"|\")\n", "movies[\"genre\"] = movies.pop(\"genres\").str.split(\"|\")\n", "movies.head()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "movies_exploded = movies.explode(\"genre\")\n", "movies_exploded[:10]" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "ratings_with_genre = pd.merge(pd.merge(movies_exploded, ratings), users)\n", "ratings_with_genre.iloc[0]\n", "genre_ratings = (ratings_with_genre.groupby([\"genre\", \"age\"])\n", " [\"rating\"].mean()\n", " .unstack(\"age\"))\n", "genre_ratings[:10]" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "!head -n 10 datasets/babynames/yob1880.txt" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "names1880 = pd.read_csv(\"datasets/babynames/yob1880.txt\",\n", " names=[\"name\", \"sex\", \"births\"])\n", "names1880" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "names1880.groupby(\"sex\")[\"births\"].sum()" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "pieces = []\n", "for year in range(1880, 2011):\n", " path = f\"datasets/babynames/yob{year}.txt\"\n", " frame = pd.read_csv(path, names=[\"name\", \"sex\", \"births\"])\n", "\n", " # Add a column for the year\n", " frame[\"year\"] = year\n", " pieces.append(frame)\n", "\n", "# Concatenate everything into a single DataFrame\n", "names = pd.concat(pieces, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "names" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "total_births = names.pivot_table(\"births\", index=\"year\",\n", " columns=\"sex\", aggfunc=sum)\n", "total_births.tail()\n", "total_births.plot(title=\"Total births by sex and year\")" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "def add_prop(group):\n", " group[\"prop\"] = group[\"births\"] / group[\"births\"].sum()\n", " return group\n", "names = names.groupby([\"year\", \"sex\"], group_keys=False).apply(add_prop)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "names" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "names.groupby([\"year\", \"sex\"])[\"prop\"].sum()" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "def get_top1000(group):\n", " return group.sort_values(\"births\", ascending=False)[:1000]\n", "grouped = names.groupby([\"year\", \"sex\"])\n", "top1000 = grouped.apply(get_top1000)\n", "top1000.head()" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "top1000 = top1000.reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "top1000.head()" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "boys = top1000[top1000[\"sex\"] == \"M\"]\n", "girls = top1000[top1000[\"sex\"] == \"F\"]" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "total_births = top1000.pivot_table(\"births\", index=\"year\",\n", " columns=\"name\",\n", " aggfunc=sum)" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "total_births.info()\n", "subset = total_births[[\"John\", \"Harry\", \"Mary\", \"Marilyn\"]]\n", "subset.plot(subplots=True, figsize=(12, 10),\n", " title=\"Number of births per year\")" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "plt.figure()" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "table = top1000.pivot_table(\"prop\", index=\"year\",\n", " columns=\"sex\", aggfunc=sum)\n", "table.plot(title=\"Sum of table1000.prop by year and sex\",\n", " yticks=np.linspace(0, 1.2, 13))" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "df = boys[boys[\"year\"] == 2010]\n", "df" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "prop_cumsum = df[\"prop\"].sort_values(ascending=False).cumsum()\n", "prop_cumsum[:10]\n", "prop_cumsum.searchsorted(0.5)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "df = boys[boys.year == 1900]\n", "in1900 = df.sort_values(\"prop\", ascending=False).prop.cumsum()\n", "in1900.searchsorted(0.5) + 1" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "def get_quantile_count(group, q=0.5):\n", " group = group.sort_values(\"prop\", ascending=False)\n", " return group.prop.cumsum().searchsorted(q) + 1\n", "\n", "diversity = top1000.groupby([\"year\", \"sex\"]).apply(get_quantile_count)\n", "diversity = diversity.unstack()" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "fig = plt.figure()" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "diversity.head()\n", "diversity.plot(title=\"Number of popular names in top 50%\")" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "def get_last_letter(x):\n", " return x[-1]\n", "\n", "last_letters = names[\"name\"].map(get_last_letter)\n", "last_letters.name = \"last_letter\"\n", "\n", "table = names.pivot_table(\"births\", index=last_letters,\n", " columns=[\"sex\", \"year\"], aggfunc=sum)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "subtable = table.reindex(columns=[1910, 1960, 2010], level=\"year\")\n", "subtable.head()" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "subtable.sum()\n", "letter_prop = subtable / subtable.sum()\n", "letter_prop" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "fig, axes = plt.subplots(2, 1, figsize=(10, 8))\n", "letter_prop[\"M\"].plot(kind=\"bar\", rot=0, ax=axes[0], title=\"Male\")\n", "letter_prop[\"F\"].plot(kind=\"bar\", rot=0, ax=axes[1], title=\"Female\",\n", " legend=False)" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "plt.subplots_adjust(hspace=0.25)" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "letter_prop = table / table.sum()\n", "\n", "dny_ts = letter_prop.loc[[\"d\", \"n\", \"y\"], \"M\"].T\n", "dny_ts.head()" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "plt.close(\"all\")" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "fig = plt.figure()" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "dny_ts.plot()" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "all_names = pd.Series(top1000[\"name\"].unique())\n", "lesley_like = all_names[all_names.str.contains(\"Lesl\")]\n", "lesley_like" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "filtered = top1000[top1000[\"name\"].isin(lesley_like)]\n", "filtered.groupby(\"name\")[\"births\"].sum()" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "table = filtered.pivot_table(\"births\", index=\"year\",\n", " columns=\"sex\", aggfunc=\"sum\")\n", "table = table.div(table.sum(axis=\"columns\"), axis=\"index\")\n", "table.tail()" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "fig = plt.figure()" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "table.plot(style={\"M\": \"k-\", \"F\": \"k--\"})" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "import json\n", "db = json.load(open(\"datasets/usda_food/database.json\"))\n", "len(db)" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "db[0].keys()\n", "db[0][\"nutrients\"][0]\n", "nutrients = pd.DataFrame(db[0][\"nutrients\"])\n", "nutrients.head(7)" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "info_keys = [\"description\", \"group\", \"id\", \"manufacturer\"]\n", "info = pd.DataFrame(db, columns=info_keys)\n", "info.head()\n", "info.info()" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "pd.value_counts(info[\"group\"])[:10]" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "nutrients = []\n", "\n", "for rec in db:\n", " fnuts = pd.DataFrame(rec[\"nutrients\"])\n", " fnuts[\"id\"] = rec[\"id\"]\n", " nutrients.append(fnuts)\n", "\n", "nutrients = pd.concat(nutrients, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "nutrients" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [], "source": [ "nutrients.duplicated().sum() # number of duplicates\n", "nutrients = nutrients.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "col_mapping = {\"description\" : \"food\",\n", " \"group\" : \"fgroup\"}\n", "info = info.rename(columns=col_mapping, copy=False)\n", "info.info()\n", "col_mapping = {\"description\" : \"nutrient\",\n", " \"group\" : \"nutgroup\"}\n", "nutrients = nutrients.rename(columns=col_mapping, copy=False)\n", "nutrients" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "ndata = pd.merge(nutrients, info, on=\"id\")\n", "ndata.info()\n", "ndata.iloc[30000]" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "fig = plt.figure()" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [], "source": [ "result = ndata.groupby([\"nutrient\", \"fgroup\"])[\"value\"].quantile(0.5)\n", "result[\"Zinc, Zn\"].sort_values().plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [], "source": [ "by_nutrient = ndata.groupby([\"nutgroup\", \"nutrient\"])\n", "\n", "def get_maximum(x):\n", " return x.loc[x.value.idxmax()]\n", "\n", "max_foods = by_nutrient.apply(get_maximum)[[\"value\", \"food\"]]\n", "\n", "# make the food a little smaller\n", "max_foods[\"food\"] = max_foods[\"food\"].str[:50]" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "max_foods.loc[\"Amino Acids\"][\"food\"]" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [], "source": [ "fec = pd.read_csv(\"datasets/fec/P00000001-ALL.csv\", low_memory=False)\n", "fec.info()" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "fec.iloc[123456]" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [], "source": [ "unique_cands = fec[\"cand_nm\"].unique()\n", "unique_cands\n", "unique_cands[2]" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "parties = {\"Bachmann, Michelle\": \"Republican\",\n", " \"Cain, Herman\": \"Republican\",\n", " \"Gingrich, Newt\": \"Republican\",\n", " \"Huntsman, Jon\": \"Republican\",\n", " \"Johnson, Gary Earl\": \"Republican\",\n", " \"McCotter, Thaddeus G\": \"Republican\",\n", " \"Obama, Barack\": \"Democrat\",\n", " \"Paul, Ron\": \"Republican\",\n", " \"Pawlenty, Timothy\": \"Republican\",\n", " \"Perry, Rick\": \"Republican\",\n", " \"Roemer, Charles E. 'Buddy' III\": \"Republican\",\n", " \"Romney, Mitt\": \"Republican\",\n", " \"Santorum, Rick\": \"Republican\"}" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [], "source": [ "fec[\"cand_nm\"][123456:123461]\n", "fec[\"cand_nm\"][123456:123461].map(parties)\n", "# Add it as a column\n", "fec[\"party\"] = fec[\"cand_nm\"].map(parties)\n", "fec[\"party\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [], "source": [ "(fec[\"contb_receipt_amt\"] > 0).value_counts()" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [], "source": [ "fec = fec[fec[\"contb_receipt_amt\"] > 0]" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [], "source": [ "fec_mrbo = fec[fec[\"cand_nm\"].isin([\"Obama, Barack\", \"Romney, Mitt\"])]" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [], "source": [ "fec[\"contbr_occupation\"].value_counts()[:10]" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [], "source": [ "occ_mapping = {\n", " \"INFORMATION REQUESTED PER BEST EFFORTS\" : \"NOT PROVIDED\",\n", " \"INFORMATION REQUESTED\" : \"NOT PROVIDED\",\n", " \"INFORMATION REQUESTED (BEST EFFORTS)\" : \"NOT PROVIDED\",\n", " \"C.E.O.\": \"CEO\"\n", "}\n", "\n", "def get_occ(x):\n", " # If no mapping provided, return x\n", " return occ_mapping.get(x, x)\n", "\n", "fec[\"contbr_occupation\"] = fec[\"contbr_occupation\"].map(get_occ)" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [], "source": [ "emp_mapping = {\n", " \"INFORMATION REQUESTED PER BEST EFFORTS\" : \"NOT PROVIDED\",\n", " \"INFORMATION REQUESTED\" : \"NOT PROVIDED\",\n", " \"SELF\" : \"SELF-EMPLOYED\",\n", " \"SELF EMPLOYED\" : \"SELF-EMPLOYED\",\n", "}\n", "\n", "def get_emp(x):\n", " # If no mapping provided, return x\n", " return emp_mapping.get(x, x)\n", "\n", "fec[\"contbr_employer\"] = fec[\"contbr_employer\"].map(get_emp)" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [], "source": [ "by_occupation = fec.pivot_table(\"contb_receipt_amt\",\n", " index=\"contbr_occupation\",\n", " columns=\"party\", aggfunc=\"sum\")\n", "over_2mm = by_occupation[by_occupation.sum(axis=\"columns\") > 2000000]\n", "over_2mm" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [], "source": [ "plt.figure()" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [], "source": [ "over_2mm.plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [], "source": [ "def get_top_amounts(group, key, n=5):\n", " totals = group.groupby(key)[\"contb_receipt_amt\"].sum()\n", " return totals.nlargest(n)" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [], "source": [ "grouped = fec_mrbo.groupby(\"cand_nm\")\n", "grouped.apply(get_top_amounts, \"contbr_occupation\", n=7)\n", "grouped.apply(get_top_amounts, \"contbr_employer\", n=10)" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [], "source": [ "bins = np.array([0, 1, 10, 100, 1000, 10000,\n", " 100_000, 1_000_000, 10_000_000])\n", "labels = pd.cut(fec_mrbo[\"contb_receipt_amt\"], bins)\n", "labels" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [], "source": [ "grouped = fec_mrbo.groupby([\"cand_nm\", labels])\n", "grouped.size().unstack(level=0)" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [], "source": [ "plt.figure()" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [], "source": [ "bucket_sums = grouped[\"contb_receipt_amt\"].sum().unstack(level=0)\n", "normed_sums = bucket_sums.div(bucket_sums.sum(axis=\"columns\"),\n", " axis=\"index\")\n", "normed_sums\n", "normed_sums[:-2].plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "grouped = fec_mrbo.groupby([\"cand_nm\", \"contbr_st\"])\n", "totals = grouped[\"contb_receipt_amt\"].sum().unstack(level=0).fillna(0)\n", "totals = totals[totals.sum(axis=\"columns\") > 100000]\n", "totals.head(10)" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [], "source": [ "percent = totals.div(totals.sum(axis=\"columns\"), axis=\"index\")\n", "percent.head(10)" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }