{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Analysis Examples"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"rc = {\"figure.figsize\" : (14, 6)}\n",
"sns.set(rc = rc)\n",
"my_palette = sns.color_palette(\"husl\", 4)\n",
"sns.set_palette(my_palette)\n",
"sns.set_style(\"whitegrid\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## USA.gov data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load the dataset\n",
"Use the `json` module to load the data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"path = \"datasets/bitly_usagov/example.txt\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import json"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',\n",
" 'al': 'en-US,en;q=0.8',\n",
" 'c': 'US',\n",
" 'cy': 'Danvers',\n",
" 'g': 'A6qOVH',\n",
" 'gr': 'MA',\n",
" 'h': 'wfLQtf',\n",
" 'hc': 1331822918,\n",
" 'hh': '1.usa.gov',\n",
" 'l': 'orofrog',\n",
" 'll': [42.576698, -70.954903],\n",
" 'nk': 1,\n",
" 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',\n",
" 't': 1331923247,\n",
" 'tz': 'America/New_York',\n",
" 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"records = [json.loads(line) for line in open(path)]\n",
"records[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Count the number of timezones with pandas"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" _heartbeat_ | \n",
" a | \n",
" al | \n",
" c | \n",
" cy | \n",
" g | \n",
" gr | \n",
" h | \n",
" hc | \n",
" hh | \n",
" kw | \n",
" l | \n",
" ll | \n",
" nk | \n",
" r | \n",
" t | \n",
" tz | \n",
" u | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" NaN | \n",
" Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | \n",
" en-US,en;q=0.8 | \n",
" US | \n",
" Danvers | \n",
" A6qOVH | \n",
" MA | \n",
" wfLQtf | \n",
" 1.331823e+09 | \n",
" 1.usa.gov | \n",
" NaN | \n",
" orofrog | \n",
" [42.576698, -70.954903] | \n",
" 1.0 | \n",
" http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/... | \n",
" 1.331923e+09 | \n",
" America/New_York | \n",
" http://www.ncbi.nlm.nih.gov/pubmed/22415991 | \n",
"
\n",
" \n",
" 1 | \n",
" NaN | \n",
" GoogleMaps/RochesterNY | \n",
" NaN | \n",
" US | \n",
" Provo | \n",
" mwszkS | \n",
" UT | \n",
" mwszkS | \n",
" 1.308262e+09 | \n",
" j.mp | \n",
" NaN | \n",
" bitly | \n",
" [40.218102, -111.613297] | \n",
" 0.0 | \n",
" http://www.AwareMap.com/ | \n",
" 1.331923e+09 | \n",
" America/Denver | \n",
" http://www.monroecounty.gov/etc/911/rss.php | \n",
"
\n",
" \n",
" 2 | \n",
" NaN | \n",
" Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... | \n",
" en-US | \n",
" US | \n",
" Washington | \n",
" xxr3Qb | \n",
" DC | \n",
" xxr3Qb | \n",
" 1.331920e+09 | \n",
" 1.usa.gov | \n",
" NaN | \n",
" bitly | \n",
" [38.9007, -77.043098] | \n",
" 1.0 | \n",
" http://t.co/03elZC4Q | \n",
" 1.331923e+09 | \n",
" America/New_York | \n",
" http://boxer.senate.gov/en/press/releases/0316... | \n",
"
\n",
" \n",
" 3 | \n",
" NaN | \n",
" Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... | \n",
" pt-br | \n",
" BR | \n",
" Braz | \n",
" zCaLwp | \n",
" 27 | \n",
" zUtuOu | \n",
" 1.331923e+09 | \n",
" 1.usa.gov | \n",
" NaN | \n",
" alelex88 | \n",
" [-23.549999, -46.616699] | \n",
" 0.0 | \n",
" direct | \n",
" 1.331923e+09 | \n",
" America/Sao_Paulo | \n",
" http://apod.nasa.gov/apod/ap120312.html | \n",
"
\n",
" \n",
" 4 | \n",
" NaN | \n",
" Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | \n",
" en-US,en;q=0.8 | \n",
" US | \n",
" Shrewsbury | \n",
" 9b6kNl | \n",
" MA | \n",
" 9b6kNl | \n",
" 1.273672e+09 | \n",
" bit.ly | \n",
" NaN | \n",
" bitly | \n",
" [42.286499, -71.714699] | \n",
" 0.0 | \n",
" http://www.shrewsbury-ma.gov/selco/ | \n",
" 1.331923e+09 | \n",
" America/New_York | \n",
" http://www.shrewsbury-ma.gov/egov/gallery/1341... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" _heartbeat_ a \\\n",
"0 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... \n",
"1 NaN GoogleMaps/RochesterNY \n",
"2 NaN Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... \n",
"3 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... \n",
"4 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... \n",
"\n",
" al c cy g gr h hc \\\n",
"0 en-US,en;q=0.8 US Danvers A6qOVH MA wfLQtf 1.331823e+09 \n",
"1 NaN US Provo mwszkS UT mwszkS 1.308262e+09 \n",
"2 en-US US Washington xxr3Qb DC xxr3Qb 1.331920e+09 \n",
"3 pt-br BR Braz zCaLwp 27 zUtuOu 1.331923e+09 \n",
"4 en-US,en;q=0.8 US Shrewsbury 9b6kNl MA 9b6kNl 1.273672e+09 \n",
"\n",
" hh kw l ll nk \\\n",
"0 1.usa.gov NaN orofrog [42.576698, -70.954903] 1.0 \n",
"1 j.mp NaN bitly [40.218102, -111.613297] 0.0 \n",
"2 1.usa.gov NaN bitly [38.9007, -77.043098] 1.0 \n",
"3 1.usa.gov NaN alelex88 [-23.549999, -46.616699] 0.0 \n",
"4 bit.ly NaN bitly [42.286499, -71.714699] 0.0 \n",
"\n",
" r t \\\n",
"0 http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/... 1.331923e+09 \n",
"1 http://www.AwareMap.com/ 1.331923e+09 \n",
"2 http://t.co/03elZC4Q 1.331923e+09 \n",
"3 direct 1.331923e+09 \n",
"4 http://www.shrewsbury-ma.gov/selco/ 1.331923e+09 \n",
"\n",
" tz u \n",
"0 America/New_York http://www.ncbi.nlm.nih.gov/pubmed/22415991 \n",
"1 America/Denver http://www.monroecounty.gov/etc/911/rss.php \n",
"2 America/New_York http://boxer.senate.gov/en/press/releases/0316... \n",
"3 America/Sao_Paulo http://apod.nasa.gov/apod/ap120312.html \n",
"4 America/New_York http://www.shrewsbury-ma.gov/egov/gallery/1341... "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(records)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 3560 entries, 0 to 3559\n",
"Data columns (total 18 columns):\n",
"_heartbeat_ 120 non-null float64\n",
"a 3440 non-null object\n",
"al 3094 non-null object\n",
"c 2919 non-null object\n",
"cy 2919 non-null object\n",
"g 3440 non-null object\n",
"gr 2919 non-null object\n",
"h 3440 non-null object\n",
"hc 3440 non-null float64\n",
"hh 3440 non-null object\n",
"kw 93 non-null object\n",
"l 3440 non-null object\n",
"ll 2919 non-null object\n",
"nk 3440 non-null float64\n",
"r 3440 non-null object\n",
"t 3440 non-null float64\n",
"tz 3440 non-null object\n",
"u 3440 non-null object\n",
"dtypes: float64(4), object(14)\n",
"memory usage: 500.7+ KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"America/New_York 4\n",
" 3\n",
"Europe/Warsaw 1\n",
"America/Denver 1\n",
"America/Sao_Paulo 1\n",
"Name: tz, dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"tz\"][:10].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"clean_tz = df[\"tz\"].fillna(\"Missing\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"clean_tz[clean_tz == \"\"] = \"Unknown\""
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 America/New_York\n",
"1 America/Denver\n",
"2 America/New_York\n",
"3 America/Sao_Paulo\n",
"4 America/New_York\n",
"5 America/New_York\n",
"6 Europe/Warsaw\n",
"7 Unknown\n",
"8 Unknown\n",
"9 Unknown\n",
"10 America/Los_Angeles\n",
"11 America/New_York\n",
"12 America/New_York\n",
"13 Missing\n",
"14 America/New_York\n",
"15 Asia/Hong_Kong\n",
"16 Asia/Hong_Kong\n",
"17 America/New_York\n",
"18 America/Denver\n",
"19 Europe/Rome\n",
"Name: tz, dtype: object"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clean_tz.head(n = 20)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"America/New_York 8\n",
"Unknown 3\n",
"America/Denver 2\n",
"Asia/Hong_Kong 2\n",
"Missing 1\n",
"America/Los_Angeles 1\n",
"Europe/Rome 1\n",
"America/Sao_Paulo 1\n",
"Europe/Warsaw 1\n",
"Name: tz, dtype: int64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clean_tz[:20].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA4EAAAFlCAYAAABV4O1xAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XlY1XXe//HXYRMMNxqX25BySXFBUdAM19sFFUXcyJrS\nLKdMU7zN7GLE3Ca08S7vUSdNp9HUcuEiJgIky6VgXEiPjIHLmGRJm8uYCEdFlvP7w1/nvr3VRG/x\ni36ej+vqujpwvt/zPud8u/TZ53u+x+Z0Op0CAAAAABjBzeoBAAAAAAB3DhEIAAAAAAYhAgEAAADA\nIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQD6sHwK2x2+1WjwAAAACgigsJCbnqZ0TgXexabyhgFbvd\nzjGJKoVjElURxyWqGo7Je9v1Fo44HRQAAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAG4cIwd7FT\ny96zegTAJUDSqS8OWT3GHVd3/FNWjwAAAHBTWAkEAAAAAIMQgQAAAABgECIQAAAAAAxCBAIAAACA\nQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQC\nAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAA\nDEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCAeN3PnLVu26MUXX9SmTZvUtGnT2z5MTk6O\nkpOTNWPGjFvaPi0tTcePH1f9+vU1ffp0ffjhhwoMDJQkDRo0SG+//bb8/f1v27wJCQnauXOn/vSn\nP0mSioqKNGTIEK1atUqNGjWq0D5efvll9e/fX3369Lltc6FiLpaWqKSszOoxcJfzKiiwegRch8Ph\nUMFd8P54eXnJx8fH6jEAAAa5qQhMTU1VSEiI0tLSFBMTc1sHKS0tVVBQkIKCgm55HxkZGRo1apSO\nHDmiBg0a6O2333YFWmWIjo5WUlKSdu7cqbCwMC1atEjDhw+vcACWlpZW2mz4dSuzd+nTvMNyWj0I\n7n4p662eAHc5Nzc3RUZG3vY/VwEAuJ4Knw7qcDhkt9sVHx+vtLQ0SVJWVpaeeuopjR8/Xr1799Yb\nb7yhjz76SCNGjFBkZKSOHz8uSTpz5owmTZqk4cOHa/jw4bLb7ZKkJUuWaNq0aXr88cf1yiuvKCsr\nS+PGjXM93u9//3tFRkYqMjJSmzdvliTNmjVLw4YN08CBA7V48WLXfE6nU4cOHVLr1q0lST179tTR\no0f19ddfX/Vc/v73v2vkyJEaOnSoYmJi5HA49OWXX2rixImSLq94tm3bVpcuXVJxcbF69+59zdfE\nZrNp9uzZmjdvnnJycrR7926NHTtWknTgwAFFR0crMjJSkyZNUmFhoSTpiSee0Lx58zRs2DC99957\nV+zvzTff1PTp01VeXl7RtwW36BMCEEAVUV5eruTkZKvHAAAYpMIRuHXrVnXr1k2NGzdWnTp1lJub\nK0k6fPiw5syZo/T0dCUnJ+ubb75RYmKiRowYobVr10qS4uPj9fTTT+uDDz7QkiVLrjjdMy8vT+++\n+64WLlx4xeMtXbpUvr6+SklJUUpKijp37ixJmjJlipKSkvTRRx9pz549Onz4sCTp4MGDCgwMlM1m\nu/zE3Nz0u9/9TsuXL79iv2fOnNGyZcu0atUq/e1vf1ObNm20atUqtWrVSocOHZIk2e12Pfzww8rJ\nydH+/fvVrl27674ugYGB6tq1q8aMGaMZM2bIy8tLkjRt2jTFxsYqJSVFjRs31tKlS13blJeXKykp\nSWPGjHH9bN68eSoqKlJ8fLzc3PioZmULbxoom2xWjwEAcnNzU1RUlNVjAAAMUuHTQdPS0jR69GhJ\nUkREhNLS0tSzZ08FBQWpXr16kqSAgAB16dJFktS8eXNlZWVJknbu3KmjR4+69lVUVCSHwyFJ6tWr\nl7y9va96vF27dl0RhrVq1ZIkpaenKyEhQaWlpTp16pTy8vIUGBiozMxMde/e/Yp9DBo0SMuWLVN+\nfr7rZ/v379fRo0f1xBNPSJJKSkoUHBwsDw8PBQQEKC8vT19++aWeeeYZ7d27V2VlZQoJCfnV1+bJ\nJ59URkaGHnnkEUnSzz//rOLiYtd2Q4YM0SuvvOK6/4ABA67YfvHixerQoYNmz579q4+D2+fZ9o/q\nt0GhfCYQ/2f3PxNt9Qi4jhv9T7yqgs8EAgDutApF4NmzZ7V7924dOXJENptNZWVlstls6tGjh2vl\nS7r8fzN/ue3m5qay//8X7PLyciUkJKhatWpX7ftm/uDLz8/XypUrlZiYqFq1aik2NlbFxcWSpB07\ndlxxeqgkeXh46Nlnn9Vf/vIX18+cTqe6dOly1cqjJIWGhiojI0MeHh4KCwtTbGysysrKrgi4a7HZ\nbDe1ele9evUrbrdt21Y5OTkqKChwxS4qn7eHp7w9PK0eA3c5/putuu677z7eHwAArqFC5bJ582ZF\nRUVp+/bt2rZtmz7//HP5+/tr7969FXqQrl27uk4NleQ67fLXhIWF6f3333fdLigokMPhkI+Pj2rU\nqKHTp08rIyNDklRYWKjS0lLVqVPnqv0MHTpUu3bt0pkzZyRJwcHB2rdvn7799ltJ0vnz53Xs2DFJ\nlyNw9erVCg4Olp+fn86ePatjx46pefPmFXqev6hTp468vb21b98+SVJycrI6dux43fv37NlTzz77\nrMaNG+daIQUAAACAylChCExNTb3qKwzCw8NdF4i5kbi4OOXm5ioyMlIRERFav/7GV9MbP368zp07\np0GDBmnw4MHKyspSYGCgWrVqpQEDBmjq1Knq0KGDpMurgGFhYdfcj5eXl0aNGqV//etfkiQ/Pz/N\nnz9fL730kiIjIzVy5EjXxWPatWun06dPu4KtRYsWat68uetzhjdjwYIFmj9/viIjI5WXl6cJEyb8\n6v0HDhyoYcOGacKECa7VTQAAAAC43WxOp/Ouv0hiXFycoqOjFRwcbPUod4zdblfAFzdeUQVQueqO\nf8rqEXAddrv9hp/pBu40jktUNRyT97brvb839T2BVVV8fLzVIwAAAADAXeGeiMA7Yc6cOa7P+P1i\n9OjRGj58uEUTAQAAAMDNIwIraNasWVaPAAAAAAD/Z3wrOQAAAAAYhAgEAAAAAIMQgQAAAABgECIQ\nAAAAAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAA\nYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIB5WD4BbV3f8U1aPALjY7XaF\nhIRYPQYAAABugJVAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAi\nEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABjEw+oBcOsSV/W3eoS7\n3ohnPrZ6BAAAAOCOYiUQAAAAAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAA\nGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQg\nAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAA\nwCBEIAAAAAAYpNIjcMuWLWrRooXy8vIqZf85OTl67bXXbnn7tLQ0LVu2TElJSercubOGDBmi8PBw\njR07Vvv27buNkwIAAACA9So9AlNTUxUSEqK0tLTbvu/S0lIFBQVpxowZt7yPjIwMdevWTZIUERGh\nDz/8UJ988omee+45TZo0qdLi9deUlpbe8ccEAAAAYAaPyty5w+GQ3W7XmjVr9MILLygmJkZZWVla\nsmSJatSooSNHjmjAgAFq3ry51qxZo+LiYr311lsKCAjQmTNnNGvWLP3www+SpOnTpyskJERLlizR\n8ePHlZ+fr4YNG2rkyJFauXKlli9fLofDoddee025ubmSpIkTJ6pfv36aNWuWcnJyVFxcrH79+ikm\nJkaS5HQ6dejQIbVu3VpHjhy5YvbOnTvrscce08aNGzV9+nQdP35cc+bM0c8//yxvb2/94Q9/UNOm\nTRUbGytfX1/l5ubq1KlTmjZtmvr3768pU6YoKipKPXv2lCTFxsaqZ8+e6tu3r9544w198cUXunTp\nkp588kk9/vjjysrK0qJFi1SzZk0dO3ZMmzdvrsy3BgAAAIChKjUCt27dqm7duqlx48aqU6eOK84O\nHz6sTZs2qXbt2urdu7eio6OVmJio1atXa+3atYqLi1N8fLyefvpphYaG6ocfftDYsWOVnp4uScrL\ny9O6devk7e2trKws1+MtXbpUvr6+SklJkSQVFBRIkqZMmaLatWurrKxMY8aM0eHDhxUYGKiDBw8q\nMDBQNpvtmvO3bt1aGzZskCS9+uqrmjNnjh566CHt379fc+bM0Zo1ayRJJ0+e1Lp16/T1119r/Pjx\n6t+/vyIiIpSenq6ePXvq0qVL2rVrl2bPnq3ExETVqFFDH3zwgS5duqTHH39cXbp0kSQdPHhQKSkp\natSoUSW8GwAAAABQyRGYlpam0aNHS7p8qmVaWpp69uypoKAg1atXT5IUEBDgiqDmzZu7om7nzp06\nevSoa19FRUVyOBySpF69esnb2/uqx9u1a5cWLlzoul2rVi1JUnp6uhISElRaWqpTp04pLy9PgYGB\nyszMVPfu3a87v9PplHR5RTM7O1uTJ092/e7SpUuuf+/Tp4/c3NzUrFkznT59WpLUvXt3xcfH69Kl\nS8rIyFBoaKi8vb21Y8cO/fOf/3St9BUWFurbb7+Vp6engoKCCEAAAAAAlarSIvDs2bPavXu3jhw5\nIpvNprKyMtlsNvXo0UNeXl6u+7m5ubluu7m5qaysTJJUXl6uhIQEVatW7ap9+/j4VHiO/Px8rVy5\nUomJiapVq5ZiY2NVXFwsSdqxY4cWL1583W0PHjyopk2byul0qmbNmkpOTr7m/f7n8/lFtWrV1KlT\nJ2VmZio9PV0RERGSLofljBkzXJ9D/EVWVpaqV69e4ecFAAAAALei0i4Ms3nzZkVFRWn79u3atm2b\nPv/8c/n7+2vv3r0V2r5r165au3at6/ahQ4duuE1YWJjef/991+2CggI5HA75+PioRo0aOn36tDIy\nMiRdXoErLS1VnTp1rrmvL774QgkJCXrsscfk6+srf39/1+moTqdThw8fvuE8ERERSkpK0t69e13R\n17VrV61fv14lJSWSpGPHjun8+fM33BcAAAAA3A6VFoGpqanq06fPFT8LDw+v8FVC4+LilJubq8jI\nSEVERGj9+vU33Gb8+PE6d+6cBg0apMGDBysrK0uBgYFq1aqVBgwYoKlTp6pDhw6SLq8ChoWFXbH9\npk2bFBUVpX79+mn58uVavHixmjZtKkn6z//8TyUmJmrw4MEaOHCgtmzZcsN5unTpoj179igsLMy1\nWhgdHa1mzZpp2LBhGjRokGbOnOla/QQAAACAymZz/vLBN8PExcUpOjpawcHBVo9yS+x2u459GWf1\nGHe9Ec98bPUI9wy73a6QkBCrxwBcOCZRFXFcoqrhmLy3Xe/9rdQLw1Rl8fHxVo8AAAAAAHdcpX9Z\nPAAAAACg6iACAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAA\nAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBB\niEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIN4WD0Abt2IZz62egQAAAAAdxlW\nAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAA\nAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgEA+rB8Cti/jba1aPAFzpeLrVE9wTNg2d\nYfUIAADgHsZKIAAAAAAYhAgEAAAAAIMQgQAAAABgECIQAAAAAAxCBAIAAACAQYhAAAAAADAIEQgA\nAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAw\nCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBBiEAA\nAAAAMEiFInDLli1q0aKF8vLyKmWInJwcvfbaa7e8fVpampYtW6bTp09r3LhxGjx4sCIiIvTcc8/d\nximlli1bKioqSoMGDVJMTIwuXLhwS/tZsmSJ/vrXv97W2QAAAACgIioUgampqQoJCVFaWtptH6C0\ntFRBQUGaMWPGLe8jIyND3bp10+LFixUWFqaPPvpImzZt0tSpU2/jpJK3t7eSk5OVmpoqT09Pbdiw\n4bbuHwAAAAAqm8eN7uBwOGS327VmzRq98MILiomJUVZWlpYsWaIaNWroyJEjGjBggJo3b641a9ao\nuLhYb731lgICAnTmzBnNmjVLP/zwgyRp+vTpCgkJ0ZIlS3T8+HHl5+erYcOGGjlypFauXKnly5fL\n4XDotddeU25uriRp4sSJ6tevn2bNmqWcnBwVFxerX79+iomJkSQ5nU4dOnRIrVu31smTJ9WlSxfX\n7IGBga7nMGHCBJ07d06lpaWaPHmy+vTpI0latWqVPvjgA0nSiBEjNGbMmAq9cKGhofrnP/8pSZow\nYYJ++uknFRcXa/To0Ro5cqQkqX379srOzpYkffzxx/rss8/0+uuvX7GfQ4cOadasWbpw4YICAgI0\nb9481apVq0IzAAAAAMDNumEEbt26Vd26dVPjxo1Vp04dV5wdPnxYmzZtUu3atdW7d29FR0crMTFR\nq1ev1tq1axUXF6f4+Hg9/fTTCg0N1Q8//KCxY8cqPT1dkpSXl6d169bJ29tbWVlZrsdbunSpfH19\nlZKSIkkqKCiQJE2ZMkW1a9dWWVmZxowZo8OHDyswMFAHDx5UYGCgbDabnnzySU2ZMkXvvfeewsLC\nNGzYMNWvX1/VqlXTW2+9JV9fX505c0YjR45U7969deDAASUlJSkhIUFOp1OPPfaYOnXqpFatWv3q\na1JaWupafZSkefPmqXbt2rp48aJGjBih8PBw1alTp0JvwCuvvKJXX31VnTp10qJFi/TnP/9ZcXFx\nFdoWAAAAAG7WDSMwLS1No0ePliRFREQoLS1NPXv2VFBQkOrVqydJCggIcK3ANW/e3BV1O3fu1NGj\nR137KioqksPhkCT16tVL3t7eVz3erl27tHDhQtftX1bF0tPTlZCQoNLSUp06dUp5eXkKDAxUZmam\nunfvLknq1q2btmzZoszMTGVkZGjo0KFKTU1VjRo1tHDhQu3Zs0dubm46ceKETp8+Lbvdrj59+qh6\n9eqSpL59+2rv3r3XjcCLFy8qKipK0uWVwBEjRkiS1q5dq08//VSS9OOPP+rbb7+tUAQWFhaqsLBQ\nnTp1kiQNHTpUkydPvuF2AAAAAHCrfjUCz549q927d+vIkSOy2WwqKyuTzWZTjx495OXl5bqfm5ub\n67abm5vKysokSeXl5UpISFC1atWu2rePj0+Fh8zPz9fKlSuVmJioWrVqKTY2VsXFxZKkHTt2aPHi\nxa771q5dW5GRkYqMjNS4ceO0Z88eORwOnTlzRklJSfL09FSvXr1c29+MXz4T+D9lZWVp586d2rhx\no3x8fDRq1Khr7vtWHg8AAAAAbrdfvTDM5s2bFRUVpe3bt2vbtm36/PPP5e/vr71791Zo5127dtXa\ntWtdtw8dOnTDbcLCwvT++++7bhcUFMjhcMjHx0c1atTQ6dOnlZGRIenySlppaalr1W3Xrl2uK3YW\nFRXp+PHj+rd/+zcVFhbq/vvvl6enp3bv3q3vv/9e0uXVvC1btujChQs6f/68tmzZotDQ0Ao9t18U\nFhaqVq1a8vHxUV5env7xj3+4fveb3/xGeXl5Ki8v15YtW67atkaNGqpZs6br9UxOTlbHjh1v6vEB\nAAAA4Gb86kpgamrqVV+zEB4ervXr1ysgIOCGO4+Li9PcuXMVGRmpsrIyhYaGau7cub+6zfjx4zV3\n7lwNGjRIbm5umjhxosLDw9WqVSsNGDBADRo0UIcOHSRdXgUMCwtzbXvgwAH94Q9/kLu7u5xOp6Kj\no9W2bVv5+/tr/PjxioyMVJs2bdSkSRNJUuvWrTVs2DBFR0dLunxhmBt9HvB/6969uzZs2KABAwao\ncePGCg4Odv1u6tSpGjdunPz8/NSmTRudP3/+qu3/+Mc/ui4M06hRI82fP/+mHh8AAAAAbobN6XQ6\nrR7iVsXFxSk6OvqK8DKF3W7Xq8fTrR4DQCXYNPTWvzIH/81utyskJMTqMYArcFyiquGYvLdd7/29\n4YVhqrL4+HirRwAAAACAu8pdHYGV4eeff77mdwW+++67Ff7aBwAAAACoqojA/6VOnTpXXQEUAAAA\nAO4Vv3p1UAAAAADAvYUIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAA\nAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBBiEAAAAAAMAgR\nCAAAAAAGIQIBAAAAwCBEIAAAAAAYxMPqAXDrNg2dYfUIgIvdbldISIjVYwAAAOAGWAkEAAAAAIMQ\ngQAAAABgECIQAAAAAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAA\nAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEE8rB4At27oB3+3egTgSt9wTKKK4ZhEFTPjIR+rRwAA\nVgIBAAAAwCREIAAAAAAYhAgEAAAAAIMQgQAAAABgECIQAAAAAAxCBAIAAACAQYhAAAAAADAIEQgA\nAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAw\nCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBB7kgE\ntmzZUlFRUa5/VqxYcSce9rpWrFihjz76SI888oicTqckKTs7Wy1atNBPP/0kSSosLFSnTp1UXl5u\n5agAAAAAcFt53IkH8fb2VnJy8i1tW1paKg+P2ztmZmamFi1apLp16yovL0/NmjVTdna2WrVqpX37\n9ikiIkL/+Mc/FBQUJDe3inVyZcwJAAAAALebpdXSq1cvJSYmys/PTzk5OVqwYIHWrl2rJUuW6Pjx\n48rPz1fDhg01f/58zZ49W7m5uXJ3d1dsbKw6d+6spKQkffrppyoqKtKJEyc0ePBgTZw4UZKUnJys\ntWvXqqSkRO3atdOsWbPk7u6uoqIilZSUyM/PT+3bt1d2drYrAp9++mllZ2crIiJC2dnZ6tChgyQp\nISFBGzduVElJiR588EEtWLBAPj4+io2NlZeXlw4dOqQOHTqod+/eio+PlyTZbDa99957stlsmjBh\ngs6dO6fS0lJNnjxZffr00TvvvCMvLy+NHj1a8+bN0+HDh7VmzRrt2rVLiYmJevPNNy17XwAAAADc\nu+5IBF68eFFRUVGu2+PGjVNERMSvbpOXl6d169bJ29tbK1eulCSlpKQoLy9PY8eO1ebNmyVJOTk5\nSklJkY+Pj0aMGKEePXqoevXqSk9P1/r16+Xp6anZs2crJSVFQ4YM0c6dO/Xoo49Kkjp06KAvvvhC\n0dHRys/P14ABA7Rx40ZJl08Pff755yVJffv21WOPPSZJ+q//+i8lJiZq1KhRkqQTJ05ow4YNcnd3\n1wsvvKCZM2cqJCREDodD1apVkyS99dZb8vX11ZkzZzRy5Ej17t1boaGhWrlypUaPHq3c3FxdunRJ\nJSUlstvt6tix4+166QEAAADgClX2dNBevXrJ29tbkmS32/XUU09Jkpo2baqGDRvq2LFjkqSwsDDV\nqVNH0uVYs9vt8vDwUG5urkaMGCHpcoTef//9ki6fCjps2DBJUvv27bV8+XLl5+frgQceULVq1eR0\nOuVwOHTgwAG1bdtWkvTVV1/pT3/6kwoLC+VwONS1a1fXnP3795e7u7uky1H5+uuvKzIyUuHh4brv\nvvtUUlKihQsXas+ePXJzc9OJEyd0+vRptW7dWgcOHFBRUZG8vLzUqlUr5ebmau/evZoxY8Ytvc4A\nAAAAcCOWng7q7u7uujBLcXHxFb/z8fGp0D5sNttVt51Op4YOHaqpU6dedf8vv/xSs2fPliQ99NBD\nKiws1Pbt2xUcHCxJatOmjZKSkvTAAw/ovvvukyTFxsZq6dKlCgwMVFJSkr744otrzvn888+rR48e\n+vzzz/XEE0/onXfe0f79+3XmzBklJSXJ09NTvXr1UnFxsTw9PeXv76+kpCS1b99eLVq0UFZWlo4f\nP66mTZtW6LkDAAAAwM2y9CsiHnjgAeXm5kqSPvnkk+veLzQ0VCkpKZKkY8eO6ccff1STJk0kSTt2\n7NDZs2d18eJFbdmyRR06dNCjjz6qzZs361//+pck6ezZs/r+++/11VdfqUmTJq6VO0lq166d1qxZ\no/bt20uSgoODtXr1atfnASXJ4XCobt26Kikpcc1xLcePH1eLFi30/PPPKygoSMeOHVNhYaHuv/9+\neXp6avfu3fr++++veF4rV65Ux44dFRoaqg0bNqhly5ZXhS0AAAAA3C6WfCawW7duevnllzVx4kTF\nxcVp0aJFeuSRR667/W9/+1vNnj1bkZGRcnd31/z58+Xl5SVJatu2rSZNmuS6MExQUJAk6T/+4z/0\n7LPPqry8XJ6enpo5c6bsdru6det2xb47dOigjIwMtWnTRtLlCMzPz3dFoSRNnjxZ0dHR8vPzU7t2\n7eRwOK4Mob+8AAAK90lEQVQ55+rVq5WVlSWbzaaHH35Y3bt3V1FRkcaPH6/IyEi1adPGFa/S5Qh8\n++23FRwcrOrVq6tatWoKDQ29yVcXAAAAACrO5vzlfMy7UFJSknJzczVz5swK3f+ZZ57RH//4R9Wr\nV6+SJ6t8drtdr31zweoxAADATZjxkI9CQkKsHgNwsdvtHJP3sOu9v0Z9sd2qVausHgEAAAAALHVX\nR+CwYcNcV/oEAAAAANyYpReGAQAAAADcWUQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIE\nAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgECIQAAAA\nAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGMTD6gFw6/42vKvVIwAudrtd\nISEhVo8BuHBMoiqy2+1WjwAArAQCAAAAgEmIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABiE\nCAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCAeVg+A\nW5f9zkmrRwBc3NRI2dkck6g6OCZRFXFcosppb/UAsAIrgQAAAABgECIQAAAAAAxCBAIAAACAQYhA\nAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAA\ngEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIE\nAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBGROB3332nQYMGXfGzJUuW6K9//et1t0lKStLc\nuXMrezQAAAAAuKM8rB4AAAAAuBnFJRdVWl5i9Rj3hHKHQwUFBVaPcU/w8vKSj4+P1WNUiPEROGrU\nKLVt21ZZWVkqLCxUfHy8QkNDr7jPZ599pmXLlmnZsmVasGCBfH19lZubq1OnTmnatGnq37+/nE6n\nFixYoMzMTNlsNo0fP14RERGaM2eOunbtqt69e+vFF19UzZo1NX/+fCUmJio/P1/R0dF67rnnFBIS\nouzsbNWvX19Lly6Vt7e3Ra8IAABA1ZW0d5V2ffWpnHJaPcq9IcnqAe4dbm5uioyMVExMjNWj3JAR\np4PeSFlZmRITEzV9+nT9+c9/vuJ3n376qVasWKEVK1bIz89PknTy5EmtW7dOy5cv15tvvilJ+uST\nT3T48GElJydr1apVWrBggU6ePKnQ0FDt3btXknTixAnl5eVJkux2uys2v/32Wz355JNKS0tTjRo1\ntHnz5jv11AEAAO4qO7/6hABElVReXq7k5GSrx6gQIyLQZrP96s/79u0rSWrdurW+//571+93796t\nv/zlL1qxYoVq1arl+nmfPn3k5uamZs2a6fTp05IuR93AgQPl7u6u3/zmN+rYsaNycnIUGhoqu92u\no0ePqlmzZrr//vt18uRJZWdnq3379pIkf39/tWzZ8pozAAAA4L+FPRx+3b/bAVZyc3NTVFSU1WNU\niBGng9auXfuqc50LCgrk7+8v6fL5u9LlN66srMx1n4CAAOXn5+vYsWMKCgpy/fyX+1dE/fr1de7c\nOWVmZio0NFQFBQVKT09X9erV5evrq7Nnz16xP3d3dxUXF9/S8wQAALjXDQt9RgPbPcFnAm+T8qDv\n1a5dO6vHuCfwmcAq5r777lPdunW1a9cuPfroozp79qwyMzM1evRoJSVd/0Tohg0batq0aZo0aZIW\nLVqkhx9++Lr3DQ0N1caNGzV06FAVFBRo7969euWVVyRJwcHBWr16tVavXq2zZ88qJiZG/fr1u+3P\nEwAAwATVPL1VTVw/4XYov+++K854gxmMiEBJWrBggebMmaPXX39dkvTiiy8qICDghts1bdpUb7zx\nhiZPnqy33377uvfr27evsrOzFRUVJZvNpmnTpqlu3bqSpJCQEP3973/Xgw8+qIYNG6qgoOCqi88A\nAAAAwJ1gczqdfLL2LmS32+WW3cjqMQAAAHAXK2+fr5CQEKvHQCWx2+3XfH+NuDAMAAAAAOAyIhAA\nAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABg\nECIQAAAAAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEA\nAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwiIfVA+DWtf9dPatHAFzsdrtCQkKsHgNw4ZhEVcRx\niarGbs+3egRYgJVAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBCb\n0+l0Wj0Ebp7dbrd6BAAAAABV3LW+loYIBAAAAACDcDooAAAAABiECAQAAAAAgxCBAAAAAGAQIhAA\nAAAADEIEAgAAAIBBiMC7TEZGhvr166e+fftqxYoVVo8Dw/34448aNWqUIiIiNHDgQK1evdrqkQBJ\nUllZmYYMGaJx48ZZPQogSTp37pxiYmLUv39/DRgwQNnZ2VaPBMO9++67GjhwoAYNGqSXXnpJxcXF\nVo+EO4gIvIuUlZVp7ty5euedd5SWlqbU1FQdPXrU6rFgMHd3d8XGxmrTpk3auHGj1q1bxzGJKmHN\nmjVq2rSp1WMALvHx8erWrZs+/vhjJScnc3zCUidOnNCaNWv0wQcfKDU1VWVlZUpLS7N6LNxBROBd\n5Msvv9SDDz6oRo0aycvLSwMHDtTWrVutHgsGq1evnlq3bi1J8vX1VZMmTXTixAmLp4LpfvrpJ332\n2WcaMWKE1aMAkqTCwkLt2bPHdUx6eXmpZs2aFk8F05WVlenixYsqLS3VxYsXVa9ePatHwh1EBN5F\nTpw4oQYNGrhu169fn79wo8r47rvvdOjQIbVr187qUWC4efPmadq0aXJz4484VA3fffed/Pz89Pvf\n/15DhgxRXFyczp8/b/VYMFj9+vX17LPP6t///d/VtWtX+fr6qmvXrlaPhTuIPyEB/J85HA7FxMRo\n+vTp8vX1tXocGGz79u3y8/NTmzZtrB4FcCktLdXBgwf1xBNP6MMPP5SPjw+f64elCgoKtHXrVm3d\nulWZmZm6cOGCkpOTrR4LdxAReBepX7++fvrpJ9ftEydOqH79+hZOBEglJSWKiYlRZGSkwsPDrR4H\nhtu3b5+2bdumXr166aWXXtLu3bv18ssvWz0WDNegQQM1aNDAdaZE//79dfDgQYungsl27twpf39/\n+fn5ydPTU+Hh4VysyDBE4F0kKChI33zzjfLz83Xp0iWlpaWpV69eVo8FgzmdTsXFxalJkyZ65pln\nrB4H0NSpU5WRkaFt27Zp4cKF6ty5s9544w2rx4Lh6tatqwYNGujrr7+WJO3atYsLw8BSDRs21P79\n+3XhwgU5nU6OSQN5WD0AKs7Dw0MzZ87U7373O5WVlWn48OF6+OGHrR4LBrPb7UpOTlbz5s0VFRUl\nSXrppZfUo0cPiycDgKrl1Vdf1csvv6ySkhI1atRI8+fPt3okGKxdu3bq16+fhg4dKg8PD7Vs2VIj\nR460eizcQTan0+m0eggAAAAAwJ3B6aAAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAA\nAAYhAgEAAADAIEQgAAAAABiECAQAAAAAg/w/57T0QJ9j5NkAAAAASUVORK5CYII=\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.barplot(y = clean_tz[:10].values, x = clean_tz[:10].index)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## MovieLens data"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
" after removing the cwd from sys.path.\n",
"/home/ubuntu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
" \n",
"/home/ubuntu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:8: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
" \n"
]
}
],
"source": [
"# Make display smaller\n",
"pd.options.display.max_rows = 15\n",
"unames = ['user_id', 'gender', 'age', 'occupation', 'zip']\n",
"users = pd.read_table('datasets/movielens/users.dat', sep='::',header=None, names=unames)\n",
"rnames = ['user_id', 'movie_id', 'rating', 'timestamp']\n",
"ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::',header=None, names=rnames)\n",
"mnames = ['movie_id', 'title', 'genres']\n",
"movies = pd.read_table('datasets/movielens/movies.dat', sep='::',header=None, names=mnames)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user_id | \n",
" gender | \n",
" age | \n",
" occupation | \n",
" zip | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" F | \n",
" 1 | \n",
" 10 | \n",
" 48067 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" M | \n",
" 56 | \n",
" 16 | \n",
" 70072 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" M | \n",
" 25 | \n",
" 15 | \n",
" 55117 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" M | \n",
" 45 | \n",
" 7 | \n",
" 02460 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" M | \n",
" 25 | \n",
" 20 | \n",
" 55455 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user_id gender age occupation zip\n",
"0 1 F 1 10 48067\n",
"1 2 M 56 16 70072\n",
"2 3 M 25 15 55117\n",
"3 4 M 45 7 02460\n",
"4 5 M 25 20 55455"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"users.head()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user_id | \n",
" movie_id | \n",
" rating | \n",
" timestamp | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1193 | \n",
" 5 | \n",
" 978300760 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 661 | \n",
" 3 | \n",
" 978302109 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 914 | \n",
" 3 | \n",
" 978301968 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 3408 | \n",
" 4 | \n",
" 978300275 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 2355 | \n",
" 5 | \n",
" 978824291 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user_id movie_id rating timestamp\n",
"0 1 1193 5 978300760\n",
"1 1 661 3 978302109\n",
"2 1 914 3 978301968\n",
"3 1 3408 4 978300275\n",
"4 1 2355 5 978824291"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings.head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" movie_id | \n",
" title | \n",
" genres | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Toy Story (1995) | \n",
" Animation|Children's|Comedy | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Jumanji (1995) | \n",
" Adventure|Children's|Fantasy | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Grumpier Old Men (1995) | \n",
" Comedy|Romance | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" Waiting to Exhale (1995) | \n",
" Comedy|Drama | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" Father of the Bride Part II (1995) | \n",
" Comedy | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" movie_id title genres\n",
"0 1 Toy Story (1995) Animation|Children's|Comedy\n",
"1 2 Jumanji (1995) Adventure|Children's|Fantasy\n",
"2 3 Grumpier Old Men (1995) Comedy|Romance\n",
"3 4 Waiting to Exhale (1995) Comedy|Drama\n",
"4 5 Father of the Bride Part II (1995) Comedy"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user_id | \n",
" movie_id | \n",
" rating | \n",
" timestamp | \n",
" gender | \n",
" age | \n",
" occupation | \n",
" zip | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1193 | \n",
" 5 | \n",
" 978300760 | \n",
" F | \n",
" 1 | \n",
" 10 | \n",
" 48067 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 661 | \n",
" 3 | \n",
" 978302109 | \n",
" F | \n",
" 1 | \n",
" 10 | \n",
" 48067 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 914 | \n",
" 3 | \n",
" 978301968 | \n",
" F | \n",
" 1 | \n",
" 10 | \n",
" 48067 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 3408 | \n",
" 4 | \n",
" 978300275 | \n",
" F | \n",
" 1 | \n",
" 10 | \n",
" 48067 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 2355 | \n",
" 5 | \n",
" 978824291 | \n",
" F | \n",
" 1 | \n",
" 10 | \n",
" 48067 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user_id movie_id rating timestamp gender age occupation zip\n",
"0 1 1193 5 978300760 F 1 10 48067\n",
"1 1 661 3 978302109 F 1 10 48067\n",
"2 1 914 3 978301968 F 1 10 48067\n",
"3 1 3408 4 978300275 F 1 10 48067\n",
"4 1 2355 5 978824291 F 1 10 48067"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"first_merge = pd.merge(ratings, users, on = \"user_id\")\n",
"first_merge.head()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user_id | \n",
" movie_id | \n",
" rating | \n",
" timestamp | \n",
" gender | \n",
" age | \n",
" occupation | \n",
" zip | \n",
" title | \n",
" genres | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1193 | \n",
" 5 | \n",
" 978300760 | \n",
" F | \n",
" 1 | \n",
" 10 | \n",
" 48067 | \n",
" One Flew Over the Cuckoo's Nest (1975) | \n",
" Drama | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 1193 | \n",
" 5 | \n",
" 978298413 | \n",
" M | \n",
" 56 | \n",
" 16 | \n",
" 70072 | \n",
" One Flew Over the Cuckoo's Nest (1975) | \n",
" Drama | \n",
"
\n",
" \n",
" 2 | \n",
" 12 | \n",
" 1193 | \n",
" 4 | \n",
" 978220179 | \n",
" M | \n",
" 25 | \n",
" 12 | \n",
" 32793 | \n",
" One Flew Over the Cuckoo's Nest (1975) | \n",
" Drama | \n",
"
\n",
" \n",
" 3 | \n",
" 15 | \n",
" 1193 | \n",
" 4 | \n",
" 978199279 | \n",
" M | \n",
" 25 | \n",
" 7 | \n",
" 22903 | \n",
" One Flew Over the Cuckoo's Nest (1975) | \n",
" Drama | \n",
"
\n",
" \n",
" 4 | \n",
" 17 | \n",
" 1193 | \n",
" 5 | \n",
" 978158471 | \n",
" M | \n",
" 50 | \n",
" 1 | \n",
" 95350 | \n",
" One Flew Over the Cuckoo's Nest (1975) | \n",
" Drama | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user_id movie_id rating timestamp gender age occupation zip \\\n",
"0 1 1193 5 978300760 F 1 10 48067 \n",
"1 2 1193 5 978298413 M 56 16 70072 \n",
"2 12 1193 4 978220179 M 25 12 32793 \n",
"3 15 1193 4 978199279 M 25 7 22903 \n",
"4 17 1193 5 978158471 M 50 1 95350 \n",
"\n",
" title genres \n",
"0 One Flew Over the Cuckoo's Nest (1975) Drama \n",
"1 One Flew Over the Cuckoo's Nest (1975) Drama \n",
"2 One Flew Over the Cuckoo's Nest (1975) Drama \n",
"3 One Flew Over the Cuckoo's Nest (1975) Drama \n",
"4 One Flew Over the Cuckoo's Nest (1975) Drama "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"second_merge = pd.merge(first_merge, movies, on = \"movie_id\")\n",
"second_merge.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1000209, 10)"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"second_merge.shape"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"df = second_merge"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" rating | \n",
"
\n",
" \n",
" gender | \n",
" title | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" F | \n",
" $1,000,000 Duck (1971) | \n",
" 3.375000 | \n",
"
\n",
" \n",
" 'Night Mother (1986) | \n",
" 3.388889 | \n",
"
\n",
" \n",
" 'Til There Was You (1997) | \n",
" 2.675676 | \n",
"
\n",
" \n",
" 'burbs, The (1989) | \n",
" 2.793478 | \n",
"
\n",
" \n",
" ...And Justice for All (1979) | \n",
" 3.828571 | \n",
"
\n",
" \n",
" 1-900 (1994) | \n",
" 2.000000 | \n",
"
\n",
" \n",
" 10 Things I Hate About You (1999) | \n",
" 3.646552 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" M | \n",
" Your Friends and Neighbors (1998) | \n",
" 3.536585 | \n",
"
\n",
" \n",
" Zachariah (1971) | \n",
" 3.500000 | \n",
"
\n",
" \n",
" Zed & Two Noughts, A (1985) | \n",
" 3.380952 | \n",
"
\n",
" \n",
" Zero Effect (1998) | \n",
" 3.723140 | \n",
"
\n",
" \n",
" Zero Kelvin (Kj�rlighetens kj�tere) (1995) | \n",
" 3.500000 | \n",
"
\n",
" \n",
" Zeus and Roxanne (1997) | \n",
" 2.357143 | \n",
"
\n",
" \n",
" eXistenZ (1999) | \n",
" 3.289086 | \n",
"
\n",
" \n",
"
\n",
"
7152 rows × 1 columns
\n",
"
"
],
"text/plain": [
" rating\n",
"gender title \n",
"F $1,000,000 Duck (1971) 3.375000\n",
" 'Night Mother (1986) 3.388889\n",
" 'Til There Was You (1997) 2.675676\n",
" 'burbs, The (1989) 2.793478\n",
" ...And Justice for All (1979) 3.828571\n",
" 1-900 (1994) 2.000000\n",
" 10 Things I Hate About You (1999) 3.646552\n",
"... ...\n",
"M Your Friends and Neighbors (1998) 3.536585\n",
" Zachariah (1971) 3.500000\n",
" Zed & Two Noughts, A (1985) 3.380952\n",
" Zero Effect (1998) 3.723140\n",
" Zero Kelvin (Kj�rlighetens kj�tere) (1995) 3.500000\n",
" Zeus and Roxanne (1997) 2.357143\n",
" eXistenZ (1999) 3.289086\n",
"\n",
"[7152 rows x 1 columns]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby([\"gender\", \"title\"])[[\"rating\"]].agg(\"mean\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" gender | \n",
" F | \n",
" M | \n",
"
\n",
" \n",
" title | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" $1,000,000 Duck (1971) | \n",
" 3.375000 | \n",
" 2.761905 | \n",
"
\n",
" \n",
" 'Night Mother (1986) | \n",
" 3.388889 | \n",
" 3.352941 | \n",
"
\n",
" \n",
" 'Til There Was You (1997) | \n",
" 2.675676 | \n",
" 2.733333 | \n",
"
\n",
" \n",
" 'burbs, The (1989) | \n",
" 2.793478 | \n",
" 2.962085 | \n",
"
\n",
" \n",
" ...And Justice for All (1979) | \n",
" 3.828571 | \n",
" 3.689024 | \n",
"
\n",
" \n",
" 1-900 (1994) | \n",
" 2.000000 | \n",
" 3.000000 | \n",
"
\n",
" \n",
" 10 Things I Hate About You (1999) | \n",
" 3.646552 | \n",
" 3.311966 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" Your Friends and Neighbors (1998) | \n",
" 2.888889 | \n",
" 3.536585 | \n",
"
\n",
" \n",
" Zachariah (1971) | \n",
" NaN | \n",
" 3.500000 | \n",
"
\n",
" \n",
" Zed & Two Noughts, A (1985) | \n",
" 3.500000 | \n",
" 3.380952 | \n",
"
\n",
" \n",
" Zero Effect (1998) | \n",
" 3.864407 | \n",
" 3.723140 | \n",
"
\n",
" \n",
" Zero Kelvin (Kj�rlighetens kj�tere) (1995) | \n",
" NaN | \n",
" 3.500000 | \n",
"
\n",
" \n",
" Zeus and Roxanne (1997) | \n",
" 2.777778 | \n",
" 2.357143 | \n",
"
\n",
" \n",
" eXistenZ (1999) | \n",
" 3.098592 | \n",
" 3.289086 | \n",
"
\n",
" \n",
"
\n",
"
3706 rows × 2 columns
\n",
"
"
],
"text/plain": [
"gender F M\n",
"title \n",
"$1,000,000 Duck (1971) 3.375000 2.761905\n",
"'Night Mother (1986) 3.388889 3.352941\n",
"'Til There Was You (1997) 2.675676 2.733333\n",
"'burbs, The (1989) 2.793478 2.962085\n",
"...And Justice for All (1979) 3.828571 3.689024\n",
"1-900 (1994) 2.000000 3.000000\n",
"10 Things I Hate About You (1999) 3.646552 3.311966\n",
"... ... ...\n",
"Your Friends and Neighbors (1998) 2.888889 3.536585\n",
"Zachariah (1971) NaN 3.500000\n",
"Zed & Two Noughts, A (1985) 3.500000 3.380952\n",
"Zero Effect (1998) 3.864407 3.723140\n",
"Zero Kelvin (Kj�rlighetens kj�tere) (1995) NaN 3.500000\n",
"Zeus and Roxanne (1997) 2.777778 2.357143\n",
"eXistenZ (1999) 3.098592 3.289086\n",
"\n",
"[3706 rows x 2 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.pivot_table(\"rating\", index = \"title\", columns = \"gender\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" gender | \n",
" F | \n",
" M | \n",
"
\n",
" \n",
" title | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" $1,000,000 Duck (1971) | \n",
" 3.375000 | \n",
" 2.761905 | \n",
"
\n",
" \n",
" 'Night Mother (1986) | \n",
" 3.388889 | \n",
" 3.352941 | \n",
"
\n",
" \n",
" 'Til There Was You (1997) | \n",
" 2.675676 | \n",
" 2.733333 | \n",
"
\n",
" \n",
" 'burbs, The (1989) | \n",
" 2.793478 | \n",
" 2.962085 | \n",
"
\n",
" \n",
" ...And Justice for All (1979) | \n",
" 3.828571 | \n",
" 3.689024 | \n",
"
\n",
" \n",
" 1-900 (1994) | \n",
" 2.000000 | \n",
" 3.000000 | \n",
"
\n",
" \n",
" 10 Things I Hate About You (1999) | \n",
" 3.646552 | \n",
" 3.311966 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" Your Friends and Neighbors (1998) | \n",
" 2.888889 | \n",
" 3.536585 | \n",
"
\n",
" \n",
" Zachariah (1971) | \n",
" NaN | \n",
" 3.500000 | \n",
"
\n",
" \n",
" Zed & Two Noughts, A (1985) | \n",
" 3.500000 | \n",
" 3.380952 | \n",
"
\n",
" \n",
" Zero Effect (1998) | \n",
" 3.864407 | \n",
" 3.723140 | \n",
"
\n",
" \n",
" Zero Kelvin (Kj�rlighetens kj�tere) (1995) | \n",
" NaN | \n",
" 3.500000 | \n",
"
\n",
" \n",
" Zeus and Roxanne (1997) | \n",
" 2.777778 | \n",
" 2.357143 | \n",
"
\n",
" \n",
" eXistenZ (1999) | \n",
" 3.098592 | \n",
" 3.289086 | \n",
"
\n",
" \n",
"
\n",
"
3706 rows × 2 columns
\n",
"
"
],
"text/plain": [
"gender F M\n",
"title \n",
"$1,000,000 Duck (1971) 3.375000 2.761905\n",
"'Night Mother (1986) 3.388889 3.352941\n",
"'Til There Was You (1997) 2.675676 2.733333\n",
"'burbs, The (1989) 2.793478 2.962085\n",
"...And Justice for All (1979) 3.828571 3.689024\n",
"1-900 (1994) 2.000000 3.000000\n",
"10 Things I Hate About You (1999) 3.646552 3.311966\n",
"... ... ...\n",
"Your Friends and Neighbors (1998) 2.888889 3.536585\n",
"Zachariah (1971) NaN 3.500000\n",
"Zed & Two Noughts, A (1985) 3.500000 3.380952\n",
"Zero Effect (1998) 3.864407 3.723140\n",
"Zero Kelvin (Kj�rlighetens kj�tere) (1995) NaN 3.500000\n",
"Zeus and Roxanne (1997) 2.777778 2.357143\n",
"eXistenZ (1999) 3.098592 3.289086\n",
"\n",
"[3706 rows x 2 columns]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"avg = df.pivot_table(\"rating\", index = \"title\", columns = \"gender\", aggfunc = \"mean\")\n",
"avg"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
"
\n",
" \n",
" title | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" $1,000,000 Duck (1971) | \n",
" 37 | \n",
"
\n",
" \n",
" 'Night Mother (1986) | \n",
" 70 | \n",
"
\n",
" \n",
" 'Til There Was You (1997) | \n",
" 52 | \n",
"
\n",
" \n",
" 'burbs, The (1989) | \n",
" 303 | \n",
"
\n",
" \n",
" ...And Justice for All (1979) | \n",
" 199 | \n",
"
\n",
" \n",
" 1-900 (1994) | \n",
" 2 | \n",
"
\n",
" \n",
" 10 Things I Hate About You (1999) | \n",
" 700 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" Your Friends and Neighbors (1998) | \n",
" 109 | \n",
"
\n",
" \n",
" Zachariah (1971) | \n",
" 2 | \n",
"
\n",
" \n",
" Zed & Two Noughts, A (1985) | \n",
" 29 | \n",
"
\n",
" \n",
" Zero Effect (1998) | \n",
" 301 | \n",
"
\n",
" \n",
" Zero Kelvin (Kj�rlighetens kj�tere) (1995) | \n",
" 2 | \n",
"
\n",
" \n",
" Zeus and Roxanne (1997) | \n",
" 23 | \n",
"
\n",
" \n",
" eXistenZ (1999) | \n",
" 410 | \n",
"
\n",
" \n",
"
\n",
"
3706 rows × 1 columns
\n",
"
"
],
"text/plain": [
" title\n",
"title \n",
"$1,000,000 Duck (1971) 37\n",
"'Night Mother (1986) 70\n",
"'Til There Was You (1997) 52\n",
"'burbs, The (1989) 303\n",
"...And Justice for All (1979) 199\n",
"1-900 (1994) 2\n",
"10 Things I Hate About You (1999) 700\n",
"... ...\n",
"Your Friends and Neighbors (1998) 109\n",
"Zachariah (1971) 2\n",
"Zed & Two Noughts, A (1985) 29\n",
"Zero Effect (1998) 301\n",
"Zero Kelvin (Kj�rlighetens kj�tere) (1995) 2\n",
"Zeus and Roxanne (1997) 23\n",
"eXistenZ (1999) 410\n",
"\n",
"[3706 rows x 1 columns]"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings_by_title = df.groupby(\"title\")[[\"title\"]].count()\n",
"ratings_by_title"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['American Beauty (1999)', 'Back to the Future (1985)', 'Fargo (1996)',\n",
" 'Jurassic Park (1993)', 'Matrix, The (1999)', 'Men in Black (1997)',\n",
" 'Raiders of the Lost Ark (1981)', 'Saving Private Ryan (1998)',\n",
" 'Silence of the Lambs, The (1991)',\n",
" 'Star Wars: Episode IV - A New Hope (1977)',\n",
" 'Star Wars: Episode V - The Empire Strikes Back (1980)',\n",
" 'Star Wars: Episode VI - Return of the Jedi (1983)',\n",
" 'Terminator 2: Judgment Day (1991)'],\n",
" dtype='object', name='title')"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings_2500 = ratings_by_title.index[ratings_by_title.title >= 2500]\n",
"ratings_2500"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" gender | \n",
" F | \n",
" M | \n",
"
\n",
" \n",
" title | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" American Beauty (1999) | \n",
" 4.238901 | \n",
" 4.347301 | \n",
"
\n",
" \n",
" Back to the Future (1985) | \n",
" 3.932707 | \n",
" 4.009259 | \n",
"
\n",
" \n",
" Fargo (1996) | \n",
" 4.217656 | \n",
" 4.267780 | \n",
"
\n",
" \n",
" Jurassic Park (1993) | \n",
" 3.579407 | \n",
" 3.814197 | \n",
"
\n",
" \n",
" Matrix, The (1999) | \n",
" 4.128405 | \n",
" 4.362235 | \n",
"
\n",
" \n",
" Men in Black (1997) | \n",
" 3.817844 | \n",
" 3.719000 | \n",
"
\n",
" \n",
" Raiders of the Lost Ark (1981) | \n",
" 4.332168 | \n",
" 4.520597 | \n",
"
\n",
" \n",
" Saving Private Ryan (1998) | \n",
" 4.114783 | \n",
" 4.398941 | \n",
"
\n",
" \n",
" Silence of the Lambs, The (1991) | \n",
" 4.271955 | \n",
" 4.381944 | \n",
"
\n",
" \n",
" Star Wars: Episode IV - A New Hope (1977) | \n",
" 4.302937 | \n",
" 4.495307 | \n",
"
\n",
" \n",
" Star Wars: Episode V - The Empire Strikes Back (1980) | \n",
" 4.106481 | \n",
" 4.344577 | \n",
"
\n",
" \n",
" Star Wars: Episode VI - Return of the Jedi (1983) | \n",
" 3.865237 | \n",
" 4.069058 | \n",
"
\n",
" \n",
" Terminator 2: Judgment Day (1991) | \n",
" 3.785088 | \n",
" 4.115367 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"gender F M\n",
"title \n",
"American Beauty (1999) 4.238901 4.347301\n",
"Back to the Future (1985) 3.932707 4.009259\n",
"Fargo (1996) 4.217656 4.267780\n",
"Jurassic Park (1993) 3.579407 3.814197\n",
"Matrix, The (1999) 4.128405 4.362235\n",
"Men in Black (1997) 3.817844 3.719000\n",
"Raiders of the Lost Ark (1981) 4.332168 4.520597\n",
"Saving Private Ryan (1998) 4.114783 4.398941\n",
"Silence of the Lambs, The (1991) 4.271955 4.381944\n",
"Star Wars: Episode IV - A New Hope (1977) 4.302937 4.495307\n",
"Star Wars: Episode V - The Empire Strikes Back ... 4.106481 4.344577\n",
"Star Wars: Episode VI - Return of the Jedi (1983) 3.865237 4.069058\n",
"Terminator 2: Judgment Day (1991) 3.785088 4.115367"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"avg.loc[ratings_2500]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" gender | \n",
" F | \n",
" M | \n",
"
\n",
" \n",
" title | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Clean Slate (Coup de Torchon) (1981) | \n",
" 5.0 | \n",
" 3.857143 | \n",
"
\n",
" \n",
" Ballad of Narayama, The (Narayama Bushiko) (1958) | \n",
" 5.0 | \n",
" 3.428571 | \n",
"
\n",
" \n",
" Raw Deal (1948) | \n",
" 5.0 | \n",
" 3.307692 | \n",
"
\n",
" \n",
" Bittersweet Motel (2000) | \n",
" 5.0 | \n",
" NaN | \n",
"
\n",
" \n",
" Skipped Parts (2000) | \n",
" 5.0 | \n",
" 4.000000 | \n",
"
\n",
" \n",
" Lamerica (1994) | \n",
" 5.0 | \n",
" 4.666667 | \n",
"
\n",
" \n",
" Gambler, The (A J�t�kos) (1997) | \n",
" 5.0 | \n",
" 3.166667 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" Wings of Courage (1995) | \n",
" NaN | \n",
" 3.000000 | \n",
"
\n",
" \n",
" With Byrd at the South Pole (1930) | \n",
" NaN | \n",
" 2.000000 | \n",
"
\n",
" \n",
" With Friends Like These... (1998) | \n",
" NaN | \n",
" 4.000000 | \n",
"
\n",
" \n",
" Wooden Man's Bride, The (Wu Kui) (1994) | \n",
" NaN | \n",
" 3.000000 | \n",
"
\n",
" \n",
" Year of the Horse (1997) | \n",
" NaN | \n",
" 3.250000 | \n",
"
\n",
" \n",
" Zachariah (1971) | \n",
" NaN | \n",
" 3.500000 | \n",
"
\n",
" \n",
" Zero Kelvin (Kj�rlighetens kj�tere) (1995) | \n",
" NaN | \n",
" 3.500000 | \n",
"
\n",
" \n",
"
\n",
"
3706 rows × 2 columns
\n",
"
"
],
"text/plain": [
"gender F M\n",
"title \n",
"Clean Slate (Coup de Torchon) (1981) 5.0 3.857143\n",
"Ballad of Narayama, The (Narayama Bushiko) (1958) 5.0 3.428571\n",
"Raw Deal (1948) 5.0 3.307692\n",
"Bittersweet Motel (2000) 5.0 NaN\n",
"Skipped Parts (2000) 5.0 4.000000\n",
"Lamerica (1994) 5.0 4.666667\n",
"Gambler, The (A J�t�kos) (1997) 5.0 3.166667\n",
"... ... ...\n",
"Wings of Courage (1995) NaN 3.000000\n",
"With Byrd at the South Pole (1930) NaN 2.000000\n",
"With Friends Like These... (1998) NaN 4.000000\n",
"Wooden Man's Bride, The (Wu Kui) (1994) NaN 3.000000\n",
"Year of the Horse (1997) NaN 3.250000\n",
"Zachariah (1971) NaN 3.500000\n",
"Zero Kelvin (Kj�rlighetens kj�tere) (1995) NaN 3.500000\n",
"\n",
"[3706 rows x 2 columns]"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_titles = avg.sort_values(ascending = False, by = \"F\")\n",
"sorted_titles"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"avg[\"mean_diff\"] = np.abs(avg.F - avg.M)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" gender | \n",
" F | \n",
" M | \n",
" mean_diff | \n",
"
\n",
" \n",
" title | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" $1,000,000 Duck (1971) | \n",
" 3.375000 | \n",
" 2.761905 | \n",
" 0.613095 | \n",
"
\n",
" \n",
" 'Night Mother (1986) | \n",
" 3.388889 | \n",
" 3.352941 | \n",
" 0.035948 | \n",
"
\n",
" \n",
" 'Til There Was You (1997) | \n",
" 2.675676 | \n",
" 2.733333 | \n",
" 0.057658 | \n",
"
\n",
" \n",
" 'burbs, The (1989) | \n",
" 2.793478 | \n",
" 2.962085 | \n",
" 0.168607 | \n",
"
\n",
" \n",
" ...And Justice for All (1979) | \n",
" 3.828571 | \n",
" 3.689024 | \n",
" 0.139547 | \n",
"
\n",
" \n",
" 1-900 (1994) | \n",
" 2.000000 | \n",
" 3.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 10 Things I Hate About You (1999) | \n",
" 3.646552 | \n",
" 3.311966 | \n",
" 0.334586 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" Your Friends and Neighbors (1998) | \n",
" 2.888889 | \n",
" 3.536585 | \n",
" 0.647696 | \n",
"
\n",
" \n",
" Zachariah (1971) | \n",
" NaN | \n",
" 3.500000 | \n",
" NaN | \n",
"
\n",
" \n",
" Zed & Two Noughts, A (1985) | \n",
" 3.500000 | \n",
" 3.380952 | \n",
" 0.119048 | \n",
"
\n",
" \n",
" Zero Effect (1998) | \n",
" 3.864407 | \n",
" 3.723140 | \n",
" 0.141266 | \n",
"
\n",
" \n",
" Zero Kelvin (Kj�rlighetens kj�tere) (1995) | \n",
" NaN | \n",
" 3.500000 | \n",
" NaN | \n",
"
\n",
" \n",
" Zeus and Roxanne (1997) | \n",
" 2.777778 | \n",
" 2.357143 | \n",
" 0.420635 | \n",
"
\n",
" \n",
" eXistenZ (1999) | \n",
" 3.098592 | \n",
" 3.289086 | \n",
" 0.190494 | \n",
"
\n",
" \n",
"
\n",
"
3706 rows × 3 columns
\n",
"
"
],
"text/plain": [
"gender F M mean_diff\n",
"title \n",
"$1,000,000 Duck (1971) 3.375000 2.761905 0.613095\n",
"'Night Mother (1986) 3.388889 3.352941 0.035948\n",
"'Til There Was You (1997) 2.675676 2.733333 0.057658\n",
"'burbs, The (1989) 2.793478 2.962085 0.168607\n",
"...And Justice for All (1979) 3.828571 3.689024 0.139547\n",
"1-900 (1994) 2.000000 3.000000 1.000000\n",
"10 Things I Hate About You (1999) 3.646552 3.311966 0.334586\n",
"... ... ... ...\n",
"Your Friends and Neighbors (1998) 2.888889 3.536585 0.647696\n",
"Zachariah (1971) NaN 3.500000 NaN\n",
"Zed & Two Noughts, A (1985) 3.500000 3.380952 0.119048\n",
"Zero Effect (1998) 3.864407 3.723140 0.141266\n",
"Zero Kelvin (Kj�rlighetens kj�tere) (1995) NaN 3.500000 NaN\n",
"Zeus and Roxanne (1997) 2.777778 2.357143 0.420635\n",
"eXistenZ (1999) 3.098592 3.289086 0.190494\n",
"\n",
"[3706 rows x 3 columns]"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"avg"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" gender | \n",
" F | \n",
" M | \n",
" mean_diff | \n",
"
\n",
" \n",
" title | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Tigrero: A Film That Was Never Made (1994) | \n",
" 1.000000 | \n",
" 4.333333 | \n",
" 3.333333 | \n",
"
\n",
" \n",
" Spiders, The (Die Spinnen, 1. Teil: Der Goldene See) (1919) | \n",
" 4.000000 | \n",
" 1.000000 | \n",
" 3.000000 | \n",
"
\n",
" \n",
" Neon Bible, The (1995) | \n",
" 1.000000 | \n",
" 4.000000 | \n",
" 3.000000 | \n",
"
\n",
" \n",
" James Dean Story, The (1957) | \n",
" 4.000000 | \n",
" 1.000000 | \n",
" 3.000000 | \n",
"
\n",
" \n",
" Country Life (1994) | \n",
" 5.000000 | \n",
" 2.000000 | \n",
" 3.000000 | \n",
"
\n",
" \n",
" Enfer, L' (1994) | \n",
" 1.000000 | \n",
" 3.750000 | \n",
" 2.750000 | \n",
"
\n",
" \n",
" Babyfever (1994) | \n",
" 3.666667 | \n",
" 1.000000 | \n",
" 2.666667 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" Santa with Muscles (1996) | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" Wirey Spindell (1999) | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" Nelly & Monsieur Arnaud (1995) | \n",
" 3.600000 | \n",
" 3.600000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" Walk in the Sun, A (1945) | \n",
" 4.000000 | \n",
" 4.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" Inferno (1980) | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" Shopping (1994) | \n",
" 2.000000 | \n",
" 2.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" Loaded (1994) | \n",
" 2.666667 | \n",
" 2.666667 | \n",
" 0.000000 | \n",
"
\n",
" \n",
"
\n",
"
3446 rows × 3 columns
\n",
"
"
],
"text/plain": [
"gender F M \\\n",
"title \n",
"Tigrero: A Film That Was Never Made (1994) 1.000000 4.333333 \n",
"Spiders, The (Die Spinnen, 1. Teil: Der Goldene... 4.000000 1.000000 \n",
"Neon Bible, The (1995) 1.000000 4.000000 \n",
"James Dean Story, The (1957) 4.000000 1.000000 \n",
"Country Life (1994) 5.000000 2.000000 \n",
"Enfer, L' (1994) 1.000000 3.750000 \n",
"Babyfever (1994) 3.666667 1.000000 \n",
"... ... ... \n",
"Santa with Muscles (1996) 1.000000 1.000000 \n",
"Wirey Spindell (1999) 1.000000 1.000000 \n",
"Nelly & Monsieur Arnaud (1995) 3.600000 3.600000 \n",
"Walk in the Sun, A (1945) 4.000000 4.000000 \n",
"Inferno (1980) 3.000000 3.000000 \n",
"Shopping (1994) 2.000000 2.000000 \n",
"Loaded (1994) 2.666667 2.666667 \n",
"\n",
"gender mean_diff \n",
"title \n",
"Tigrero: A Film That Was Never Made (1994) 3.333333 \n",
"Spiders, The (Die Spinnen, 1. Teil: Der Goldene... 3.000000 \n",
"Neon Bible, The (1995) 3.000000 \n",
"James Dean Story, The (1957) 3.000000 \n",
"Country Life (1994) 3.000000 \n",
"Enfer, L' (1994) 2.750000 \n",
"Babyfever (1994) 2.666667 \n",
"... ... \n",
"Santa with Muscles (1996) 0.000000 \n",
"Wirey Spindell (1999) 0.000000 \n",
"Nelly & Monsieur Arnaud (1995) 0.000000 \n",
"Walk in the Sun, A (1945) 0.000000 \n",
"Inferno (1980) 0.000000 \n",
"Shopping (1994) 0.000000 \n",
"Loaded (1994) 0.000000 \n",
"\n",
"[3446 rows x 3 columns]"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"avg.sort_values(by = \"mean_diff\", ascending = False).dropna()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}