{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Analysis Examples"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"rc = {\"figure.figsize\" : (14, 6)}\n",
"sns.set(rc = rc)\n",
"my_palette = sns.color_palette(\"husl\", 4)\n",
"sns.set_palette(my_palette)\n",
"sns.set_style(\"whitegrid\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## USA.gov data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load the dataset\n",
"Use the `json` module to load the data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"path = \"datasets/bitly_usagov/example.txt\""
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import json"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',\n",
" 'al': 'en-US,en;q=0.8',\n",
" 'c': 'US',\n",
" 'cy': 'Danvers',\n",
" 'g': 'A6qOVH',\n",
" 'gr': 'MA',\n",
" 'h': 'wfLQtf',\n",
" 'hc': 1331822918,\n",
" 'hh': '1.usa.gov',\n",
" 'l': 'orofrog',\n",
" 'll': [42.576698, -70.954903],\n",
" 'nk': 1,\n",
" 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',\n",
" 't': 1331923247,\n",
" 'tz': 'America/New_York',\n",
" 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"records = [json.loads(line) for line in open(path)]\n",
"records[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Count the number of timezones with pandas"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" _heartbeat_ | \n",
" a | \n",
" al | \n",
" c | \n",
" cy | \n",
" g | \n",
" gr | \n",
" h | \n",
" hc | \n",
" hh | \n",
" kw | \n",
" l | \n",
" ll | \n",
" nk | \n",
" r | \n",
" t | \n",
" tz | \n",
" u | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" NaN | \n",
" Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | \n",
" en-US,en;q=0.8 | \n",
" US | \n",
" Danvers | \n",
" A6qOVH | \n",
" MA | \n",
" wfLQtf | \n",
" 1.331823e+09 | \n",
" 1.usa.gov | \n",
" NaN | \n",
" orofrog | \n",
" [42.576698, -70.954903] | \n",
" 1.0 | \n",
" http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/... | \n",
" 1.331923e+09 | \n",
" America/New_York | \n",
" http://www.ncbi.nlm.nih.gov/pubmed/22415991 | \n",
"
\n",
" \n",
" 1 | \n",
" NaN | \n",
" GoogleMaps/RochesterNY | \n",
" NaN | \n",
" US | \n",
" Provo | \n",
" mwszkS | \n",
" UT | \n",
" mwszkS | \n",
" 1.308262e+09 | \n",
" j.mp | \n",
" NaN | \n",
" bitly | \n",
" [40.218102, -111.613297] | \n",
" 0.0 | \n",
" http://www.AwareMap.com/ | \n",
" 1.331923e+09 | \n",
" America/Denver | \n",
" http://www.monroecounty.gov/etc/911/rss.php | \n",
"
\n",
" \n",
" 2 | \n",
" NaN | \n",
" Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... | \n",
" en-US | \n",
" US | \n",
" Washington | \n",
" xxr3Qb | \n",
" DC | \n",
" xxr3Qb | \n",
" 1.331920e+09 | \n",
" 1.usa.gov | \n",
" NaN | \n",
" bitly | \n",
" [38.9007, -77.043098] | \n",
" 1.0 | \n",
" http://t.co/03elZC4Q | \n",
" 1.331923e+09 | \n",
" America/New_York | \n",
" http://boxer.senate.gov/en/press/releases/0316... | \n",
"
\n",
" \n",
" 3 | \n",
" NaN | \n",
" Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... | \n",
" pt-br | \n",
" BR | \n",
" Braz | \n",
" zCaLwp | \n",
" 27 | \n",
" zUtuOu | \n",
" 1.331923e+09 | \n",
" 1.usa.gov | \n",
" NaN | \n",
" alelex88 | \n",
" [-23.549999, -46.616699] | \n",
" 0.0 | \n",
" direct | \n",
" 1.331923e+09 | \n",
" America/Sao_Paulo | \n",
" http://apod.nasa.gov/apod/ap120312.html | \n",
"
\n",
" \n",
" 4 | \n",
" NaN | \n",
" Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | \n",
" en-US,en;q=0.8 | \n",
" US | \n",
" Shrewsbury | \n",
" 9b6kNl | \n",
" MA | \n",
" 9b6kNl | \n",
" 1.273672e+09 | \n",
" bit.ly | \n",
" NaN | \n",
" bitly | \n",
" [42.286499, -71.714699] | \n",
" 0.0 | \n",
" http://www.shrewsbury-ma.gov/selco/ | \n",
" 1.331923e+09 | \n",
" America/New_York | \n",
" http://www.shrewsbury-ma.gov/egov/gallery/1341... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" _heartbeat_ a \\\n",
"0 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... \n",
"1 NaN GoogleMaps/RochesterNY \n",
"2 NaN Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... \n",
"3 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... \n",
"4 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... \n",
"\n",
" al c cy g gr h hc \\\n",
"0 en-US,en;q=0.8 US Danvers A6qOVH MA wfLQtf 1.331823e+09 \n",
"1 NaN US Provo mwszkS UT mwszkS 1.308262e+09 \n",
"2 en-US US Washington xxr3Qb DC xxr3Qb 1.331920e+09 \n",
"3 pt-br BR Braz zCaLwp 27 zUtuOu 1.331923e+09 \n",
"4 en-US,en;q=0.8 US Shrewsbury 9b6kNl MA 9b6kNl 1.273672e+09 \n",
"\n",
" hh kw l ll nk \\\n",
"0 1.usa.gov NaN orofrog [42.576698, -70.954903] 1.0 \n",
"1 j.mp NaN bitly [40.218102, -111.613297] 0.0 \n",
"2 1.usa.gov NaN bitly [38.9007, -77.043098] 1.0 \n",
"3 1.usa.gov NaN alelex88 [-23.549999, -46.616699] 0.0 \n",
"4 bit.ly NaN bitly [42.286499, -71.714699] 0.0 \n",
"\n",
" r t \\\n",
"0 http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/... 1.331923e+09 \n",
"1 http://www.AwareMap.com/ 1.331923e+09 \n",
"2 http://t.co/03elZC4Q 1.331923e+09 \n",
"3 direct 1.331923e+09 \n",
"4 http://www.shrewsbury-ma.gov/selco/ 1.331923e+09 \n",
"\n",
" tz u \n",
"0 America/New_York http://www.ncbi.nlm.nih.gov/pubmed/22415991 \n",
"1 America/Denver http://www.monroecounty.gov/etc/911/rss.php \n",
"2 America/New_York http://boxer.senate.gov/en/press/releases/0316... \n",
"3 America/Sao_Paulo http://apod.nasa.gov/apod/ap120312.html \n",
"4 America/New_York http://www.shrewsbury-ma.gov/egov/gallery/1341... "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(records)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 3560 entries, 0 to 3559\n",
"Data columns (total 18 columns):\n",
"_heartbeat_ 120 non-null float64\n",
"a 3440 non-null object\n",
"al 3094 non-null object\n",
"c 2919 non-null object\n",
"cy 2919 non-null object\n",
"g 3440 non-null object\n",
"gr 2919 non-null object\n",
"h 3440 non-null object\n",
"hc 3440 non-null float64\n",
"hh 3440 non-null object\n",
"kw 93 non-null object\n",
"l 3440 non-null object\n",
"ll 2919 non-null object\n",
"nk 3440 non-null float64\n",
"r 3440 non-null object\n",
"t 3440 non-null float64\n",
"tz 3440 non-null object\n",
"u 3440 non-null object\n",
"dtypes: float64(4), object(14)\n",
"memory usage: 500.7+ KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"America/New_York 4\n",
" 3\n",
"Europe/Warsaw 1\n",
"America/Denver 1\n",
"America/Sao_Paulo 1\n",
"Name: tz, dtype: int64"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"tz\"][:10].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"clean_tz = df[\"tz\"].fillna(\"Missing\")"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"clean_tz[clean_tz == \"\"] = \"Unknown\""
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 America/New_York\n",
"1 America/Denver\n",
"2 America/New_York\n",
"3 America/Sao_Paulo\n",
"4 America/New_York\n",
"5 America/New_York\n",
"6 Europe/Warsaw\n",
"7 Unknown\n",
"8 Unknown\n",
"9 Unknown\n",
"10 America/Los_Angeles\n",
"11 America/New_York\n",
"12 America/New_York\n",
"13 Missing\n",
"14 America/New_York\n",
"15 Asia/Hong_Kong\n",
"16 Asia/Hong_Kong\n",
"17 America/New_York\n",
"18 America/Denver\n",
"19 Europe/Rome\n",
"Name: tz, dtype: object"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clean_tz.head(n = 20)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"America/New_York 8\n",
"Unknown 3\n",
"America/Denver 2\n",
"Asia/Hong_Kong 2\n",
"Europe/Warsaw 1\n",
"America/Los_Angeles 1\n",
"Europe/Rome 1\n",
"America/Sao_Paulo 1\n",
"Missing 1\n",
"Name: tz, dtype: int64"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clean_tz[:20].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA4EAAAFlCAYAAABV4O1xAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XlY1XXe//HXYRMMNxqX25BySXFBUdAM19sFFUXcyLpL\ns5wyTfE2s4sRc5vQxru8R500nUZTy4WLmAiQLJeCUSQ9MgYuY5IlbS5jIhwVWc7vD3+d+/ZWEx31\ni36ej+vqujpwvt/zPud8u/TZ53u+x+Z0Op0CAAAAABjBzeoBAAAAAAB3DhEIAAAAAAYhAgEAAADA\nIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQD6sHwM2x2+1WjwAAAACgigsJCbniZ0TgXexqbyhgFbvd\nzjGJKoVjElURxyWqGo7Je9u1Fo44HRQAAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAG4cIwd7GT\ny96zegTAJUDSyS8OWj3GHVd3/FNWjwAAAHBDWAkEAAAAAIMQgQAAAABgECIQAAAAAAxCBAIAAACA\nQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQC\nAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAA\nDEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCAeN3LnLVu26MUXX9SmTZvUtGnTWz5Mbm6u\nkpOTNWPGjJvaPi0tTceOHVP9+vU1ffp0ffjhhwoMDJQkDRo0SG+//bb8/f1v2bwJCQnauXOn/vjH\nP0qSiouLNWTIEK1atUqNGjWq1D5efvll9e/fX3369Lllc6HyLpSVqrS83OoxcBfzKiy0egRcg8Ph\nUOFd8P54eXnJx8fH6jEAAAa5oQhMTU1VSEiI0tLSFBMTc0sHKSsrU1BQkIKCgm56HxkZGRo1apQO\nHz6sBg0a6O2333YF2u0QHR2tpKQk7dy5U2FhYVq0aJGGDx9e6QAsKyu7bbPh+lbmZOnT/ENyWj0I\n7m4p662eAHc5Nzc3RUZG3vI/VwEAuJZKnw7qcDhkt9sVHx+vtLQ0SVJ2draeeuopjR8/Xr1799Yb\nb7yhjz76SCNGjFBkZKSOHTsmSTp9+rQmTZqk4cOHa/jw4bLb7ZKkJUuWaNq0aXr88cf1yiuvKDs7\nW+PGjXM93u9+9ztFRkYqMjJSmzdvliTNmjVLw4YN08CBA7V48WLXfE6nUwcPHlTr1q0lST179tSR\nI0f09ddfX/Fc/va3v2nkyJEaOnSoYmJi5HA49OWXX2rixImSLq14tm3bVhcvXlRJSYl69+591dfE\nZrNp9uzZmjdvnnJzc7Vr1y6NHTtWkrR//35FR0crMjJSkyZNUlFRkSTpiSee0Lx58zRs2DC99957\nl+3vzTff1PTp01VRUVHZtwX/gk8IQABVQEVFhZKTk60eAwBgkEpH4NatW9WtWzc1btxYderUUV5e\nniTp0KFDmjNnjtLT05WcnKxvvvlGiYmJGjFihNauXStJio+P19NPP60PPvhAS5Ysuex0z/z8fL37\n7rtauHDhZY+3dOlS+fr6KiUlRSkpKercubMkacqUKUpKStJHH32k3bt369ChQ5KkAwcOKDAwUDab\n7dITc3PTb3/7Wy1fvvyy/Z4+fVrLli3TqlWr9Ne//lVt2rTRqlWr1KpVKx08eFCSZLfb9fDDDys3\nN1f79u1Tu3btrvm6BAYGqmvXrhozZoxmzJghLy8vSdK0adMUGxurlJQUNW7cWEuXLnVtU1FRoaSk\nJI0ZM8b1s3nz5qm4uFjx8fFyc+OjmndCeNNA2WSzegwAhnNzc1NUVJTVYwAADFLp00HT0tI0evRo\nSVJERITS0tLUs2dPBQUFqV69epKkgIAAdenSRZLUvHlzZWdnS5J27typI0eOuPZVXFwsh8MhSerV\nq5e8vb2veLysrKzLwrBWrVqSpPT0dCUkJKisrEwnT55Ufn6+AgMDlZmZqe7du1+2j0GDBmnZsmUq\nKChw/Wzfvn06cuSInnjiCUlSaWmpgoOD5eHhoYCAAOXn5+vLL7/UM888oz179qi8vFwhISG/+to8\n+eSTysjI0COPPCJJ+vnnn1VSUuLabsiQIXrllVdc9x8wYMBl2y9evFgdOnTQ7Nmzf/VxcGs92/5R\n/UdQKJ8JxL/k/meirR4B13C9/4lXVfCZQADAnVapCDxz5ox27dqlw4cPy2azqby8XDabTT169HCt\nfEmX/m/mL7fd3NxU/v//cl1RUaGEhARVq1btin3fyB98BQUFWrlypRITE1WrVi3FxsaqpKREkrRj\nx47LTg+VJA8PDz377LP685//7PqZ0+lUly5drlh5lKTQ0FBlZGTIw8NDYWFhio2NVXl5+WUBdzU2\nm+2GVu+qV69+2e22bdsqNzdXhYWFrtjFneHt4SlvD0+rx8BdjP9mq6777ruP9wcAgKuoVLls3rxZ\nUVFR2r59u7Zt26bPP/9c/v7+2rNnT6UepGvXrq5TQyW5Trv8NWFhYXr//fddtwsLC+VwOOTj46Ma\nNWro1KlTysjIkCQVFRWprKxMderUuWI/Q4cOVVZWlk6fPi1JCg4O1t69e/Xtt99Kks6dO6ejR49K\nuhSBq1evVnBwsPz8/HTmzBkdPXpUzZs3r9Tz/EWdOnXk7e2tvXv3SpKSk5PVsWPHa96/Z8+eevbZ\nZzVu3DjXCikAAAAA3A6VisDU1NQrvsIgPDzcdYGY64mLi1NeXp4iIyMVERGh9euvfzW98ePH6+zZ\nsxo0aJAGDx6s7OxsBQYGqlWrVhowYICmTp2qDh06SLq0ChgWFnbV/Xh5eWnUqFH65z//KUny8/PT\n/Pnz9dJLLykyMlIjR450XTymXbt2OnXqlCvYWrRooebNm7s+Z3gjFixYoPnz5ysyMlL5+fmaMGHC\nr95/4MCBGjZsmCZMmOBa3QQAAACAW83mdDrv+gskxsXFKTo6WsHBwVaPcsfY7XYFfHH9FVUAt1fd\n8U9ZPQKuwW63X/cz3cCdxnGJqoZj8t52rff3hr4nsKqKj4+3egQAAAAAuCvcExF4J8yZM8f1Gb9f\njB49WsOHD7doIgAAAAC4cURgJc2aNcvqEQAAAADgX8a3kgMAAACAQYhAAAAAADAIEQgAAAAABiEC\nAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAA\nAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADOJh9QC4eXXHP2X1CICL3W5X\nSEiI1WMAAADgOlgJBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAYh\nAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBBPKweADcvcVV/q0e4\n64145mOrRwAAAADuKFYCAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAA\ngEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIE\nAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgECIQAAAA\nAAxCBAIAAACAQW57BG7ZskUtWrRQfn7+bdl/bm6uXnvttZvePi0tTcuWLVNSUpI6d+6sIUOGKDw8\nXGPHjtXevXtv4aQAAAAAYL3bHoGpqakKCQlRWlraLd93WVmZgoKCNGPGjJveR0ZGhrp16yZJioiI\n0IcffqhPPvlEzz33nCZNmnTb4vXXlJWV3fHHBAAAAGAGj9u5c4fDIbvdrjVr1uiFF15QTEyMsrOz\ntWTJEtWoUUOHDx/WgAED1Lx5c61Zs0YlJSV66623FBAQoNOnT2vWrFn64YcfJEnTp09XSEiIlixZ\nomPHjqmgoEANGzbUyJEjtXLlSi1fvlwOh0Ovvfaa8vLyJEkTJ05Uv379NGvWLOXm5qqkpET9+vVT\nTEyMJMnpdOrgwYNq3bq1Dh8+fNnsnTt31mOPPaaNGzdq+vTpOnbsmObMmaOff/5Z3t7e+v3vf6+m\nTZsqNjZWvr6+ysvL08mTJzVt2jT1799fU6ZMUVRUlHr27ClJio2NVc+ePdW3b1+98cYb+uKLL3Tx\n4kU9+eSTevzxx5Wdna1FixapZs2aOnr0qDZv3nw73xoAAAAAhrqtEbh161Z169ZNjRs3Vp06dVxx\ndujQIW3atEm1a9dW7969FR0drcTERK1evVpr165VXFyc4uPj9fTTTys0NFQ//PCDxo4dq/T0dElS\nfn6+1q1bJ29vb2VnZ7seb+nSpfL19VVKSookqbCwUJI0ZcoU1a5dW+Xl5RozZowOHTqkwMBAHThw\nQIGBgbLZbFedv3Xr1tqwYYMk6dVXX9WcOXP00EMPad++fZozZ47WrFkjSTpx4oTWrVunr7/+WuPH\nj1f//v0VERGh9PR09ezZUxcvXlRWVpZmz56txMRE1ahRQx988IEuXryoxx9/XF26dJEkHThwQCkp\nKWrUqNFteDcAAAAA4DZHYFpamkaPHi3p0qmWaWlp6tmzp4KCglSvXj1JUkBAgCuCmjdv7oq6nTt3\n6siRI659FRcXy+FwSJJ69eolb2/vKx4vKytLCxcudN2uVauWJCk9PV0JCQkqKyvTyZMnlZ+fr8DA\nQGVmZqp79+7XnN/pdEq6tKKZk5OjyZMnu3538eJF17/36dNHbm5uatasmU6dOiVJ6t69u+Lj43Xx\n4kVlZGQoNDRU3t7e2rFjh/7xj3+4VvqKior07bffytPTU0FBQQQgAAAAgNvqtkXgmTNntGvXLh0+\nfFg2m03l5eWy2Wzq0aOHvLy8XPdzc3Nz3XZzc1N5ebkkqaKiQgkJCapWrdoV+/bx8an0HAUFBVq5\ncqUSExNVq1YtxcbGqqSkRJK0Y8cOLV68+JrbHjhwQE2bNpXT6VTNmjWVnJx81fv97+fzi2rVqqlT\np07KzMxUenq6IiIiJF0KyxkzZrg+h/iL7OxsVa9evdLPCwAAAABuxm27MMzmzZsVFRWl7du3a9u2\nbfr888/l7++vPXv2VGr7rl27au3ata7bBw8evO42YWFhev/99123CwsL5XA45OPjoxo1aujUqVPK\nyMiQdGkFrqysTHXq1Lnqvr744gslJCTosccek6+vr/z9/V2nozqdTh06dOi680RERCgpKUl79uxx\nRV/Xrl21fv16lZaWSpKOHj2qc+fOXXdfAAAAAHAr3LYITE1NVZ8+fS77WXh4eKWvEhoXF6e8vDxF\nRkYqIiJC69evv+4248eP19mzZzVo0CANHjxY2dnZCgwMVKtWrTRgwABNnTpVHTp0kHRpFTAsLOyy\n7Tdt2qSoqCj169dPy5cv1+LFi9W0aVNJ0n/9138pMTFRgwcP1sCBA7Vly5brztOlSxft3r1bYWFh\nrtXC6OhoNWvWTMOGDdOgQYM0c+ZM1+onAAAAANxuNucvH3wzTFxcnKKjoxUcHGz1KDfFbrfr6Jdx\nVo9x1xvxzMdWj3DPsNvtCgkJsXoMwIVjElURxyWqGo7Je9u13t/bemGYqiw+Pt7qEQAAAADgjrvt\nXxYPAAAAAKg6iEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgECIQ\nAAAAAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAA\nYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIB5WD4CbN+KZj60eAQAAAMBd\nhpVAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQC\nAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABjEw+oBcPMi/vqa1SMAlzuWbvUE94RN\nQ2dYPQIAALiHsRIIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIE\nAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgECIQAAAA\nAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAi\nEAAAAAAMUqkI3LJli1q0aKH8/PzbMkRubq5ee+21m94+LS1Ny5Yt06lTpzRu3DgNHjxYEREReu65\n527hlFLLli0VFRWlQYMGKSYmRufPn7+p/SxZskR/+ctfbulsAAAAAFAZlYrA1NRUhYSEKC0t7ZYP\nUFZWpqCgIM2YMeOm95GRkaFu3bpp8eLFCgsL00cffaRNmzZp6tSpt3BSydvbW8nJyUpNTZWnp6c2\nbNhwS/cPAAAAALebx/Xu4HA4ZLfbtWbNGr3wwguKiYlRdna2lixZoho1aujw4cMaMGCAmjdvrjVr\n1qikpERvvfWWAgICdPr0ac2aNUs//PCDJGn69OkKCQnRkiVLdOzYMRUUFKhhw4YaOXKkVq5cqeXL\nl8vhcOi1115TXl6eJGnixInq16+fZs2apdzcXJWUlKhfv36KiYmRJDmdTh08eFCtW7fWiRMn1KVL\nF9fsgYGBrucwYcIEnT17VmVlZZo8ebL69OkjSVq1apU++OADSdKIESM0ZsyYSr1woaGh+sc//iFJ\nmjBhgn766SeVlJRo9OjRGjlypCSpffv2ysnJkSR9/PHH+uyzz/T6669ftp+DBw9q1qxZOn/+vAIC\nAjRv3jzVqlWrUjMAAAAAwI26bgRu3bpV3bp1U+PGjVWnTh1XnB06dEibNm1S7dq11bt3b0VHRysx\nMVGrV6/W2rVrFRcXp/j4eD399NMKDQ3VDz/8oLFjxyo9PV2SlJ+fr3Xr1snb21vZ2dmux1u6dKl8\nfX2VkpIiSSosLJQkTZkyRbVr11Z5ebnGjBmjQ4cOKTAwUAcOHFBgYKBsNpuefPJJTZkyRe+9957C\nwsI0bNgw1a9fX9WqVdNbb70lX19fnT59WiNHjlTv3r21f/9+JSUlKSEhQU6nU4899pg6deqkVq1a\n/eprUlZW5lp9lKR58+apdu3aunDhgkaMGKHw8HDVqVOnUm/AK6+8oldffVWdOnXSokWL9Kc//Ulx\ncXGV2hYAAAAAbtR1IzAtLU2jR4+WJEVERCgtLU09e/ZUUFCQ6tWrJ0kKCAhwrcA1b97cFXU7d+7U\nkSNHXPsqLi6Ww+GQJPXq1Uve3t5XPF5WVpYWLlzouv3Lqlh6eroSEhJUVlamkydPKj8/X4GBgcrM\nzFT37t0lSd26ddOWLVuUmZmpjIwMDR06VKmpqapRo4YWLlyo3bt3y83NTcePH9epU6dkt9vVp08f\nVa9eXZLUt29f7dmz55oReOHCBUVFRUm6tBI4YsQISdLatWv16aefSpJ+/PFHffvtt5WKwKKiIhUV\nFalTp06SpKFDh2ry5MnX3Q4AAAAAbtavRuCZM2e0a9cuHT58WDabTeXl5bLZbOrRo4e8vLxc93Nz\nc3PddnNzU3l5uSSpoqJCCQkJqlat2hX79vHxqfSQBQUFWrlypRITE1WrVi3FxsaqpKREkrRjxw4t\nXrzYdd/atWsrMjJSkZGRGjdunHbv3i2Hw6HTp08rKSlJnp6e6tWrl2v7G/HLZwL/t+zsbO3cuVMb\nN26Uj4+PRo0addV938zjAQAAAMCt9qsXhtm8ebOioqK0fft2bdu2TZ9//rn8/f21Z8+eSu28a9eu\nWrt2rev2wYMHr7tNWFiY3n//fdftwsJCORwO+fj4qEaNGjp16pQyMjIkXVpJKysrc626ZWVlua7Y\nWVxcrGPHjunf/u3fVFRUpPvvv1+enp7atWuXvv/+e0mXVvO2bNmi8+fP69y5c9qyZYtCQ0Mr9dx+\nUVRUpFq1asnHx0f5+fn6+9//7vrdb37zG+Xn56uiokJbtmy5YtsaNWqoZs2artczOTlZHTt2vKHH\nBwAAAIAb8asrgampqVd8zUJ4eLjWr1+vgICA6+48Li5Oc+fOVWRkpMrLyxUaGqq5c+f+6jbjx4/X\n3LlzNWjQILm5uWnixIkKDw9Xq1atNGDAADVo0EAdOnSQdGkVMCwszLXt/v379fvf/17u7u5yOp2K\njo5W27Zt5e/vr/HjxysyMlJt2rRRkyZNJEmtW7fWsGHDFB0dLenShWGu93nA/6t79+7asGGDBgwY\noMaNGys4ONj1u6lTp2rcuHHy8/NTmzZtdO7cuSu2/8Mf/uC6MEyjRo00f/78G3p8AAAAALgRNqfT\n6bR6iJsVFxen6Ojoy8LLFHa7Xa8eS7d6DAC3waahN/+VOfgfdrtdISEhVo8BXIbjElUNx+S97Vrv\n73UvDFOVxcfHWz0CAAAAANxV7uoIvB1+/vnnq35X4Lvvvlvpr30AAAAAgKqKCPw/6tSpc8UVQAEA\nAADgXvGrVwcFAAAAANxbiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAA\nAABgECIQAAAAAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACD\nEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEE8rB4AN2/T0BlWjwC42O12hYSEWD0GAAAAroOVQAAAAAAw\nCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBBiEAA\nAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYxMPqAXDzhn7wN6tHAC73DcckqhiOSVQxMx7ysXoE\nAGAlEAAAAABMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCB\nAAAAAGAQIhAAAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAA\nAIMQgQAAAABgECIQAAAAAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGOSO\nRGDLli0VFRXl+mfFihV34mGvacWKFfroo4/0yCOPyOl0SpJycnLUokUL/fTTT5KkoqIiderUSRUV\nFVaOCgAAAAC3lMedeBBvb28lJyff1LZlZWXy8Li1Y2ZmZmrRokWqW7eu8vPz1axZM+Xk5KhVq1ba\nu3evIiIi9Pe//11BQUFyc6tcJ9+OOQEAAADgVrO0Wnr16qXExET5+fkpNzdXCxYs0Nq1a7VkyRId\nO3ZMBQUFatiwoebPn6/Zs2crLy9P7u7uio2NVefOnZWUlKRPP/1UxcXFOn78uAYPHqyJEydKkpKT\nk7V27VqVlpaqXbt2mjVrltzd3VVcXKzS0lL5+fmpffv2ysnJcUXg008/rZycHEVERCgnJ0cdOnSQ\nJCUkJGjjxo0qLS3Vgw8+qAULFsjHx0exsbHy8vLSwYMH1aFDB/Xu3Vvx8fGSJJvNpvfee082m00T\nJkzQ2bNnVVZWpsmTJ6tPnz5655135OXlpdGjR2vevHk6dOiQ1qxZo6ysLCUmJurNN9+07H0BAAAA\ncO+6IxF44cIFRUVFuW6PGzdOERERv7pNfn6+1q1bJ29vb61cuVKSlJKSovz8fI0dO1abN2+WJOXm\n5iolJUU+Pj4aMWKEevTooerVqys9PV3r16+Xp6enZs+erZSUFA0ZMkQ7d+7Uo48+Kknq0KGDvvji\nC0VHR6ugoEADBgzQxo0bJV06PfT555+XJPXt21ePPfaYJOm///u/lZiYqFGjRkmSjh8/rg0bNsjd\n3V0vvPCCZs6cqZCQEDkcDlWrVk2S9NZbb8nX11enT5/WyJEj1bt3b4WGhmrlypUaPXq08vLydPHi\nRZWWlsput6tjx4636qUHAAAAgMtU2dNBe/XqJW9vb0mS3W7XU089JUlq2rSpGjZsqKNHj0qSwsLC\nVKdOHUmXYs1ut8vDw0N5eXkaMWKEpEsRev/990u6dCrosGHDJEnt27fX8uXLVVBQoAceeEDVqlWT\n0+mUw+HQ/v371bZtW0nSV199pT/+8Y8qKiqSw+FQ165dXXP2799f7u7uki5F5euvv67IyEiFh4fr\nvvvuU2lpqRYuXKjdu3fLzc1Nx48f16lTp9S6dWvt379fxcXF8vLyUqtWrZSXl6c9e/ZoxowZN/U6\nAwAAAMD1WHo6qLu7u+vCLCUlJZf9zsfHp1L7sNlsV9x2Op0aOnSopk6desX9v/zyS82ePVuS9NBD\nD6moqEjbt29XcHCwJKlNmzZKSkrSAw88oPvuu0+SFBsbq6VLlyowMFBJSUn64osvrjrn888/rx49\neujzzz/XE088oXfeeUf79u3T6dOnlZSUJE9PT/Xq1UslJSXy9PSUv7+/kpKS1L59e7Vo0ULZ2dk6\nduyYmjZtWqnnDgAAAAA3ytKviHjggQeUl5cnSfrkk0+ueb/Q0FClpKRIko4ePaoff/xRTZo0kSTt\n2LFDZ86c0YULF7RlyxZ16NBBjz76qDZv3qx//vOfkqQzZ87o+++/11dffaUmTZq4Vu4kqV27dlqz\nZo3at28vSQoODtbq1atdnweUJIfDobp166q0tNQ1x9UcO3ZMLVq00PPPP6+goCAdPXpURUVFuv/+\n++Xp6aldu3bp+++/v+x5rVy5Uh07dlRoaKg2bNigli1bXhG2AAAAAHCrWPKZwG7duunll1/WxIkT\nFRcXp0WLFumRRx655vb/8R//odmzZysyMlLu7u6aP3++vLy8JElt27bVpEmTXBeGCQoKkiT953/+\np5599llVVFTI09NTM2fOlN1uV7du3S7bd4cOHZSRkaE2bdpIuhSBBQUFriiUpMmTJys6Olp+fn5q\n166dHA4BsxV8AAAK+ElEQVTHVedcvXq1srOzZbPZ9PDDD6t79+4qLi7W+PHjFRkZqTZt2rjiVboU\ngW+//baCg4NVvXp1VatWTaGhoTf46gIAAABA5dmcv5yPeRdKSkpSXl6eZs6cWan7P/PMM/rDH/6g\nevXq3ebJbj+73a7Xvjlv9RgAAOAGzHjIRyEhIVaPAbjY7XaOyXvYtd5fo77YbtWqVVaPAAAAAACW\nuqsjcNiwYa4rfQIAAAAArs/SC8MAAAAAAO4sIhAAAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAG\nIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgECIQAAAAAAxCBAIAAACAQYhAAAAAADAIEQgA\nAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAM4mH1ALh5fx3e1eoRABe7\n3a6QkBCrxwBcOCZRFdntdqtHAABWAgEAAADAJEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAA\nDEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgEA+r\nB8DNy3nnhNUjAC5uaqScHI5JVB0ck6iKOC5R5bS3egBYgZVAAAAAADAIEQgAAAAABiECAQAAAMAg\nRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEA\nAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAG\nIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgECMi8LvvvtOgQYMu+9mSJUv0l7/85ZrbJCUl\nae7cubd7NAAAAAC4ozysHgAAAAC4ESWlF1RWUWr1GPeECodDhYWFVo9xT/Dy8pKPj4/VY1SK8RE4\natQotW3bVtnZ2SoqKlJ8fLxCQ0Mvu89nn32mZcuWadmyZVqwYIF8fX2Vl5enkydPatq0aerfv7+c\nTqcWLFigzMxM2Ww2jR8/XhEREZozZ466du2q3r1768UXX1TNmjU1f/58JSYmqqCgQNHR0XruuecU\nEhKinJwc1a9fX0uXLpW3t7dFrwgAAEDVlbRnlbK++lROOa0e5d6QZPUA9w43NzdFRkYqJibG6lGu\ny4jTQa+nvLxciYmJmj59uv70pz9d9rtPP/1UK1as0IoVK+Tn5ydJOnHihNatW6fly5frzTfflCR9\n8sknOnTokJKTk7Vq1SotWLBAJ06cUGhoqPbs2SNJOn78uPLz8yVJdrvdFZvffvutnnzySaWlpalG\njRravHnznXrqAAAAd5WdX31CAKJKqqioUHJystVjVIoREWiz2X7153379pUktW7dWt9//73r97t2\n7dKf//xnrVixQrVq1XL9vE+fPnJzc1OzZs106tQpSZeibuDAgXJ3d9dvfvMbdezYUbm5uQoNDZXd\nbteRI0fUrFkz3X///Tpx4oRycnLUvn17SZK/v79atmx51RkAAADwP8IeDr/m3+0AK7m5uSkqKsrq\nMSrFiNNBa9eufcW5zoWFhfL395d06fxd6dIbV15e7rpPQECACgoKdPToUQUFBbl+/sv9K6N+/fo6\ne/asMjMzFRoaqsLCQqWnp6t69ery9fXVmTNnLtufu7u7SkpKbup5AgAA3OuGhT6jge2e4DOBt0hF\n0Pdq166d1WPcE/hMYBVz3333qW7dusrKytKjjz6qM2fOKDMzU6NHj1ZS0rVPhG7YsKGmTZumSZMm\nadGiRXr44Yeved/Q0FBt3LhRQ4cOVWFhofbs2aNXXnlFkhQcHKzVq1dr9erVOnPmjGJiYtSvX79b\n/jwBAABMUM3TW9XE9RNuhYr77rvsjDeYwYgIlKQFCxZozpw5ev311yVJL774ogICAq67XdOmTfXG\nG29o8uTJevvtt695v759+yonJ0dRUVGy2WyaNm2a6tatK0kKCQnR3/72Nz344INq2LChCgsLr7j4\nDAAAAADcCTan08kna+9CdrtdbjmNrB4DAAAAd7GK9gUKCQmxegzcJna7/arvrxEXhgEAAAAAXEIE\nAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgECIQAAAA\nAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAi\nEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAbxsHoA3Lz2v61n9QiAi91uV0hIiNVjAC4ck6iK\nOC5R1djtBVaPAAuwEggAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAM\nYnM6nU6rh8CNs9vtVo8AAAAAoIq72tfSEIEAAAAAYBBOBwUAAAAAgxCBAAAAAGAQIhAAAAAADEIE\nAgAAAIBBiEAAAAAAMAgReJfJyMhQv3791LdvX61YscLqcWC4H3/8UaNGjVJERIQGDhyo1atXWz0S\nIEkqLy/XkCFDNG7cOKtHASRJZ8+eVUxMjPr3768BAwYoJyfH6pFguHfffVcDBw7UoEGD9NJLL6mk\npMTqkXAHEYF3kfLycs2dO1fvvPOO0tLSlJqaqiNHjlg9Fgzm7u6u2NhYbdq0SRs3btS6des4JlEl\nrFmzRk2bNrV6DMAlPj5e3bp108cff6zk5GSOT1jq+PHjWrNmjT744AOlpqaqvLxcaWlpVo+FO4gI\nvIt8+eWXevDBB9WoUSN5eXlp4MCB2rp1q9VjwWD16tVT69atJUm+vr5q0qSJjh8/bvFUMN1PP/2k\nzz77TCNGjLB6FECSVFRUpN27d7uOSS8vL9WsWdPiqWC68vJyXbhwQWVlZbpw4YLq1atn9Ui4g4jA\nu8jx48fVoEED1+369evzF25UGd99950OHjyodu3aWT0KDDdv3jxNmzZNbm78EYeq4bvvvpOfn59+\n97vfaciQIYqLi9O5c+esHgsGq1+/vp599ln9+7//u7p27SpfX1917drV6rFwB/EnJIB/mcPhUExM\njKZPny5fX1+rx4HBtm/fLj8/P7Vp08bqUQCXsrIyHThwQE888YQ+/PBD+fj48Ll+WKqwsFBbt27V\n1q1blZmZqfPnzys5OdnqsXAHEYF3kfr16+unn35y3T5+/Ljq169v4USAVFpaqpiYGEVGRio8PNzq\ncWC4vXv3atu2berVq5deeukl7dq1Sy+//LLVY8FwDRo0UIMGDVxnSvTv318HDhyweCqYbOfOnfL3\n95efn588PT0VHh7OxYoMQwTeRYKCgvTNN9+ooKBAFy9eVFpamnr16mX1WDCY0+lUXFycmjRpomee\necbqcQBNnTpVGRkZ2rZtmxYuXKjOnTvrjTfesHosGK5u3bpq0KCBvv76a0lSVlYWF4aBpRo2bKh9\n+/bp/PnzcjqdHJMG8rB6AFSeh4eHZs6cqd/+9rcqLy/X8OHD9fDDD1s9Fgxmt9uVnJys5s2bKyoq\nSpL00ksvqUePHhZPBgBVy6uvvqqXX35ZpaWlatSokebPn2/1SDBYu3bt1K9fPw0dOlQeHh5q2bKl\nRo4cafVYuINsTqfTafUQAAAAAIA7g9NBAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAA\nAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAb5f5cc9EBFqbCUAAAAAElFTkSuQmCC\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.barplot(y = clean_tz[:10].values, x = clean_tz[:10].index)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## MovieLens data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
" after removing the cwd from sys.path.\n",
"/home/ubuntu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
" \n"
]
}
],
"source": [
"# Make display smaller\n",
"pd.options.display.max_rows = 15\n",
"unames = ['user_id', 'gender', 'age', 'occupation', 'zip']\n",
"users = pd.read_table('datasets/movielens/users.dat', sep='::',header=None, names=unames)\n",
"rnames = ['user_id', 'movie_id', 'rating', 'timestamp']\n",
"ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::',header=None, names=rnames)\n",
"mnames = ['movie_id', 'title', 'genres']\n",
"movies = pd.read_table('datasets/movielens/movies.dat', sep='::',header=None, names=mnames)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user_id | \n",
" gender | \n",
" age | \n",
" occupation | \n",
" zip | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" F | \n",
" 1 | \n",
" 10 | \n",
" 48067 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" M | \n",
" 56 | \n",
" 16 | \n",
" 70072 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" M | \n",
" 25 | \n",
" 15 | \n",
" 55117 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" M | \n",
" 45 | \n",
" 7 | \n",
" 02460 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" M | \n",
" 25 | \n",
" 20 | \n",
" 55455 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user_id gender age occupation zip\n",
"0 1 F 1 10 48067\n",
"1 2 M 56 16 70072\n",
"2 3 M 25 15 55117\n",
"3 4 M 45 7 02460\n",
"4 5 M 25 20 55455"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"users.head()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user_id | \n",
" movie_id | \n",
" rating | \n",
" timestamp | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1193 | \n",
" 5 | \n",
" 978300760 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 661 | \n",
" 3 | \n",
" 978302109 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 914 | \n",
" 3 | \n",
" 978301968 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 3408 | \n",
" 4 | \n",
" 978300275 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 2355 | \n",
" 5 | \n",
" 978824291 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user_id movie_id rating timestamp\n",
"0 1 1193 5 978300760\n",
"1 1 661 3 978302109\n",
"2 1 914 3 978301968\n",
"3 1 3408 4 978300275\n",
"4 1 2355 5 978824291"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings.head()"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" movie_id | \n",
" title | \n",
" genres | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Toy Story (1995) | \n",
" Animation|Children's|Comedy | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Jumanji (1995) | \n",
" Adventure|Children's|Fantasy | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Grumpier Old Men (1995) | \n",
" Comedy|Romance | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" Waiting to Exhale (1995) | \n",
" Comedy|Drama | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" Father of the Bride Part II (1995) | \n",
" Comedy | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" movie_id title genres\n",
"0 1 Toy Story (1995) Animation|Children's|Comedy\n",
"1 2 Jumanji (1995) Adventure|Children's|Fantasy\n",
"2 3 Grumpier Old Men (1995) Comedy|Romance\n",
"3 4 Waiting to Exhale (1995) Comedy|Drama\n",
"4 5 Father of the Bride Part II (1995) Comedy"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.head()"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user_id | \n",
" movie_id | \n",
" rating | \n",
" timestamp | \n",
" gender | \n",
" age | \n",
" occupation | \n",
" zip | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1193 | \n",
" 5 | \n",
" 978300760 | \n",
" F | \n",
" 1 | \n",
" 10 | \n",
" 48067 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 661 | \n",
" 3 | \n",
" 978302109 | \n",
" F | \n",
" 1 | \n",
" 10 | \n",
" 48067 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 914 | \n",
" 3 | \n",
" 978301968 | \n",
" F | \n",
" 1 | \n",
" 10 | \n",
" 48067 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 3408 | \n",
" 4 | \n",
" 978300275 | \n",
" F | \n",
" 1 | \n",
" 10 | \n",
" 48067 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 2355 | \n",
" 5 | \n",
" 978824291 | \n",
" F | \n",
" 1 | \n",
" 10 | \n",
" 48067 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user_id movie_id rating timestamp gender age occupation zip\n",
"0 1 1193 5 978300760 F 1 10 48067\n",
"1 1 661 3 978302109 F 1 10 48067\n",
"2 1 914 3 978301968 F 1 10 48067\n",
"3 1 3408 4 978300275 F 1 10 48067\n",
"4 1 2355 5 978824291 F 1 10 48067"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"first_merge = pd.merge(ratings, users, on = \"user_id\")\n",
"first_merge.head()"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user_id | \n",
" movie_id | \n",
" rating | \n",
" timestamp | \n",
" gender | \n",
" age | \n",
" occupation | \n",
" zip | \n",
" title | \n",
" genres | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1193 | \n",
" 5 | \n",
" 978300760 | \n",
" F | \n",
" 1 | \n",
" 10 | \n",
" 48067 | \n",
" One Flew Over the Cuckoo's Nest (1975) | \n",
" Drama | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 1193 | \n",
" 5 | \n",
" 978298413 | \n",
" M | \n",
" 56 | \n",
" 16 | \n",
" 70072 | \n",
" One Flew Over the Cuckoo's Nest (1975) | \n",
" Drama | \n",
"
\n",
" \n",
" 2 | \n",
" 12 | \n",
" 1193 | \n",
" 4 | \n",
" 978220179 | \n",
" M | \n",
" 25 | \n",
" 12 | \n",
" 32793 | \n",
" One Flew Over the Cuckoo's Nest (1975) | \n",
" Drama | \n",
"
\n",
" \n",
" 3 | \n",
" 15 | \n",
" 1193 | \n",
" 4 | \n",
" 978199279 | \n",
" M | \n",
" 25 | \n",
" 7 | \n",
" 22903 | \n",
" One Flew Over the Cuckoo's Nest (1975) | \n",
" Drama | \n",
"
\n",
" \n",
" 4 | \n",
" 17 | \n",
" 1193 | \n",
" 5 | \n",
" 978158471 | \n",
" M | \n",
" 50 | \n",
" 1 | \n",
" 95350 | \n",
" One Flew Over the Cuckoo's Nest (1975) | \n",
" Drama | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user_id movie_id rating timestamp gender age occupation zip \\\n",
"0 1 1193 5 978300760 F 1 10 48067 \n",
"1 2 1193 5 978298413 M 56 16 70072 \n",
"2 12 1193 4 978220179 M 25 12 32793 \n",
"3 15 1193 4 978199279 M 25 7 22903 \n",
"4 17 1193 5 978158471 M 50 1 95350 \n",
"\n",
" title genres \n",
"0 One Flew Over the Cuckoo's Nest (1975) Drama \n",
"1 One Flew Over the Cuckoo's Nest (1975) Drama \n",
"2 One Flew Over the Cuckoo's Nest (1975) Drama \n",
"3 One Flew Over the Cuckoo's Nest (1975) Drama \n",
"4 One Flew Over the Cuckoo's Nest (1975) Drama "
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"second_merge = pd.merge(first_merge, movies, on = \"movie_id\")\n",
"second_merge.head()"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1000209, 10)"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"second_merge.shape"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"df = second_merge"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" rating | \n",
"
\n",
" \n",
" gender | \n",
" title | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" F | \n",
" $1,000,000 Duck (1971) | \n",
" 3.375000 | \n",
"
\n",
" \n",
" 'Night Mother (1986) | \n",
" 3.388889 | \n",
"
\n",
" \n",
" 'Til There Was You (1997) | \n",
" 2.675676 | \n",
"
\n",
" \n",
" 'burbs, The (1989) | \n",
" 2.793478 | \n",
"
\n",
" \n",
" ...And Justice for All (1979) | \n",
" 3.828571 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" M | \n",
" Zed & Two Noughts, A (1985) | \n",
" 3.380952 | \n",
"
\n",
" \n",
" Zero Effect (1998) | \n",
" 3.723140 | \n",
"
\n",
" \n",
" Zero Kelvin (Kj�rlighetens kj�tere) (1995) | \n",
" 3.500000 | \n",
"
\n",
" \n",
" Zeus and Roxanne (1997) | \n",
" 2.357143 | \n",
"
\n",
" \n",
" eXistenZ (1999) | \n",
" 3.289086 | \n",
"
\n",
" \n",
"
\n",
"
7152 rows × 1 columns
\n",
"
"
],
"text/plain": [
" rating\n",
"gender title \n",
"F $1,000,000 Duck (1971) 3.375000\n",
" 'Night Mother (1986) 3.388889\n",
" 'Til There Was You (1997) 2.675676\n",
" 'burbs, The (1989) 2.793478\n",
" ...And Justice for All (1979) 3.828571\n",
"... ...\n",
"M Zed & Two Noughts, A (1985) 3.380952\n",
" Zero Effect (1998) 3.723140\n",
" Zero Kelvin (Kj�rlighetens kj�tere) (1995) 3.500000\n",
" Zeus and Roxanne (1997) 2.357143\n",
" eXistenZ (1999) 3.289086\n",
"\n",
"[7152 rows x 1 columns]"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby([\"gender\", \"title\"])[[\"rating\"]].agg(\"mean\")"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" gender | \n",
" F | \n",
" M | \n",
"
\n",
" \n",
" title | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" $1,000,000 Duck (1971) | \n",
" 3.375000 | \n",
" 2.761905 | \n",
"
\n",
" \n",
" 'Night Mother (1986) | \n",
" 3.388889 | \n",
" 3.352941 | \n",
"
\n",
" \n",
" 'Til There Was You (1997) | \n",
" 2.675676 | \n",
" 2.733333 | \n",
"
\n",
" \n",
" 'burbs, The (1989) | \n",
" 2.793478 | \n",
" 2.962085 | \n",
"
\n",
" \n",
" ...And Justice for All (1979) | \n",
" 3.828571 | \n",
" 3.689024 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" Zed & Two Noughts, A (1985) | \n",
" 3.500000 | \n",
" 3.380952 | \n",
"
\n",
" \n",
" Zero Effect (1998) | \n",
" 3.864407 | \n",
" 3.723140 | \n",
"
\n",
" \n",
" Zero Kelvin (Kj�rlighetens kj�tere) (1995) | \n",
" NaN | \n",
" 3.500000 | \n",
"
\n",
" \n",
" Zeus and Roxanne (1997) | \n",
" 2.777778 | \n",
" 2.357143 | \n",
"
\n",
" \n",
" eXistenZ (1999) | \n",
" 3.098592 | \n",
" 3.289086 | \n",
"
\n",
" \n",
"
\n",
"
3706 rows × 2 columns
\n",
"
"
],
"text/plain": [
"gender F M\n",
"title \n",
"$1,000,000 Duck (1971) 3.375000 2.761905\n",
"'Night Mother (1986) 3.388889 3.352941\n",
"'Til There Was You (1997) 2.675676 2.733333\n",
"'burbs, The (1989) 2.793478 2.962085\n",
"...And Justice for All (1979) 3.828571 3.689024\n",
"... ... ...\n",
"Zed & Two Noughts, A (1985) 3.500000 3.380952\n",
"Zero Effect (1998) 3.864407 3.723140\n",
"Zero Kelvin (Kj�rlighetens kj�tere) (1995) NaN 3.500000\n",
"Zeus and Roxanne (1997) 2.777778 2.357143\n",
"eXistenZ (1999) 3.098592 3.289086\n",
"\n",
"[3706 rows x 2 columns]"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.pivot_table(\"rating\", index = \"title\", columns = \"gender\")"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" gender | \n",
" F | \n",
" M | \n",
"
\n",
" \n",
" title | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" $1,000,000 Duck (1971) | \n",
" 3.375000 | \n",
" 2.761905 | \n",
"
\n",
" \n",
" 'Night Mother (1986) | \n",
" 3.388889 | \n",
" 3.352941 | \n",
"
\n",
" \n",
" 'Til There Was You (1997) | \n",
" 2.675676 | \n",
" 2.733333 | \n",
"
\n",
" \n",
" 'burbs, The (1989) | \n",
" 2.793478 | \n",
" 2.962085 | \n",
"
\n",
" \n",
" ...And Justice for All (1979) | \n",
" 3.828571 | \n",
" 3.689024 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" Zed & Two Noughts, A (1985) | \n",
" 3.500000 | \n",
" 3.380952 | \n",
"
\n",
" \n",
" Zero Effect (1998) | \n",
" 3.864407 | \n",
" 3.723140 | \n",
"
\n",
" \n",
" Zero Kelvin (Kj�rlighetens kj�tere) (1995) | \n",
" NaN | \n",
" 3.500000 | \n",
"
\n",
" \n",
" Zeus and Roxanne (1997) | \n",
" 2.777778 | \n",
" 2.357143 | \n",
"
\n",
" \n",
" eXistenZ (1999) | \n",
" 3.098592 | \n",
" 3.289086 | \n",
"
\n",
" \n",
"
\n",
"
3706 rows × 2 columns
\n",
"
"
],
"text/plain": [
"gender F M\n",
"title \n",
"$1,000,000 Duck (1971) 3.375000 2.761905\n",
"'Night Mother (1986) 3.388889 3.352941\n",
"'Til There Was You (1997) 2.675676 2.733333\n",
"'burbs, The (1989) 2.793478 2.962085\n",
"...And Justice for All (1979) 3.828571 3.689024\n",
"... ... ...\n",
"Zed & Two Noughts, A (1985) 3.500000 3.380952\n",
"Zero Effect (1998) 3.864407 3.723140\n",
"Zero Kelvin (Kj�rlighetens kj�tere) (1995) NaN 3.500000\n",
"Zeus and Roxanne (1997) 2.777778 2.357143\n",
"eXistenZ (1999) 3.098592 3.289086\n",
"\n",
"[3706 rows x 2 columns]"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"avg = df.pivot_table(\"rating\", index = \"title\", columns = \"gender\", aggfunc = \"mean\")\n",
"avg"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
"
\n",
" \n",
" title | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" $1,000,000 Duck (1971) | \n",
" 37 | \n",
"
\n",
" \n",
" 'Night Mother (1986) | \n",
" 70 | \n",
"
\n",
" \n",
" 'Til There Was You (1997) | \n",
" 52 | \n",
"
\n",
" \n",
" 'burbs, The (1989) | \n",
" 303 | \n",
"
\n",
" \n",
" ...And Justice for All (1979) | \n",
" 199 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" Zed & Two Noughts, A (1985) | \n",
" 29 | \n",
"
\n",
" \n",
" Zero Effect (1998) | \n",
" 301 | \n",
"
\n",
" \n",
" Zero Kelvin (Kj�rlighetens kj�tere) (1995) | \n",
" 2 | \n",
"
\n",
" \n",
" Zeus and Roxanne (1997) | \n",
" 23 | \n",
"
\n",
" \n",
" eXistenZ (1999) | \n",
" 410 | \n",
"
\n",
" \n",
"
\n",
"
3706 rows × 1 columns
\n",
"
"
],
"text/plain": [
" title\n",
"title \n",
"$1,000,000 Duck (1971) 37\n",
"'Night Mother (1986) 70\n",
"'Til There Was You (1997) 52\n",
"'burbs, The (1989) 303\n",
"...And Justice for All (1979) 199\n",
"... ...\n",
"Zed & Two Noughts, A (1985) 29\n",
"Zero Effect (1998) 301\n",
"Zero Kelvin (Kj�rlighetens kj�tere) (1995) 2\n",
"Zeus and Roxanne (1997) 23\n",
"eXistenZ (1999) 410\n",
"\n",
"[3706 rows x 1 columns]"
]
},
"execution_count": 111,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings_by_title = df.groupby(\"title\")[[\"title\"]].count()\n",
"ratings_by_title"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ratings_2500 = ratings_by_title.index[ratings_by_title.title >= 2500]\n",
"ratings_2500"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"avg.loc[ratings_2500]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}