{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Analysis Examples" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "rc = {\"figure.figsize\" : (14, 6)}\n", "sns.set(rc = rc)\n", "my_palette = sns.color_palette(\"husl\", 4)\n", "sns.set_palette(my_palette)\n", "sns.set_style(\"whitegrid\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## USA.gov data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load the dataset\n", "Use the `json` module to load the data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "path = \"datasets/bitly_usagov/example.txt\"" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import json" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',\n", " 'al': 'en-US,en;q=0.8',\n", " 'c': 'US',\n", " 'cy': 'Danvers',\n", " 'g': 'A6qOVH',\n", " 'gr': 'MA',\n", " 'h': 'wfLQtf',\n", " 'hc': 1331822918,\n", " 'hh': '1.usa.gov',\n", " 'l': 'orofrog',\n", " 'll': [42.576698, -70.954903],\n", " 'nk': 1,\n", " 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',\n", " 't': 1331923247,\n", " 'tz': 'America/New_York',\n", " 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "records = [json.loads(line) for line in open(path)]\n", "records[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Count the number of timezones with pandas" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_heartbeat_aalccyggrhhchhkwlllnkrttzu
0NaNMozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...en-US,en;q=0.8USDanversA6qOVHMAwfLQtf1.331823e+091.usa.govNaNorofrog[42.576698, -70.954903]1.0http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/...1.331923e+09America/New_Yorkhttp://www.ncbi.nlm.nih.gov/pubmed/22415991
1NaNGoogleMaps/RochesterNYNaNUSProvomwszkSUTmwszkS1.308262e+09j.mpNaNbitly[40.218102, -111.613297]0.0http://www.AwareMap.com/1.331923e+09America/Denverhttp://www.monroecounty.gov/etc/911/rss.php
2NaNMozilla/4.0 (compatible; MSIE 8.0; Windows NT ...en-USUSWashingtonxxr3QbDCxxr3Qb1.331920e+091.usa.govNaNbitly[38.9007, -77.043098]1.0http://t.co/03elZC4Q1.331923e+09America/New_Yorkhttp://boxer.senate.gov/en/press/releases/0316...
3NaNMozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)...pt-brBRBrazzCaLwp27zUtuOu1.331923e+091.usa.govNaNalelex88[-23.549999, -46.616699]0.0direct1.331923e+09America/Sao_Paulohttp://apod.nasa.gov/apod/ap120312.html
4NaNMozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...en-US,en;q=0.8USShrewsbury9b6kNlMA9b6kNl1.273672e+09bit.lyNaNbitly[42.286499, -71.714699]0.0http://www.shrewsbury-ma.gov/selco/1.331923e+09America/New_Yorkhttp://www.shrewsbury-ma.gov/egov/gallery/1341...
\n", "
" ], "text/plain": [ " _heartbeat_ a \\\n", "0 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... \n", "1 NaN GoogleMaps/RochesterNY \n", "2 NaN Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... \n", "3 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... \n", "4 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... \n", "\n", " al c cy g gr h hc \\\n", "0 en-US,en;q=0.8 US Danvers A6qOVH MA wfLQtf 1.331823e+09 \n", "1 NaN US Provo mwszkS UT mwszkS 1.308262e+09 \n", "2 en-US US Washington xxr3Qb DC xxr3Qb 1.331920e+09 \n", "3 pt-br BR Braz zCaLwp 27 zUtuOu 1.331923e+09 \n", "4 en-US,en;q=0.8 US Shrewsbury 9b6kNl MA 9b6kNl 1.273672e+09 \n", "\n", " hh kw l ll nk \\\n", "0 1.usa.gov NaN orofrog [42.576698, -70.954903] 1.0 \n", "1 j.mp NaN bitly [40.218102, -111.613297] 0.0 \n", "2 1.usa.gov NaN bitly [38.9007, -77.043098] 1.0 \n", "3 1.usa.gov NaN alelex88 [-23.549999, -46.616699] 0.0 \n", "4 bit.ly NaN bitly [42.286499, -71.714699] 0.0 \n", "\n", " r t \\\n", "0 http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/... 1.331923e+09 \n", "1 http://www.AwareMap.com/ 1.331923e+09 \n", "2 http://t.co/03elZC4Q 1.331923e+09 \n", "3 direct 1.331923e+09 \n", "4 http://www.shrewsbury-ma.gov/selco/ 1.331923e+09 \n", "\n", " tz u \n", "0 America/New_York http://www.ncbi.nlm.nih.gov/pubmed/22415991 \n", "1 America/Denver http://www.monroecounty.gov/etc/911/rss.php \n", "2 America/New_York http://boxer.senate.gov/en/press/releases/0316... \n", "3 America/Sao_Paulo http://apod.nasa.gov/apod/ap120312.html \n", "4 America/New_York http://www.shrewsbury-ma.gov/egov/gallery/1341... " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(records)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 3560 entries, 0 to 3559\n", "Data columns (total 18 columns):\n", "_heartbeat_ 120 non-null float64\n", "a 3440 non-null object\n", "al 3094 non-null object\n", "c 2919 non-null object\n", "cy 2919 non-null object\n", "g 3440 non-null object\n", "gr 2919 non-null object\n", "h 3440 non-null object\n", "hc 3440 non-null float64\n", "hh 3440 non-null object\n", "kw 93 non-null object\n", "l 3440 non-null object\n", "ll 2919 non-null object\n", "nk 3440 non-null float64\n", "r 3440 non-null object\n", "t 3440 non-null float64\n", "tz 3440 non-null object\n", "u 3440 non-null object\n", "dtypes: float64(4), object(14)\n", "memory usage: 500.7+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "America/New_York 4\n", " 3\n", "Europe/Warsaw 1\n", "America/Denver 1\n", "America/Sao_Paulo 1\n", "Name: tz, dtype: int64" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"tz\"][:10].value_counts()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "clean_tz = df[\"tz\"].fillna(\"Missing\")" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "clean_tz[clean_tz == \"\"] = \"Unknown\"" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 America/New_York\n", "1 America/Denver\n", "2 America/New_York\n", "3 America/Sao_Paulo\n", "4 America/New_York\n", "5 America/New_York\n", "6 Europe/Warsaw\n", "7 Unknown\n", "8 Unknown\n", "9 Unknown\n", "10 America/Los_Angeles\n", "11 America/New_York\n", "12 America/New_York\n", "13 Missing\n", "14 America/New_York\n", "15 Asia/Hong_Kong\n", "16 Asia/Hong_Kong\n", "17 America/New_York\n", "18 America/Denver\n", "19 Europe/Rome\n", "Name: tz, dtype: object" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean_tz.head(n = 20)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "America/New_York 8\n", "Unknown 3\n", "America/Denver 2\n", "Asia/Hong_Kong 2\n", "Europe/Warsaw 1\n", "America/Los_Angeles 1\n", "Europe/Rome 1\n", "America/Sao_Paulo 1\n", "Missing 1\n", "Name: tz, dtype: int64" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean_tz[:20].value_counts()" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAA4EAAAFlCAYAAABV4O1xAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XlY1XXe//HXYRMMNxqX25BySXFBUdAM19sFFUXcyLpL\ns5wyTfE2s4sRc5vQxru8R500nUZTy4WLmAiQLJeCUSQ9MgYuY5IlbS5jIhwVWc7vD3+d+/ZWEx31\ni36ej+vqujpwvt/zPud8u/TZ53u+x+Z0Op0CAAAAABjBzeoBAAAAAAB3DhEIAAAAAAYhAgEAAADA\nIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQD6sHwM2x2+1WjwAAAACgigsJCbniZ0TgXexqbyhgFbvd\nzjGJKoVjElURxyWqGo7Je9u1Fo44HRQAAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAG4cIwd7GT\ny96zegTAJUDSyS8OWj3GHVd3/FNWjwAAAHBDWAkEAAAAAIMQgQAAAABgECIQAAAAAAxCBAIAAACA\nQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQC\nAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAA\nDEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCAeN3LnLVu26MUXX9SmTZvUtGnTWz5Mbm6u\nkpOTNWPGjJvaPi0tTceOHVP9+vU1ffp0ffjhhwoMDJQkDRo0SG+//bb8/f1v2bwJCQnauXOn/vjH\nP0qSiouLNWTIEK1atUqNGjWq1D5efvll9e/fX3369Lllc6HyLpSVqrS83OoxcBfzKiy0egRcg8Ph\nUOFd8P54eXnJx8fH6jEAAAa5oQhMTU1VSEiI0tLSFBMTc0sHKSsrU1BQkIKCgm56HxkZGRo1apQO\nHz6sBg0a6O2333YF2u0QHR2tpKQk7dy5U2FhYVq0aJGGDx9e6QAsKyu7bbPh+lbmZOnT/ENyWj0I\n7m4p662eAHc5Nzc3RUZG3vI/VwEAuJZKnw7qcDhkt9sVHx+vtLQ0SVJ2draeeuopjR8/Xr1799Yb\nb7yhjz76SCNGjFBkZKSOHTsmSTp9+rQmTZqk4cOHa/jw4bLb7ZKkJUuWaNq0aXr88cf1yiuvKDs7\nW+PGjXM93u9+9ztFRkYqMjJSmzdvliTNmjVLw4YN08CBA7V48WLXfE6nUwcPHlTr1q0lST179tSR\nI0f09ddfX/Fc/va3v2nkyJEaOnSoYmJi5HA49OWXX2rixImSLq14tm3bVhcvXlRJSYl69+591dfE\nZrNp9uzZmjdvnnJzc7Vr1y6NHTtWkrR//35FR0crMjJSkyZNUlFRkSTpiSee0Lx58zRs2DC99957\nl+3vzTff1PTp01VRUVHZtwX/gk8IQABVQEVFhZKTk60eAwBgkEpH4NatW9WtWzc1btxYderUUV5e\nniTp0KFDmjNnjtLT05WcnKxvvvlGiYmJGjFihNauXStJio+P19NPP60PPvhAS5Ysuex0z/z8fL37\n7rtauHDhZY+3dOlS+fr6KiUlRSkpKercubMkacqUKUpKStJHH32k3bt369ChQ5KkAwcOKDAwUDab\n7dITc3PTb3/7Wy1fvvyy/Z4+fVrLli3TqlWr9Ne//lVt2rTRqlWr1KpVKx08eFCSZLfb9fDDDys3\nN1f79u1Tu3btrvm6BAYGqmvXrhozZoxmzJghLy8vSdK0adMUGxurlJQUNW7cWEuXLnVtU1FRoaSk\nJI0ZM8b1s3nz5qm4uFjx8fFyc+OjmndCeNNA2WSzegwAhnNzc1NUVJTVYwAADFLp00HT0tI0evRo\nSVJERITS0tLUs2dPBQUFqV69epKkgIAAdenSRZLUvHlzZWdnS5J27typI0eOuPZVXFwsh8MhSerV\nq5e8vb2veLysrKzLwrBWrVqSpPT0dCUkJKisrEwnT55Ufn6+AgMDlZmZqe7du1+2j0GDBmnZsmUq\nKChw/Wzfvn06cuSInnjiCUlSaWmpgoOD5eHhoYCAAOXn5+vLL7/UM888oz179qi8vFwhISG/+to8\n+eSTysjI0COPPCJJ+vnnn1VSUuLabsiQIXrllVdc9x8wYMBl2y9evFgdOnTQ7Nmzf/VxcGs92/5R\n/UdQKJ8JxL/k/meirR4B13C9/4lXVfCZQADAnVapCDxz5ox27dqlw4cPy2azqby8XDabTT169HCt\nfEmX/m/mL7fd3NxU/v//cl1RUaGEhARVq1btin3fyB98BQUFWrlypRITE1WrVi3FxsaqpKREkrRj\nx47LTg+VJA8PDz377LP685//7PqZ0+lUly5drlh5lKTQ0FBlZGTIw8NDYWFhio2NVXl5+WUBdzU2\nm+2GVu+qV69+2e22bdsqNzdXhYWFrtjFneHt4SlvD0+rx8BdjP9mq6777ruP9wcAgKuoVLls3rxZ\nUVFR2r59u7Zt26bPP/9c/v7+2rNnT6UepGvXrq5TQyW5Trv8NWFhYXr//fddtwsLC+VwOOTj46Ma\nNWro1KlTysjIkCQVFRWprKxMderUuWI/Q4cOVVZWlk6fPi1JCg4O1t69e/Xtt99Kks6dO6ejR49K\nuhSBq1evVnBwsPz8/HTmzBkdPXpUzZs3r9Tz/EWdOnXk7e2tvXv3SpKSk5PVsWPHa96/Z8+eevbZ\nZzVu3DjXCikAAAAA3A6VisDU1NQrvsIgPDzcdYGY64mLi1NeXp4iIyMVERGh9euvfzW98ePH6+zZ\nsxo0aJAGDx6s7OxsBQYGqlWrVhowYICmTp2qDh06SLq0ChgWFnbV/Xh5eWnUqFH65z//KUny8/PT\n/Pnz9dJLLykyMlIjR450XTymXbt2OnXqlCvYWrRooebNm7s+Z3gjFixYoPnz5ysyMlL5+fmaMGHC\nr95/4MCBGjZsmCZMmOBa3QQAAACAW83mdDrv+gskxsXFKTo6WsHBwVaPcsfY7XYFfHH9FVUAt1fd\n8U9ZPQKuwW63X/cz3cCdxnGJqoZj8t52rff3hr4nsKqKj4+3egQAAAAAuCvcExF4J8yZM8f1Gb9f\njB49WsOHD7doIgAAAAC4cURgJc2aNcvqEQAAAADgX8a3kgMAAACAQYhAAAAAADAIEQgAAAAABiEC\nAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAA\nAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADOJh9QC4eXXHP2X1CICL3W5X\nSEiI1WMAAADgOlgJBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAYh\nAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBBPKweADcvcVV/q0e4\n64145mOrRwAAAADuKFYCAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAA\ngEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIE\nAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgECIQAAAA\nAAxCBAIAAACAQW57BG7ZskUtWrRQfn7+bdl/bm6uXnvttZvePi0tTcuWLVNSUpI6d+6sIUOGKDw8\nXGPHjtXevXtv4aQAAAAAYL3bHoGpqakKCQlRWlraLd93WVmZgoKCNGPGjJveR0ZGhrp16yZJioiI\n0IcffqhPPvlEzz33nCZNmnTb4vXXlJWV3fHHBAAAAGAGj9u5c4fDIbvdrjVr1uiFF15QTEyMsrOz\ntWTJEtWoUUOHDx/WgAED1Lx5c61Zs0YlJSV66623FBAQoNOnT2vWrFn64YcfJEnTp09XSEiIlixZ\nomPHjqmgoEANGzbUyJEjtXLlSi1fvlwOh0Ovvfaa8vLyJEkTJ05Uv379NGvWLOXm5qqkpET9+vVT\nTEyMJMnpdOrgwYNq3bq1Dh8+fNnsnTt31mOPPaaNGzdq+vTpOnbsmObMmaOff/5Z3t7e+v3vf6+m\nTZsqNjZWvr6+ysvL08mTJzVt2jT1799fU6ZMUVRUlHr27ClJio2NVc+ePdW3b1+98cYb+uKLL3Tx\n4kU9+eSTevzxx5Wdna1FixapZs2aOnr0qDZv3nw73xoAAAAAhrqtEbh161Z169ZNjRs3Vp06dVxx\ndujQIW3atEm1a9dW7969FR0drcTERK1evVpr165VXFyc4uPj9fTTTys0NFQ//PCDxo4dq/T0dElS\nfn6+1q1bJ29vb2VnZ7seb+nSpfL19VVKSookqbCwUJI0ZcoU1a5dW+Xl5RozZowOHTqkwMBAHThw\nQIGBgbLZbFedv3Xr1tqwYYMk6dVXX9WcOXP00EMPad++fZozZ47WrFkjSTpx4oTWrVunr7/+WuPH\nj1f//v0VERGh9PR09ezZUxcvXlRWVpZmz56txMRE1ahRQx988IEuXryoxx9/XF26dJEkHThwQCkp\nKWrUqNFteDcAAAAA4DZHYFpamkaPHi3p0qmWaWlp6tmzp4KCglSvXj1JUkBAgCuCmjdv7oq6nTt3\n6siRI659FRcXy+FwSJJ69eolb2/vKx4vKytLCxcudN2uVauWJCk9PV0JCQkqKyvTyZMnlZ+fr8DA\nQGVmZqp79+7XnN/pdEq6tKKZk5OjyZMnu3538eJF17/36dNHbm5uatasmU6dOiVJ6t69u+Lj43Xx\n4kVlZGQoNDRU3t7e2rFjh/7xj3+4VvqKior07bffytPTU0FBQQQgAAAAgNvqtkXgmTNntGvXLh0+\nfFg2m03l5eWy2Wzq0aOHvLy8XPdzc3Nz3XZzc1N5ebkkqaKiQgkJCapWrdoV+/bx8an0HAUFBVq5\ncqUSExNVq1YtxcbGqqSkRJK0Y8cOLV68+JrbHjhwQE2bNpXT6VTNmjWVnJx81fv97+fzi2rVqqlT\np07KzMxUenq6IiIiJF0KyxkzZrg+h/iL7OxsVa9evdLPCwAAAABuxm27MMzmzZsVFRWl7du3a9u2\nbfr888/l7++vPXv2VGr7rl27au3ata7bBw8evO42YWFhev/99123CwsL5XA45OPjoxo1aujUqVPK\nyMiQdGkFrqysTHXq1Lnqvr744gslJCTosccek6+vr/z9/V2nozqdTh06dOi680RERCgpKUl79uxx\nRV/Xrl21fv16lZaWSpKOHj2qc+fOXXdfAAAAAHAr3LYITE1NVZ8+fS77WXh4eKWvEhoXF6e8vDxF\nRkYqIiJC69evv+4248eP19mzZzVo0CANHjxY2dnZCgwMVKtWrTRgwABNnTpVHTp0kHRpFTAsLOyy\n7Tdt2qSoqCj169dPy5cv1+LFi9W0aVNJ0n/9138pMTFRgwcP1sCBA7Vly5brztOlSxft3r1bYWFh\nrtXC6OhoNWvWTMOGDdOgQYM0c+ZM1+onAAAAANxuNucvH3wzTFxcnKKjoxUcHGz1KDfFbrfr6Jdx\nVo9x1xvxzMdWj3DPsNvtCgkJsXoMwIVjElURxyWqGo7Je9u13t/bemGYqiw+Pt7qEQAAAADgjrvt\nXxYPAAAAAKg6iEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgECIQ\nAAAAAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAA\nYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIB5WD4CbN+KZj60eAQAAAMBd\nhpVAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQC\nAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABjEw+oBcPMi/vqa1SMAlzuWbvUE94RN\nQ2dYPQIAALiHsRIIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIE\nAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgECIQAAAA\nAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAi\nEAAAAAAMUqkI3LJli1q0aKH8/PzbMkRubq5ee+21m94+LS1Ny5Yt06lTpzRu3DgNHjxYEREReu65\n527hlFLLli0VFRWlQYMGKSYmRufPn7+p/SxZskR/+ctfbulsAAAAAFAZlYrA1NRUhYSEKC0t7ZYP\nUFZWpqCgIM2YMeOm95GRkaFu3bpp8eLFCgsL00cffaRNmzZp6tSpt3BSydvbW8nJyUpNTZWnp6c2\nbNhwS/cPAAAAALebx/Xu4HA4ZLfbtWbNGr3wwguKiYlRdna2lixZoho1aujw4cMaMGCAmjdvrjVr\n1qikpERvvfWWAgICdPr0ac2aNUs//PCDJGn69OkKCQnRkiVLdOzYMRUUFKhhw4YaOXKkVq5cqeXL\nl8vhcOi1115TXl6eJGnixInq16+fZs2apdzcXJWUlKhfv36KiYmRJDmdTh08eFCtW7fWiRMn1KVL\nF9fsgYGBrucwYcIEnT17VmVlZZo8ebL69OkjSVq1apU++OADSdKIESM0ZsyYSr1woaGh+sc//iFJ\nmjBhgn766SeVlJRo9OjRGjlypCSpffv2ysnJkSR9/PHH+uyzz/T6669ftp+DBw9q1qxZOn/+vAIC\nAjRv3jzVqlWrUjMAAAAAwI26bgRu3bpV3bp1U+PGjVWnTh1XnB06dEibNm1S7dq11bt3b0VHRysx\nMVGrV6/W2rVrFRcXp/j4eD399NMKDQ3VDz/8oLFjxyo9PV2SlJ+fr3Xr1snb21vZ2dmux1u6dKl8\nfX2VkpIiSSosLJQkTZkyRbVr11Z5ebnGjBmjQ4cOKTAwUAcOHFBgYKBsNpuefPJJTZkyRe+9957C\nwsI0bNgw1a9fX9WqVdNbb70lX19fnT59WiNHjlTv3r21f/9+JSUlKSEhQU6nU4899pg6deqkVq1a\n/eprUlZW5lp9lKR58+apdu3aunDhgkaMGKHw8HDVqVOnUm/AK6+8oldffVWdOnXSokWL9Kc//Ulx\ncXGV2hYAAAAAbtR1IzAtLU2jR4+WJEVERCgtLU09e/ZUUFCQ6tWrJ0kKCAhwrcA1b97cFXU7d+7U\nkSNHXPsqLi6Ww+GQJPXq1Uve3t5XPF5WVpYWLlzouv3Lqlh6eroSEhJUVlamkydPKj8/X4GBgcrM\nzFT37t0lSd26ddOWLVuUmZmpjIwMDR06VKmpqapRo4YWLlyo3bt3y83NTcePH9epU6dkt9vVp08f\nVa9eXZLUt29f7dmz55oReOHCBUVFRUm6tBI4YsQISdLatWv16aefSpJ+/PFHffvtt5WKwKKiIhUV\nFalTp06SpKFDh2ry5MnX3Q4AAAAAbtavRuCZM2e0a9cuHT58WDabTeXl5bLZbOrRo4e8vLxc93Nz\nc3PddnNzU3l5uSSpoqJCCQkJqlat2hX79vHxqfSQBQUFWrlypRITE1WrVi3FxsaqpKREkrRjxw4t\nXrzYdd/atWsrMjJSkZGRGjdunHbv3i2Hw6HTp08rKSlJnp6e6tWrl2v7G/HLZwL/t+zsbO3cuVMb\nN26Uj4+PRo0addV938zjAQAAAMCt9qsXhtm8ebOioqK0fft2bdu2TZ9//rn8/f21Z8+eSu28a9eu\nWrt2rev2wYMHr7tNWFiY3n//fdftwsJCORwO+fj4qEaNGjp16pQyMjIkXVpJKysrc626ZWVlua7Y\nWVxcrGPHjunf/u3fVFRUpPvvv1+enp7atWuXvv/+e0mXVvO2bNmi8+fP69y5c9qyZYtCQ0Mr9dx+\nUVRUpFq1asnHx0f5+fn6+9//7vrdb37zG+Xn56uiokJbtmy5YtsaNWqoZs2artczOTlZHTt2vKHH\nBwAAAIAb8asrgampqVd8zUJ4eLjWr1+vgICA6+48Li5Oc+fOVWRkpMrLyxUaGqq5c+f+6jbjx4/X\n3LlzNWjQILm5uWnixIkKDw9Xq1atNGDAADVo0EAdOnSQdGkVMCwszLXt/v379fvf/17u7u5yOp2K\njo5W27Zt5e/vr/HjxysyMlJt2rRRkyZNJEmtW7fWsGHDFB0dLenShWGu93nA/6t79+7asGGDBgwY\noMaNGys4ONj1u6lTp2rcuHHy8/NTmzZtdO7cuSu2/8Mf/uC6MEyjRo00f/78G3p8AAAAALgRNqfT\n6bR6iJsVFxen6Ojoy8LLFHa7Xa8eS7d6DAC3waahN/+VOfgfdrtdISEhVo8BXIbjElUNx+S97Vrv\n73UvDFOVxcfHWz0CAAAAANxV7uoIvB1+/vnnq35X4Lvvvlvpr30AAAAAgKqKCPw/6tSpc8UVQAEA\nAADgXvGrVwcFAAAAANxbiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAA\nAABgECIQAAAAAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACD\nEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEE8rB4AN2/T0BlWjwC42O12hYSEWD0GAAAAroOVQAAAAAAw\nCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBBiEAA\nAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYxMPqAXDzhn7wN6tHAC73DcckqhiOSVQxMx7ysXoE\nAGAlEAAAAABMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCB\nAAAAAGAQIhAAAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAA\nAIMQgQAAAABgECIQAAAAAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGOSO\nRGDLli0VFRXl+mfFihV34mGvacWKFfroo4/0yCOPyOl0SpJycnLUokUL/fTTT5KkoqIiderUSRUV\nFVaOCgAAAAC3lMedeBBvb28lJyff1LZlZWXy8Li1Y2ZmZmrRokWqW7eu8vPz1axZM+Xk5KhVq1ba\nu3evIiIi9Pe//11BQUFyc6tcJ9+OOQEAAADgVrO0Wnr16qXExET5+fkpNzdXCxYs0Nq1a7VkyRId\nO3ZMBQUFatiwoebPn6/Zs2crLy9P7u7uio2NVefOnZWUlKRPP/1UxcXFOn78uAYPHqyJEydKkpKT\nk7V27VqVlpaqXbt2mjVrltzd3VVcXKzS0lL5+fmpffv2ysnJcUXg008/rZycHEVERCgnJ0cdOnSQ\nJCUkJGjjxo0qLS3Vgw8+qAULFsjHx0exsbHy8vLSwYMH1aFDB/Xu3Vvx8fGSJJvNpvfee082m00T\nJkzQ2bNnVVZWpsmTJ6tPnz5655135OXlpdGjR2vevHk6dOiQ1qxZo6ysLCUmJurNN9+07H0BAAAA\ncO+6IxF44cIFRUVFuW6PGzdOERERv7pNfn6+1q1bJ29vb61cuVKSlJKSovz8fI0dO1abN2+WJOXm\n5iolJUU+Pj4aMWKEevTooerVqys9PV3r16+Xp6enZs+erZSUFA0ZMkQ7d+7Uo48+Kknq0KGDvvji\nC0VHR6ugoEADBgzQxo0bJV06PfT555+XJPXt21ePPfaYJOm///u/lZiYqFGjRkmSjh8/rg0bNsjd\n3V0vvPCCZs6cqZCQEDkcDlWrVk2S9NZbb8nX11enT5/WyJEj1bt3b4WGhmrlypUaPXq08vLydPHi\nRZWWlsput6tjx4636qUHAAAAgMtU2dNBe/XqJW9vb0mS3W7XU089JUlq2rSpGjZsqKNHj0qSwsLC\nVKdOHUmXYs1ut8vDw0N5eXkaMWKEpEsRev/990u6dCrosGHDJEnt27fX8uXLVVBQoAceeEDVqlWT\n0+mUw+HQ/v371bZtW0nSV199pT/+8Y8qKiqSw+FQ165dXXP2799f7u7uki5F5euvv67IyEiFh4fr\nvvvuU2lpqRYuXKjdu3fLzc1Nx48f16lTp9S6dWvt379fxcXF8vLyUqtWrZSXl6c9e/ZoxowZN/U6\nAwAAAMD1WHo6qLu7u+vCLCUlJZf9zsfHp1L7sNlsV9x2Op0aOnSopk6desX9v/zyS82ePVuS9NBD\nD6moqEjbt29XcHCwJKlNmzZKSkrSAw88oPvuu0+SFBsbq6VLlyowMFBJSUn64osvrjrn888/rx49\neujzzz/XE088oXfeeUf79u3T6dOnlZSUJE9PT/Xq1UslJSXy9PSUv7+/kpKS1L59e7Vo0ULZ2dk6\nduyYmjZtWqnnDgAAAAA3ytKviHjggQeUl5cnSfrkk0+ueb/Q0FClpKRIko4ePaoff/xRTZo0kSTt\n2LFDZ86c0YULF7RlyxZ16NBBjz76qDZv3qx//vOfkqQzZ87o+++/11dffaUmTZq4Vu4kqV27dlqz\nZo3at28vSQoODtbq1atdnweUJIfDobp166q0tNQ1x9UcO3ZMLVq00PPPP6+goCAdPXpURUVFuv/+\n++Xp6aldu3bp+++/v+x5rVy5Uh07dlRoaKg2bNigli1bXhG2AAAAAHCrWPKZwG7duunll1/WxIkT\nFRcXp0WLFumRRx655vb/8R//odmzZysyMlLu7u6aP3++vLy8JElt27bVpEmTXBeGCQoKkiT953/+\np5599llVVFTI09NTM2fOlN1uV7du3S7bd4cOHZSRkaE2bdpIuhSBBQUFriiUpMmTJys6Olp+fn5q\n166dHA4BsxV8AAAK+ElEQVTHVedcvXq1srOzZbPZ9PDDD6t79+4qLi7W+PHjFRkZqTZt2rjiVboU\ngW+//baCg4NVvXp1VatWTaGhoTf46gIAAABA5dmcv5yPeRdKSkpSXl6eZs6cWan7P/PMM/rDH/6g\nevXq3ebJbj+73a7Xvjlv9RgAAOAGzHjIRyEhIVaPAbjY7XaOyXvYtd5fo77YbtWqVVaPAAAAAACW\nuqsjcNiwYa4rfQIAAAAArs/SC8MAAAAAAO4sIhAAAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAG\nIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgECIQAAAAAAxCBAIAAACAQYhAAAAAADAIEQgA\nAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAM4mH1ALh5fx3e1eoRABe7\n3a6QkBCrxwBcOCZRFdntdqtHAABWAgEAAADAJEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAA\nDEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgEA+r\nB8DNy3nnhNUjAC5uaqScHI5JVB0ck6iKOC5R5bS3egBYgZVAAAAAADAIEQgAAAAABiECAQAAAMAg\nRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEA\nAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAG\nIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgECMi8LvvvtOgQYMu+9mSJUv0l7/85ZrbJCUl\nae7cubd7NAAAAAC4ozysHgAAAAC4ESWlF1RWUWr1GPeECodDhYWFVo9xT/Dy8pKPj4/VY1SK8RE4\natQotW3bVtnZ2SoqKlJ8fLxCQ0Mvu89nn32mZcuWadmyZVqwYIF8fX2Vl5enkydPatq0aerfv7+c\nTqcWLFigzMxM2Ww2jR8/XhEREZozZ466du2q3r1768UXX1TNmjU1f/58JSYmqqCgQNHR0XruuecU\nEhKinJwc1a9fX0uXLpW3t7dFrwgAAEDVlbRnlbK++lROOa0e5d6QZPUA9w43NzdFRkYqJibG6lGu\ny4jTQa+nvLxciYmJmj59uv70pz9d9rtPP/1UK1as0IoVK+Tn5ydJOnHihNatW6fly5frzTfflCR9\n8sknOnTokJKTk7Vq1SotWLBAJ06cUGhoqPbs2SNJOn78uPLz8yVJdrvdFZvffvutnnzySaWlpalG\njRravHnznXrqAAAAd5WdX31CAKJKqqioUHJystVjVIoREWiz2X7153379pUktW7dWt9//73r97t2\n7dKf//xnrVixQrVq1XL9vE+fPnJzc1OzZs106tQpSZeibuDAgXJ3d9dvfvMbdezYUbm5uQoNDZXd\nbteRI0fUrFkz3X///Tpx4oRycnLUvn17SZK/v79atmx51RkAAADwP8IeDr/m3+0AK7m5uSkqKsrq\nMSrFiNNBa9eufcW5zoWFhfL395d06fxd6dIbV15e7rpPQECACgoKdPToUQUFBbl+/sv9K6N+/fo6\ne/asMjMzFRoaqsLCQqWnp6t69ery9fXVmTNnLtufu7u7SkpKbup5AgAA3OuGhT6jge2e4DOBt0hF\n0Pdq166d1WPcE/hMYBVz3333qW7dusrKytKjjz6qM2fOKDMzU6NHj1ZS0rVPhG7YsKGmTZumSZMm\nadGiRXr44Yeved/Q0FBt3LhRQ4cOVWFhofbs2aNXXnlFkhQcHKzVq1dr9erVOnPmjGJiYtSvX79b\n/jwBAABMUM3TW9XE9RNuhYr77rvsjDeYwYgIlKQFCxZozpw5ev311yVJL774ogICAq67XdOmTfXG\nG29o8uTJevvtt695v759+yonJ0dRUVGy2WyaNm2a6tatK0kKCQnR3/72Nz344INq2LChCgsLr7j4\nDAAAAADcCTan08kna+9CdrtdbjmNrB4DAAAAd7GK9gUKCQmxegzcJna7/arvrxEXhgEAAAAAXEIE\nAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgECIQAAAA\nAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAi\nEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAbxsHoA3Lz2v61n9QiAi91uV0hIiNVjAC4ck6iK\nOC5R1djtBVaPAAuwEggAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAM\nYnM6nU6rh8CNs9vtVo8AAAAAoIq72tfSEIEAAAAAYBBOBwUAAAAAgxCBAAAAAGAQIhAAAAAADEIE\nAgAAAIBBiEAAAAAAMAgReJfJyMhQv3791LdvX61YscLqcWC4H3/8UaNGjVJERIQGDhyo1atXWz0S\nIEkqLy/XkCFDNG7cOKtHASRJZ8+eVUxMjPr3768BAwYoJyfH6pFguHfffVcDBw7UoEGD9NJLL6mk\npMTqkXAHEYF3kfLycs2dO1fvvPOO0tLSlJqaqiNHjlg9Fgzm7u6u2NhYbdq0SRs3btS6des4JlEl\nrFmzRk2bNrV6DMAlPj5e3bp108cff6zk5GSOT1jq+PHjWrNmjT744AOlpqaqvLxcaWlpVo+FO4gI\nvIt8+eWXevDBB9WoUSN5eXlp4MCB2rp1q9VjwWD16tVT69atJUm+vr5q0qSJjh8/bvFUMN1PP/2k\nzz77TCNGjLB6FECSVFRUpN27d7uOSS8vL9WsWdPiqWC68vJyXbhwQWVlZbpw4YLq1atn9Ui4g4jA\nu8jx48fVoEED1+369evzF25UGd99950OHjyodu3aWT0KDDdv3jxNmzZNbm78EYeq4bvvvpOfn59+\n97vfaciQIYqLi9O5c+esHgsGq1+/vp599ln9+7//u7p27SpfX1917drV6rFwB/EnJIB/mcPhUExM\njKZPny5fX1+rx4HBtm/fLj8/P7Vp08bqUQCXsrIyHThwQE888YQ+/PBD+fj48Ll+WKqwsFBbt27V\n1q1blZmZqfPnzys5OdnqsXAHEYF3kfr16+unn35y3T5+/Ljq169v4USAVFpaqpiYGEVGRio8PNzq\ncWC4vXv3atu2berVq5deeukl7dq1Sy+//LLVY8FwDRo0UIMGDVxnSvTv318HDhyweCqYbOfOnfL3\n95efn588PT0VHh7OxYoMQwTeRYKCgvTNN9+ooKBAFy9eVFpamnr16mX1WDCY0+lUXFycmjRpomee\necbqcQBNnTpVGRkZ2rZtmxYuXKjOnTvrjTfesHosGK5u3bpq0KCBvv76a0lSVlYWF4aBpRo2bKh9\n+/bp/PnzcjqdHJMG8rB6AFSeh4eHZs6cqd/+9rcqLy/X8OHD9fDDD1s9Fgxmt9uVnJys5s2bKyoq\nSpL00ksvqUePHhZPBgBVy6uvvqqXX35ZpaWlatSokebPn2/1SDBYu3bt1K9fPw0dOlQeHh5q2bKl\nRo4cafVYuINsTqfTafUQAAAAAIA7g9NBAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAA\nAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAb5f5cc9EBFqbCUAAAAAElFTkSuQmCC\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sns.barplot(y = clean_tz[:10].values, x = clean_tz[:10].index)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## MovieLens data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ubuntu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", " after removing the cwd from sys.path.\n", "/home/ubuntu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", " \n" ] } ], "source": [ "# Make display smaller\n", "pd.options.display.max_rows = 15\n", "unames = ['user_id', 'gender', 'age', 'occupation', 'zip']\n", "users = pd.read_table('datasets/movielens/users.dat', sep='::',header=None, names=unames)\n", "rnames = ['user_id', 'movie_id', 'rating', 'timestamp']\n", "ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::',header=None, names=rnames)\n", "mnames = ['movie_id', 'title', 'genres']\n", "movies = pd.read_table('datasets/movielens/movies.dat', sep='::',header=None, names=mnames)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idgenderageoccupationzip
01F11048067
12M561670072
23M251555117
34M45702460
45M252055455
\n", "
" ], "text/plain": [ " user_id gender age occupation zip\n", "0 1 F 1 10 48067\n", "1 2 M 56 16 70072\n", "2 3 M 25 15 55117\n", "3 4 M 45 7 02460\n", "4 5 M 25 20 55455" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "users.head()" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idmovie_idratingtimestamp
0111935978300760
116613978302109
219143978301968
3134084978300275
4123555978824291
\n", "
" ], "text/plain": [ " user_id movie_id rating timestamp\n", "0 1 1193 5 978300760\n", "1 1 661 3 978302109\n", "2 1 914 3 978301968\n", "3 1 3408 4 978300275\n", "4 1 2355 5 978824291" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings.head()" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idtitlegenres
01Toy Story (1995)Animation|Children's|Comedy
12Jumanji (1995)Adventure|Children's|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama
45Father of the Bride Part II (1995)Comedy
\n", "
" ], "text/plain": [ " movie_id title genres\n", "0 1 Toy Story (1995) Animation|Children's|Comedy\n", "1 2 Jumanji (1995) Adventure|Children's|Fantasy\n", "2 3 Grumpier Old Men (1995) Comedy|Romance\n", "3 4 Waiting to Exhale (1995) Comedy|Drama\n", "4 5 Father of the Bride Part II (1995) Comedy" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idmovie_idratingtimestampgenderageoccupationzip
0111935978300760F11048067
116613978302109F11048067
219143978301968F11048067
3134084978300275F11048067
4123555978824291F11048067
\n", "
" ], "text/plain": [ " user_id movie_id rating timestamp gender age occupation zip\n", "0 1 1193 5 978300760 F 1 10 48067\n", "1 1 661 3 978302109 F 1 10 48067\n", "2 1 914 3 978301968 F 1 10 48067\n", "3 1 3408 4 978300275 F 1 10 48067\n", "4 1 2355 5 978824291 F 1 10 48067" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "first_merge = pd.merge(ratings, users, on = \"user_id\")\n", "first_merge.head()" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idmovie_idratingtimestampgenderageoccupationziptitlegenres
0111935978300760F11048067One Flew Over the Cuckoo's Nest (1975)Drama
1211935978298413M561670072One Flew Over the Cuckoo's Nest (1975)Drama
21211934978220179M251232793One Flew Over the Cuckoo's Nest (1975)Drama
31511934978199279M25722903One Flew Over the Cuckoo's Nest (1975)Drama
41711935978158471M50195350One Flew Over the Cuckoo's Nest (1975)Drama
\n", "
" ], "text/plain": [ " user_id movie_id rating timestamp gender age occupation zip \\\n", "0 1 1193 5 978300760 F 1 10 48067 \n", "1 2 1193 5 978298413 M 56 16 70072 \n", "2 12 1193 4 978220179 M 25 12 32793 \n", "3 15 1193 4 978199279 M 25 7 22903 \n", "4 17 1193 5 978158471 M 50 1 95350 \n", "\n", " title genres \n", "0 One Flew Over the Cuckoo's Nest (1975) Drama \n", "1 One Flew Over the Cuckoo's Nest (1975) Drama \n", "2 One Flew Over the Cuckoo's Nest (1975) Drama \n", "3 One Flew Over the Cuckoo's Nest (1975) Drama \n", "4 One Flew Over the Cuckoo's Nest (1975) Drama " ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "second_merge = pd.merge(first_merge, movies, on = \"movie_id\")\n", "second_merge.head()" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1000209, 10)" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "second_merge.shape" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "df = second_merge" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rating
gendertitle
F$1,000,000 Duck (1971)3.375000
'Night Mother (1986)3.388889
'Til There Was You (1997)2.675676
'burbs, The (1989)2.793478
...And Justice for All (1979)3.828571
.........
MZed & Two Noughts, A (1985)3.380952
Zero Effect (1998)3.723140
Zero Kelvin (Kj�rlighetens kj�tere) (1995)3.500000
Zeus and Roxanne (1997)2.357143
eXistenZ (1999)3.289086
\n", "

7152 rows × 1 columns

\n", "
" ], "text/plain": [ " rating\n", "gender title \n", "F $1,000,000 Duck (1971) 3.375000\n", " 'Night Mother (1986) 3.388889\n", " 'Til There Was You (1997) 2.675676\n", " 'burbs, The (1989) 2.793478\n", " ...And Justice for All (1979) 3.828571\n", "... ...\n", "M Zed & Two Noughts, A (1985) 3.380952\n", " Zero Effect (1998) 3.723140\n", " Zero Kelvin (Kj�rlighetens kj�tere) (1995) 3.500000\n", " Zeus and Roxanne (1997) 2.357143\n", " eXistenZ (1999) 3.289086\n", "\n", "[7152 rows x 1 columns]" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby([\"gender\", \"title\"])[[\"rating\"]].agg(\"mean\")" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genderFM
title
$1,000,000 Duck (1971)3.3750002.761905
'Night Mother (1986)3.3888893.352941
'Til There Was You (1997)2.6756762.733333
'burbs, The (1989)2.7934782.962085
...And Justice for All (1979)3.8285713.689024
.........
Zed & Two Noughts, A (1985)3.5000003.380952
Zero Effect (1998)3.8644073.723140
Zero Kelvin (Kj�rlighetens kj�tere) (1995)NaN3.500000
Zeus and Roxanne (1997)2.7777782.357143
eXistenZ (1999)3.0985923.289086
\n", "

3706 rows × 2 columns

\n", "
" ], "text/plain": [ "gender F M\n", "title \n", "$1,000,000 Duck (1971) 3.375000 2.761905\n", "'Night Mother (1986) 3.388889 3.352941\n", "'Til There Was You (1997) 2.675676 2.733333\n", "'burbs, The (1989) 2.793478 2.962085\n", "...And Justice for All (1979) 3.828571 3.689024\n", "... ... ...\n", "Zed & Two Noughts, A (1985) 3.500000 3.380952\n", "Zero Effect (1998) 3.864407 3.723140\n", "Zero Kelvin (Kj�rlighetens kj�tere) (1995) NaN 3.500000\n", "Zeus and Roxanne (1997) 2.777778 2.357143\n", "eXistenZ (1999) 3.098592 3.289086\n", "\n", "[3706 rows x 2 columns]" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.pivot_table(\"rating\", index = \"title\", columns = \"gender\")" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genderFM
title
$1,000,000 Duck (1971)3.3750002.761905
'Night Mother (1986)3.3888893.352941
'Til There Was You (1997)2.6756762.733333
'burbs, The (1989)2.7934782.962085
...And Justice for All (1979)3.8285713.689024
.........
Zed & Two Noughts, A (1985)3.5000003.380952
Zero Effect (1998)3.8644073.723140
Zero Kelvin (Kj�rlighetens kj�tere) (1995)NaN3.500000
Zeus and Roxanne (1997)2.7777782.357143
eXistenZ (1999)3.0985923.289086
\n", "

3706 rows × 2 columns

\n", "
" ], "text/plain": [ "gender F M\n", "title \n", "$1,000,000 Duck (1971) 3.375000 2.761905\n", "'Night Mother (1986) 3.388889 3.352941\n", "'Til There Was You (1997) 2.675676 2.733333\n", "'burbs, The (1989) 2.793478 2.962085\n", "...And Justice for All (1979) 3.828571 3.689024\n", "... ... ...\n", "Zed & Two Noughts, A (1985) 3.500000 3.380952\n", "Zero Effect (1998) 3.864407 3.723140\n", "Zero Kelvin (Kj�rlighetens kj�tere) (1995) NaN 3.500000\n", "Zeus and Roxanne (1997) 2.777778 2.357143\n", "eXistenZ (1999) 3.098592 3.289086\n", "\n", "[3706 rows x 2 columns]" ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "avg = df.pivot_table(\"rating\", index = \"title\", columns = \"gender\", aggfunc = \"mean\")\n", "avg" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
title
title
$1,000,000 Duck (1971)37
'Night Mother (1986)70
'Til There Was You (1997)52
'burbs, The (1989)303
...And Justice for All (1979)199
......
Zed & Two Noughts, A (1985)29
Zero Effect (1998)301
Zero Kelvin (Kj�rlighetens kj�tere) (1995)2
Zeus and Roxanne (1997)23
eXistenZ (1999)410
\n", "

3706 rows × 1 columns

\n", "
" ], "text/plain": [ " title\n", "title \n", "$1,000,000 Duck (1971) 37\n", "'Night Mother (1986) 70\n", "'Til There Was You (1997) 52\n", "'burbs, The (1989) 303\n", "...And Justice for All (1979) 199\n", "... ...\n", "Zed & Two Noughts, A (1985) 29\n", "Zero Effect (1998) 301\n", "Zero Kelvin (Kj�rlighetens kj�tere) (1995) 2\n", "Zeus and Roxanne (1997) 23\n", "eXistenZ (1999) 410\n", "\n", "[3706 rows x 1 columns]" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings_by_title = df.groupby(\"title\")[[\"title\"]].count()\n", "ratings_by_title" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ratings_2500 = ratings_by_title.index[ratings_by_title.title >= 2500]\n", "ratings_2500" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "avg.loc[ratings_2500]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }