{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Analysis Examples" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "rc = {\"figure.figsize\" : (14, 6)}\n", "sns.set(rc = rc)\n", "my_palette = sns.color_palette(\"husl\", 4)\n", "sns.set_palette(my_palette)\n", "sns.set_style(\"whitegrid\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## USA.gov data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load the dataset\n", "Use the `json` module to load the data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "path = \"datasets/bitly_usagov/example.txt\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import json" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',\n", " 'al': 'en-US,en;q=0.8',\n", " 'c': 'US',\n", " 'cy': 'Danvers',\n", " 'g': 'A6qOVH',\n", " 'gr': 'MA',\n", " 'h': 'wfLQtf',\n", " 'hc': 1331822918,\n", " 'hh': '1.usa.gov',\n", " 'l': 'orofrog',\n", " 'll': [42.576698, -70.954903],\n", " 'nk': 1,\n", " 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',\n", " 't': 1331923247,\n", " 'tz': 'America/New_York',\n", " 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "records = [json.loads(line) for line in open(path)]\n", "records[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Count the number of timezones with pandas" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_heartbeat_aalccyggrhhchhkwlllnkrttzu
0NaNMozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...en-US,en;q=0.8USDanversA6qOVHMAwfLQtf1.331823e+091.usa.govNaNorofrog[42.576698, -70.954903]1.0http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/...1.331923e+09America/New_Yorkhttp://www.ncbi.nlm.nih.gov/pubmed/22415991
1NaNGoogleMaps/RochesterNYNaNUSProvomwszkSUTmwszkS1.308262e+09j.mpNaNbitly[40.218102, -111.613297]0.0http://www.AwareMap.com/1.331923e+09America/Denverhttp://www.monroecounty.gov/etc/911/rss.php
2NaNMozilla/4.0 (compatible; MSIE 8.0; Windows NT ...en-USUSWashingtonxxr3QbDCxxr3Qb1.331920e+091.usa.govNaNbitly[38.9007, -77.043098]1.0http://t.co/03elZC4Q1.331923e+09America/New_Yorkhttp://boxer.senate.gov/en/press/releases/0316...
3NaNMozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)...pt-brBRBrazzCaLwp27zUtuOu1.331923e+091.usa.govNaNalelex88[-23.549999, -46.616699]0.0direct1.331923e+09America/Sao_Paulohttp://apod.nasa.gov/apod/ap120312.html
4NaNMozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...en-US,en;q=0.8USShrewsbury9b6kNlMA9b6kNl1.273672e+09bit.lyNaNbitly[42.286499, -71.714699]0.0http://www.shrewsbury-ma.gov/selco/1.331923e+09America/New_Yorkhttp://www.shrewsbury-ma.gov/egov/gallery/1341...
\n", "
" ], "text/plain": [ " _heartbeat_ a \\\n", "0 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... \n", "1 NaN GoogleMaps/RochesterNY \n", "2 NaN Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... \n", "3 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... \n", "4 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... \n", "\n", " al c cy g gr h hc \\\n", "0 en-US,en;q=0.8 US Danvers A6qOVH MA wfLQtf 1.331823e+09 \n", "1 NaN US Provo mwszkS UT mwszkS 1.308262e+09 \n", "2 en-US US Washington xxr3Qb DC xxr3Qb 1.331920e+09 \n", "3 pt-br BR Braz zCaLwp 27 zUtuOu 1.331923e+09 \n", "4 en-US,en;q=0.8 US Shrewsbury 9b6kNl MA 9b6kNl 1.273672e+09 \n", "\n", " hh kw l ll nk \\\n", "0 1.usa.gov NaN orofrog [42.576698, -70.954903] 1.0 \n", "1 j.mp NaN bitly [40.218102, -111.613297] 0.0 \n", "2 1.usa.gov NaN bitly [38.9007, -77.043098] 1.0 \n", "3 1.usa.gov NaN alelex88 [-23.549999, -46.616699] 0.0 \n", "4 bit.ly NaN bitly [42.286499, -71.714699] 0.0 \n", "\n", " r t \\\n", "0 http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/... 1.331923e+09 \n", "1 http://www.AwareMap.com/ 1.331923e+09 \n", "2 http://t.co/03elZC4Q 1.331923e+09 \n", "3 direct 1.331923e+09 \n", "4 http://www.shrewsbury-ma.gov/selco/ 1.331923e+09 \n", "\n", " tz u \n", "0 America/New_York http://www.ncbi.nlm.nih.gov/pubmed/22415991 \n", "1 America/Denver http://www.monroecounty.gov/etc/911/rss.php \n", "2 America/New_York http://boxer.senate.gov/en/press/releases/0316... \n", "3 America/Sao_Paulo http://apod.nasa.gov/apod/ap120312.html \n", "4 America/New_York http://www.shrewsbury-ma.gov/egov/gallery/1341... " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(records)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 3560 entries, 0 to 3559\n", "Data columns (total 18 columns):\n", "_heartbeat_ 120 non-null float64\n", "a 3440 non-null object\n", "al 3094 non-null object\n", "c 2919 non-null object\n", "cy 2919 non-null object\n", "g 3440 non-null object\n", "gr 2919 non-null object\n", "h 3440 non-null object\n", "hc 3440 non-null float64\n", "hh 3440 non-null object\n", "kw 93 non-null object\n", "l 3440 non-null object\n", "ll 2919 non-null object\n", "nk 3440 non-null float64\n", "r 3440 non-null object\n", "t 3440 non-null float64\n", "tz 3440 non-null object\n", "u 3440 non-null object\n", "dtypes: float64(4), object(14)\n", "memory usage: 500.7+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "America/New_York 4\n", " 3\n", "Europe/Warsaw 1\n", "America/Denver 1\n", "America/Sao_Paulo 1\n", "Name: tz, dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"tz\"][:10].value_counts()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "clean_tz = df[\"tz\"].fillna(\"Missing\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "clean_tz[clean_tz == \"\"] = \"Unknown\"" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 America/New_York\n", "1 America/Denver\n", "2 America/New_York\n", "3 America/Sao_Paulo\n", "4 America/New_York\n", "5 America/New_York\n", "6 Europe/Warsaw\n", "7 Unknown\n", "8 Unknown\n", "9 Unknown\n", "10 America/Los_Angeles\n", "11 America/New_York\n", "12 America/New_York\n", "13 Missing\n", "14 America/New_York\n", "15 Asia/Hong_Kong\n", "16 Asia/Hong_Kong\n", "17 America/New_York\n", "18 America/Denver\n", "19 Europe/Rome\n", "Name: tz, dtype: object" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean_tz.head(n = 20)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "America/New_York 8\n", "Unknown 3\n", "America/Denver 2\n", "Asia/Hong_Kong 2\n", "Missing 1\n", "America/Los_Angeles 1\n", "Europe/Rome 1\n", "America/Sao_Paulo 1\n", "Europe/Warsaw 1\n", "Name: tz, dtype: int64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean_tz[:20].value_counts()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAA4EAAAFlCAYAAABV4O1xAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XlY1XXe//HXYRMMNxqX25BySXFBUdAM19sFFUXcyJrS\nLKdMU7zN7GLE3Ca08S7vUSdNp9HUcuEiJgIky6VgXEiPjIHLmGRJm8uYCEdFlvP7w1/nvr3VRG/x\ni36ej+vqujpwvt/zPud8u/TZ53u+x+Z0Op0CAAAAABjBzeoBAAAAAAB3DhEIAAAAAAYhAgEAAADA\nIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQD6sHwK2x2+1WjwAAAACgigsJCbnqZ0TgXexabyhgFbvd\nzjGJKoVjElURxyWqGo7Je9v1Fo44HRQAAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAG4cIwd7FT\ny96zegTAJUDSqS8OWT3GHVd3/FNWjwAAAHBTWAkEAAAAAIMQgQAAAABgECIQAAAAAAxCBAIAAACA\nQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQC\nAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAA\nDEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCAeN3PnLVu26MUXX9SmTZvUtGnT2z5MTk6O\nkpOTNWPGjFvaPi0tTcePH1f9+vU1ffp0ffjhhwoMDJQkDRo0SG+//bb8/f1v27wJCQnauXOn/vSn\nP0mSioqKNGTIEK1atUqNGjWq0D5efvll9e/fX3369Lltc6FiLpaWqKSszOoxcJfzKiiwegRch8Ph\nUMFd8P54eXnJx8fH6jEAAAa5qQhMTU1VSEiI0tLSFBMTc1sHKS0tVVBQkIKCgm55HxkZGRo1apSO\nHDmiBg0a6O2333YFWmWIjo5WUlKSdu7cqbCwMC1atEjDhw+vcACWlpZW2mz4dSuzd+nTvMNyWj0I\n7n4p662eAHc5Nzc3RUZG3vY/VwEAuJ4Knw7qcDhkt9sVHx+vtLQ0SVJWVpaeeuopjR8/Xr1799Yb\nb7yhjz76SCNGjFBkZKSOHz8uSTpz5owmTZqk4cOHa/jw4bLb7ZKkJUuWaNq0aXr88cf1yiuvKCsr\nS+PGjXM93u9//3tFRkYqMjJSmzdvliTNmjVLw4YN08CBA7V48WLXfE6nU4cOHVLr1q0lST179tTR\no0f19ddfX/Vc/v73v2vkyJEaOnSoYmJi5HA49OWXX2rixImSLq94tm3bVpcuXVJxcbF69+59zdfE\nZrNp9uzZmjdvnnJycrR7926NHTtWknTgwAFFR0crMjJSkyZNUmFhoSTpiSee0Lx58zRs2DC99957\nV+zvzTff1PTp01VeXl7RtwW36BMCEEAVUV5eruTkZKvHAAAYpMIRuHXrVnXr1k2NGzdWnTp1lJub\nK0k6fPiw5syZo/T0dCUnJ+ubb75RYmKiRowYobVr10qS4uPj9fTTT+uDDz7QkiVLrjjdMy8vT+++\n+64WLlx4xeMtXbpUvr6+SklJUUpKijp37ixJmjJlipKSkvTRRx9pz549Onz4sCTp4MGDCgwMlM1m\nu/zE3Nz0u9/9TsuXL79iv2fOnNGyZcu0atUq/e1vf1ObNm20atUqtWrVSocOHZIk2e12Pfzww8rJ\nydH+/fvVrl27674ugYGB6tq1q8aMGaMZM2bIy8tLkjRt2jTFxsYqJSVFjRs31tKlS13blJeXKykp\nSWPGjHH9bN68eSoqKlJ8fLzc3PioZmULbxoom2xWjwEAcnNzU1RUlNVjAAAMUuHTQdPS0jR69GhJ\nUkREhNLS0tSzZ08FBQWpXr16kqSAgAB16dJFktS8eXNlZWVJknbu3KmjR4+69lVUVCSHwyFJ6tWr\nl7y9va96vF27dl0RhrVq1ZIkpaenKyEhQaWlpTp16pTy8vIUGBiozMxMde/e/Yp9DBo0SMuWLVN+\nfr7rZ/v379fRo0f1xBNPSJJKSkoUHBwsDw8PBQQEKC8vT19++aWeeeYZ7d27V2VlZQoJCfnV1+bJ\nJ59URkaGHnnkEUnSzz//rOLiYtd2Q4YM0SuvvOK6/4ABA67YfvHixerQoYNmz579q4+D2+fZ9o/q\nt0GhfCYQ/2f3PxNt9Qi4jhv9T7yqgs8EAgDutApF4NmzZ7V7924dOXJENptNZWVlstls6tGjh2vl\nS7r8fzN/ue3m5qay//8X7PLyciUkJKhatWpX7ftm/uDLz8/XypUrlZiYqFq1aik2NlbFxcWSpB07\ndlxxeqgkeXh46Nlnn9Vf/vIX18+cTqe6dOly1cqjJIWGhiojI0MeHh4KCwtTbGysysrKrgi4a7HZ\nbDe1ele9evUrbrdt21Y5OTkqKChwxS4qn7eHp7w9PK0eA3c5/putuu677z7eHwAArqFC5bJ582ZF\nRUVp+/bt2rZtmz7//HP5+/tr7969FXqQrl27uk4NleQ67fLXhIWF6f3333fdLigokMPhkI+Pj2rU\nqKHTp08rIyNDklRYWKjS0lLVqVPnqv0MHTpUu3bt0pkzZyRJwcHB2rdvn7799ltJ0vnz53Xs2DFJ\nlyNw9erVCg4Olp+fn86ePatjx46pefPmFXqev6hTp468vb21b98+SVJycrI6dux43fv37NlTzz77\nrMaNG+daIQUAAACAylChCExNTb3qKwzCw8NdF4i5kbi4OOXm5ioyMlIRERFav/7GV9MbP368zp07\np0GDBmnw4MHKyspSYGCgWrVqpQEDBmjq1Knq0KGDpMurgGFhYdfcj5eXl0aNGqV//etfkiQ/Pz/N\nnz9fL730kiIjIzVy5EjXxWPatWun06dPu4KtRYsWat68uetzhjdjwYIFmj9/viIjI5WXl6cJEyb8\n6v0HDhyoYcOGacKECa7VTQAAAAC43WxOp/Ouv0hiXFycoqOjFRwcbPUod4zdblfAFzdeUQVQueqO\nf8rqEXAddrv9hp/pBu40jktUNRyT97brvb839T2BVVV8fLzVIwAAAADAXeGeiMA7Yc6cOa7P+P1i\n9OjRGj58uEUTAQAAAMDNIwIraNasWVaPAAAAAAD/Z3wrOQAAAAAYhAgEAAAAAIMQgQAAAABgECIQ\nAAAAAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAA\nYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIB5WD4BbV3f8U1aPALjY7XaF\nhIRYPQYAAABugJVAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAi\nEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABjEw+oBcOsSV/W3eoS7\n3ohnPrZ6BAAAAOCOYiUQAAAAAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAA\nGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQg\nAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAA\nwCBEIAAAAAAYpNIjcMuWLWrRooXy8vIqZf85OTl67bXXbnn7tLQ0LVu2TElJSercubOGDBmi8PBw\njR07Vvv27buNkwIAAACA9So9AlNTUxUSEqK0tLTbvu/S0lIFBQVpxowZt7yPjIwMdevWTZIUERGh\nDz/8UJ988omee+45TZo0qdLi9deUlpbe8ccEAAAAYAaPyty5w+GQ3W7XmjVr9MILLygmJkZZWVla\nsmSJatSooSNHjmjAgAFq3ry51qxZo+LiYr311lsKCAjQmTNnNGvWLP3www+SpOnTpyskJERLlizR\n8ePHlZ+fr4YNG2rkyJFauXKlli9fLofDoddee025ubmSpIkTJ6pfv36aNWuWcnJyVFxcrH79+ikm\nJkaS5HQ6dejQIbVu3VpHjhy5YvbOnTvrscce08aNGzV9+nQdP35cc+bM0c8//yxvb2/94Q9/UNOm\nTRUbGytfX1/l5ubq1KlTmjZtmvr3768pU6YoKipKPXv2lCTFxsaqZ8+e6tu3r9544w198cUXunTp\nkp588kk9/vjjysrK0qJFi1SzZk0dO3ZMmzdvrsy3BgAAAIChKjUCt27dqm7duqlx48aqU6eOK84O\nHz6sTZs2qXbt2urdu7eio6OVmJio1atXa+3atYqLi1N8fLyefvpphYaG6ocfftDYsWOVnp4uScrL\ny9O6devk7e2trKws1+MtXbpUvr6+SklJkSQVFBRIkqZMmaLatWurrKxMY8aM0eHDhxUYGKiDBw8q\nMDBQNpvtmvO3bt1aGzZskCS9+uqrmjNnjh566CHt379fc+bM0Zo1ayRJJ0+e1Lp16/T1119r/Pjx\n6t+/vyIiIpSenq6ePXvq0qVL2rVrl2bPnq3ExETVqFFDH3zwgS5duqTHH39cXbp0kSQdPHhQKSkp\natSoUSW8GwAAAABQyRGYlpam0aNHS7p8qmVaWpp69uypoKAg1atXT5IUEBDgiqDmzZu7om7nzp06\nevSoa19FRUVyOBySpF69esnb2/uqx9u1a5cWLlzoul2rVi1JUnp6uhISElRaWqpTp04pLy9PgYGB\nyszMVPfu3a87v9PplHR5RTM7O1uTJ092/e7SpUuuf+/Tp4/c3NzUrFkznT59WpLUvXt3xcfH69Kl\nS8rIyFBoaKi8vb21Y8cO/fOf/3St9BUWFurbb7+Vp6engoKCCEAAAAAAlarSIvDs2bPavXu3jhw5\nIpvNprKyMtlsNvXo0UNeXl6u+7m5ubluu7m5qaysTJJUXl6uhIQEVatW7ap9+/j4VHiO/Px8rVy5\nUomJiapVq5ZiY2NVXFwsSdqxY4cWL1583W0PHjyopk2byul0qmbNmkpOTr7m/f7n8/lFtWrV1KlT\nJ2VmZio9PV0RERGSLofljBkzXJ9D/EVWVpaqV69e4ecFAAAAALei0i4Ms3nzZkVFRWn79u3atm2b\nPv/8c/n7+2vv3r0V2r5r165au3at6/ahQ4duuE1YWJjef/991+2CggI5HA75+PioRo0aOn36tDIy\nMiRdXoErLS1VnTp1rrmvL774QgkJCXrsscfk6+srf39/1+moTqdThw8fvuE8ERERSkpK0t69e13R\n17VrV61fv14lJSWSpGPHjun8+fM33BcAAAAA3A6VFoGpqanq06fPFT8LDw+v8FVC4+LilJubq8jI\nSEVERGj9+vU33Gb8+PE6d+6cBg0apMGDBysrK0uBgYFq1aqVBgwYoKlTp6pDhw6SLq8ChoWFXbH9\npk2bFBUVpX79+mn58uVavHixmjZtKkn6z//8TyUmJmrw4MEaOHCgtmzZcsN5unTpoj179igsLMy1\nWhgdHa1mzZpp2LBhGjRokGbOnOla/QQAAACAymZz/vLBN8PExcUpOjpawcHBVo9yS+x2u459GWf1\nGHe9Ec98bPUI9wy73a6QkBCrxwBcOCZRFXFcoqrhmLy3Xe/9rdQLw1Rl8fHxVo8AAAAAAHdcpX9Z\nPAAAAACg6iACAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAA\nAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBB\niEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIN4WD0Abt2IZz62egQAAAAAdxlW\nAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAA\nAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgEA+rB8Cti/jba1aPAFzpeLrVE9wTNg2d\nYfUIAADgHsZKIAAAAAAYhAgEAAAAAIMQgQAAAABgECIQAAAAAAxCBAIAAACAQYhAAAAAADAIEQgA\nAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAw\nCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBBiEAA\nAAAAMEiFInDLli1q0aKF8vLyKmWInJwcvfbaa7e8fVpampYtW6bTp09r3LhxGjx4sCIiIvTcc8/d\nximlli1bKioqSoMGDVJMTIwuXLhwS/tZsmSJ/vrXv97W2QAAAACgIioUgampqQoJCVFaWtptH6C0\ntFRBQUGaMWPGLe8jIyND3bp10+LFixUWFqaPPvpImzZt0tSpU2/jpJK3t7eSk5OVmpoqT09Pbdiw\n4bbuHwAAAAAqm8eN7uBwOGS327VmzRq98MILiomJUVZWlpYsWaIaNWroyJEjGjBggJo3b641a9ao\nuLhYb731lgICAnTmzBnNmjVLP/zwgyRp+vTpCgkJ0ZIlS3T8+HHl5+erYcOGGjlypFauXKnly5fL\n4XDotddeU25uriRp4sSJ6tevn2bNmqWcnBwVFxerX79+iomJkSQ5nU4dOnRIrVu31smTJ9WlSxfX\n7IGBga7nMGHCBJ07d06lpaWaPHmy+vTpI0latWqVPvjgA0nSiBEjNGbMmAq9cKGhofrnP/8pSZow\nYYJ++uknFRcXa/To0Ro5cqQkqX379srOzpYkffzxx/rss8/0+uuvX7GfQ4cOadasWbpw4YICAgI0\nb9481apVq0IzAAAAAMDNumEEbt26Vd26dVPjxo1Vp04dV5wdPnxYmzZtUu3atdW7d29FR0crMTFR\nq1ev1tq1axUXF6f4+Hg9/fTTCg0N1Q8//KCxY8cqPT1dkpSXl6d169bJ29tbWVlZrsdbunSpfH19\nlZKSIkkqKCiQJE2ZMkW1a9dWWVmZxowZo8OHDyswMFAHDx5UYGCgbDabnnzySU2ZMkXvvfeewsLC\nNGzYMNWvX1/VqlXTW2+9JV9fX505c0YjR45U7969deDAASUlJSkhIUFOp1OPPfaYOnXqpFatWv3q\na1JaWupafZSkefPmqXbt2rp48aJGjBih8PBw1alTp0JvwCuvvKJXX31VnTp10qJFi/TnP/9ZcXFx\nFdoWAAAAAG7WDSMwLS1No0ePliRFREQoLS1NPXv2VFBQkOrVqydJCggIcK3ANW/e3BV1O3fu1NGj\nR137KioqksPhkCT16tVL3t7eVz3erl27tHDhQtftX1bF0tPTlZCQoNLSUp06dUp5eXkKDAxUZmam\nunfvLknq1q2btmzZoszMTGVkZGjo0KFKTU1VjRo1tHDhQu3Zs0dubm46ceKETp8+Lbvdrj59+qh6\n9eqSpL59+2rv3r3XjcCLFy8qKipK0uWVwBEjRkiS1q5dq08//VSS9OOPP+rbb7+tUAQWFhaqsLBQ\nnTp1kiQNHTpUkydPvuF2AAAAAHCrfjUCz549q927d+vIkSOy2WwqKyuTzWZTjx495OXl5bqfm5ub\n67abm5vKysokSeXl5UpISFC1atWu2rePj0+Fh8zPz9fKlSuVmJioWrVqKTY2VsXFxZKkHTt2aPHi\nxa771q5dW5GRkYqMjNS4ceO0Z88eORwOnTlzRklJSfL09FSvXr1c29+MXz4T+D9lZWVp586d2rhx\no3x8fDRq1Khr7vtWHg8AAAAAbrdfvTDM5s2bFRUVpe3bt2vbtm36/PPP5e/vr71791Zo5127dtXa\ntWtdtw8dOnTDbcLCwvT++++7bhcUFMjhcMjHx0c1atTQ6dOnlZGRIenySlppaalr1W3Xrl2uK3YW\nFRXp+PHj+rd/+zcVFhbq/vvvl6enp3bv3q3vv/9e0uXVvC1btujChQs6f/68tmzZotDQ0Ao9t18U\nFhaqVq1a8vHxUV5env7xj3+4fveb3/xGeXl5Ki8v15YtW67atkaNGqpZs6br9UxOTlbHjh1v6vEB\nAAAA4Gb86kpgamrqVV+zEB4ervXr1ysgIOCGO4+Li9PcuXMVGRmpsrIyhYaGau7cub+6zfjx4zV3\n7lwNGjRIbm5umjhxosLDw9WqVSsNGDBADRo0UIcOHSRdXgUMCwtzbXvgwAH94Q9/kLu7u5xOp6Kj\no9W2bVv5+/tr/PjxioyMVJs2bdSkSRNJUuvWrTVs2DBFR0dLunxhmBt9HvB/6969uzZs2KABAwao\ncePGCg4Odv1u6tSpGjdunPz8/NSmTRudP3/+qu3/+Mc/ui4M06hRI82fP/+mHh8AAAAAbobN6XQ6\nrR7iVsXFxSk6OvqK8DKF3W7Xq8fTrR4DQCXYNPTWvzIH/81utyskJMTqMYArcFyiquGYvLdd7/29\n4YVhqrL4+HirRwAAAACAu8pdHYGV4eeff77mdwW+++67Ff7aBwAAAACoqojA/6VOnTpXXQEUAAAA\nAO4Vv3p1UAAAAADAvYUIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAA\nAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBBiEAAAAAAMAgR\nCAAAAAAGIQIBAAAAwCBEIAAAAAAYxMPqAXDrNg2dYfUIgIvdbldISIjVYwAAAOAGWAkEAAAAAIMQ\ngQAAAABgECIQAAAAAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAA\nAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEE8rB4At27oB3+3egTgSt9wTKKK4ZhEFTPjIR+rRwAA\nVgIBAAAAwCREIAAAAAAYhAgEAAAAAIMQgQAAAABgECIQAAAAAAxCBAIAAACAQYhAAAAAADAIEQgA\nAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAw\nCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBB7kgE\ntmzZUlFRUa5/VqxYcSce9rpWrFihjz76SI888oicTqckKTs7Wy1atNBPP/0kSSosLFSnTp1UXl5u\n5agAAAAAcFt53IkH8fb2VnJy8i1tW1paKg+P2ztmZmamFi1apLp16yovL0/NmjVTdna2WrVqpX37\n9ikiIkL/+Mc/FBQUJDe3inVyZcwJAAAAALebpdXSq1cvJSYmys/PTzk5OVqwYIHWrl2rJUuW6Pjx\n48rPz1fDhg01f/58zZ49W7m5uXJ3d1dsbKw6d+6spKQkffrppyoqKtKJEyc0ePBgTZw4UZKUnJys\ntWvXqqSkRO3atdOsWbPk7u6uoqIilZSUyM/PT+3bt1d2drYrAp9++mllZ2crIiJC2dnZ6tChgyQp\nISFBGzduVElJiR588EEtWLBAPj4+io2NlZeXlw4dOqQOHTqod+/eio+PlyTZbDa99957stlsmjBh\ngs6dO6fS0lJNnjxZffr00TvvvCMvLy+NHj1a8+bN0+HDh7VmzRrt2rVLiYmJevPNNy17XwAAAADc\nu+5IBF68eFFRUVGu2+PGjVNERMSvbpOXl6d169bJ29tbK1eulCSlpKQoLy9PY8eO1ebNmyVJOTk5\nSklJkY+Pj0aMGKEePXqoevXqSk9P1/r16+Xp6anZs2crJSVFQ4YM0c6dO/Xoo49Kkjp06KAvvvhC\n0dHRys/P14ABA7Rx40ZJl08Pff755yVJffv21WOPPSZJ+q//+i8lJiZq1KhRkqQTJ05ow4YNcnd3\n1wsvvKCZM2cqJCREDodD1apVkyS99dZb8vX11ZkzZzRy5Ej17t1boaGhWrlypUaPHq3c3FxdunRJ\nJSUlstvt6tix4+166QEAAADgClX2dNBevXrJ29tbkmS32/XUU09Jkpo2baqGDRvq2LFjkqSwsDDV\nqVNH0uVYs9vt8vDwUG5urkaMGCHpcoTef//9ki6fCjps2DBJUvv27bV8+XLl5+frgQceULVq1eR0\nOuVwOHTgwAG1bdtWkvTVV1/pT3/6kwoLC+VwONS1a1fXnP3795e7u7uky1H5+uuvKzIyUuHh4brv\nvvtUUlKihQsXas+ePXJzc9OJEyd0+vRptW7dWgcOHFBRUZG8vLzUqlUr5ebmau/evZoxY8Ytvc4A\nAAAAcCOWng7q7u7uujBLcXHxFb/z8fGp0D5sNttVt51Op4YOHaqpU6dedf8vv/xSs2fPliQ99NBD\nKiws1Pbt2xUcHCxJatOmjZKSkvTAAw/ovvvukyTFxsZq6dKlCgwMVFJSkr744otrzvn888+rR48e\n+vzzz/XEE0/onXfe0f79+3XmzBklJSXJ09NTvXr1UnFxsTw9PeXv76+kpCS1b99eLVq0UFZWlo4f\nP66mTZtW6LkDAAAAwM2y9CsiHnjgAeXm5kqSPvnkk+veLzQ0VCkpKZKkY8eO6ccff1STJk0kSTt2\n7NDZs2d18eJFbdmyRR06dNCjjz6qzZs361//+pck6ezZs/r+++/11VdfqUmTJq6VO0lq166d1qxZ\no/bt20uSgoODtXr1atfnASXJ4XCobt26Kikpcc1xLcePH1eLFi30/PPPKygoSMeOHVNhYaHuv/9+\neXp6avfu3fr++++veF4rV65Ux44dFRoaqg0bNqhly5ZXhS0AAAAA3C6WfCawW7duevnllzVx4kTF\nxcVp0aJFeuSRR667/W9/+1vNnj1bkZGRcnd31/z58+Xl5SVJatu2rSZNmuS6MExQUJAk6T/+4z/0\n7LPPqry8XJ6enpo5c6bsdru6det2xb47dOigjIwMtWnTRtLlCMzPz3dFoSRNnjxZ0dHR8vPzU7t2\n7eRwOK4Mob+8AAAK90lEQVQ55+rVq5WVlSWbzaaHH35Y3bt3V1FRkcaPH6/IyEi1adPGFa/S5Qh8\n++23FRwcrOrVq6tatWoKDQ29yVcXAAAAACrO5vzlfMy7UFJSknJzczVz5swK3f+ZZ57RH//4R9Wr\nV6+SJ6t8drtdr31zweoxAADATZjxkI9CQkKsHgNwsdvtHJP3sOu9v0Z9sd2qVausHgEAAAAALHVX\nR+CwYcNcV/oEAAAAANyYpReGAQAAAADcWUQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIE\nAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABgECIQAAAA\nAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGMTD6gFw6/42vKvVIwAudrtd\nISEhVo8BuHBMoiqy2+1WjwAArAQCAAAAgEmIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABiE\nCAQAAAAAgxCBAAAAAGAQIhAAAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCAeVg+A\nW5f9zkmrRwBc3NRI2dkck6g6OCZRFXFcosppb/UAsAIrgQAAAABgECIQAAAAAAxCBAIAAACAQYhA\nAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBAiEAAAAAAMQgQCAAAA\ngEGIQAAAAAAwCBEIAAAAAAYhAgEAAADAIEQgAAAAABiECAQAAAAAgxCBAAAAAGAQIhAAAAAADEIE\nAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBGROB3332nQYMGXfGzJUuW6K9//et1t0lKStLc\nuXMrezQAAAAAuKM8rB4AAAAAuBnFJRdVWl5i9Rj3hHKHQwUFBVaPcU/w8vKSj4+P1WNUiPEROGrU\nKLVt21ZZWVkqLCxUfHy8QkNDr7jPZ599pmXLlmnZsmVasGCBfH19lZubq1OnTmnatGnq37+/nE6n\nFixYoMzMTNlsNo0fP14RERGaM2eOunbtqt69e+vFF19UzZo1NX/+fCUmJio/P1/R0dF67rnnFBIS\nouzsbNWvX19Lly6Vt7e3Ra8IAABA1ZW0d5V2ffWpnHJaPcq9IcnqAe4dbm5uioyMVExMjNWj3JAR\np4PeSFlZmRITEzV9+nT9+c9/vuJ3n376qVasWKEVK1bIz89PknTy5EmtW7dOy5cv15tvvilJ+uST\nT3T48GElJydr1apVWrBggU6ePKnQ0FDt3btXknTixAnl5eVJkux2uys2v/32Wz355JNKS0tTjRo1\ntHnz5jv11AEAAO4qO7/6hABElVReXq7k5GSrx6gQIyLQZrP96s/79u0rSWrdurW+//571+93796t\nv/zlL1qxYoVq1arl+nmfPn3k5uamZs2a6fTp05IuR93AgQPl7u6u3/zmN+rYsaNycnIUGhoqu92u\no0ePqlmzZrr//vt18uRJZWdnq3379pIkf39/tWzZ8pozAAAA4L+FPRx+3b/bAVZyc3NTVFSU1WNU\niBGng9auXfuqc50LCgrk7+8v6fL5u9LlN66srMx1n4CAAOXn5+vYsWMKCgpy/fyX+1dE/fr1de7c\nOWVmZio0NFQFBQVKT09X9erV5evrq7Nnz16xP3d3dxUXF9/S8wQAALjXDQt9RgPbPcFnAm+T8qDv\n1a5dO6vHuCfwmcAq5r777lPdunW1a9cuPfroozp79qwyMzM1evRoJSVd/0Tohg0batq0aZo0aZIW\nLVqkhx9++Lr3DQ0N1caNGzV06FAVFBRo7969euWVVyRJwcHBWr16tVavXq2zZ88qJiZG/fr1u+3P\nEwAAwATVPL1VTVw/4XYov+++K854gxmMiEBJWrBggebMmaPXX39dkvTiiy8qICDghts1bdpUb7zx\nhiZPnqy33377uvfr27evsrOzFRUVJZvNpmnTpqlu3bqSpJCQEP3973/Xgw8+qIYNG6qgoOCqi88A\nAAAAwJ1gczqdfLL2LmS32+WW3cjqMQAAAHAXK2+fr5CQEKvHQCWx2+3XfH+NuDAMAAAAAOAyIhAA\nAAAADEIEAgAAAIBBiEAAAAAAMAgRCAAAAAAGIQIBAAAAwCBEIAAAAAAYhAgEAAAAAIMQgQAAAABg\nECIQAAAAAAxCBAIAAACAQYhAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEA\nAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwiIfVA+DWtf9dPatHAFzsdrtCQkKsHgNw4ZhEVcRx\niarGbs+3egRYgJVAAAAAADAIEQgAAAAABiECAQAAAMAgRCAAAAAAGIQIBAAAAACDEIEAAAAAYBCb\n0+l0Wj0Ebp7dbrd6BAAAAABV3LW+loYIBAAAAACDcDooAAAAABiECAQAAAAAgxCBAAAAAGAQIhAA\nAAAADEIEAgAAAIBBiMC7TEZGhvr166e+fftqxYoVVo8Dw/34448aNWqUIiIiNHDgQK1evdrqkQBJ\nUllZmYYMGaJx48ZZPQogSTp37pxiYmLUv39/DRgwQNnZ2VaPBMO9++67GjhwoAYNGqSXXnpJxcXF\nVo+EO4gIvIuUlZVp7ty5euedd5SWlqbU1FQdPXrU6rFgMHd3d8XGxmrTpk3auHGj1q1bxzGJKmHN\nmjVq2rSp1WMALvHx8erWrZs+/vhjJScnc3zCUidOnNCaNWv0wQcfKDU1VWVlZUpLS7N6LNxBROBd\n5Msvv9SDDz6oRo0aycvLSwMHDtTWrVutHgsGq1evnlq3bi1J8vX1VZMmTXTixAmLp4LpfvrpJ332\n2WcaMWKE1aMAkqTCwkLt2bPHdUx6eXmpZs2aFk8F05WVlenixYsqLS3VxYsXVa9ePatHwh1EBN5F\nTpw4oQYNGrhu169fn79wo8r47rvvdOjQIbVr187qUWC4efPmadq0aXJz4484VA3fffed/Pz89Pvf\n/15DhgxRXFyczp8/b/VYMFj9+vX17LPP6t///d/VtWtX+fr6qmvXrlaPhTuIPyEB/J85HA7FxMRo\n+vTp8vX1tXocGGz79u3y8/NTmzZtrB4FcCktLdXBgwf1xBNP6MMPP5SPjw+f64elCgoKtHXrVm3d\nulWZmZm6cOGCkpOTrR4LdxAReBepX7++fvrpJ9ftEydOqH79+hZOBEglJSWKiYlRZGSkwsPDrR4H\nhtu3b5+2bdumXr166aWXXtLu3bv18ssvWz0WDNegQQM1aNDAdaZE//79dfDgQYungsl27twpf39/\n+fn5ydPTU+Hh4VysyDBE4F0kKChI33zzjfLz83Xp0iWlpaWpV69eVo8FgzmdTsXFxalJkyZ65pln\nrB4H0NSpU5WRkaFt27Zp4cKF6ty5s9544w2rx4Lh6tatqwYNGujrr7+WJO3atYsLw8BSDRs21P79\n+3XhwgU5nU6OSQN5WD0AKs7Dw0MzZ87U7373O5WVlWn48OF6+OGHrR4LBrPb7UpOTlbz5s0VFRUl\nSXrppZfUo0cPiycDgKrl1Vdf1csvv6ySkhI1atRI8+fPt3okGKxdu3bq16+fhg4dKg8PD7Vs2VIj\nR460eizcQTan0+m0eggAAAAAwJ3B6aAAAAAAYBAiEAAAAAAMQgQCAAAAgEGIQAAAAAAwCBEIAAAA\nAAYhAgEAAADAIEQgAAAAABiECAQAAAAAg/w/57T0QJ9j5NkAAAAASUVORK5CYII=\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sns.barplot(y = clean_tz[:10].values, x = clean_tz[:10].index)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## MovieLens data" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ubuntu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", " after removing the cwd from sys.path.\n", "/home/ubuntu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", " \n", "/home/ubuntu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:8: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", " \n" ] } ], "source": [ "# Make display smaller\n", "pd.options.display.max_rows = 15\n", "unames = ['user_id', 'gender', 'age', 'occupation', 'zip']\n", "users = pd.read_table('datasets/movielens/users.dat', sep='::',header=None, names=unames)\n", "rnames = ['user_id', 'movie_id', 'rating', 'timestamp']\n", "ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::',header=None, names=rnames)\n", "mnames = ['movie_id', 'title', 'genres']\n", "movies = pd.read_table('datasets/movielens/movies.dat', sep='::',header=None, names=mnames)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idgenderageoccupationzip
01F11048067
12M561670072
23M251555117
34M45702460
45M252055455
\n", "
" ], "text/plain": [ " user_id gender age occupation zip\n", "0 1 F 1 10 48067\n", "1 2 M 56 16 70072\n", "2 3 M 25 15 55117\n", "3 4 M 45 7 02460\n", "4 5 M 25 20 55455" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "users.head()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idmovie_idratingtimestamp
0111935978300760
116613978302109
219143978301968
3134084978300275
4123555978824291
\n", "
" ], "text/plain": [ " user_id movie_id rating timestamp\n", "0 1 1193 5 978300760\n", "1 1 661 3 978302109\n", "2 1 914 3 978301968\n", "3 1 3408 4 978300275\n", "4 1 2355 5 978824291" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings.head()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idtitlegenres
01Toy Story (1995)Animation|Children's|Comedy
12Jumanji (1995)Adventure|Children's|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama
45Father of the Bride Part II (1995)Comedy
\n", "
" ], "text/plain": [ " movie_id title genres\n", "0 1 Toy Story (1995) Animation|Children's|Comedy\n", "1 2 Jumanji (1995) Adventure|Children's|Fantasy\n", "2 3 Grumpier Old Men (1995) Comedy|Romance\n", "3 4 Waiting to Exhale (1995) Comedy|Drama\n", "4 5 Father of the Bride Part II (1995) Comedy" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idmovie_idratingtimestampgenderageoccupationzip
0111935978300760F11048067
116613978302109F11048067
219143978301968F11048067
3134084978300275F11048067
4123555978824291F11048067
\n", "
" ], "text/plain": [ " user_id movie_id rating timestamp gender age occupation zip\n", "0 1 1193 5 978300760 F 1 10 48067\n", "1 1 661 3 978302109 F 1 10 48067\n", "2 1 914 3 978301968 F 1 10 48067\n", "3 1 3408 4 978300275 F 1 10 48067\n", "4 1 2355 5 978824291 F 1 10 48067" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "first_merge = pd.merge(ratings, users, on = \"user_id\")\n", "first_merge.head()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idmovie_idratingtimestampgenderageoccupationziptitlegenres
0111935978300760F11048067One Flew Over the Cuckoo's Nest (1975)Drama
1211935978298413M561670072One Flew Over the Cuckoo's Nest (1975)Drama
21211934978220179M251232793One Flew Over the Cuckoo's Nest (1975)Drama
31511934978199279M25722903One Flew Over the Cuckoo's Nest (1975)Drama
41711935978158471M50195350One Flew Over the Cuckoo's Nest (1975)Drama
\n", "
" ], "text/plain": [ " user_id movie_id rating timestamp gender age occupation zip \\\n", "0 1 1193 5 978300760 F 1 10 48067 \n", "1 2 1193 5 978298413 M 56 16 70072 \n", "2 12 1193 4 978220179 M 25 12 32793 \n", "3 15 1193 4 978199279 M 25 7 22903 \n", "4 17 1193 5 978158471 M 50 1 95350 \n", "\n", " title genres \n", "0 One Flew Over the Cuckoo's Nest (1975) Drama \n", "1 One Flew Over the Cuckoo's Nest (1975) Drama \n", "2 One Flew Over the Cuckoo's Nest (1975) Drama \n", "3 One Flew Over the Cuckoo's Nest (1975) Drama \n", "4 One Flew Over the Cuckoo's Nest (1975) Drama " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "second_merge = pd.merge(first_merge, movies, on = \"movie_id\")\n", "second_merge.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1000209, 10)" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "second_merge.shape" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "df = second_merge" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rating
gendertitle
F$1,000,000 Duck (1971)3.375000
'Night Mother (1986)3.388889
'Til There Was You (1997)2.675676
'burbs, The (1989)2.793478
...And Justice for All (1979)3.828571
1-900 (1994)2.000000
10 Things I Hate About You (1999)3.646552
.........
MYour Friends and Neighbors (1998)3.536585
Zachariah (1971)3.500000
Zed & Two Noughts, A (1985)3.380952
Zero Effect (1998)3.723140
Zero Kelvin (Kj�rlighetens kj�tere) (1995)3.500000
Zeus and Roxanne (1997)2.357143
eXistenZ (1999)3.289086
\n", "

7152 rows × 1 columns

\n", "
" ], "text/plain": [ " rating\n", "gender title \n", "F $1,000,000 Duck (1971) 3.375000\n", " 'Night Mother (1986) 3.388889\n", " 'Til There Was You (1997) 2.675676\n", " 'burbs, The (1989) 2.793478\n", " ...And Justice for All (1979) 3.828571\n", " 1-900 (1994) 2.000000\n", " 10 Things I Hate About You (1999) 3.646552\n", "... ...\n", "M Your Friends and Neighbors (1998) 3.536585\n", " Zachariah (1971) 3.500000\n", " Zed & Two Noughts, A (1985) 3.380952\n", " Zero Effect (1998) 3.723140\n", " Zero Kelvin (Kj�rlighetens kj�tere) (1995) 3.500000\n", " Zeus and Roxanne (1997) 2.357143\n", " eXistenZ (1999) 3.289086\n", "\n", "[7152 rows x 1 columns]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby([\"gender\", \"title\"])[[\"rating\"]].agg(\"mean\")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genderFM
title
$1,000,000 Duck (1971)3.3750002.761905
'Night Mother (1986)3.3888893.352941
'Til There Was You (1997)2.6756762.733333
'burbs, The (1989)2.7934782.962085
...And Justice for All (1979)3.8285713.689024
1-900 (1994)2.0000003.000000
10 Things I Hate About You (1999)3.6465523.311966
.........
Your Friends and Neighbors (1998)2.8888893.536585
Zachariah (1971)NaN3.500000
Zed & Two Noughts, A (1985)3.5000003.380952
Zero Effect (1998)3.8644073.723140
Zero Kelvin (Kj�rlighetens kj�tere) (1995)NaN3.500000
Zeus and Roxanne (1997)2.7777782.357143
eXistenZ (1999)3.0985923.289086
\n", "

3706 rows × 2 columns

\n", "
" ], "text/plain": [ "gender F M\n", "title \n", "$1,000,000 Duck (1971) 3.375000 2.761905\n", "'Night Mother (1986) 3.388889 3.352941\n", "'Til There Was You (1997) 2.675676 2.733333\n", "'burbs, The (1989) 2.793478 2.962085\n", "...And Justice for All (1979) 3.828571 3.689024\n", "1-900 (1994) 2.000000 3.000000\n", "10 Things I Hate About You (1999) 3.646552 3.311966\n", "... ... ...\n", "Your Friends and Neighbors (1998) 2.888889 3.536585\n", "Zachariah (1971) NaN 3.500000\n", "Zed & Two Noughts, A (1985) 3.500000 3.380952\n", "Zero Effect (1998) 3.864407 3.723140\n", "Zero Kelvin (Kj�rlighetens kj�tere) (1995) NaN 3.500000\n", "Zeus and Roxanne (1997) 2.777778 2.357143\n", "eXistenZ (1999) 3.098592 3.289086\n", "\n", "[3706 rows x 2 columns]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.pivot_table(\"rating\", index = \"title\", columns = \"gender\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genderFM
title
$1,000,000 Duck (1971)3.3750002.761905
'Night Mother (1986)3.3888893.352941
'Til There Was You (1997)2.6756762.733333
'burbs, The (1989)2.7934782.962085
...And Justice for All (1979)3.8285713.689024
1-900 (1994)2.0000003.000000
10 Things I Hate About You (1999)3.6465523.311966
.........
Your Friends and Neighbors (1998)2.8888893.536585
Zachariah (1971)NaN3.500000
Zed & Two Noughts, A (1985)3.5000003.380952
Zero Effect (1998)3.8644073.723140
Zero Kelvin (Kj�rlighetens kj�tere) (1995)NaN3.500000
Zeus and Roxanne (1997)2.7777782.357143
eXistenZ (1999)3.0985923.289086
\n", "

3706 rows × 2 columns

\n", "
" ], "text/plain": [ "gender F M\n", "title \n", "$1,000,000 Duck (1971) 3.375000 2.761905\n", "'Night Mother (1986) 3.388889 3.352941\n", "'Til There Was You (1997) 2.675676 2.733333\n", "'burbs, The (1989) 2.793478 2.962085\n", "...And Justice for All (1979) 3.828571 3.689024\n", "1-900 (1994) 2.000000 3.000000\n", "10 Things I Hate About You (1999) 3.646552 3.311966\n", "... ... ...\n", "Your Friends and Neighbors (1998) 2.888889 3.536585\n", "Zachariah (1971) NaN 3.500000\n", "Zed & Two Noughts, A (1985) 3.500000 3.380952\n", "Zero Effect (1998) 3.864407 3.723140\n", "Zero Kelvin (Kj�rlighetens kj�tere) (1995) NaN 3.500000\n", "Zeus and Roxanne (1997) 2.777778 2.357143\n", "eXistenZ (1999) 3.098592 3.289086\n", "\n", "[3706 rows x 2 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "avg = df.pivot_table(\"rating\", index = \"title\", columns = \"gender\", aggfunc = \"mean\")\n", "avg" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
title
title
$1,000,000 Duck (1971)37
'Night Mother (1986)70
'Til There Was You (1997)52
'burbs, The (1989)303
...And Justice for All (1979)199
1-900 (1994)2
10 Things I Hate About You (1999)700
......
Your Friends and Neighbors (1998)109
Zachariah (1971)2
Zed & Two Noughts, A (1985)29
Zero Effect (1998)301
Zero Kelvin (Kj�rlighetens kj�tere) (1995)2
Zeus and Roxanne (1997)23
eXistenZ (1999)410
\n", "

3706 rows × 1 columns

\n", "
" ], "text/plain": [ " title\n", "title \n", "$1,000,000 Duck (1971) 37\n", "'Night Mother (1986) 70\n", "'Til There Was You (1997) 52\n", "'burbs, The (1989) 303\n", "...And Justice for All (1979) 199\n", "1-900 (1994) 2\n", "10 Things I Hate About You (1999) 700\n", "... ...\n", "Your Friends and Neighbors (1998) 109\n", "Zachariah (1971) 2\n", "Zed & Two Noughts, A (1985) 29\n", "Zero Effect (1998) 301\n", "Zero Kelvin (Kj�rlighetens kj�tere) (1995) 2\n", "Zeus and Roxanne (1997) 23\n", "eXistenZ (1999) 410\n", "\n", "[3706 rows x 1 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings_by_title = df.groupby(\"title\")[[\"title\"]].count()\n", "ratings_by_title" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['American Beauty (1999)', 'Back to the Future (1985)', 'Fargo (1996)',\n", " 'Jurassic Park (1993)', 'Matrix, The (1999)', 'Men in Black (1997)',\n", " 'Raiders of the Lost Ark (1981)', 'Saving Private Ryan (1998)',\n", " 'Silence of the Lambs, The (1991)',\n", " 'Star Wars: Episode IV - A New Hope (1977)',\n", " 'Star Wars: Episode V - The Empire Strikes Back (1980)',\n", " 'Star Wars: Episode VI - Return of the Jedi (1983)',\n", " 'Terminator 2: Judgment Day (1991)'],\n", " dtype='object', name='title')" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings_2500 = ratings_by_title.index[ratings_by_title.title >= 2500]\n", "ratings_2500" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genderFM
title
American Beauty (1999)4.2389014.347301
Back to the Future (1985)3.9327074.009259
Fargo (1996)4.2176564.267780
Jurassic Park (1993)3.5794073.814197
Matrix, The (1999)4.1284054.362235
Men in Black (1997)3.8178443.719000
Raiders of the Lost Ark (1981)4.3321684.520597
Saving Private Ryan (1998)4.1147834.398941
Silence of the Lambs, The (1991)4.2719554.381944
Star Wars: Episode IV - A New Hope (1977)4.3029374.495307
Star Wars: Episode V - The Empire Strikes Back (1980)4.1064814.344577
Star Wars: Episode VI - Return of the Jedi (1983)3.8652374.069058
Terminator 2: Judgment Day (1991)3.7850884.115367
\n", "
" ], "text/plain": [ "gender F M\n", "title \n", "American Beauty (1999) 4.238901 4.347301\n", "Back to the Future (1985) 3.932707 4.009259\n", "Fargo (1996) 4.217656 4.267780\n", "Jurassic Park (1993) 3.579407 3.814197\n", "Matrix, The (1999) 4.128405 4.362235\n", "Men in Black (1997) 3.817844 3.719000\n", "Raiders of the Lost Ark (1981) 4.332168 4.520597\n", "Saving Private Ryan (1998) 4.114783 4.398941\n", "Silence of the Lambs, The (1991) 4.271955 4.381944\n", "Star Wars: Episode IV - A New Hope (1977) 4.302937 4.495307\n", "Star Wars: Episode V - The Empire Strikes Back ... 4.106481 4.344577\n", "Star Wars: Episode VI - Return of the Jedi (1983) 3.865237 4.069058\n", "Terminator 2: Judgment Day (1991) 3.785088 4.115367" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "avg.loc[ratings_2500]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genderFM
title
Clean Slate (Coup de Torchon) (1981)5.03.857143
Ballad of Narayama, The (Narayama Bushiko) (1958)5.03.428571
Raw Deal (1948)5.03.307692
Bittersweet Motel (2000)5.0NaN
Skipped Parts (2000)5.04.000000
Lamerica (1994)5.04.666667
Gambler, The (A J�t�kos) (1997)5.03.166667
.........
Wings of Courage (1995)NaN3.000000
With Byrd at the South Pole (1930)NaN2.000000
With Friends Like These... (1998)NaN4.000000
Wooden Man's Bride, The (Wu Kui) (1994)NaN3.000000
Year of the Horse (1997)NaN3.250000
Zachariah (1971)NaN3.500000
Zero Kelvin (Kj�rlighetens kj�tere) (1995)NaN3.500000
\n", "

3706 rows × 2 columns

\n", "
" ], "text/plain": [ "gender F M\n", "title \n", "Clean Slate (Coup de Torchon) (1981) 5.0 3.857143\n", "Ballad of Narayama, The (Narayama Bushiko) (1958) 5.0 3.428571\n", "Raw Deal (1948) 5.0 3.307692\n", "Bittersweet Motel (2000) 5.0 NaN\n", "Skipped Parts (2000) 5.0 4.000000\n", "Lamerica (1994) 5.0 4.666667\n", "Gambler, The (A J�t�kos) (1997) 5.0 3.166667\n", "... ... ...\n", "Wings of Courage (1995) NaN 3.000000\n", "With Byrd at the South Pole (1930) NaN 2.000000\n", "With Friends Like These... (1998) NaN 4.000000\n", "Wooden Man's Bride, The (Wu Kui) (1994) NaN 3.000000\n", "Year of the Horse (1997) NaN 3.250000\n", "Zachariah (1971) NaN 3.500000\n", "Zero Kelvin (Kj�rlighetens kj�tere) (1995) NaN 3.500000\n", "\n", "[3706 rows x 2 columns]" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted_titles = avg.sort_values(ascending = False, by = \"F\")\n", "sorted_titles" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "avg[\"mean_diff\"] = np.abs(avg.F - avg.M)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genderFMmean_diff
title
$1,000,000 Duck (1971)3.3750002.7619050.613095
'Night Mother (1986)3.3888893.3529410.035948
'Til There Was You (1997)2.6756762.7333330.057658
'burbs, The (1989)2.7934782.9620850.168607
...And Justice for All (1979)3.8285713.6890240.139547
1-900 (1994)2.0000003.0000001.000000
10 Things I Hate About You (1999)3.6465523.3119660.334586
............
Your Friends and Neighbors (1998)2.8888893.5365850.647696
Zachariah (1971)NaN3.500000NaN
Zed & Two Noughts, A (1985)3.5000003.3809520.119048
Zero Effect (1998)3.8644073.7231400.141266
Zero Kelvin (Kj�rlighetens kj�tere) (1995)NaN3.500000NaN
Zeus and Roxanne (1997)2.7777782.3571430.420635
eXistenZ (1999)3.0985923.2890860.190494
\n", "

3706 rows × 3 columns

\n", "
" ], "text/plain": [ "gender F M mean_diff\n", "title \n", "$1,000,000 Duck (1971) 3.375000 2.761905 0.613095\n", "'Night Mother (1986) 3.388889 3.352941 0.035948\n", "'Til There Was You (1997) 2.675676 2.733333 0.057658\n", "'burbs, The (1989) 2.793478 2.962085 0.168607\n", "...And Justice for All (1979) 3.828571 3.689024 0.139547\n", "1-900 (1994) 2.000000 3.000000 1.000000\n", "10 Things I Hate About You (1999) 3.646552 3.311966 0.334586\n", "... ... ... ...\n", "Your Friends and Neighbors (1998) 2.888889 3.536585 0.647696\n", "Zachariah (1971) NaN 3.500000 NaN\n", "Zed & Two Noughts, A (1985) 3.500000 3.380952 0.119048\n", "Zero Effect (1998) 3.864407 3.723140 0.141266\n", "Zero Kelvin (Kj�rlighetens kj�tere) (1995) NaN 3.500000 NaN\n", "Zeus and Roxanne (1997) 2.777778 2.357143 0.420635\n", "eXistenZ (1999) 3.098592 3.289086 0.190494\n", "\n", "[3706 rows x 3 columns]" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "avg" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genderFMmean_diff
title
Tigrero: A Film That Was Never Made (1994)1.0000004.3333333.333333
Spiders, The (Die Spinnen, 1. Teil: Der Goldene See) (1919)4.0000001.0000003.000000
Neon Bible, The (1995)1.0000004.0000003.000000
James Dean Story, The (1957)4.0000001.0000003.000000
Country Life (1994)5.0000002.0000003.000000
Enfer, L' (1994)1.0000003.7500002.750000
Babyfever (1994)3.6666671.0000002.666667
............
Santa with Muscles (1996)1.0000001.0000000.000000
Wirey Spindell (1999)1.0000001.0000000.000000
Nelly & Monsieur Arnaud (1995)3.6000003.6000000.000000
Walk in the Sun, A (1945)4.0000004.0000000.000000
Inferno (1980)3.0000003.0000000.000000
Shopping (1994)2.0000002.0000000.000000
Loaded (1994)2.6666672.6666670.000000
\n", "

3446 rows × 3 columns

\n", "
" ], "text/plain": [ "gender F M \\\n", "title \n", "Tigrero: A Film That Was Never Made (1994) 1.000000 4.333333 \n", "Spiders, The (Die Spinnen, 1. Teil: Der Goldene... 4.000000 1.000000 \n", "Neon Bible, The (1995) 1.000000 4.000000 \n", "James Dean Story, The (1957) 4.000000 1.000000 \n", "Country Life (1994) 5.000000 2.000000 \n", "Enfer, L' (1994) 1.000000 3.750000 \n", "Babyfever (1994) 3.666667 1.000000 \n", "... ... ... \n", "Santa with Muscles (1996) 1.000000 1.000000 \n", "Wirey Spindell (1999) 1.000000 1.000000 \n", "Nelly & Monsieur Arnaud (1995) 3.600000 3.600000 \n", "Walk in the Sun, A (1945) 4.000000 4.000000 \n", "Inferno (1980) 3.000000 3.000000 \n", "Shopping (1994) 2.000000 2.000000 \n", "Loaded (1994) 2.666667 2.666667 \n", "\n", "gender mean_diff \n", "title \n", "Tigrero: A Film That Was Never Made (1994) 3.333333 \n", "Spiders, The (Die Spinnen, 1. Teil: Der Goldene... 3.000000 \n", "Neon Bible, The (1995) 3.000000 \n", "James Dean Story, The (1957) 3.000000 \n", "Country Life (1994) 3.000000 \n", "Enfer, L' (1994) 2.750000 \n", "Babyfever (1994) 2.666667 \n", "... ... \n", "Santa with Muscles (1996) 0.000000 \n", "Wirey Spindell (1999) 0.000000 \n", "Nelly & Monsieur Arnaud (1995) 0.000000 \n", "Walk in the Sun, A (1945) 0.000000 \n", "Inferno (1980) 0.000000 \n", "Shopping (1994) 0.000000 \n", "Loaded (1994) 0.000000 \n", "\n", "[3446 rows x 3 columns]" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "avg.sort_values(by = \"mean_diff\", ascending = False).dropna()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }