{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')\n", "plots.rcParams[\"patch.force_edgecolor\"] = True" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Lecture 8 " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Categorical Distribution ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top = Table.read_table('top_movies_2017.csv')\n", "top" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "studios = top.select('Studio')\n", "studios" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "studio_distribution = studios.group('Studio')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "studio_distribution" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sum(studio_distribution.column(1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Bar Charts ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "studio_distribution.barh('Studio')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "studio_distribution.sort(1, descending=True).barh(0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Numerical Distribution ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ages = 2018 - top.column('Year')\n", "top = top.with_column('Age', ages)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Binning ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "[min(ages), max(ages)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "my_bins = make_array(0, 5, 10, 15, 25, 40, 65, 100)\n", "my_bins" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top.bin('Age', bins = my_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sum(top.bin('Age', bins = my_bins).column(1))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top.bin('Age', bins = np.arange(0, 101, 25))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top.bin('Age', bins = np.arange(0, 60, 25))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top.where('Age', 50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Histograms ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "my_bins" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top.bin('Age', bins = my_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top.hist('Age', bins = my_bins, unit = 'Year')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# What *not* to do:\n", "top.hist('Age', bins = my_bins, unit = 'Year', normed = False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top.hist('Age', bins = my_bins, unit = 'Year')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top.hist('Age', bins = np.arange(0, 110, 10), unit = 'Year')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top.hist('Age', bins = 20, unit = 'Year')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top.hist('Age', unit='Year')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top.hist('Age', bins = my_bins, unit = 'Year')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "distribution = top.bin('Age', bins = my_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "distribution" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 52 out of 200 movies in the [40, 65) bin\n", "\n", "percent = (52/200) * 100\n", "percent" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "width = 65 - 40\n", "width" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "height = percent / width\n", "height" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.5" } }, "nbformat": 4, "nbformat_minor": 1 }