{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": "## Exploring a dataset in the Notebook" }, { "cell_type": "markdown", "metadata": {}, "source": "### Provenance of the data" }, { "cell_type": "markdown", "metadata": {}, "source": "### Downloading and loading a dataset" }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": "import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n%matplotlib inline" }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": "%cd ~/minibook/chapter2/" }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": "!wget https://raw.githubusercontent.com/ipython-books/minibook-2nd-data/master/nyc_taxi.zip\n!unzip nyc_taxi.zip" }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": "nyc_data.csv nyc_fare.csv [...]" }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": "%ls data" }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": "data_filename = 'data/nyc_data.csv'\nfare_filename = 'data/nyc_fare.csv'" }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": "data = pd.read_csv(data_filename, parse_dates=['pickup_datetime',\n 'dropoff_datetime'])\nfare = pd.read_csv(fare_filename, parse_dates=['pickup_datetime'])" }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": "data.head(3)" }, { "cell_type": "markdown", "metadata": {}, "source": "### Making plots with matplotlib" }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": "Index(['medallion',\n ...\n 'pickup_datetime',\n 'dropoff_datetime',\n 'passenger_count',\n 'trip_time_in_secs',\n 'trip_distance',\n 'pickup_longitude',\n 'pickup_latitude',\n 'dropoff_longitude',\n 'dropoff_latitude'], dtype='object')" }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": "data.columns" }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": "p_lng = data.pickup_longitude\np_lat = data.pickup_latitude\nd_lng = data.dropoff_longitude\nd_lat = data.dropoff_latitude" }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": "0 -73.955925\n1 -74.005501\n...\n846943 -73.978477\n846944 -73.987206\nName: pickup_longitude, Length: 846945, dtype: float64" }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": "p_lng" }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": "def lat_lng_to_pixels(lat, lng):\n lat_rad = lat * np.pi / 180.0\n lat_rad = np.log(np.tan((lat_rad + np.pi / 2.0) / 2.0))\n x = 100 * (lng + 180.0) / 360.0\n y = 100 * (lat_rad - np.pi) / (2.0 * np.pi)\n return (x, y)" }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": "px, py = lat_lng_to_pixels(p_lat, p_lng)" }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": "0 29.456688\n1 29.442916\n...\n846943 29.450423\n846944 29.447998\nName: pickup_longitude, dtype: float64" }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": "px" }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": "plt.scatter(px, py)" }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": "plt.figure(figsize=(8, 6))\nplt.scatter(px, py, s=.1, alpha=.03)\nplt.axis('equal')\nplt.xlim(29.40, 29.55)\nplt.ylim(-37.63, -37.54)\nplt.axis('off')" }, { "cell_type": "markdown", "metadata": {}, "source": "### Descriptive statistics with pandas and seaborn" }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": "(846945, 29.417137499999995, 29.714313055555561)" }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": "px.count(), px.min(), px.max()" }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": "(29.451345807768575, 29.449418333333337, 0.0097616942794720614)" }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": "px.mean(), px.median(), px.std()" }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": "!conda install seaborn -q -y" }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": "'0.6.0'" }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": "import seaborn as sns\nsns.__version__" }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": "data.trip_distance.hist(bins=np.linspace(0., 10., 100))" } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 0 }