{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" }, "colab": { "name": "01-explore-data.ipynb", "provenance": [] } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "OsjvyRyVU3_G", "colab_type": "text" }, "source": [ "# Understanding the data\n", "\n", "In this first part, we load the data and perform some initial exploration on it. The main goal of this step is to acquire some basic knowledge about the data, how the various features are distributed, if there are missing values in it and so on." ] }, { "cell_type": "code", "metadata": { "id": "_wmNcEoRU3_K", "colab_type": "code", "colab": {} }, "source": [ "### imports\n", "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "%matplotlib inline\n", "\n", "# load hourly data\n", "hourly_data = pd.read_csv('https://raw.githubusercontent.com/PacktWorkshops/The-Data-Analysis-Workshop/master/Chapter01/data/hour.csv')" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "qDc53XA2U3_P", "colab_type": "text" }, "source": [ "Check data format, number of missing values in the data and general statistics:" ] }, { "cell_type": "code", "metadata": { "id": "9QFrElsEU3_Q", "colab_type": "code", "colab": {}, "outputId": "6fda5579-55de-478b-a00d-ede2cfecd346" }, "source": [ "# print some generic statistics about the data\n", "print(f\"Shape of data: {hourly_data.shape}\")\n", "print(f\"Number of missing values in the data: {hourly_data.isnull().sum().sum()}\")\n", "\n", "# get statistics on the numerical columns\n", "hourly_data.describe().T" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Shape of data: (17379, 17)\n", "Number of missing values in the data: 0\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countmeanstdmin25%50%75%max
instant17379.08690.0000005017.0295001.004345.50008690.000013034.500017379.0000
season17379.02.5016401.1069181.002.00003.00003.00004.0000
yr17379.00.5025610.5000080.000.00001.00001.00001.0000
mnth17379.06.5377753.4387761.004.00007.000010.000012.0000
hr17379.011.5467526.9144050.006.000012.000018.000023.0000
holiday17379.00.0287700.1671650.000.00000.00000.00001.0000
weekday17379.03.0036832.0057710.001.00003.00005.00006.0000
workingday17379.00.6827210.4654310.000.00001.00001.00001.0000
weathersit17379.01.4252830.6393571.001.00001.00002.00004.0000
temp17379.00.4969870.1925560.020.34000.50000.66001.0000
atemp17379.00.4757750.1718500.000.33330.48480.62121.0000
hum17379.00.6272290.1929300.000.48000.63000.78001.0000
windspeed17379.00.1900980.1223400.000.10450.19400.25370.8507
casual17379.035.67621849.3050300.004.000017.000048.0000367.0000
registered17379.0153.786869151.3572860.0034.0000115.0000220.0000886.0000
cnt17379.0189.463088181.3875991.0040.0000142.0000281.0000977.0000
\n", "
" ], "text/plain": [ " count mean std min 25% 50% \\\n", "instant 17379.0 8690.000000 5017.029500 1.00 4345.5000 8690.0000 \n", "season 17379.0 2.501640 1.106918 1.00 2.0000 3.0000 \n", "yr 17379.0 0.502561 0.500008 0.00 0.0000 1.0000 \n", "mnth 17379.0 6.537775 3.438776 1.00 4.0000 7.0000 \n", "hr 17379.0 11.546752 6.914405 0.00 6.0000 12.0000 \n", "holiday 17379.0 0.028770 0.167165 0.00 0.0000 0.0000 \n", "weekday 17379.0 3.003683 2.005771 0.00 1.0000 3.0000 \n", "workingday 17379.0 0.682721 0.465431 0.00 0.0000 1.0000 \n", "weathersit 17379.0 1.425283 0.639357 1.00 1.0000 1.0000 \n", "temp 17379.0 0.496987 0.192556 0.02 0.3400 0.5000 \n", "atemp 17379.0 0.475775 0.171850 0.00 0.3333 0.4848 \n", "hum 17379.0 0.627229 0.192930 0.00 0.4800 0.6300 \n", "windspeed 17379.0 0.190098 0.122340 0.00 0.1045 0.1940 \n", "casual 17379.0 35.676218 49.305030 0.00 4.0000 17.0000 \n", "registered 17379.0 153.786869 151.357286 0.00 34.0000 115.0000 \n", "cnt 17379.0 189.463088 181.387599 1.00 40.0000 142.0000 \n", "\n", " 75% max \n", "instant 13034.5000 17379.0000 \n", "season 3.0000 4.0000 \n", "yr 1.0000 1.0000 \n", "mnth 10.0000 12.0000 \n", "hr 18.0000 23.0000 \n", "holiday 0.0000 1.0000 \n", "weekday 5.0000 6.0000 \n", "workingday 1.0000 1.0000 \n", "weathersit 2.0000 4.0000 \n", "temp 0.6600 1.0000 \n", "atemp 0.6212 1.0000 \n", "hum 0.7800 1.0000 \n", "windspeed 0.2537 0.8507 \n", "casual 48.0000 367.0000 \n", "registered 220.0000 886.0000 \n", "cnt 281.0000 977.0000 " ] }, "metadata": { "tags": [] }, "execution_count": 2 } ] }, { "cell_type": "markdown", "metadata": { "id": "GwZrQdjybjKO", "colab_type": "text" }, "source": [ "Exercise 1.01: Preprocessing temporal and weather features " ] }, { "cell_type": "code", "metadata": { "id": "7gGtv-TzU3_W", "colab_type": "code", "colab": {}, "outputId": "8a79860f-0a5c-435a-c84f-859bdebf07d2" }, "source": [ "# create a copy of the original data\n", "preprocessed_data = hourly_data.copy()\n", "\n", "# tranform seasons\n", "seasons_mapping = {1: 'winter', 2: 'spring', 3: 'summer', 4: 'fall'}\n", "preprocessed_data['season'] = preprocessed_data['season'].apply(lambda x: seasons_mapping[x])\n", "\n", "# transform yr\n", "yr_mapping = {0: 2011, 1: 2012}\n", "preprocessed_data['yr'] = preprocessed_data['yr'].apply(lambda x: yr_mapping[x])\n", "\n", "# transform weekday\n", "weekday_mapping = {0: 'Sunday', 1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday'}\n", "preprocessed_data['weekday'] = preprocessed_data['weekday'].apply(lambda x: weekday_mapping[x])\n", "\n", "# transform weathersit\n", "weather_mapping = {1: 'clear', 2: 'cloudy', 3: 'light_rain_snow', 4: 'heavy_rain_snow'}\n", "preprocessed_data['weathersit'] = preprocessed_data['weathersit'].apply(lambda x: weather_mapping[x]) \n", "\n", "# transorm hum and windspeed\n", "preprocessed_data['hum'] = preprocessed_data['hum']*100\n", "preprocessed_data['windspeed'] = preprocessed_data['windspeed']*67\n", "\n", "# visualize preprocessed columns\n", "cols = ['season', 'yr', 'weekday', 'weathersit', 'hum', 'windspeed']\n", "preprocessed_data[cols].sample(10, random_state=123)" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
seasonyrweekdayweathersithumwindspeed
5792summer2011Saturdayclear74.08.9981
7823fall2011Sundayclear43.031.0009
15426fall2012Tuesdaycloudy77.06.0032
15028fall2012Sundayclear51.022.0028
12290spring2012Fridaycloudy89.012.9980
3262spring2011Fridayclear64.07.0015
10763spring2012Thursdayclear42.023.9994
12384spring2012Tuesdaylight_rain_snow82.011.0014
6051summer2011Wednesdayclear52.019.0012
948winter2011Saturdayclear80.00.0000
\n", "
" ], "text/plain": [ " season yr weekday weathersit hum windspeed\n", "5792 summer 2011 Saturday clear 74.0 8.9981\n", "7823 fall 2011 Sunday clear 43.0 31.0009\n", "15426 fall 2012 Tuesday cloudy 77.0 6.0032\n", "15028 fall 2012 Sunday clear 51.0 22.0028\n", "12290 spring 2012 Friday cloudy 89.0 12.9980\n", "3262 spring 2011 Friday clear 64.0 7.0015\n", "10763 spring 2012 Thursday clear 42.0 23.9994\n", "12384 spring 2012 Tuesday light_rain_snow 82.0 11.0014\n", "6051 summer 2011 Wednesday clear 52.0 19.0012\n", "948 winter 2011 Saturday clear 80.0 0.0000" ] }, "metadata": { "tags": [] }, "execution_count": 3 } ] } ] }