{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
},
"colab": {
"name": "01-explore-data.ipynb",
"provenance": []
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "OsjvyRyVU3_G",
"colab_type": "text"
},
"source": [
"# Understanding the data\n",
"\n",
"In this first part, we load the data and perform some initial exploration on it. The main goal of this step is to acquire some basic knowledge about the data, how the various features are distributed, if there are missing values in it and so on."
]
},
{
"cell_type": "code",
"metadata": {
"id": "_wmNcEoRU3_K",
"colab_type": "code",
"colab": {}
},
"source": [
"### imports\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"%matplotlib inline\n",
"\n",
"# load hourly data\n",
"hourly_data = pd.read_csv('https://raw.githubusercontent.com/PacktWorkshops/The-Data-Analysis-Workshop/master/Chapter01/data/hour.csv')"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "qDc53XA2U3_P",
"colab_type": "text"
},
"source": [
"Check data format, number of missing values in the data and general statistics:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "9QFrElsEU3_Q",
"colab_type": "code",
"colab": {},
"outputId": "6fda5579-55de-478b-a00d-ede2cfecd346"
},
"source": [
"# print some generic statistics about the data\n",
"print(f\"Shape of data: {hourly_data.shape}\")\n",
"print(f\"Number of missing values in the data: {hourly_data.isnull().sum().sum()}\")\n",
"\n",
"# get statistics on the numerical columns\n",
"hourly_data.describe().T"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"Shape of data: (17379, 17)\n",
"Number of missing values in the data: 0\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
" mean | \n",
" std | \n",
" min | \n",
" 25% | \n",
" 50% | \n",
" 75% | \n",
" max | \n",
"
\n",
" \n",
" \n",
" \n",
" instant | \n",
" 17379.0 | \n",
" 8690.000000 | \n",
" 5017.029500 | \n",
" 1.00 | \n",
" 4345.5000 | \n",
" 8690.0000 | \n",
" 13034.5000 | \n",
" 17379.0000 | \n",
"
\n",
" \n",
" season | \n",
" 17379.0 | \n",
" 2.501640 | \n",
" 1.106918 | \n",
" 1.00 | \n",
" 2.0000 | \n",
" 3.0000 | \n",
" 3.0000 | \n",
" 4.0000 | \n",
"
\n",
" \n",
" yr | \n",
" 17379.0 | \n",
" 0.502561 | \n",
" 0.500008 | \n",
" 0.00 | \n",
" 0.0000 | \n",
" 1.0000 | \n",
" 1.0000 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" mnth | \n",
" 17379.0 | \n",
" 6.537775 | \n",
" 3.438776 | \n",
" 1.00 | \n",
" 4.0000 | \n",
" 7.0000 | \n",
" 10.0000 | \n",
" 12.0000 | \n",
"
\n",
" \n",
" hr | \n",
" 17379.0 | \n",
" 11.546752 | \n",
" 6.914405 | \n",
" 0.00 | \n",
" 6.0000 | \n",
" 12.0000 | \n",
" 18.0000 | \n",
" 23.0000 | \n",
"
\n",
" \n",
" holiday | \n",
" 17379.0 | \n",
" 0.028770 | \n",
" 0.167165 | \n",
" 0.00 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" weekday | \n",
" 17379.0 | \n",
" 3.003683 | \n",
" 2.005771 | \n",
" 0.00 | \n",
" 1.0000 | \n",
" 3.0000 | \n",
" 5.0000 | \n",
" 6.0000 | \n",
"
\n",
" \n",
" workingday | \n",
" 17379.0 | \n",
" 0.682721 | \n",
" 0.465431 | \n",
" 0.00 | \n",
" 0.0000 | \n",
" 1.0000 | \n",
" 1.0000 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" weathersit | \n",
" 17379.0 | \n",
" 1.425283 | \n",
" 0.639357 | \n",
" 1.00 | \n",
" 1.0000 | \n",
" 1.0000 | \n",
" 2.0000 | \n",
" 4.0000 | \n",
"
\n",
" \n",
" temp | \n",
" 17379.0 | \n",
" 0.496987 | \n",
" 0.192556 | \n",
" 0.02 | \n",
" 0.3400 | \n",
" 0.5000 | \n",
" 0.6600 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" atemp | \n",
" 17379.0 | \n",
" 0.475775 | \n",
" 0.171850 | \n",
" 0.00 | \n",
" 0.3333 | \n",
" 0.4848 | \n",
" 0.6212 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" hum | \n",
" 17379.0 | \n",
" 0.627229 | \n",
" 0.192930 | \n",
" 0.00 | \n",
" 0.4800 | \n",
" 0.6300 | \n",
" 0.7800 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" windspeed | \n",
" 17379.0 | \n",
" 0.190098 | \n",
" 0.122340 | \n",
" 0.00 | \n",
" 0.1045 | \n",
" 0.1940 | \n",
" 0.2537 | \n",
" 0.8507 | \n",
"
\n",
" \n",
" casual | \n",
" 17379.0 | \n",
" 35.676218 | \n",
" 49.305030 | \n",
" 0.00 | \n",
" 4.0000 | \n",
" 17.0000 | \n",
" 48.0000 | \n",
" 367.0000 | \n",
"
\n",
" \n",
" registered | \n",
" 17379.0 | \n",
" 153.786869 | \n",
" 151.357286 | \n",
" 0.00 | \n",
" 34.0000 | \n",
" 115.0000 | \n",
" 220.0000 | \n",
" 886.0000 | \n",
"
\n",
" \n",
" cnt | \n",
" 17379.0 | \n",
" 189.463088 | \n",
" 181.387599 | \n",
" 1.00 | \n",
" 40.0000 | \n",
" 142.0000 | \n",
" 281.0000 | \n",
" 977.0000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count mean std min 25% 50% \\\n",
"instant 17379.0 8690.000000 5017.029500 1.00 4345.5000 8690.0000 \n",
"season 17379.0 2.501640 1.106918 1.00 2.0000 3.0000 \n",
"yr 17379.0 0.502561 0.500008 0.00 0.0000 1.0000 \n",
"mnth 17379.0 6.537775 3.438776 1.00 4.0000 7.0000 \n",
"hr 17379.0 11.546752 6.914405 0.00 6.0000 12.0000 \n",
"holiday 17379.0 0.028770 0.167165 0.00 0.0000 0.0000 \n",
"weekday 17379.0 3.003683 2.005771 0.00 1.0000 3.0000 \n",
"workingday 17379.0 0.682721 0.465431 0.00 0.0000 1.0000 \n",
"weathersit 17379.0 1.425283 0.639357 1.00 1.0000 1.0000 \n",
"temp 17379.0 0.496987 0.192556 0.02 0.3400 0.5000 \n",
"atemp 17379.0 0.475775 0.171850 0.00 0.3333 0.4848 \n",
"hum 17379.0 0.627229 0.192930 0.00 0.4800 0.6300 \n",
"windspeed 17379.0 0.190098 0.122340 0.00 0.1045 0.1940 \n",
"casual 17379.0 35.676218 49.305030 0.00 4.0000 17.0000 \n",
"registered 17379.0 153.786869 151.357286 0.00 34.0000 115.0000 \n",
"cnt 17379.0 189.463088 181.387599 1.00 40.0000 142.0000 \n",
"\n",
" 75% max \n",
"instant 13034.5000 17379.0000 \n",
"season 3.0000 4.0000 \n",
"yr 1.0000 1.0000 \n",
"mnth 10.0000 12.0000 \n",
"hr 18.0000 23.0000 \n",
"holiday 0.0000 1.0000 \n",
"weekday 5.0000 6.0000 \n",
"workingday 1.0000 1.0000 \n",
"weathersit 2.0000 4.0000 \n",
"temp 0.6600 1.0000 \n",
"atemp 0.6212 1.0000 \n",
"hum 0.7800 1.0000 \n",
"windspeed 0.2537 0.8507 \n",
"casual 48.0000 367.0000 \n",
"registered 220.0000 886.0000 \n",
"cnt 281.0000 977.0000 "
]
},
"metadata": {
"tags": []
},
"execution_count": 2
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "GwZrQdjybjKO",
"colab_type": "text"
},
"source": [
"Exercise 1.01: Preprocessing temporal and weather features "
]
},
{
"cell_type": "code",
"metadata": {
"id": "7gGtv-TzU3_W",
"colab_type": "code",
"colab": {},
"outputId": "8a79860f-0a5c-435a-c84f-859bdebf07d2"
},
"source": [
"# create a copy of the original data\n",
"preprocessed_data = hourly_data.copy()\n",
"\n",
"# tranform seasons\n",
"seasons_mapping = {1: 'winter', 2: 'spring', 3: 'summer', 4: 'fall'}\n",
"preprocessed_data['season'] = preprocessed_data['season'].apply(lambda x: seasons_mapping[x])\n",
"\n",
"# transform yr\n",
"yr_mapping = {0: 2011, 1: 2012}\n",
"preprocessed_data['yr'] = preprocessed_data['yr'].apply(lambda x: yr_mapping[x])\n",
"\n",
"# transform weekday\n",
"weekday_mapping = {0: 'Sunday', 1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday'}\n",
"preprocessed_data['weekday'] = preprocessed_data['weekday'].apply(lambda x: weekday_mapping[x])\n",
"\n",
"# transform weathersit\n",
"weather_mapping = {1: 'clear', 2: 'cloudy', 3: 'light_rain_snow', 4: 'heavy_rain_snow'}\n",
"preprocessed_data['weathersit'] = preprocessed_data['weathersit'].apply(lambda x: weather_mapping[x]) \n",
"\n",
"# transorm hum and windspeed\n",
"preprocessed_data['hum'] = preprocessed_data['hum']*100\n",
"preprocessed_data['windspeed'] = preprocessed_data['windspeed']*67\n",
"\n",
"# visualize preprocessed columns\n",
"cols = ['season', 'yr', 'weekday', 'weathersit', 'hum', 'windspeed']\n",
"preprocessed_data[cols].sample(10, random_state=123)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" season | \n",
" yr | \n",
" weekday | \n",
" weathersit | \n",
" hum | \n",
" windspeed | \n",
"
\n",
" \n",
" \n",
" \n",
" 5792 | \n",
" summer | \n",
" 2011 | \n",
" Saturday | \n",
" clear | \n",
" 74.0 | \n",
" 8.9981 | \n",
"
\n",
" \n",
" 7823 | \n",
" fall | \n",
" 2011 | \n",
" Sunday | \n",
" clear | \n",
" 43.0 | \n",
" 31.0009 | \n",
"
\n",
" \n",
" 15426 | \n",
" fall | \n",
" 2012 | \n",
" Tuesday | \n",
" cloudy | \n",
" 77.0 | \n",
" 6.0032 | \n",
"
\n",
" \n",
" 15028 | \n",
" fall | \n",
" 2012 | \n",
" Sunday | \n",
" clear | \n",
" 51.0 | \n",
" 22.0028 | \n",
"
\n",
" \n",
" 12290 | \n",
" spring | \n",
" 2012 | \n",
" Friday | \n",
" cloudy | \n",
" 89.0 | \n",
" 12.9980 | \n",
"
\n",
" \n",
" 3262 | \n",
" spring | \n",
" 2011 | \n",
" Friday | \n",
" clear | \n",
" 64.0 | \n",
" 7.0015 | \n",
"
\n",
" \n",
" 10763 | \n",
" spring | \n",
" 2012 | \n",
" Thursday | \n",
" clear | \n",
" 42.0 | \n",
" 23.9994 | \n",
"
\n",
" \n",
" 12384 | \n",
" spring | \n",
" 2012 | \n",
" Tuesday | \n",
" light_rain_snow | \n",
" 82.0 | \n",
" 11.0014 | \n",
"
\n",
" \n",
" 6051 | \n",
" summer | \n",
" 2011 | \n",
" Wednesday | \n",
" clear | \n",
" 52.0 | \n",
" 19.0012 | \n",
"
\n",
" \n",
" 948 | \n",
" winter | \n",
" 2011 | \n",
" Saturday | \n",
" clear | \n",
" 80.0 | \n",
" 0.0000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" season yr weekday weathersit hum windspeed\n",
"5792 summer 2011 Saturday clear 74.0 8.9981\n",
"7823 fall 2011 Sunday clear 43.0 31.0009\n",
"15426 fall 2012 Tuesday cloudy 77.0 6.0032\n",
"15028 fall 2012 Sunday clear 51.0 22.0028\n",
"12290 spring 2012 Friday cloudy 89.0 12.9980\n",
"3262 spring 2011 Friday clear 64.0 7.0015\n",
"10763 spring 2012 Thursday clear 42.0 23.9994\n",
"12384 spring 2012 Tuesday light_rain_snow 82.0 11.0014\n",
"6051 summer 2011 Wednesday clear 52.0 19.0012\n",
"948 winter 2011 Saturday clear 80.0 0.0000"
]
},
"metadata": {
"tags": []
},
"execution_count": 3
}
]
}
]
}