{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Manupaltion using Python Pandas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## The following operations in Pandas are done to achieve the same results shown in vignette of data.table package in R."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Basics"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" year | \n",
" month | \n",
" day | \n",
" dep_time | \n",
" dep_delay | \n",
" arr_time | \n",
" arr_delay | \n",
" cancelled | \n",
" carrier | \n",
" tailnum | \n",
" flight | \n",
" origin | \n",
" dest | \n",
" air_time | \n",
" distance | \n",
" hour | \n",
" min | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2014 | \n",
" 1 | \n",
" 1 | \n",
" 914 | \n",
" 14 | \n",
" 1238 | \n",
" 13 | \n",
" 0 | \n",
" AA | \n",
" N338AA | \n",
" 1 | \n",
" JFK | \n",
" LAX | \n",
" 359 | \n",
" 2475 | \n",
" 9 | \n",
" 14 | \n",
"
\n",
" \n",
" 1 | \n",
" 2014 | \n",
" 1 | \n",
" 1 | \n",
" 1157 | \n",
" -3 | \n",
" 1523 | \n",
" 13 | \n",
" 0 | \n",
" AA | \n",
" N335AA | \n",
" 3 | \n",
" JFK | \n",
" LAX | \n",
" 363 | \n",
" 2475 | \n",
" 11 | \n",
" 57 | \n",
"
\n",
" \n",
" 2 | \n",
" 2014 | \n",
" 1 | \n",
" 1 | \n",
" 1902 | \n",
" 2 | \n",
" 2224 | \n",
" 9 | \n",
" 0 | \n",
" AA | \n",
" N327AA | \n",
" 21 | \n",
" JFK | \n",
" LAX | \n",
" 351 | \n",
" 2475 | \n",
" 19 | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" 2014 | \n",
" 1 | \n",
" 1 | \n",
" 722 | \n",
" -8 | \n",
" 1014 | \n",
" -26 | \n",
" 0 | \n",
" AA | \n",
" N3EHAA | \n",
" 29 | \n",
" LGA | \n",
" PBI | \n",
" 157 | \n",
" 1035 | \n",
" 7 | \n",
" 22 | \n",
"
\n",
" \n",
" 4 | \n",
" 2014 | \n",
" 1 | \n",
" 1 | \n",
" 1347 | \n",
" 2 | \n",
" 1706 | \n",
" 1 | \n",
" 0 | \n",
" AA | \n",
" N319AA | \n",
" 117 | \n",
" JFK | \n",
" LAX | \n",
" 350 | \n",
" 2475 | \n",
" 13 | \n",
" 47 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" year month day dep_time dep_delay arr_time arr_delay cancelled \\\n",
"0 2014 1 1 914 14 1238 13 0 \n",
"1 2014 1 1 1157 -3 1523 13 0 \n",
"2 2014 1 1 1902 2 2224 9 0 \n",
"3 2014 1 1 722 -8 1014 -26 0 \n",
"4 2014 1 1 1347 2 1706 1 0 \n",
"\n",
" carrier tailnum flight origin dest air_time distance hour min \n",
"0 AA N338AA 1 JFK LAX 359 2475 9 14 \n",
"1 AA N335AA 3 JFK LAX 363 2475 11 57 \n",
"2 AA N327AA 21 JFK LAX 351 2475 19 2 \n",
"3 AA N3EHAA 29 LGA PBI 157 1035 7 22 \n",
"4 AA N319AA 117 JFK LAX 350 2475 13 47 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Read csv file in pandas:\n",
"\n",
"flights = pd.read_csv(\"D:\\PythonPandas\\FlightsNew.csv\")\n",
"flights.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
" In this notebook, we will :
\n",
"start with basics - how to subset rows, select and compute on columns
\n",
"and then we will look at performing data aggregations by group.
"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Get all the flights with “JFK” as the origin airport in the month of March."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" index | \n",
" year | \n",
" month | \n",
" day | \n",
" dep_time | \n",
" dep_delay | \n",
" arr_time | \n",
" arr_delay | \n",
" cancelled | \n",
" carrier | \n",
" tailnum | \n",
" flight | \n",
" origin | \n",
" dest | \n",
" air_time | \n",
" distance | \n",
" hour | \n",
" min | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 43609 | \n",
" 2014 | \n",
" 3 | \n",
" 1 | \n",
" 849 | \n",
" -11 | \n",
" 1306 | \n",
" 36 | \n",
" 0 | \n",
" AA | \n",
" N784AA | \n",
" 1 | \n",
" JFK | \n",
" LAX | \n",
" 375 | \n",
" 2475 | \n",
" 8 | \n",
" 49 | \n",
"
\n",
" \n",
" 1 | \n",
" 43610 | \n",
" 2014 | \n",
" 3 | \n",
" 1 | \n",
" 1157 | \n",
" -3 | \n",
" 1529 | \n",
" 14 | \n",
" 0 | \n",
" AA | \n",
" N787AA | \n",
" 3 | \n",
" JFK | \n",
" LAX | \n",
" 368 | \n",
" 2475 | \n",
" 11 | \n",
" 57 | \n",
"
\n",
" \n",
" 2 | \n",
" 43611 | \n",
" 2014 | \n",
" 3 | \n",
" 1 | \n",
" 652 | \n",
" -3 | \n",
" 1019 | \n",
" 4 | \n",
" 0 | \n",
" AA | \n",
" N785AA | \n",
" 9 | \n",
" JFK | \n",
" LAX | \n",
" 366 | \n",
" 2475 | \n",
" 6 | \n",
" 52 | \n",
"
\n",
" \n",
" 3 | \n",
" 43612 | \n",
" 2014 | \n",
" 3 | \n",
" 1 | \n",
" 1117 | \n",
" 47 | \n",
" 1502 | \n",
" 77 | \n",
" 0 | \n",
" AA | \n",
" N788AA | \n",
" 19 | \n",
" JFK | \n",
" LAX | \n",
" 379 | \n",
" 2475 | \n",
" 11 | \n",
" 17 | \n",
"
\n",
" \n",
" 4 | \n",
" 43613 | \n",
" 2014 | \n",
" 3 | \n",
" 1 | \n",
" 1950 | \n",
" 50 | \n",
" 2321 | \n",
" 66 | \n",
" 0 | \n",
" AA | \n",
" N336AA | \n",
" 21 | \n",
" JFK | \n",
" LAX | \n",
" 350 | \n",
" 2475 | \n",
" 19 | \n",
" 50 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index year month day dep_time dep_delay arr_time arr_delay \\\n",
"0 43609 2014 3 1 849 -11 1306 36 \n",
"1 43610 2014 3 1 1157 -3 1529 14 \n",
"2 43611 2014 3 1 652 -3 1019 4 \n",
"3 43612 2014 3 1 1117 47 1502 77 \n",
"4 43613 2014 3 1 1950 50 2321 66 \n",
"\n",
" cancelled carrier tailnum flight origin dest air_time distance hour \\\n",
"0 0 AA N784AA 1 JFK LAX 375 2475 8 \n",
"1 0 AA N787AA 3 JFK LAX 368 2475 11 \n",
"2 0 AA N785AA 9 JFK LAX 366 2475 6 \n",
"3 0 AA N788AA 19 JFK LAX 379 2475 11 \n",
"4 0 AA N336AA 21 JFK LAX 350 2475 19 \n",
"\n",
" min \n",
"0 49 \n",
"1 57 \n",
"2 52 \n",
"3 17 \n",
"4 50 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ans=flights.loc[(flights[\"origin\"].isin([\"JFK\"]))&(flights[\"month\"]==3)].reset_index()\n",
"ans.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Get the first two rows from flights. "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" year | \n",
" month | \n",
" day | \n",
" dep_time | \n",
" dep_delay | \n",
" arr_time | \n",
" arr_delay | \n",
" cancelled | \n",
" carrier | \n",
" tailnum | \n",
" flight | \n",
" origin | \n",
" dest | \n",
" air_time | \n",
" distance | \n",
" hour | \n",
" min | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2014 | \n",
" 1 | \n",
" 1 | \n",
" 914 | \n",
" 14 | \n",
" 1238 | \n",
" 13 | \n",
" 0 | \n",
" AA | \n",
" N338AA | \n",
" 1 | \n",
" JFK | \n",
" LAX | \n",
" 359 | \n",
" 2475 | \n",
" 9 | \n",
" 14 | \n",
"
\n",
" \n",
" 1 | \n",
" 2014 | \n",
" 1 | \n",
" 1 | \n",
" 1157 | \n",
" -3 | \n",
" 1523 | \n",
" 13 | \n",
" 0 | \n",
" AA | \n",
" N335AA | \n",
" 3 | \n",
" JFK | \n",
" LAX | \n",
" 363 | \n",
" 2475 | \n",
" 11 | \n",
" 57 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" year month day dep_time dep_delay arr_time arr_delay cancelled \\\n",
"0 2014 1 1 914 14 1238 13 0 \n",
"1 2014 1 1 1157 -3 1523 13 0 \n",
"\n",
" carrier tailnum flight origin dest air_time distance hour min \n",
"0 AA N338AA 1 JFK LAX 359 2475 9 14 \n",
"1 AA N335AA 3 JFK LAX 363 2475 11 57 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flights_tworows=flights.loc[0:1]\n",
"flights_tworows.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Sort flights first by column origin in ascending order, and then by dest in descending order:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" index | \n",
" year | \n",
" month | \n",
" day | \n",
" dep_time | \n",
" dep_delay | \n",
" arr_time | \n",
" arr_delay | \n",
" cancelled | \n",
" carrier | \n",
" tailnum | \n",
" flight | \n",
" origin | \n",
" dest | \n",
" air_time | \n",
" distance | \n",
" hour | \n",
" min | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2790 | \n",
" 2014 | \n",
" 1 | \n",
" 5 | \n",
" 836 | \n",
" 6 | \n",
" 1151 | \n",
" 49 | \n",
" 0 | \n",
" EV | \n",
" N12175 | \n",
" 4419 | \n",
" EWR | \n",
" XNA | \n",
" 195 | \n",
" 1131 | \n",
" 8 | \n",
" 36 | \n",
"
\n",
" \n",
" 1 | \n",
" 3351 | \n",
" 2014 | \n",
" 1 | \n",
" 6 | \n",
" 833 | \n",
" 7 | \n",
" 1111 | \n",
" 13 | \n",
" 0 | \n",
" EV | \n",
" N24128 | \n",
" 4419 | \n",
" EWR | \n",
" XNA | \n",
" 190 | \n",
" 1131 | \n",
" 8 | \n",
" 33 | \n",
"
\n",
" \n",
" 2 | \n",
" 4066 | \n",
" 2014 | \n",
" 1 | \n",
" 7 | \n",
" 811 | \n",
" -6 | \n",
" 1035 | \n",
" -13 | \n",
" 0 | \n",
" EV | \n",
" N12142 | \n",
" 4419 | \n",
" EWR | \n",
" XNA | \n",
" 179 | \n",
" 1131 | \n",
" 8 | \n",
" 11 | \n",
"
\n",
" \n",
" 3 | \n",
" 4825 | \n",
" 2014 | \n",
" 1 | \n",
" 8 | \n",
" 810 | \n",
" -7 | \n",
" 1036 | \n",
" -12 | \n",
" 0 | \n",
" EV | \n",
" N11193 | \n",
" 4419 | \n",
" EWR | \n",
" XNA | \n",
" 184 | \n",
" 1131 | \n",
" 8 | \n",
" 10 | \n",
"
\n",
" \n",
" 4 | \n",
" 5658 | \n",
" 2014 | \n",
" 1 | \n",
" 9 | \n",
" 833 | \n",
" 16 | \n",
" 1055 | \n",
" 7 | \n",
" 0 | \n",
" EV | \n",
" N14198 | \n",
" 4419 | \n",
" EWR | \n",
" XNA | \n",
" 181 | \n",
" 1131 | \n",
" 8 | \n",
" 33 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index year month day dep_time dep_delay arr_time arr_delay \\\n",
"0 2790 2014 1 5 836 6 1151 49 \n",
"1 3351 2014 1 6 833 7 1111 13 \n",
"2 4066 2014 1 7 811 -6 1035 -13 \n",
"3 4825 2014 1 8 810 -7 1036 -12 \n",
"4 5658 2014 1 9 833 16 1055 7 \n",
"\n",
" cancelled carrier tailnum flight origin dest air_time distance hour \\\n",
"0 0 EV N12175 4419 EWR XNA 195 1131 8 \n",
"1 0 EV N24128 4419 EWR XNA 190 1131 8 \n",
"2 0 EV N12142 4419 EWR XNA 179 1131 8 \n",
"3 0 EV N11193 4419 EWR XNA 184 1131 8 \n",
"4 0 EV N14198 4419 EWR XNA 181 1131 8 \n",
"\n",
" min \n",
"0 36 \n",
"1 33 \n",
"2 11 \n",
"3 10 \n",
"4 33 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flights_sort=flights.sort_values([\"origin\",\"dest\"],ascending=[1,0]).reset_index()\n",
"flights_sort.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Select both arr_delay and dep_delay columns."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" arr_delay | \n",
" dep_delay | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 13 | \n",
" 14 | \n",
"
\n",
" \n",
" 1 | \n",
" 13 | \n",
" -3 | \n",
"
\n",
" \n",
" 2 | \n",
" 9 | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" -26 | \n",
" -8 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" arr_delay dep_delay\n",
"0 13 14\n",
"1 13 -3\n",
"2 9 2\n",
"3 -26 -8\n",
"4 1 2"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"arr_dep_delay=flights[[\"arr_delay\",\"dep_delay\"]].copy()\n",
"arr_dep_delay.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Select both arr_delay and dep_delay columns and rename them to delay_arr and delay_dep."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" delay_arr | \n",
" delay_dep | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 13 | \n",
" 14 | \n",
"
\n",
" \n",
" 1 | \n",
" 13 | \n",
" -3 | \n",
"
\n",
" \n",
" 2 | \n",
" 9 | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" -26 | \n",
" -8 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" delay_arr delay_dep\n",
"0 13 14\n",
"1 13 -3\n",
"2 9 2\n",
"3 -26 -8\n",
"4 1 2"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"delay_arr_dep=flights[[\"arr_delay\",\"dep_delay\"]].rename(columns={\"arr_delay\":\"delay_arr\",\"dep_delay\":\"delay_dep\"})\n",
"delay_arr_dep.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### How many trips have had total delay < 0?"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"141814"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Step1: Create a column called \"delay\" in \"flights\" dataframe and assign 0 to it.\n",
"flights[\"delay\"]=0\n",
"#Step2: Check for cases where arr_delay + dep_delay is less than 0\n",
"flights.loc[(flights[\"arr_delay\"]+flights[\"dep_delay\"])<0,\"delay\"]=1\n",
"#Step3: Count the number of 1 in the delay column\n",
"flights.loc[flights[\"delay\"]==1][\"delay\"].count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Calculate the average arrival and departure delay for all flights with “JFK” as the origin airport in the month of June."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"arr_delay 7.731465\n",
"dep_delay 11.446167\n",
"dtype: float64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flights.loc[flights[\"origin\"].isin([\"JFK\"])][[\"arr_delay\",\"dep_delay\"]].mean()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### How many trips have been made in 2014 from “JFK” airport in the month of June?"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8422"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(flights.loc[(flights[\"origin\"].isin([\"JFK\"]))&(flights[\"month\"]==6)][\"origin\"].tolist())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Aggregations"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### How can we get the number of trips corresponding to each origin airport?"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"aggregations={\"origin\":\"count\"}\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" origin | \n",
" number_trips | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" EWR | \n",
" 87400 | \n",
"
\n",
" \n",
" 1 | \n",
" JFK | \n",
" 81483 | \n",
"
\n",
" \n",
" 2 | \n",
" LGA | \n",
" 84433 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" origin number_trips\n",
"0 EWR 87400\n",
"1 JFK 81483\n",
"2 LGA 84433"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flights.groupby([\"origin\"]).agg({\"origin\":\"count\"}).rename(columns={\"origin\":\"number_trips\"}).reset_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### How can we calculate the number of trips for each origin airport for carrier code “AA”?"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" origin | \n",
" number_trips | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" EWR | \n",
" 2649 | \n",
"
\n",
" \n",
" 1 | \n",
" JFK | \n",
" 11923 | \n",
"
\n",
" \n",
" 2 | \n",
" LGA | \n",
" 11730 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" origin number_trips\n",
"0 EWR 2649\n",
"1 JFK 11923\n",
"2 LGA 11730"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flights.loc[flights[\"carrier\"].isin([\"AA\"])].groupby([\"origin\"]).count().reset_index()[['origin','flight']].rename(columns={\"flight\":\"number_trips\"})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### How can we get the total number of trips for each origin, dest pair for carrier code “AA”?"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" origin | \n",
" dest | \n",
" number_trips | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" EWR | \n",
" DFW | \n",
" 1618 | \n",
"
\n",
" \n",
" 1 | \n",
" EWR | \n",
" LAX | \n",
" 62 | \n",
"
\n",
" \n",
" 2 | \n",
" EWR | \n",
" MIA | \n",
" 848 | \n",
"
\n",
" \n",
" 3 | \n",
" EWR | \n",
" PHX | \n",
" 121 | \n",
"
\n",
" \n",
" 4 | \n",
" JFK | \n",
" AUS | \n",
" 297 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" origin dest number_trips\n",
"0 EWR DFW 1618\n",
"1 EWR LAX 62\n",
"2 EWR MIA 848\n",
"3 EWR PHX 121\n",
"4 JFK AUS 297"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flights.loc[flights[\"carrier\"].isin([\"AA\"])].groupby([\"origin\",\"dest\"]).count().reset_index()[[\"origin\",\"dest\",\"flight\"]].rename(columns={\"flight\":\"number_trips\"}).head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### How can we get the average arrival and departure delay for each orig,dest pair for each month for carrier code “AA”? "
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" origin | \n",
" dest | \n",
" month | \n",
" arr_delay | \n",
" dep_delay | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" EWR | \n",
" DFW | \n",
" 1 | \n",
" 6.427673 | \n",
" 10.012579 | \n",
"
\n",
" \n",
" 1 | \n",
" EWR | \n",
" DFW | \n",
" 2 | \n",
" 10.536765 | \n",
" 11.345588 | \n",
"
\n",
" \n",
" 2 | \n",
" EWR | \n",
" DFW | \n",
" 3 | \n",
" 12.865031 | \n",
" 8.079755 | \n",
"
\n",
" \n",
" 3 | \n",
" EWR | \n",
" DFW | \n",
" 4 | \n",
" 17.792683 | \n",
" 12.920732 | \n",
"
\n",
" \n",
" 4 | \n",
" EWR | \n",
" DFW | \n",
" 5 | \n",
" 18.487805 | \n",
" 18.682927 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" origin dest month arr_delay dep_delay\n",
"0 EWR DFW 1 6.427673 10.012579\n",
"1 EWR DFW 2 10.536765 11.345588\n",
"2 EWR DFW 3 12.865031 8.079755\n",
"3 EWR DFW 4 17.792683 12.920732\n",
"4 EWR DFW 5 18.487805 18.682927"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flights.loc[flights[\"carrier\"].isin([\"AA\"])].groupby([\"origin\",\"dest\",\"month\"])[[\"arr_delay\",\"dep_delay\"]].mean().reset_index().head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Melting Operation\n",
"#### Melting operation is converting from wide form to long form"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" family_id | \n",
" age_mother | \n",
" dob_child1 | \n",
" dob_child2 | \n",
" dob_child3 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 30 | \n",
" 1998-11-26 | \n",
" NA | \n",
" NA | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 27 | \n",
" 1996-06-22 | \n",
" 2004-04-05 | \n",
" NA | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 26 | \n",
" 2002-07-11 | \n",
" 2004-04-05 | \n",
" 2007-09-02 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 32 | \n",
" 2004-10-10 | \n",
" 2009-08-27 | \n",
" 2012-07-21 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 29 | \n",
" 2000-12-05 | \n",
" 2005-02-28 | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" family_id age_mother dob_child1 dob_child2 dob_child3\n",
"0 1 30 1998-11-26 NA NA\n",
"1 2 27 1996-06-22 2004-04-05 NA\n",
"2 3 26 2002-07-11 2004-04-05 2007-09-02\n",
"3 4 32 2004-10-10 2009-08-27 2012-07-21\n",
"4 5 29 2000-12-05 2005-02-28 "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"family_data=pd.DataFrame({'family_id':[1,2,3,4,5], \n",
" 'age_mother':[30,27,26,32,29],\n",
" 'dob_child1':['1998-11-26','1996-06-22','2002-07-11','2004-10-10','2000-12-05'],\n",
" 'dob_child2':['NA','2004-04-05','2004-04-05','2009-08-27','2005-02-28'],\n",
" 'dob_child3':['NA','NA','2007-09-02','2012-07-21', \"\"]\n",
" \n",
" })[['family_id','age_mother','dob_child1','dob_child2','dob_child3']]\n",
"family_data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### We need to melt the above dataframe"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" family_id | \n",
" age_mother | \n",
" dob_child | \n",
" dob | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 30 | \n",
" dob_child1 | \n",
" 1998-11-26 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 27 | \n",
" dob_child1 | \n",
" 1996-06-22 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 26 | \n",
" dob_child1 | \n",
" 2002-07-11 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 32 | \n",
" dob_child1 | \n",
" 2004-10-10 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 29 | \n",
" dob_child1 | \n",
" 2000-12-05 | \n",
"
\n",
" \n",
" 5 | \n",
" 1 | \n",
" 30 | \n",
" dob_child2 | \n",
" NA | \n",
"
\n",
" \n",
" 6 | \n",
" 2 | \n",
" 27 | \n",
" dob_child2 | \n",
" 2004-04-05 | \n",
"
\n",
" \n",
" 7 | \n",
" 3 | \n",
" 26 | \n",
" dob_child2 | \n",
" 2004-04-05 | \n",
"
\n",
" \n",
" 8 | \n",
" 4 | \n",
" 32 | \n",
" dob_child2 | \n",
" 2009-08-27 | \n",
"
\n",
" \n",
" 9 | \n",
" 5 | \n",
" 29 | \n",
" dob_child2 | \n",
" 2005-02-28 | \n",
"
\n",
" \n",
" 10 | \n",
" 1 | \n",
" 30 | \n",
" dob_child3 | \n",
" NA | \n",
"
\n",
" \n",
" 11 | \n",
" 2 | \n",
" 27 | \n",
" dob_child3 | \n",
" NA | \n",
"
\n",
" \n",
" 12 | \n",
" 3 | \n",
" 26 | \n",
" dob_child3 | \n",
" 2007-09-02 | \n",
"
\n",
" \n",
" 13 | \n",
" 4 | \n",
" 32 | \n",
" dob_child3 | \n",
" 2012-07-21 | \n",
"
\n",
" \n",
" 14 | \n",
" 5 | \n",
" 29 | \n",
" dob_child3 | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" family_id age_mother dob_child dob\n",
"0 1 30 dob_child1 1998-11-26\n",
"1 2 27 dob_child1 1996-06-22\n",
"2 3 26 dob_child1 2002-07-11\n",
"3 4 32 dob_child1 2004-10-10\n",
"4 5 29 dob_child1 2000-12-05\n",
"5 1 30 dob_child2 NA\n",
"6 2 27 dob_child2 2004-04-05\n",
"7 3 26 dob_child2 2004-04-05\n",
"8 4 32 dob_child2 2009-08-27\n",
"9 5 29 dob_child2 2005-02-28\n",
"10 1 30 dob_child3 NA\n",
"11 2 27 dob_child3 NA\n",
"12 3 26 dob_child3 2007-09-02\n",
"13 4 32 dob_child3 2012-07-21\n",
"14 5 29 dob_child3 "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"family_melt=pd.melt(family_data,\n",
" id_vars=['family_id','age_mother'],\n",
" value_vars=['dob_child1','dob_child2','dob_child3'],\n",
" var_name=['dob_child'],\n",
" value_name='dob')\n",
"family_melt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Dcast operation"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"#family_melt.set_index(['dob','family_id']).unstack('dob')\n",
"\n",
"family_dcast=family_melt.pivot_table(index=['family_id','age_mother'],columns=\"dob_child\",values=\"dob\",aggfunc='first')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" dob_child | \n",
" dob_child1 | \n",
" dob_child2 | \n",
" dob_child3 | \n",
"
\n",
" \n",
" family_id | \n",
" age_mother | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 30 | \n",
" 1998-11-26 | \n",
" NA | \n",
" NA | \n",
"
\n",
" \n",
" 2 | \n",
" 27 | \n",
" 1996-06-22 | \n",
" 2004-04-05 | \n",
" NA | \n",
"
\n",
" \n",
" 3 | \n",
" 26 | \n",
" 2002-07-11 | \n",
" 2004-04-05 | \n",
" 2007-09-02 | \n",
"
\n",
" \n",
" 4 | \n",
" 32 | \n",
" 2004-10-10 | \n",
" 2009-08-27 | \n",
" 2012-07-21 | \n",
"
\n",
" \n",
" 5 | \n",
" 29 | \n",
" 2000-12-05 | \n",
" 2005-02-28 | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"dob_child dob_child1 dob_child2 dob_child3\n",
"family_id age_mother \n",
"1 30 1998-11-26 NA NA\n",
"2 27 1996-06-22 2004-04-05 NA\n",
"3 26 2002-07-11 2004-04-05 2007-09-02\n",
"4 32 2004-10-10 2009-08-27 2012-07-21\n",
"5 29 2000-12-05 2005-02-28 "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"family_dcast"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['dob_child1', 'dob_child2', 'dob_child3'], dtype='object', name='dob_child')"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"family_dcast.columns"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" family_id | \n",
" age_mother | \n",
" dob_child1 | \n",
" dob_child2 | \n",
" dob_child3 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 30 | \n",
" 1998-11-26 | \n",
" NA | \n",
" NA | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 27 | \n",
" 1996-06-22 | \n",
" 2004-04-05 | \n",
" NA | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 26 | \n",
" 2002-07-11 | \n",
" 2004-04-05 | \n",
" 2007-09-02 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 32 | \n",
" 2004-10-10 | \n",
" 2009-08-27 | \n",
" 2012-07-21 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 29 | \n",
" 2000-12-05 | \n",
" 2005-02-28 | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" family_id age_mother dob_child1 dob_child2 dob_child3\n",
"0 1 30 1998-11-26 NA NA\n",
"1 2 27 1996-06-22 2004-04-05 NA\n",
"2 3 26 2002-07-11 2004-04-05 2007-09-02\n",
"3 4 32 2004-10-10 2009-08-27 2012-07-21\n",
"4 5 29 2000-12-05 2005-02-28 "
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"family_dcast.reset_index().rename_axis(\"\",axis=\"columns\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}