In [28]:
print("What's up data nerds!")
What's up data nerds!
In [1]:
import pandas as pd
In [2]:
df = pd.read_csv(r"C:\Users\abdir\Desktop\ncr_ride_bookings.csv")
In [3]:
df.head ()
Out[3]:
Date Time Booking ID Booking Status Customer ID Vehicle Type Pickup Location Drop Location Avg VTAT Avg CTAT ... Reason for cancelling by Customer Cancelled Rides by Driver Driver Cancellation Reason Incomplete Rides Incomplete Rides Reason Booking Value Ride Distance Driver Ratings Customer Rating Payment Method
0 2024-03-23 12:29:38 "CNR5884300" No Driver Found "CID1982111" eBike Palam Vihar Jhilmil NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2024-11-29 18:01:39 "CNR1326809" Incomplete "CID4604802" Go Sedan Shastri Nagar Gurgaon Sector 56 4.9 14.0 ... NaN NaN NaN 1.0 Vehicle Breakdown 237.0 5.73 NaN NaN UPI
2 2024-08-23 08:56:10 "CNR8494506" Completed "CID9202816" Auto Khandsa Malviya Nagar 13.4 25.8 ... NaN NaN NaN NaN NaN 627.0 13.58 4.9 4.9 Debit Card
3 2024-10-21 17:17:25 "CNR8906825" Completed "CID2610914" Premier Sedan Central Secretariat Inderlok 13.1 28.5 ... NaN NaN NaN NaN NaN 416.0 34.02 4.6 5.0 UPI
4 2024-09-16 22:08:00 "CNR1950162" Completed "CID9933542" Bike Ghitorni Village Khan Market 5.3 19.6 ... NaN NaN NaN NaN NaN 737.0 48.21 4.1 4.3 UPI

5 rows × 21 columns

In [4]:
df.isnull ().sum()   # Nulls per column
Out[4]:
Date                                      0
Time                                      0
Booking ID                                0
Booking Status                            0
Customer ID                               0
Vehicle Type                              0
Pickup Location                           0
Drop Location                             0
Avg VTAT                              10500
Avg CTAT                              48000
Cancelled Rides by Customer          139500
Reason for cancelling by Customer    139500
Cancelled Rides by Driver            123000
Driver Cancellation Reason           123000
Incomplete Rides                     141000
Incomplete Rides Reason              141000
Booking Value                         48000
Ride Distance                         48000
Driver Ratings                        57000
Customer Rating                       57000
Payment Method                        48000
dtype: int64
In [6]:
df.columns
Out[6]:
Index(['Date', 'Time', 'Booking ID', 'Booking Status', 'Customer ID',
       'Vehicle Type', 'Pickup Location', 'Drop Location', 'Avg VTAT',
       'Avg CTAT', 'Cancelled Rides by Customer',
       'Reason for cancelling by Customer', 'Cancelled Rides by Driver',
       'Driver Cancellation Reason', 'Incomplete Rides',
       'Incomplete Rides Reason', 'Booking Value', 'Ride Distance',
       'Driver Ratings', 'Customer Rating', 'Payment Method'],
      dtype='object')
In [7]:
import pandas as pd

df = pd.read_csv("C:\\Users\\abdir\\Desktop\\ncr_ride_bookings.csv")
   # <-- this creates df
df.head(10)   # now it works
Out[7]:
Date Time Booking ID Booking Status Customer ID Vehicle Type Pickup Location Drop Location Avg VTAT Avg CTAT ... Reason for cancelling by Customer Cancelled Rides by Driver Driver Cancellation Reason Incomplete Rides Incomplete Rides Reason Booking Value Ride Distance Driver Ratings Customer Rating Payment Method
0 2024-03-23 12:29:38 "CNR5884300" No Driver Found "CID1982111" eBike Palam Vihar Jhilmil NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2024-11-29 18:01:39 "CNR1326809" Incomplete "CID4604802" Go Sedan Shastri Nagar Gurgaon Sector 56 4.9 14.0 ... NaN NaN NaN 1.0 Vehicle Breakdown 237.0 5.73 NaN NaN UPI
2 2024-08-23 08:56:10 "CNR8494506" Completed "CID9202816" Auto Khandsa Malviya Nagar 13.4 25.8 ... NaN NaN NaN NaN NaN 627.0 13.58 4.9 4.9 Debit Card
3 2024-10-21 17:17:25 "CNR8906825" Completed "CID2610914" Premier Sedan Central Secretariat Inderlok 13.1 28.5 ... NaN NaN NaN NaN NaN 416.0 34.02 4.6 5.0 UPI
4 2024-09-16 22:08:00 "CNR1950162" Completed "CID9933542" Bike Ghitorni Village Khan Market 5.3 19.6 ... NaN NaN NaN NaN NaN 737.0 48.21 4.1 4.3 UPI
5 2024-02-06 09:44:56 "CNR4096693" Completed "CID4670564" Auto AIIMS Narsinghpur 5.1 18.1 ... NaN NaN NaN NaN NaN 316.0 4.85 4.1 4.6 UPI
6 2024-06-17 15:45:58 "CNR2002539" Completed "CID6800553" Go Mini Vaishali Punjabi Bagh 7.1 20.4 ... NaN NaN NaN NaN NaN 640.0 41.24 4.0 4.1 UPI
7 2024-03-19 17:37:37 "CNR6568000" Completed "CID8610436" Auto Mayur Vihar Cyber Hub 12.1 16.5 ... NaN NaN NaN NaN NaN 136.0 6.56 4.4 4.2 UPI
8 2024-09-14 12:49:09 "CNR4510807" No Driver Found "CID7873618" Go Sedan Noida Sector 62 Noida Sector 18 NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
9 2024-12-16 19:06:48 "CNR7721892" Incomplete "CID5214275" Auto Rohini Adarsh Nagar 6.1 26.0 ... NaN NaN NaN 1.0 Other Issue 135.0 10.36 NaN NaN Cash

10 rows × 21 columns

In [8]:
# Convert Date and Time into one datetime column
df["datetime"] = pd.to_datetime(df["Date"] + " " + df["Time"], errors="coerce")

# Extract useful features
df["year"] = df["datetime"].dt.year
df["month"] = df["datetime"].dt.month
df["day"] = df["datetime"].dt.day
df["day_of_week"] = df["datetime"].dt.day_name()
df["hour"] = df["datetime"].dt.hour
In [10]:
# Remove extra quotes
df["Booking ID"] = df["Booking ID"].str.replace('"', '')
df["Customer ID"] = df["Customer ID"].str.replace('"', '')
In [11]:
# Drop high-null columns
drop_cols = [
    "Cancelled Rides by Customer", "Reason for cancelling by Customer",
    "Cancelled Rides by Driver", "Driver Cancellation Reason",
    "Incomplete Rides", "Incomplete Rides Reason"
]
df.drop(columns=drop_cols, inplace=True)

# Fill numeric columns with median
for col in ["Avg VTAT", "Avg CTAT", "Booking Value", "Ride Distance", "Driver Ratings", "Customer Rating"]:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical columns with mode
for col in ["Payment Method", "Vehicle Type", "Pickup Location", "Drop Location"]:
    df[col].fillna(df[col].mode()[0], inplace=True)
C:\Users\abdir\AppData\Local\Temp\ipykernel_7248\2665887149.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
C:\Users\abdir\AppData\Local\Temp\ipykernel_7248\2665887149.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
C:\Users\abdir\AppData\Local\Temp\ipykernel_7248\2665887149.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
C:\Users\abdir\AppData\Local\Temp\ipykernel_7248\2665887149.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
C:\Users\abdir\AppData\Local\Temp\ipykernel_7248\2665887149.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
C:\Users\abdir\AppData\Local\Temp\ipykernel_7248\2665887149.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
C:\Users\abdir\AppData\Local\Temp\ipykernel_7248\2665887149.py:15: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
In [12]:
df.drop_duplicates(subset="Booking ID", inplace=True)
In [13]:
df.info()
df.isnull().sum()
df.head()
<class 'pandas.core.frame.DataFrame'>
Index: 148767 entries, 0 to 149999
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   Date             148767 non-null  object        
 1   Time             148767 non-null  object        
 2   Booking ID       148767 non-null  object        
 3   Booking Status   148767 non-null  object        
 4   Customer ID      148767 non-null  object        
 5   Vehicle Type     148767 non-null  object        
 6   Pickup Location  148767 non-null  object        
 7   Drop Location    148767 non-null  object        
 8   Avg VTAT         148767 non-null  float64       
 9   Avg CTAT         148767 non-null  float64       
 10  Booking Value    148767 non-null  float64       
 11  Ride Distance    148767 non-null  float64       
 12  Driver Ratings   148767 non-null  float64       
 13  Customer Rating  148767 non-null  float64       
 14  Payment Method   148767 non-null  object        
 15  datetime         148767 non-null  datetime64[ns]
 16  year             148767 non-null  int32         
 17  month            148767 non-null  int32         
 18  day              148767 non-null  int32         
 19  day_of_week      148767 non-null  object        
 20  hour             148767 non-null  int32         
dtypes: datetime64[ns](1), float64(6), int32(4), object(10)
memory usage: 22.7+ MB
Out[13]:
Date Time Booking ID Booking Status Customer ID Vehicle Type Pickup Location Drop Location Avg VTAT Avg CTAT ... Ride Distance Driver Ratings Customer Rating Payment Method datetime year month day day_of_week hour
0 2024-03-23 12:29:38 CNR5884300 No Driver Found CID1982111 eBike Palam Vihar Jhilmil 8.3 28.8 ... 23.72 4.3 4.5 UPI 2024-03-23 12:29:38 2024 3 23 Saturday 12
1 2024-11-29 18:01:39 CNR1326809 Incomplete CID4604802 Go Sedan Shastri Nagar Gurgaon Sector 56 4.9 14.0 ... 5.73 4.3 4.5 UPI 2024-11-29 18:01:39 2024 11 29 Friday 18
2 2024-08-23 08:56:10 CNR8494506 Completed CID9202816 Auto Khandsa Malviya Nagar 13.4 25.8 ... 13.58 4.9 4.9 Debit Card 2024-08-23 08:56:10 2024 8 23 Friday 8
3 2024-10-21 17:17:25 CNR8906825 Completed CID2610914 Premier Sedan Central Secretariat Inderlok 13.1 28.5 ... 34.02 4.6 5.0 UPI 2024-10-21 17:17:25 2024 10 21 Monday 17
4 2024-09-16 22:08:00 CNR1950162 Completed CID9933542 Bike Ghitorni Village Khan Market 5.3 19.6 ... 48.21 4.1 4.3 UPI 2024-09-16 22:08:00 2024 9 16 Monday 22

5 rows × 21 columns

In [14]:
import matplotlib.pyplot as plt
import seaborn as sns

# Booking status distribution
status_counts = df["Booking Status"].value_counts()

plt.figure(figsize=(6,4))
sns.barplot(x=status_counts.index, y=status_counts.values, palette="viridis")
plt.title("Booking Status Distribution")
plt.ylabel("Number of Bookings")
plt.xticks(rotation=30)
plt.show()
C:\Users\abdir\AppData\Local\Temp\ipykernel_7248\302591104.py:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=status_counts.index, y=status_counts.values, palette="viridis")
No description has been provided for this image
In [15]:
vehicle_counts = df["Vehicle Type"].value_counts()

plt.figure(figsize=(6,4))
sns.barplot(x=vehicle_counts.index, y=vehicle_counts.values, palette="magma")
plt.title("Most Popular Vehicle Types")
plt.ylabel("Number of Rides")
plt.xticks(rotation=30)
plt.show()
C:\Users\abdir\AppData\Local\Temp\ipykernel_7248\665332025.py:4: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=vehicle_counts.index, y=vehicle_counts.values, palette="magma")
No description has been provided for this image
In [17]:
print("Total Revenue:", df["Booking Value"].sum())
print("Average Booking Value:", df["Booking Value"].mean())

plt.figure(figsize=(6,4))
sns.histplot(df["Booking Value"], bins=50, kde=True)
plt.title("Distribution of Booking Value")
plt.xlabel("Booking Value")
plt.ylabel("Frequency")
plt.show()
Total Revenue: 71129352.0
Average Booking Value: 478.1258746899514
No description has been provided for this image
In [18]:
plt.figure(figsize=(6,4))
sns.histplot(df["Driver Ratings"], bins=20, kde=True, color="blue", label="Driver")
sns.histplot(df["Customer Rating"], bins=20, kde=True, color="orange", label="Customer")
plt.title("Driver vs Customer Ratings")
plt.legend()
plt.show()
No description has been provided for this image
In [20]:
# Monthly ride counts
monthly_rides = df.groupby(["year","month"])["Booking ID"].count()

monthly_rides.plot(kind="bar", figsize=(12,6), color="teal")
plt.title("Monthly Ride Counts")
plt.ylabel("Number of Rides")
plt.show()
No description has been provided for this image
In [21]:
df.head(10)
Out[21]:
Date Time Booking ID Booking Status Customer ID Vehicle Type Pickup Location Drop Location Avg VTAT Avg CTAT ... Ride Distance Driver Ratings Customer Rating Payment Method datetime year month day day_of_week hour
0 2024-03-23 12:29:38 CNR5884300 No Driver Found CID1982111 eBike Palam Vihar Jhilmil 8.3 28.8 ... 23.72 4.3 4.5 UPI 2024-03-23 12:29:38 2024 3 23 Saturday 12
1 2024-11-29 18:01:39 CNR1326809 Incomplete CID4604802 Go Sedan Shastri Nagar Gurgaon Sector 56 4.9 14.0 ... 5.73 4.3 4.5 UPI 2024-11-29 18:01:39 2024 11 29 Friday 18
2 2024-08-23 08:56:10 CNR8494506 Completed CID9202816 Auto Khandsa Malviya Nagar 13.4 25.8 ... 13.58 4.9 4.9 Debit Card 2024-08-23 08:56:10 2024 8 23 Friday 8
3 2024-10-21 17:17:25 CNR8906825 Completed CID2610914 Premier Sedan Central Secretariat Inderlok 13.1 28.5 ... 34.02 4.6 5.0 UPI 2024-10-21 17:17:25 2024 10 21 Monday 17
4 2024-09-16 22:08:00 CNR1950162 Completed CID9933542 Bike Ghitorni Village Khan Market 5.3 19.6 ... 48.21 4.1 4.3 UPI 2024-09-16 22:08:00 2024 9 16 Monday 22
5 2024-02-06 09:44:56 CNR4096693 Completed CID4670564 Auto AIIMS Narsinghpur 5.1 18.1 ... 4.85 4.1 4.6 UPI 2024-02-06 09:44:56 2024 2 6 Tuesday 9
6 2024-06-17 15:45:58 CNR2002539 Completed CID6800553 Go Mini Vaishali Punjabi Bagh 7.1 20.4 ... 41.24 4.0 4.1 UPI 2024-06-17 15:45:58 2024 6 17 Monday 15
7 2024-03-19 17:37:37 CNR6568000 Completed CID8610436 Auto Mayur Vihar Cyber Hub 12.1 16.5 ... 6.56 4.4 4.2 UPI 2024-03-19 17:37:37 2024 3 19 Tuesday 17
8 2024-09-14 12:49:09 CNR4510807 No Driver Found CID7873618 Go Sedan Noida Sector 62 Noida Sector 18 8.3 28.8 ... 23.72 4.3 4.5 UPI 2024-09-14 12:49:09 2024 9 14 Saturday 12
9 2024-12-16 19:06:48 CNR7721892 Incomplete CID5214275 Auto Rohini Adarsh Nagar 6.1 26.0 ... 10.36 4.3 4.5 Cash 2024-12-16 19:06:48 2024 12 16 Monday 19

10 rows × 21 columns

In [ ]: