{ "cells": [ { "cell_type": "markdown", "id": "58041ec7-6bbf-4fa8-b76d-1f4af1a3cccd", "metadata": {}, "source": [ "Importing pandas, a leading data manipulation python library" ] }, { "cell_type": "code", "execution_count": 1, "id": "7c04b7c9-fd43-4f0c-91e5-bfca643d0bb6", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "id": "92d05092-b42f-4943-bc7d-eff9db6198eb", "metadata": {}, "source": [ "Reading in the data set." ] }, { "cell_type": "code", "execution_count": 2, "id": "32b6b294-acf7-411a-aee6-affd819fbc04", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('raw/data.csv', names=['station_id', 'bike_id', 'from', 'to'])" ] }, { "cell_type": "markdown", "id": "3980c0e3-1d94-4ccd-b863-c63e11561963", "metadata": {}, "source": [ "Observing the data set" ] }, { "cell_type": "code", "execution_count": 3, "id": "c35503ec-e37c-4397-8e0d-8968fa18c70a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pandas.core.frame.DataFrame" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(df)" ] }, { "cell_type": "code", "execution_count": 4, "id": "baaa8207-6042-43cf-8433-d371238eac86", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1000" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df)" ] }, { "cell_type": "code", "execution_count": 5, "id": "c812ac6a-47cc-4f61-8612-a8e704fcfb66", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1000, 4)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 6, "id": "8987a405-9b2b-43bf-a049-aed7c21880f1", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>station_id</th>\n", " <th>bike_id</th>\n", " <th>from</th>\n", " <th>to</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>212</td>\n", " <td>2294</td>\n", " <td>2020-12-13T11:26:54Z</td>\n", " <td>2020-12-17T16:13:54Z</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>418</td>\n", " <td>3441</td>\n", " <td>2020-08-25T11:37:11Z</td>\n", " <td>2020-08-31T11:18:11Z</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>301</td>\n", " <td>6467</td>\n", " <td>2021-04-10T17:05:16Z</td>\n", " <td>2021-04-11T15:00:16Z</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>560</td>\n", " <td>9386</td>\n", " <td>2021-04-28T12:10:24Z</td>\n", " <td>2021-05-02T07:31:24Z</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>87</td>\n", " <td>8755</td>\n", " <td>2021-01-10T10:40:53Z</td>\n", " <td>2021-01-11T20:58:53Z</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>651</td>\n", " <td>1525</td>\n", " <td>2020-08-08T06:44:20Z</td>\n", " <td>2020-08-11T01:15:20Z</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>40</td>\n", " <td>8238</td>\n", " <td>2021-01-16T10:20:47Z</td>\n", " <td>2021-01-16T14:06:47Z</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>97</td>\n", " <td>2437</td>\n", " <td>2021-06-21T15:52:08Z</td>\n", " <td>2021-06-24T00:56:08Z</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>362</td>\n", " <td>8428</td>\n", " <td>2021-03-12T16:38:10Z</td>\n", " <td>2021-03-17T05:07:10Z</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>234</td>\n", " <td>7266</td>\n", " <td>2021-01-02T19:17:57Z</td>\n", " <td>2021-01-08T01:42:57Z</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " station_id bike_id from to\n", "0 212 2294 2020-12-13T11:26:54Z 2020-12-17T16:13:54Z\n", "1 418 3441 2020-08-25T11:37:11Z 2020-08-31T11:18:11Z\n", "2 301 6467 2021-04-10T17:05:16Z 2021-04-11T15:00:16Z\n", "3 560 9386 2021-04-28T12:10:24Z 2021-05-02T07:31:24Z\n", "4 87 8755 2021-01-10T10:40:53Z 2021-01-11T20:58:53Z\n", "5 651 1525 2020-08-08T06:44:20Z 2020-08-11T01:15:20Z\n", "6 40 8238 2021-01-16T10:20:47Z 2021-01-16T14:06:47Z\n", "7 97 2437 2021-06-21T15:52:08Z 2021-06-24T00:56:08Z\n", "8 362 8428 2021-03-12T16:38:10Z 2021-03-17T05:07:10Z\n", "9 234 7266 2021-01-02T19:17:57Z 2021-01-08T01:42:57Z" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(10)" ] }, { "cell_type": "code", "execution_count": 7, "id": "b96093e2-356d-46e8-8c09-7b0ae547886f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'pandas.core.frame.DataFrame'>\n", "RangeIndex: 1000 entries, 0 to 999\n", "Data columns (total 4 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 station_id 1000 non-null int64 \n", " 1 bike_id 1000 non-null int64 \n", " 2 from 1000 non-null object\n", " 3 to 1000 non-null object\n", "dtypes: int64(2), object(2)\n", "memory usage: 31.4+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "markdown", "id": "715ff011-2d0c-473a-a672-f9b632e69ad8", "metadata": {}, "source": [ "Using the datetime objects in Pandas DataFrame." ] }, { "cell_type": "code", "execution_count": 8, "id": "bd204a28-f525-4f4e-9ef8-ba7fa30158ec", "metadata": {}, "outputs": [], "source": [ "df['from'] = pd.to_datetime(df['from'])\n", "df['to'] = pd.to_datetime(df['to'])" ] }, { "cell_type": "markdown", "id": "932734b3-4155-4491-924e-57fcd8fc24d2", "metadata": {}, "source": [ "Subracting the two date columns and return a Series of timedelta objects with the time diff between each row." ] }, { "cell_type": "code", "execution_count": 9, "id": "74b0da85-5b62-4b81-a8a9-6a3f4196539c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 4 days 04:47:00\n", "1 5 days 23:41:00\n", "2 0 days 21:55:00\n", "3 3 days 19:21:00\n", "4 1 days 10:18:00\n", " ... \n", "995 5 days 02:16:00\n", "996 0 days 20:08:00\n", "997 5 days 14:35:00\n", "998 3 days 04:50:00\n", "999 3 days 00:12:00\n", "Length: 1000, dtype: timedelta64[ns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "difference = (df['to'] - df['from'])\n", "difference" ] }, { "cell_type": "code", "execution_count": 10, "id": "c6fc4dd7-cefd-4d1f-b26e-32865137521f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3 days 13:43:53.760000 is the average (mean) journey duration across all bikes and all stations for this reporting period.\n" ] } ], "source": [ "avg_journey_duration = difference.mean()\n", "print(f'{avg_journey_duration} is the average (mean) journey duration across all bikes and all stations for this reporting period.')" ] }, { "cell_type": "code", "execution_count": null, "id": "6cb0be5f-6889-4d04-9704-9c4a7567ac66", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" } }, "nbformat": 4, "nbformat_minor": 5 }