{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# This notebook expects that Modin and Ray are installed, e.g. by `pip install modin[ray]`.\n", "# For all ways to install Modin see official documentation at:\n", "# https://modin.readthedocs.io/en/latest/installation.html\n", "import modin.pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "columns_names = [\n", " \"trip_id\", \"vendor_id\", \"pickup_datetime\", \"dropoff_datetime\", \"store_and_fwd_flag\",\n", " \"rate_code_id\", \"pickup_longitude\", \"pickup_latitude\", \"dropoff_longitude\", \"dropoff_latitude\",\n", " \"passenger_count\", \"trip_distance\", \"fare_amount\", \"extra\", \"mta_tax\", \"tip_amount\",\n", " \"tolls_amount\", \"ehail_fee\", \"improvement_surcharge\", \"total_amount\", \"payment_type\",\n", " \"trip_type\", \"pickup\", \"dropoff\", \"cab_type\", \"precipitation\", \"snow_depth\", \"snowfall\",\n", " \"max_temperature\", \"min_temperature\", \"average_wind_speed\", \"pickup_nyct2010_gid\",\n", " \"pickup_ctlabel\", \"pickup_borocode\", \"pickup_boroname\", \"pickup_ct2010\",\n", " \"pickup_boroct2010\", \"pickup_cdeligibil\", \"pickup_ntacode\", \"pickup_ntaname\", \"pickup_puma\",\n", " \"dropoff_nyct2010_gid\", \"dropoff_ctlabel\", \"dropoff_borocode\", \"dropoff_boroname\",\n", " \"dropoff_ct2010\", \"dropoff_boroct2010\", \"dropoff_cdeligibil\", \"dropoff_ntacode\",\n", " \"dropoff_ntaname\", \"dropoff_puma\",\n", " ]\n", "parse_dates=[\"pickup_datetime\", \"dropoff_datetime\"]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "UserWarning: Parameters provided defaulting to pandas implementation.\n", "To request implementation, send an email to feature_requests@modin.org.\n" ] } ], "source": [ "df = pd.read_csv('https://modin-datasets.s3.amazonaws.com/trips_data.csv', names=columns_names,\n", " header=None, parse_dates=parse_dates)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
trip_idvendor_idpickup_datetimedropoff_datetimestore_and_fwd_flagrate_code_idpickup_longitudepickup_latitudedropoff_longitudedropoff_latitude...dropoff_nyct2010_giddropoff_ctlabeldropoff_borocodedropoff_boronamedropoff_ct2010dropoff_boroct2010dropoff_cdeligibildropoff_ntacodedropoff_ntanamedropoff_puma
0122013-08-01 08:14:372013-08-01 09:09:06N1NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1222013-08-01 09:13:002013-08-01 11:38:00N1NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2322013-08-01 09:48:002013-08-01 09:49:00N5NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3422013-08-01 10:38:352013-08-01 10:38:51N1NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4522013-08-01 11:51:452013-08-01 12:03:52N1NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
..................................................................
9995988122013-09-29 01:03:482013-09-29 01:09:49N1-73.95833640.820820-73.95377340.825195...907.0225.01.0Manhattan22500.01022500.0EMN04Hamilton Heights3802.0
9996988222013-09-29 03:04:102013-09-29 03:09:37N1-73.95882440.820251-73.93417440.853394...912.0271.01.0Manhattan27100.01027100.0EMN35Washington Heights North3801.0
9997988322013-09-30 16:28:122013-09-30 16:56:03N1-73.95610040.818974-73.94105540.789993...1318.0170.01.0Manhattan17000.01017000.0EMN33East Harlem South3804.0
9998988422013-09-01 13:15:152013-09-01 13:23:10N1-73.95534540.820053-73.94244440.841507...911.0251.01.0Manhattan25100.01025100.0EMN36Washington Heights South3801.0
9999988522013-09-20 07:32:172013-09-20 08:01:06N1-73.95535340.820213-73.95768040.765190...1758.0116.01.0Manhattan11600.01011600.0IMN31Lenox Hill-Roosevelt Island3805.0
\n", "

10000 rows x 51 columns

\n", "
" ], "text/plain": [ " trip_id vendor_id pickup_datetime dropoff_datetime \\\n", "0 1 2 2013-08-01 08:14:37 2013-08-01 09:09:06 \n", "1 2 2 2013-08-01 09:13:00 2013-08-01 11:38:00 \n", "2 3 2 2013-08-01 09:48:00 2013-08-01 09:49:00 \n", "3 4 2 2013-08-01 10:38:35 2013-08-01 10:38:51 \n", "4 5 2 2013-08-01 11:51:45 2013-08-01 12:03:52 \n", "... ... ... ... ... \n", "9995 9881 2 2013-09-29 01:03:48 2013-09-29 01:09:49 \n", "9996 9882 2 2013-09-29 03:04:10 2013-09-29 03:09:37 \n", "9997 9883 2 2013-09-30 16:28:12 2013-09-30 16:56:03 \n", "9998 9884 2 2013-09-01 13:15:15 2013-09-01 13:23:10 \n", "9999 9885 2 2013-09-20 07:32:17 2013-09-20 08:01:06 \n", "\n", " store_and_fwd_flag rate_code_id pickup_longitude pickup_latitude \\\n", "0 N 1 NaN NaN \n", "1 N 1 NaN NaN \n", "2 N 5 NaN NaN \n", "3 N 1 NaN NaN \n", "4 N 1 NaN NaN \n", "... ... ... ... ... \n", "9995 N 1 -73.958336 40.820820 \n", "9996 N 1 -73.958824 40.820251 \n", "9997 N 1 -73.956100 40.818974 \n", "9998 N 1 -73.955345 40.820053 \n", "9999 N 1 -73.955353 40.820213 \n", "\n", " dropoff_longitude dropoff_latitude ... dropoff_nyct2010_gid \\\n", "0 NaN NaN ... NaN \n", "1 NaN NaN ... NaN \n", "2 NaN NaN ... NaN \n", "3 NaN NaN ... NaN \n", "4 NaN NaN ... NaN \n", "... ... ... ... ... \n", "9995 -73.953773 40.825195 ... 907.0 \n", "9996 -73.934174 40.853394 ... 912.0 \n", "9997 -73.941055 40.789993 ... 1318.0 \n", "9998 -73.942444 40.841507 ... 911.0 \n", "9999 -73.957680 40.765190 ... 1758.0 \n", "\n", " dropoff_ctlabel dropoff_borocode dropoff_boroname dropoff_ct2010 \\\n", "0 NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN \n", "... ... ... ... ... \n", "9995 225.0 1.0 Manhattan 22500.0 \n", "9996 271.0 1.0 Manhattan 27100.0 \n", "9997 170.0 1.0 Manhattan 17000.0 \n", "9998 251.0 1.0 Manhattan 25100.0 \n", "9999 116.0 1.0 Manhattan 11600.0 \n", "\n", " dropoff_boroct2010 dropoff_cdeligibil dropoff_ntacode \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "... ... ... ... \n", "9995 1022500.0 E MN04 \n", "9996 1027100.0 E MN35 \n", "9997 1017000.0 E MN33 \n", "9998 1025100.0 E MN36 \n", "9999 1011600.0 I MN31 \n", "\n", " dropoff_ntaname dropoff_puma \n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n", "... ... ... \n", "9995 Hamilton Heights 3802.0 \n", "9996 Washington Heights North 3801.0 \n", "9997 East Harlem South 3804.0 \n", "9998 Washington Heights South 3801.0 \n", "9999 Lenox Hill-Roosevelt Island 3805.0 \n", "\n", "[10000 rows x 51 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def q1(df):\n", " return df.groupby(\"cab_type\")[\"cab_type\"].count()\n", "def q2(df):\n", " return df.groupby(\"passenger_count\", as_index=False).mean()[[\"passenger_count\", \"total_amount\"]]\n", "def q3(df):\n", " return df.groupby([\"passenger_count\", \"pickup_datetime\"]).size().reset_index()\n", "def q4(df):\n", " transformed = pd.DataFrame({\n", " \"passenger_count\": df[\"passenger_count\"],\n", " \"pickup_datetime\": df[\"pickup_datetime\"].dt.year,\n", " \"trip_distance\": df[\"trip_distance\"].astype(\"int64\"),\n", " })\n", " return transformed.groupby([\"passenger_count\", \"pickup_datetime\", \"trip_distance\"]) \\\n", " .size().reset_index().sort_values(by=[\"pickup_datetime\", 0], ascending=[True, False])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10000" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q1(df)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
passenger_counttotal_amount
0018.333333
1115.258850
2220.332356
3313.748845
4419.742688
5514.786221
6615.400085
\n", "
" ], "text/plain": [ " passenger_count total_amount\n", "0 0 18.333333\n", "1 1 15.258850\n", "2 2 20.332356\n", "3 3 13.748845\n", "4 4 19.742688\n", "5 5 14.786221\n", "6 6 15.400085" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q2(df)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
passenger_countpickup_datetime0
002013-08-14 12:07:001
102013-08-14 12:37:001
202013-08-15 00:00:001
312013-08-01 08:14:371
412013-08-01 09:48:001
............
990962013-09-28 18:30:151
991062013-09-28 19:57:221
991162013-09-29 18:47:291
991262013-09-30 02:27:331
991362013-09-30 21:31:061
\n", "

9914 rows x 3 columns

\n", "
" ], "text/plain": [ " passenger_count pickup_datetime 0\n", "0 0 2013-08-14 12:07:00 1\n", "1 0 2013-08-14 12:37:00 1\n", "2 0 2013-08-15 00:00:00 1\n", "3 1 2013-08-01 08:14:37 1\n", "4 1 2013-08-01 09:48:00 1\n", "... ... ... ..\n", "9909 6 2013-09-28 18:30:15 1\n", "9910 6 2013-09-28 19:57:22 1\n", "9911 6 2013-09-29 18:47:29 1\n", "9912 6 2013-09-30 02:27:33 1\n", "9913 6 2013-09-30 21:31:06 1\n", "\n", "[9914 rows x 3 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q3(df)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
passenger_countpickup_datetimetrip_distance0
21201301991
31201311270
4120132853
80520130551
81520131537
...............
7742013101
7842013111
7942013141
10252013281
11562013141
\n", "

116 rows x 4 columns

\n", "
" ], "text/plain": [ " passenger_count pickup_datetime trip_distance 0\n", "2 1 2013 0 1991\n", "3 1 2013 1 1270\n", "4 1 2013 2 853\n", "80 5 2013 0 551\n", "81 5 2013 1 537\n", ".. ... ... ... ...\n", "77 4 2013 10 1\n", "78 4 2013 11 1\n", "79 4 2013 14 1\n", "102 5 2013 28 1\n", "115 6 2013 14 1\n", "\n", "[116 rows x 4 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q4(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%timeit q1(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%timeit q2(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%timeit q3(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%timeit q4(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "license": "http://www.apache.org/licenses/LICENSE-2.0", "license_notice": [ "Licensed to Modin Development Team under one or more contributor license agreements.", "See the NOTICE file distributed with this work for additional information regarding", "copyright ownership. The Modin Development Team licenses this file to you under the", "Apache License, Version 2.0 (the \"License\"); you may not use this file except in", "compliance with the License. You may obtain a copy of the License at", "", " http://www.apache.org/licenses/LICENSE-2.0", "", "Unless required by applicable law or agreed to in writing, software distributed under", "the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF", "ANY KIND, either express or implied. See the License for the specific language", "governing permissions and limitations under the License." ], "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.8" } }, "nbformat": 4, "nbformat_minor": 4 }