{ "cells": [ { "cell_type": "code", "execution_count": 4, "id": "3cd196ae-caaa-483e-bfaa-c1af386967c0", "metadata": {}, "outputs": [], "source": [ "\n", "import pyspark\n", "from pyspark.sql import SparkSession" ] }, { "cell_type": "code", "execution_count": 5, "id": "90d20aba-2bfb-4fd9-824f-d4bbcab8274d", "metadata": {}, "outputs": [], "source": [ "spark = SparkSession.builder \\\n", " .master(\"local[*]\") \\\n", " .appName('test') \\\n", " .getOrCreate()" ] }, { "cell_type": "code", "execution_count": null, "id": "b42ef949-278f-4c3e-8393-3786d91106b5", "metadata": {}, "outputs": [], "source": [ "spark.version" ] }, { "cell_type": "code", "execution_count": 7, "id": "c85e752e-a66f-4773-803c-36ccdd28fa35", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Downloading: 100%|██████████| 67.8M/67.8M [00:04<00:00, 14.9MB/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "✅ Download complete!\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "import requests\n", "from tqdm import tqdm # uv add tqdm for progress bar\n", "\n", "url = \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-11.parquet\"\n", "\n", "response = requests.get(url, stream=True)\n", "response.raise_for_status()\n", "\n", "total_size = int(response.headers.get('content-length', 0))\n", "\n", "with open(\"yellow_tripdata_2025-11.parquet\", \"wb\") as file, tqdm(\n", " desc=\"Downloading\",\n", " total=total_size,\n", " unit=\"B\",\n", " unit_scale=True,\n", " unit_divisor=1024,\n", ") as pbar:\n", " for chunk in response.iter_content(chunk_size=8192):\n", " if chunk:\n", " file.write(chunk)\n", " pbar.update(len(chunk))\n", "\n", "print(\"\\n✅ Download complete!\")\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "c7c81906-eeb4-4be9-b316-09354666fd06", "metadata": {}, "outputs": [], "source": [ "df = spark.read \\\n", " .option(\"header\", \"true\") \\\n", " .parquet('yellow_tripdata_2025-11.parquet')" ] }, { "cell_type": "code", "execution_count": 10, "id": "58c4f13b-14c6-4b2c-a907-3faa9b13eeea", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+\n", "|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|cbd_congestion_fee|\n", "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+\n", "| 7| 2025-11-01 00:13:25| 2025-11-01 00:13:25| 1| 1.68| 1| N| 43| 186| 1| 14.9| 0.0| 0.5| 1.5| 0.0| 1.0| 22.15| 2.5| 0.0| 0.75|\n", "| 2| 2025-11-01 00:49:07| 2025-11-01 01:01:22| 1| 2.28| 1| N| 142| 237| 1| 14.2| 1.0| 0.5| 4.99| 0.0| 1.0| 24.94| 2.5| 0.0| 0.75|\n", "| 1| 2025-11-01 00:07:19| 2025-11-01 00:20:41| 0| 2.7| 1| N| 163| 238| 1| 15.6| 4.25| 0.5| 4.27| 0.0| 1.0| 25.62| 2.5| 0.0| 0.75|\n", "| 2| 2025-11-01 00:00:00| 2025-11-01 01:01:03| 3| 12.87| 1| N| 138| 261| 1| 66.7| 6.0| 0.5| 0.0| 6.94| 1.0| 86.14| 2.5| 1.75| 0.75|\n", "| 1| 2025-11-01 00:18:50| 2025-11-01 00:49:32| 0| 8.4| 1| N| 138| 37| 2| 39.4| 7.75| 0.5| 0.0| 0.0| 1.0| 48.65| 0.0| 1.75| 0.0|\n", "| 2| 2025-11-01 00:21:11| 2025-11-01 00:31:39| 1| 0.85| 1| N| 90| 100| 2| 10.7| 1.0| 0.5| 0.0| 0.0| 1.0| 16.45| 2.5| 0.0| 0.75|\n", "| 2| 2025-11-01 00:07:31| 2025-11-01 00:25:44| 1| 3.01| 1| N| 142| 170| 1| 19.1| 1.0| 0.5| 1.0| 0.0| 1.0| 25.85| 2.5| 0.0| 0.75|\n", "| 2| 2025-11-01 00:46:52| 2025-11-01 01:38:55| 3| 3.82| 1| N| 237| 144| 1| 42.2| 1.0| 0.5| 9.59| 0.0| 1.0| 57.54| 2.5| 0.0| 0.75|\n", "| 2| 2025-11-01 00:56:59| 2025-11-01 01:02:05| 1| 0.89| 1| N| 162| 161| 2| 7.2| 1.0| 0.5| 0.0| 0.0| 1.0| 12.95| 2.5| 0.0| 0.75|\n", "| 2| 2025-11-01 00:10:43| 2025-11-01 00:39:25| 3| 2.28| 1| N| 234| 162| 1| 24.0| 1.0| 0.5| 8.93| 0.0| 1.0| 38.68| 2.5| 0.0| 0.75|\n", "| 1| 2025-11-01 00:00:03| 2025-11-01 00:42:25| 2| 3.3| 1| N| 158| 88| 1| 35.9| 4.25| 0.5| 2.35| 0.0| 1.0| 44.0| 2.5| 0.0| 0.75|\n", "| 1| 2025-11-01 00:43:53| 2025-11-01 00:56:46| 2| 1.5| 1| N| 88| 148| 1| 12.8| 4.25| 0.5| 1.0| 0.0| 1.0| 19.55| 2.5| 0.0| 0.75|\n", "| 1| 2025-11-01 00:58:02| 2025-11-01 01:32:36| 2| 4.7| 1| N| 148| 236| 1| 32.4| 4.25| 0.5| 9.5| 0.0| 1.0| 47.65| 2.5| 0.0| 0.75|\n", "| 2| 2025-11-01 00:52:48| 2025-11-01 01:23:18| 1| 5.61| 1| N| 87| 255| 1| 33.1| 1.0| 0.5| 0.0| 0.0| 1.0| 38.85| 2.5| 0.0| 0.75|\n", "| 1| 2025-11-01 00:05:53| 2025-11-01 00:58:03| 1| 3.9| 1| N| 231| 43| 1| 40.8| 4.25| 0.5| 0.0| 0.0| 1.0| 46.55| 2.5| 0.0| 0.75|\n", "| 2| 2025-11-01 00:13:44| 2025-11-01 00:19:30| 2| 1.14| 1| N| 141| 262| 1| 7.9| 1.0| 0.5| 2.0| 0.0| 1.0| 14.9| 2.5| 0.0| 0.0|\n", "| 1| 2025-11-01 00:03:18| 2025-11-01 00:06:48| 2| 0.6| 1| N| 238| 24| 1| 5.1| 1.0| 0.5| 1.52| 0.0| 1.0| 9.12| 0.0| 0.0| 0.0|\n", "| 1| 2025-11-01 00:19:55| 2025-11-01 00:45:37| 1| 4.3| 1| N| 236| 147| 1| 24.7| 1.0| 0.5| 2.0| 0.0| 1.0| 29.2| 0.0| 0.0| 0.0|\n", "| 1| 2025-11-01 00:45:55| 2025-11-01 01:11:30| 1| 3.0| 1| N| 231| 137| 1| 24.0| 4.25| 0.5| 3.0| 0.0| 1.0| 32.75| 2.5| 0.0| 0.75|\n", "| 2| 2025-11-01 00:11:12| 2025-11-01 00:15:02| 1| 0.69| 1| N| 237| 237| 2| 6.5| 1.0| 0.5| 0.0| 0.0| 1.0| 11.5| 2.5| 0.0| 0.0|\n", "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "df.show()" ] }, { "cell_type": "code", "execution_count": 12, "id": "fc69c30e-fcbd-4e33-ad26-950172dec322", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "StructType([StructField('VendorID', IntegerType(), True), StructField('tpep_pickup_datetime', TimestampNTZType(), True), StructField('tpep_dropoff_datetime', TimestampNTZType(), True), StructField('passenger_count', LongType(), True), StructField('trip_distance', DoubleType(), True), StructField('RatecodeID', LongType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('PULocationID', IntegerType(), True), StructField('DOLocationID', IntegerType(), True), StructField('payment_type', LongType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True), StructField('Airport_fee', DoubleType(), True), StructField('cbd_congestion_fee', DoubleType(), True)])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.schema" ] }, { "cell_type": "code", "execution_count": 6, "id": "1ba5c35a-84cf-4928-9765-b16fee506bdf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+\n", "|hvfhs_license_num|dispatching_base_num| pickup_datetime| dropoff_datetime|PULocationID|DOLocationID|SR_Flag|\n", "+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+\n", "| HV0003| B02682|2021-01-01 00:33:44|2021-01-01 00:49:07| 230| 166| NULL|\n", "| HV0003| B02682|2021-01-01 00:55:19|2021-01-01 01:18:21| 152| 167| NULL|\n", "| HV0003| B02764|2021-01-01 00:23:56|2021-01-01 00:38:05| 233| 142| NULL|\n", "| HV0003| B02764|2021-01-01 00:42:51|2021-01-01 00:45:50| 142| 143| NULL|\n", "| HV0003| B02764|2021-01-01 00:48:14|2021-01-01 01:08:42| 143| 78| NULL|\n", "| HV0005| B02510|2021-01-01 00:06:59|2021-01-01 00:43:01| 88| 42| NULL|\n", "| HV0005| B02510|2021-01-01 00:50:00|2021-01-01 01:04:57| 42| 151| NULL|\n", "| HV0003| B02764|2021-01-01 00:14:30|2021-01-01 00:50:27| 71| 226| NULL|\n", "| HV0003| B02875|2021-01-01 00:22:54|2021-01-01 00:30:20| 112| 255| NULL|\n", "| HV0003| B02875|2021-01-01 00:40:12|2021-01-01 00:53:31| 255| 232| NULL|\n", "| HV0003| B02875|2021-01-01 00:56:45|2021-01-01 01:17:42| 232| 198| NULL|\n", "| HV0003| B02835|2021-01-01 00:29:04|2021-01-01 00:36:27| 113| 48| NULL|\n", "| HV0003| B02835|2021-01-01 00:48:56|2021-01-01 00:59:12| 239| 75| NULL|\n", "| HV0004| B02800|2021-01-01 00:15:24|2021-01-01 00:38:31| 181| 237| NULL|\n", "| HV0004| B02800|2021-01-01 00:45:00|2021-01-01 01:06:45| 236| 68| NULL|\n", "| HV0003| B02682|2021-01-01 00:11:53|2021-01-01 00:18:06| 256| 148| NULL|\n", "| HV0003| B02682|2021-01-01 00:28:31|2021-01-01 00:41:40| 79| 80| NULL|\n", "| HV0003| B02682|2021-01-01 00:50:49|2021-01-01 00:55:59| 17| 217| NULL|\n", "| HV0005| B02510|2021-01-01 00:08:40|2021-01-01 00:39:39| 62| 29| NULL|\n", "| HV0003| B02836|2021-01-01 00:53:48|2021-01-01 01:11:40| 22| 22| NULL|\n", "+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "df.show()" ] }, { "cell_type": "code", "execution_count": 13, "id": "b1b46682-3b18-4e44-8918-bb75730c6356", "metadata": {}, "outputs": [], "source": [ "df = df.repartition(4)" ] }, { "cell_type": "code", "execution_count": 15, "id": "2e9e23d5-a25b-4375-9de0-65e0e77a023f", "metadata": {}, "outputs": [], "source": [ "df.write.parquet('yellow/2025/11/')" ] }, { "cell_type": "code", "execution_count": 16, "id": "0e61cd48-f4ed-444c-a07d-6d16ee6e7b7c", "metadata": {}, "outputs": [], "source": [ "df = spark.read.parquet('yellow/2025/11/')" ] }, { "cell_type": "code", "execution_count": 17, "id": "0f27cdac-c34d-49ea-b23d-bf6b26e5fb81", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- VendorID: integer (nullable = true)\n", " |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)\n", " |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)\n", " |-- passenger_count: long (nullable = true)\n", " |-- trip_distance: double (nullable = true)\n", " |-- RatecodeID: long (nullable = true)\n", " |-- store_and_fwd_flag: string (nullable = true)\n", " |-- PULocationID: integer (nullable = true)\n", " |-- DOLocationID: integer (nullable = true)\n", " |-- payment_type: long (nullable = true)\n", " |-- fare_amount: double (nullable = true)\n", " |-- extra: double (nullable = true)\n", " |-- mta_tax: double (nullable = true)\n", " |-- tip_amount: double (nullable = true)\n", " |-- tolls_amount: double (nullable = true)\n", " |-- improvement_surcharge: double (nullable = true)\n", " |-- total_amount: double (nullable = true)\n", " |-- congestion_surcharge: double (nullable = true)\n", " |-- Airport_fee: double (nullable = true)\n", " |-- cbd_congestion_fee: double (nullable = true)\n", "\n" ] } ], "source": [ "df.printSchema() " ] }, { "cell_type": "code", "execution_count": 18, "id": "3c366864-cf2e-4a8f-a55a-0e313d957ba6", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-rw-r--r-- 1 jovyan users 68M Mar 9 10:34 yellow_tripdata_2025-11.parquet\n" ] } ], "source": [ "!ls -lh yellow_tripdata_2025-11.parquet" ] }, { "cell_type": "code", "execution_count": 19, "id": "61071374-50ef-49f0-b7f0-171fe763eff5", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 98M\n", "-rw-r--r-- 1 jovyan users 25M Mar 9 10:38 part-00000-cd13b5d4-0502-492f-9ea0-b2519d9595ec-c000.snappy.parquet\n", "-rw-r--r-- 1 jovyan users 25M Mar 9 10:38 part-00001-cd13b5d4-0502-492f-9ea0-b2519d9595ec-c000.snappy.parquet\n", "-rw-r--r-- 1 jovyan users 25M Mar 9 10:38 part-00002-cd13b5d4-0502-492f-9ea0-b2519d9595ec-c000.snappy.parquet\n", "-rw-r--r-- 1 jovyan users 25M Mar 9 10:38 part-00003-cd13b5d4-0502-492f-9ea0-b2519d9595ec-c000.snappy.parquet\n", "-rw-r--r-- 1 jovyan users 0 Mar 9 10:38 _SUCCESS\n" ] } ], "source": [ "!ls -lh yellow/2025/11/\n" ] }, { "cell_type": "code", "execution_count": 20, "id": "cabe651e-ea02-445a-a6cc-fc96f27106ac", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Taxi trips started on 2025-11-15: 162,604\n" ] } ], "source": [ "from pyspark.sql.functions import to_timestamp, col\n", "\n", "df_filtered = df.filter(\n", " to_timestamp(col(\"tpep_pickup_datetime\"), \"yyyy-MM-dd HH:mm:ss\") >= \"2025-11-15 00:00:00\"\n", ").filter(\n", " to_timestamp(col(\"tpep_pickup_datetime\"), \"yyyy-MM-dd HH:mm:ss\") < \"2025-11-16 00:00:00\"\n", ")\n", "\n", "trip_count = df_filtered.count()\n", "print(f\"Taxi trips started on 2025-11-15: {trip_count:,}\")\n" ] }, { "cell_type": "code", "execution_count": 21, "id": "a8908efd-c0ab-4fef-9722-95548f686c84", "metadata": {}, "outputs": [], "source": [ "from pyspark.sql.functions import unix_timestamp, col, round\n", "\n", "df = df.withColumn(\n", " \"total_trip_time_minutes\",\n", " round(\n", " (unix_timestamp(\"tpep_dropoff_datetime\") - unix_timestamp(\"tpep_pickup_datetime\")) / 60.0\n", " )\n", ")\n" ] }, { "cell_type": "code", "execution_count": 22, "id": "d1fbec1c-a0df-44a4-b3c7-25c285fe5429", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+-----------------------+\n", "|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|cbd_congestion_fee|total_trip_time_minutes|\n", "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+-----------------------+\n", "| 2| 2025-11-07 18:37:45| 2025-11-07 18:41:51| 1| 0.78| 1| N| 140| 262| 1| 5.8| 2.5| 0.5| 2.46| 0.0| 1.0| 14.76| 2.5| 0.0| 0.0| 4.0|\n", "| 2| 2025-11-07 23:14:45| 2025-11-08 00:10:08| 1| 18.2| 2| N| 132| 141| 1| 70.0| 0.0| 0.5| 16.69| 6.94| 1.0| 100.13| 2.5| 1.75| 0.75| 55.0|\n", "| 2| 2025-11-01 22:40:45| 2025-11-01 22:44:52| 1| 0.44| 1| N| 262| 236| 1| 5.8| 1.0| 0.5| 2.16| 0.0| 1.0| 12.96| 2.5| 0.0| 0.0| 4.0|\n", "| 1| 2025-11-07 13:57:39| 2025-11-07 14:19:11| 1| 7.6| 1| N| 141| 33| 2| 32.4| 2.5| 0.5| 0.0| 0.0| 1.0| 36.4| 2.5| 0.0| 0.0| 22.0|\n", "| 2| 2025-11-02 18:02:18| 2025-11-02 18:37:23| 1| 4.2| 1| N| 141| 211| 1| 31.7| 0.0| 0.5| 2.0| 0.0| 1.0| 38.45| 2.5| 0.0| 0.75| 35.0|\n", "| 2| 2025-11-05 09:05:48| 2025-11-05 09:18:11| 1| 0.94| 1| N| 100| 170| 1| 12.1| 0.0| 0.5| 3.37| 0.0| 1.0| 20.22| 2.5| 0.0| 0.75| 12.0|\n", "| 2| 2025-11-09 18:01:11| 2025-11-09 18:17:31| 1| 1.1| 1| N| 233| 230| 1| 14.9| 0.0| 0.5| 3.93| 0.0| 1.0| 23.58| 2.5| 0.0| 0.75| 16.0|\n", "| 2| 2025-11-08 11:20:15| 2025-11-08 11:26:17| 2| 0.46| 1| N| 186| 90| 1| 7.2| 0.0| 0.5| 2.39| 0.0| 1.0| 14.34| 2.5| 0.0| 0.75| 6.0|\n", "| 2| 2025-11-02 10:32:27| 2025-11-02 10:41:20| 1| 2.11| 1| N| 142| 151| 1| 12.1| 0.0| 0.5| 3.22| 0.0| 1.0| 19.32| 2.5| 0.0| 0.0| 9.0|\n", "| 1| 2025-11-08 13:25:10| 2025-11-08 13:43:13| 1| 2.6| 1| N| 79| 162| 4| 17.0| 3.25| 0.5| 0.0| 0.0| 1.0| 21.75| 2.5| 0.0| 0.75| 18.0|\n", "| 2| 2025-11-08 16:39:52| 2025-11-08 16:51:32| 1| 1.51| 1| N| 162| 141| 1| 12.1| 0.0| 0.5| 3.15| 0.0| 1.0| 20.0| 2.5| 0.0| 0.75| 12.0|\n", "| 2| 2025-11-07 11:14:32| 2025-11-07 11:21:22| 1| 0.86| 1| N| 262| 140| 2| 7.9| 0.0| 0.5| 0.0| 0.0| 1.0| 11.9| 2.5| 0.0| 0.0| 7.0|\n", "| 2| 2025-11-04 20:39:14| 2025-11-04 20:50:37| 1| 1.55| 1| N| 113| 68| 1| 11.4| 1.0| 0.5| 3.43| 0.0| 1.0| 20.58| 2.5| 0.0| 0.75| 11.0|\n", "| 1| 2025-11-05 09:15:31| 2025-11-05 09:48:05| 1| 1.9| 99| N| 42| 116| 1| 19.5| 0.0| 0.5| 0.0| 0.0| 0.0| 20.0| 0.0| 0.0| 0.0| 33.0|\n", "| 1| 2025-11-01 10:24:47| 2025-11-01 10:58:29| 1| 9.2| 1| N| 138| 230| 1| 44.3| 10.0| 0.5| 12.55| 6.94| 1.0| 75.29| 2.5| 1.75| 0.75| 34.0|\n", "| 2| 2025-11-03 22:07:46| 2025-11-03 22:17:37| 1| 1.26| 1| N| 163| 141| 1| 10.7| 1.0| 0.5| 1.5| 0.0| 1.0| 17.95| 2.5| 0.0| 0.75| 10.0|\n", "| 1| 2025-11-10 11:14:54| 2025-11-10 11:29:47| 2| 2.7| 1| N| 163| 75| 1| 15.6| 3.25| 0.5| 4.05| 0.0| 1.0| 24.4| 2.5| 0.0| 0.75| 15.0|\n", "| 2| 2025-11-02 01:17:59| 2025-11-02 01:46:28| 2| 5.45| 1| N| 246| 33| 1| 31.0| 1.0| 0.5| 7.35| 0.0| 1.0| 44.1| 2.5| 0.0| 0.75| 28.0|\n", "| 2| 2025-11-02 15:11:44| 2025-11-02 15:29:36| 1| 1.53| 1| N| 236| 238| 1| 16.3| 0.0| 0.5| 4.06| 0.0| 1.0| 24.36| 2.5| 0.0| 0.0| 18.0|\n", "| 2| 2025-11-06 13:14:11| 2025-11-06 13:21:23| 1| 0.69| 1| N| 48| 50| 1| 7.9| 0.0| 0.5| 2.53| 0.0| 1.0| 15.18| 2.5| 0.0| 0.75| 7.0|\n", "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+-----------------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "df.show()" ] }, { "cell_type": "code", "execution_count": 23, "id": "b6fc027c-ebad-4230-b768-9872907a0f2c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Longest trip: 90.65 hours\n" ] } ], "source": [ "from pyspark.sql.functions import unix_timestamp, col, hour, max\n", "\n", "df_longest = df.filter(\n", " col(\"total_trip_time_minutes\") > 0 # Exclude invalid trips\n", ").agg(\n", " max((unix_timestamp(\"tpep_dropoff_datetime\") - unix_timestamp(\"tpep_pickup_datetime\")) / 3600.0).alias(\"max_trip_hours\")\n", ").collect()[0][\"max_trip_hours\"]\n", "\n", "print(f\"Longest trip: {df_longest:.2f} hours\")\n" ] }, { "cell_type": "code", "execution_count": 24, "id": "bde85e58-4421-49c8-be36-3c5fefff5730", "metadata": {}, "outputs": [], "source": [ "zone_df = spark.read \\\n", " .option(\"header\", \"true\") \\\n", " .csv(\"taxi_zone_lookup.csv\")\n", "\n", "zone_df.createOrReplaceTempView(\"zones\")\n" ] }, { "cell_type": "code", "execution_count": 26, "id": "caf57862-3048-408e-884e-77759ed5f550", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----------+-------------+-----------------------+------------+\n", "|LocationID|Borough |Zone |service_zone|\n", "+----------+-------------+-----------------------+------------+\n", "|1 |EWR |Newark Airport |EWR |\n", "|2 |Queens |Jamaica Bay |Boro Zone |\n", "|3 |Bronx |Allerton/Pelham Gardens|Boro Zone |\n", "|4 |Manhattan |Alphabet City |Yellow Zone |\n", "|5 |Staten Island|Arden Heights |Boro Zone |\n", "|6 |Staten Island|Arrochar/Fort Wadsworth|Boro Zone |\n", "|7 |Queens |Astoria |Boro Zone |\n", "|8 |Queens |Astoria Park |Boro Zone |\n", "|9 |Queens |Auburndale |Boro Zone |\n", "|10 |Queens |Baisley Park |Boro Zone |\n", "|11 |Brooklyn |Bath Beach |Boro Zone |\n", "|12 |Manhattan |Battery Park |Yellow Zone |\n", "|13 |Manhattan |Battery Park City |Yellow Zone |\n", "|14 |Brooklyn |Bay Ridge |Boro Zone |\n", "|15 |Queens |Bay Terrace/Fort Totten|Boro Zone |\n", "|16 |Queens |Bayside |Boro Zone |\n", "|17 |Brooklyn |Bedford |Boro Zone |\n", "|18 |Bronx |Bedford Park |Boro Zone |\n", "|19 |Queens |Bellerose |Boro Zone |\n", "|20 |Bronx |Belmont |Boro Zone |\n", "+----------+-------------+-----------------------+------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "zone_df.show(20, truncate=False) # Full column names" ] }, { "cell_type": "code", "execution_count": 28, "id": "8829812c-0547-4b87-b060-aa818bc641a4", "metadata": {}, "outputs": [], "source": [ "df.createOrReplaceTempView(\"yellow_taxi\") # Name it" ] }, { "cell_type": "code", "execution_count": 32, "id": "fb3b8d2d-42fa-4043-b9c9-00ce785ff6a5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-----+\n", "| Zone|trips|\n", "+--------------------+-----+\n", "| Arden Heights| 1|\n", "|Governor's Island...| 1|\n", "|Eltingville/Annad...| 1|\n", "| Port Richmond| 3|\n", "| Green-Wood Cemetery| 4|\n", "| Rikers Island| 4|\n", "| Great Kills| 4|\n", "| Rossville/Woodrow| 4|\n", "| Jamaica Bay| 5|\n", "| Westerleigh| 12|\n", "+--------------------+-----+\n", "\n" ] } ], "source": [ "spark.sql(\"\"\"\n", "SELECT zones.Zone, COUNT(*) as trips\n", "FROM yellow_taxi \n", "LEFT JOIN zones ON yellow_taxi.PULocationID = zones.LocationID\n", "WHERE zones.Zone IS NOT NULL -- Exclude NULL zones\n", "GROUP BY zones.Zone\n", "ORDER BY trips ASC\n", "LIMIT 10\n", "\"\"\").show()" ] }, { "cell_type": "code", "execution_count": null, "id": "31beaf75-180d-437a-b25a-dbe93e53f7f9", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }