{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "5280fe89-8532-4f44-8672-97696cadf83c", "metadata": {}, "outputs": [], "source": [ "Oracle AI Data Platform v1.0\n", "\n", "Copyright © 2025, Oracle and/or its affiliates.\n", "\n", "Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/" ] }, { "cell_type": "markdown", "id": "8a76b357-10b2-43f5-864e-2747ce0040dc", "metadata": { "type": "python" }, "source": [ "# Analyze Data Using PySpark Examples\n", "\n", " **Learn basic PySpark features in AI Data Platform**\n", "\n", " See https://spark.apache.org/docs/latest/api/python/index.html for detailed documentation.\n", " \n", " This notebook demonstrates different ways to read and write data from tables in AI Data Platform. It covers:\n", " \n", " 1. **Setup catalog and schemas**\n", " 2. **Create Table**\n", " 3. **Table operations**\n", " - Insert data\n", " 4. **Schema exploration**\n", " - Listing schemas and tables\n", " 5. **Visualize data using matplotlib**\n", "\n", " **Prerequisites**\n", "\n", " Install requirements.txt which contains matplotlib into the cluster before running the notebook." ] }, { "cell_type": "markdown", "id": "1a670927-fc2f-445c-8d2c-338645fa428b", "metadata": {}, "source": [ "# **1. Setup catalog and schemas**" ] }, { "cell_type": "markdown", "id": "f843ec04-b72b-44f6-9a72-2be2a289cd58", "metadata": { "execution": { "iopub.status.busy": "2025-03-24T15:16:21.700Z" }, "type": "markdown" }, "source": [ "## Show Catalogs" ] }, { "cell_type": "code", "execution_count": 1, "id": "4031b6f5-d673-474d-86e2-41bd8113d678", "metadata": { "execution": { "iopub.status.busy": "2025-03-24T17:48:11.692Z" }, "type": "sql" }, "outputs": [], "source": [ "%sql\n", "show catalogs" ] }, { "cell_type": "markdown", "id": "15bb6d74-ff08-4327-9cbe-bb80e16cb9e7", "metadata": {}, "source": [ "## Create Managed Catalog" ] }, { "cell_type": "code", "execution_count": 1, "id": "6bbeaab3-2a11-4ec1-acec-b1c816bb78fd", "metadata": { "execution": { "iopub.status.busy": "2025-04-08T18:38:08.376Z" }, "type": "sql" }, "outputs": [], "source": [ "%sql\n", "create catalog if not exists lake" ] }, { "cell_type": "code", "execution_count": 1, "id": "5691975e-f0bc-4ffe-9c98-ac1af61cc1eb", "metadata": { "execution": { "iopub.status.busy": "2025-03-24T15:21:47.091Z" } }, "outputs": [ { "data": { "text/html": [ "
+------------------+--------------------+\n",
       "|         attribute|     attribute_value|\n",
       "+------------------+--------------------+\n",
       "|              name|                lake|\n",
       "|         createdOn|       1742755977633|\n",
       "|         updatedOn|       1742755977275|\n",
       "|connection_details|                  {}|\n",
       "|   catalog_details|{\"createdOn\":\"174...|\n",
       "|         createdBy|ocid1.user.oc1..a...|\n",
       "|              type|            INTERNAL|\n",
       "|         updatedBy|ocid1.user.oc1..a...|\n",
       "+------------------+--------------------+\n",
       "\n",
       "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Describe Catalog\n", "spark.sql(\"describe catalog lake\").show()" ] }, { "cell_type": "markdown", "id": "55560927-d68d-4321-8e9b-9ec13d6e97f1", "metadata": {}, "source": [ "## Create Schema in Managed Catalog" ] }, { "cell_type": "code", "execution_count": 1, "id": "72fe94b6-e810-4613-8aa3-a79c736fc4e1", "metadata": { "execution": { "iopub.status.busy": "2025-04-08T18:38:15.186Z" }, "type": "sql" }, "outputs": [ { "data": { "text/html": [ "
OK
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%sql\n", "create schema if not exists lake.bronze" ] }, { "cell_type": "code", "execution_count": 1, "id": "1be56743-d407-4612-b2ae-231188946db0", "metadata": { "execution": { "iopub.status.busy": "2025-03-24T17:04:41.651Z" } }, "outputs": [ { "data": { "text/html": [ "
+--------------+----------+\n",
       "|     info_name|info_value|\n",
       "+--------------+----------+\n",
       "|  Catalog Name|      lake|\n",
       "|Namespace Name|    bronze|\n",
       "+--------------+----------+\n",
       "\n",
       "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Describe Schema\n", "spark.sql(\"describe schema lake.bronze\").show()" ] }, { "cell_type": "markdown", "id": "a63df75e-333d-455a-a419-4b950a266b3e", "metadata": { "type": "markdown" }, "source": [ "# **2. Create Table**" ] }, { "cell_type": "markdown", "id": "0f67c7b9-900b-40c0-b98f-c7d4d133134f", "metadata": {}, "source": [ "## Create Managed Table\n", "Default managed table is created in Parquet format." ] }, { "cell_type": "code", "execution_count": 1, "id": "939ae194-fa80-46c3-bffc-65d297498039", "metadata": { "execution": { "iopub.status.busy": "2025-04-08T18:38:59.305Z" } }, "outputs": [], "source": [ "# Sample patient data\n", "data = [\n", " (\"2025-03-24T08:00:00Z\", \"P001\", 72, \"120/80\", 98, 36.8, \"302A\", \"stable\"),\n", " (\"2025-03-24T08:05:00Z\", \"P001\", 75, \"118/78\", 97, 36.9, \"302A\", \"stable\"),\n", " (\"2025-03-24T08:10:00Z\", \"P002\", 90, \"140/90\", 92, 38.2, \"215B\", \"critical\"),\n", " (\"2025-03-24T08:15:00Z\", \"P002\", 95, \"145/95\", 90, 38.5, \"215B\", \"critical\")\n", "]\n", "\n", "columns = [\"timestamp\", \"patient_id\", \"heart_rate_bpm\", \"blood_pressure\", \"oxygen_saturation\", \"temperature_C\", \"room_number\", \"status\"]\n", "\n", "df = spark.createDataFrame(data, columns)\n", "df.write.mode(\"overwrite\").saveAsTable(\"lake.bronze.patient_data_tab\")" ] }, { "cell_type": "markdown", "id": "bcaf3b6c-a747-4917-9464-9b3012ece36d", "metadata": { "type": "markdown" }, "source": [ "# **3. Table Operations**" ] }, { "cell_type": "markdown", "id": "52ba5b85-1814-4af5-b2cc-59bb8fac7582", "metadata": {}, "source": [ "# Insert Data in Table" ] }, { "cell_type": "code", "execution_count": 1, "id": "6ede6a23-56b8-4d86-b495-6e0a7ccaaef6", "metadata": { "execution": { "iopub.status.busy": "2025-04-08T18:40:47.183Z" }, "type": "python" }, "outputs": [], "source": [ "data = [\n", " (\"2025-03-24T08:20:00Z\", \"P003\", 93, \"143/95\", 91, 37.5, \"215B\", \"critical\")\n", "]\n", "\n", "columns = [\"timestamp\", \"patient_id\", \"heart_rate_bpm\", \"blood_pressure\", \"oxygen_saturation\", \"temperature_C\", \"room_number\", \"status\"]\n", "\n", "df = spark.createDataFrame(data, columns)\n", "df.write.insertInto(\"lake.bronze.patient_data_tab\")\n" ] }, { "cell_type": "markdown", "id": "019a0092-2684-4d4e-9c02-cfe7cbf41a2e", "metadata": {}, "source": [ "## Query Data" ] }, { "cell_type": "code", "execution_count": 1, "id": "67341f65-d4e4-4235-98bd-c228e763d1d1", "metadata": { "execution": { "iopub.status.busy": "2025-04-08T18:40:57.928Z" } }, "outputs": [ { "data": { "text/html": [ "
+--------------------+----------+--------------+--------------+-----------------+-------------+-----------+--------+\n",
       "|           timestamp|patient_id|heart_rate_bpm|blood_pressure|oxygen_saturation|temperature_C|room_number|  status|\n",
       "+--------------------+----------+--------------+--------------+-----------------+-------------+-----------+--------+\n",
       "|2025-03-24T08:10:00Z|      P002|            90|        140/90|               92|         38.2|       215B|critical|\n",
       "|2025-03-24T08:15:00Z|      P002|            95|        145/95|               90|         38.5|       215B|critical|\n",
       "|2025-03-24T08:00:00Z|      P001|            72|        120/80|               98|         36.8|       302A|  stable|\n",
       "|2025-03-24T08:05:00Z|      P001|            75|        118/78|               97|         36.9|       302A|  stable|\n",
       "|2025-03-24T08:20:00Z|      P003|            93|        143/95|               91|         37.5|       215B|critical|\n",
       "+--------------------+----------+--------------+--------------+-----------------+-------------+-----------+--------+\n",
       "\n",
       "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Query managed table\n", "spark.sql(\"select * from lake.bronze.patient_data_tab\").show()" ] }, { "cell_type": "markdown", "id": "e6c983b9-b0c3-480f-9ba2-0d111569c88b", "metadata": { "type": "markdown" }, "source": [ "# **4. Schema Exploration**" ] }, { "cell_type": "markdown", "id": "694f0bd2-8760-40c5-a41e-1ffc8779959f", "metadata": {}, "source": [ "## Show Schemas, Tables, Describe Tables" ] }, { "cell_type": "code", "execution_count": 1, "id": "0aabaf9d-b680-46ec-8310-83253c634148", "metadata": { "execution": { "iopub.status.busy": "2025-03-24T17:05:55.007Z" } }, "outputs": [ { "data": { "text/html": [ "
+---------+\n",
       "|namespace|\n",
       "+---------+\n",
       "|  default|\n",
       "|   bronze|\n",
       "+---------+\n",
       "\n",
       "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "spark.sql(\"show schemas in lake\").show()" ] }, { "cell_type": "code", "execution_count": 1, "id": "b399a240-5092-40d1-b226-7f19c8c3f549", "metadata": { "execution": { "iopub.status.busy": "2025-04-08T18:41:24.598Z" } }, "outputs": [ { "data": { "text/html": [ "
+---------+----------------+-----------+\n",
       "|namespace|       tableName|isTemporary|\n",
       "+---------+----------------+-----------+\n",
       "|   bronze|patient_data_tab|      false|\n",
       "+---------+----------------+-----------+\n",
       "\n",
       "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "spark.sql(\"show tables in lake.bronze\").show()" ] }, { "cell_type": "code", "execution_count": 1, "id": "b83c9dbe-2bfe-49ed-853d-91a34ccb03ae", "metadata": { "execution": { "iopub.status.busy": "2025-04-08T18:41:19.330Z" } }, "outputs": [ { "data": { "text/html": [ "
+---------+----------------+-----------+\n",
       "|namespace|       tableName|isTemporary|\n",
       "+---------+----------------+-----------+\n",
       "|   bronze|patient_data_tab|      false|\n",
       "+---------+----------------+-----------+\n",
       "\n",
       "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# List tables using a pattern eg. tables beginning with e\n", "spark.sql(\"show tables in lake.bronze like 'p.*'\").show()" ] }, { "cell_type": "code", "execution_count": 1, "id": "0e726160-679d-48e8-8641-9d20f0f80568", "metadata": { "execution": { "iopub.status.busy": "2025-04-08T18:41:42.702Z" } }, "outputs": [ { "data": { "text/html": [ "
+--------------------+--------------------+-------+\n",
       "|            col_name|           data_type|comment|\n",
       "+--------------------+--------------------+-------+\n",
       "|           timestamp|              string|   NULL|\n",
       "|          patient_id|              string|   NULL|\n",
       "|      heart_rate_bpm|              bigint|   NULL|\n",
       "|      blood_pressure|              string|   NULL|\n",
       "|   oxygen_saturation|              bigint|   NULL|\n",
       "|       temperature_C|              double|   NULL|\n",
       "|         room_number|              string|   NULL|\n",
       "|              status|              string|   NULL|\n",
       "|                    |                    |       |\n",
       "|# Detailed Table ...|                    |       |\n",
       "|                Name|bronze.patient_da...|       |\n",
       "|                Type|             MANAGED|       |\n",
       "|            Location|oci://t7q7elmpr2y...|       |\n",
       "|            Provider|             parquet|       |\n",
       "|               Owner|ocid1.user.oc1..a...|       |\n",
       "| Is_managed_location|                true|       |\n",
       "|    Table Properties|[com.oracle.dcat....|       |\n",
       "+--------------------+--------------------+-------+\n",
       "\n",
       "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Describe managed table, see type is MANAGED and provider is parquet\n", "spark.sql(\"describe table extended lake.bronze.patient_data_tab\").show()" ] }, { "cell_type": "markdown", "id": "70d38c33-f674-4263-93f6-a6ececfac98d", "metadata": {}, "source": [ "# List all tables in all schemas in a catalog" ] }, { "cell_type": "code", "execution_count": 1, "id": "8a97a248-3131-40b8-9ac0-29c95744e890", "metadata": { "execution": { "iopub.status.busy": "2025-04-08T18:42:04.282Z" } }, "outputs": [ { "data": { "text/html": [ "
[\"lake.bronze.patient_data_tab\"]\n",
       "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "catalog_name=\"lake\"\n", "databases = [\n", " catalog_name+\".\"+db.namespace \n", " for db in spark.sql('show schemas in '+catalog_name).collect()\n", "]\n", "tables = [\n", " catalog_name+\".\"+f\"{row['namespace']}.{row['tableName']}\" #.. format\n", " for db_rows in [\n", " spark.sql(f'show tables in {db}').collect() for db in databases\n", " ] \n", " for row in db_rows\n", "]\n", "print(tables)" ] }, { "cell_type": "markdown", "id": "e989bda7-a7b2-4dda-aa67-c8c050d9d80f", "metadata": { "type": "markdown" }, "source": [ "# Visualize Data using Matplotlib" ] }, { "cell_type": "code", "execution_count": 1, "id": "c29a6704-90dc-4802-a655-c58540789bb0", "metadata": { "execution": { "iopub.status.busy": "2025-04-08T18:42:47.317Z" }, "type": "python" }, "outputs": [ { "data": { "text/html": [ "\"Image\"" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "\n", "# Convert Delta Table to Pandas\n", "deltaTable = spark.sql(\"select * from lake.bronze.patient_data_tab\")\n", "df_pandas = deltaTable.toPandas()\n", "\n", "# Plot\n", "plt.figure(figsize=(10,5))\n", "for patient in df_pandas[\"patient_id\"].unique():\n", " patient_data = df_pandas[df_pandas[\"patient_id\"] == patient]\n", " plt.plot(patient_data[\"timestamp\"], patient_data[\"heart_rate_bpm\"], marker='o', label=f\"Patient {patient}\")\n", "\n", "plt.xlabel(\"Timestamp\")\n", "plt.ylabel(\"Heart Rate (BPM)\")\n", "plt.title(\"Patient Heart Rate Over Time\")\n", "plt.legend()\n", "plt.xticks(rotation=45)\n", "plt" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }