{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "5280fe89-8532-4f44-8672-97696cadf83c", "metadata": {}, "outputs": [], "source": [ "Oracle AI Data Platform v1.0\n", "\n", "Copyright © 2025, Oracle and/or its affiliates.\n", "\n", "Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/" ] }, { "cell_type": "markdown", "id": "8a76b357-10b2-43f5-864e-2747ce0040dc", "metadata": { "type": "python" }, "source": [ "# Analyze Data Using PySpark Examples\n", "\n", " **Learn basic PySpark features in AI Data Platform**\n", "\n", " See https://spark.apache.org/docs/latest/api/python/index.html for detailed documentation.\n", " \n", " This notebook demonstrates different ways to read and write data from tables in AI Data Platform. It covers:\n", " \n", " 1. **Setup catalog and schemas**\n", " 2. **Create Table**\n", " 3. **Table operations**\n", " - Insert data\n", " 4. **Schema exploration**\n", " - Listing schemas and tables\n", " 5. **Visualize data using matplotlib**\n", "\n", " **Prerequisites**\n", "\n", " Install requirements.txt which contains matplotlib into the cluster before running the notebook." ] }, { "cell_type": "markdown", "id": "1a670927-fc2f-445c-8d2c-338645fa428b", "metadata": {}, "source": [ "# **1. Setup catalog and schemas**" ] }, { "cell_type": "markdown", "id": "f843ec04-b72b-44f6-9a72-2be2a289cd58", "metadata": { "execution": { "iopub.status.busy": "2025-03-24T15:16:21.700Z" }, "type": "markdown" }, "source": [ "## Show Catalogs" ] }, { "cell_type": "code", "execution_count": 1, "id": "4031b6f5-d673-474d-86e2-41bd8113d678", "metadata": { "execution": { "iopub.status.busy": "2025-03-24T17:48:11.692Z" }, "type": "sql" }, "outputs": [], "source": [ "%sql\n", "show catalogs" ] }, { "cell_type": "markdown", "id": "15bb6d74-ff08-4327-9cbe-bb80e16cb9e7", "metadata": {}, "source": [ "## Create Managed Catalog" ] }, { "cell_type": "code", "execution_count": 1, "id": "6bbeaab3-2a11-4ec1-acec-b1c816bb78fd", "metadata": { "execution": { "iopub.status.busy": "2025-04-08T18:38:08.376Z" }, "type": "sql" }, "outputs": [], "source": [ "%sql\n", "create catalog if not exists lake" ] }, { "cell_type": "code", "execution_count": 1, "id": "5691975e-f0bc-4ffe-9c98-ac1af61cc1eb", "metadata": { "execution": { "iopub.status.busy": "2025-03-24T15:21:47.091Z" } }, "outputs": [ { "data": { "text/html": [ "
+------------------+--------------------+\n",
"| attribute| attribute_value|\n",
"+------------------+--------------------+\n",
"| name| lake|\n",
"| createdOn| 1742755977633|\n",
"| updatedOn| 1742755977275|\n",
"|connection_details| {}|\n",
"| catalog_details|{\"createdOn\":\"174...|\n",
"| createdBy|ocid1.user.oc1..a...|\n",
"| type| INTERNAL|\n",
"| updatedBy|ocid1.user.oc1..a...|\n",
"+------------------+--------------------+\n",
"\n",
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Describe Catalog\n",
"spark.sql(\"describe catalog lake\").show()"
]
},
{
"cell_type": "markdown",
"id": "55560927-d68d-4321-8e9b-9ec13d6e97f1",
"metadata": {},
"source": [
"## Create Schema in Managed Catalog"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "72fe94b6-e810-4613-8aa3-a79c736fc4e1",
"metadata": {
"execution": {
"iopub.status.busy": "2025-04-08T18:38:15.186Z"
},
"type": "sql"
},
"outputs": [
{
"data": {
"text/html": [
"OK" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%sql\n", "create schema if not exists lake.bronze" ] }, { "cell_type": "code", "execution_count": 1, "id": "1be56743-d407-4612-b2ae-231188946db0", "metadata": { "execution": { "iopub.status.busy": "2025-03-24T17:04:41.651Z" } }, "outputs": [ { "data": { "text/html": [ "
+--------------+----------+\n",
"| info_name|info_value|\n",
"+--------------+----------+\n",
"| Catalog Name| lake|\n",
"|Namespace Name| bronze|\n",
"+--------------+----------+\n",
"\n",
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Describe Schema\n",
"spark.sql(\"describe schema lake.bronze\").show()"
]
},
{
"cell_type": "markdown",
"id": "a63df75e-333d-455a-a419-4b950a266b3e",
"metadata": {
"type": "markdown"
},
"source": [
"# **2. Create Table**"
]
},
{
"cell_type": "markdown",
"id": "0f67c7b9-900b-40c0-b98f-c7d4d133134f",
"metadata": {},
"source": [
"## Create Managed Table\n",
"Default managed table is created in Parquet format."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "939ae194-fa80-46c3-bffc-65d297498039",
"metadata": {
"execution": {
"iopub.status.busy": "2025-04-08T18:38:59.305Z"
}
},
"outputs": [],
"source": [
"# Sample patient data\n",
"data = [\n",
" (\"2025-03-24T08:00:00Z\", \"P001\", 72, \"120/80\", 98, 36.8, \"302A\", \"stable\"),\n",
" (\"2025-03-24T08:05:00Z\", \"P001\", 75, \"118/78\", 97, 36.9, \"302A\", \"stable\"),\n",
" (\"2025-03-24T08:10:00Z\", \"P002\", 90, \"140/90\", 92, 38.2, \"215B\", \"critical\"),\n",
" (\"2025-03-24T08:15:00Z\", \"P002\", 95, \"145/95\", 90, 38.5, \"215B\", \"critical\")\n",
"]\n",
"\n",
"columns = [\"timestamp\", \"patient_id\", \"heart_rate_bpm\", \"blood_pressure\", \"oxygen_saturation\", \"temperature_C\", \"room_number\", \"status\"]\n",
"\n",
"df = spark.createDataFrame(data, columns)\n",
"df.write.mode(\"overwrite\").saveAsTable(\"lake.bronze.patient_data_tab\")"
]
},
{
"cell_type": "markdown",
"id": "bcaf3b6c-a747-4917-9464-9b3012ece36d",
"metadata": {
"type": "markdown"
},
"source": [
"# **3. Table Operations**"
]
},
{
"cell_type": "markdown",
"id": "52ba5b85-1814-4af5-b2cc-59bb8fac7582",
"metadata": {},
"source": [
"# Insert Data in Table"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "6ede6a23-56b8-4d86-b495-6e0a7ccaaef6",
"metadata": {
"execution": {
"iopub.status.busy": "2025-04-08T18:40:47.183Z"
},
"type": "python"
},
"outputs": [],
"source": [
"data = [\n",
" (\"2025-03-24T08:20:00Z\", \"P003\", 93, \"143/95\", 91, 37.5, \"215B\", \"critical\")\n",
"]\n",
"\n",
"columns = [\"timestamp\", \"patient_id\", \"heart_rate_bpm\", \"blood_pressure\", \"oxygen_saturation\", \"temperature_C\", \"room_number\", \"status\"]\n",
"\n",
"df = spark.createDataFrame(data, columns)\n",
"df.write.insertInto(\"lake.bronze.patient_data_tab\")\n"
]
},
{
"cell_type": "markdown",
"id": "019a0092-2684-4d4e-9c02-cfe7cbf41a2e",
"metadata": {},
"source": [
"## Query Data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "67341f65-d4e4-4235-98bd-c228e763d1d1",
"metadata": {
"execution": {
"iopub.status.busy": "2025-04-08T18:40:57.928Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"+--------------------+----------+--------------+--------------+-----------------+-------------+-----------+--------+\n",
"| timestamp|patient_id|heart_rate_bpm|blood_pressure|oxygen_saturation|temperature_C|room_number| status|\n",
"+--------------------+----------+--------------+--------------+-----------------+-------------+-----------+--------+\n",
"|2025-03-24T08:10:00Z| P002| 90| 140/90| 92| 38.2| 215B|critical|\n",
"|2025-03-24T08:15:00Z| P002| 95| 145/95| 90| 38.5| 215B|critical|\n",
"|2025-03-24T08:00:00Z| P001| 72| 120/80| 98| 36.8| 302A| stable|\n",
"|2025-03-24T08:05:00Z| P001| 75| 118/78| 97| 36.9| 302A| stable|\n",
"|2025-03-24T08:20:00Z| P003| 93| 143/95| 91| 37.5| 215B|critical|\n",
"+--------------------+----------+--------------+--------------+-----------------+-------------+-----------+--------+\n",
"\n",
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Query managed table\n",
"spark.sql(\"select * from lake.bronze.patient_data_tab\").show()"
]
},
{
"cell_type": "markdown",
"id": "e6c983b9-b0c3-480f-9ba2-0d111569c88b",
"metadata": {
"type": "markdown"
},
"source": [
"# **4. Schema Exploration**"
]
},
{
"cell_type": "markdown",
"id": "694f0bd2-8760-40c5-a41e-1ffc8779959f",
"metadata": {},
"source": [
"## Show Schemas, Tables, Describe Tables"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "0aabaf9d-b680-46ec-8310-83253c634148",
"metadata": {
"execution": {
"iopub.status.busy": "2025-03-24T17:05:55.007Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"+---------+\n",
"|namespace|\n",
"+---------+\n",
"| default|\n",
"| bronze|\n",
"+---------+\n",
"\n",
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"spark.sql(\"show schemas in lake\").show()"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b399a240-5092-40d1-b226-7f19c8c3f549",
"metadata": {
"execution": {
"iopub.status.busy": "2025-04-08T18:41:24.598Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"+---------+----------------+-----------+\n",
"|namespace| tableName|isTemporary|\n",
"+---------+----------------+-----------+\n",
"| bronze|patient_data_tab| false|\n",
"+---------+----------------+-----------+\n",
"\n",
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"spark.sql(\"show tables in lake.bronze\").show()"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b83c9dbe-2bfe-49ed-853d-91a34ccb03ae",
"metadata": {
"execution": {
"iopub.status.busy": "2025-04-08T18:41:19.330Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"+---------+----------------+-----------+\n",
"|namespace| tableName|isTemporary|\n",
"+---------+----------------+-----------+\n",
"| bronze|patient_data_tab| false|\n",
"+---------+----------------+-----------+\n",
"\n",
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# List tables using a pattern eg. tables beginning with e\n",
"spark.sql(\"show tables in lake.bronze like 'p.*'\").show()"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "0e726160-679d-48e8-8641-9d20f0f80568",
"metadata": {
"execution": {
"iopub.status.busy": "2025-04-08T18:41:42.702Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"+--------------------+--------------------+-------+\n",
"| col_name| data_type|comment|\n",
"+--------------------+--------------------+-------+\n",
"| timestamp| string| NULL|\n",
"| patient_id| string| NULL|\n",
"| heart_rate_bpm| bigint| NULL|\n",
"| blood_pressure| string| NULL|\n",
"| oxygen_saturation| bigint| NULL|\n",
"| temperature_C| double| NULL|\n",
"| room_number| string| NULL|\n",
"| status| string| NULL|\n",
"| | | |\n",
"|# Detailed Table ...| | |\n",
"| Name|bronze.patient_da...| |\n",
"| Type| MANAGED| |\n",
"| Location|oci://t7q7elmpr2y...| |\n",
"| Provider| parquet| |\n",
"| Owner|ocid1.user.oc1..a...| |\n",
"| Is_managed_location| true| |\n",
"| Table Properties|[com.oracle.dcat....| |\n",
"+--------------------+--------------------+-------+\n",
"\n",
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Describe managed table, see type is MANAGED and provider is parquet\n",
"spark.sql(\"describe table extended lake.bronze.patient_data_tab\").show()"
]
},
{
"cell_type": "markdown",
"id": "70d38c33-f674-4263-93f6-a6ececfac98d",
"metadata": {},
"source": [
"# List all tables in all schemas in a catalog"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "8a97a248-3131-40b8-9ac0-29c95744e890",
"metadata": {
"execution": {
"iopub.status.busy": "2025-04-08T18:42:04.282Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"[\"lake.bronze.patient_data_tab\"]\n",
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"catalog_name=\"lake\"\n",
"databases = [\n",
" catalog_name+\".\"+db.namespace \n",
" for db in spark.sql('show schemas in '+catalog_name).collect()\n",
"]\n",
"tables = [\n",
" catalog_name+\".\"+f\"{row['namespace']}.{row['tableName']}\" #