{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "11183b59-8662-4a14-ac6a-35e44c1209f6", "metadata": {}, "outputs": [], "source": [ "Oracle AI Data Platform v1.0\n", "\n", "Copyright © 2025, Oracle and/or its affiliates.\n", "\n", "Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/" ] }, { "cell_type": "markdown", "id": "079f9106-82b1-48b2-b42b-bacff883df9e", "metadata": { "type": "python" }, "source": [ "# Delta Schema Evolution\n", "**This notebook illustrates how you can configure your AI Data Platform to allow for a schema that evolves over time.**\n", " \n", "It covers:\n", " \n", " 1. **Direct Write to OCI Object Storage**\n", " 2. **Accessing OCI Object Storage via an External Table**\n", " - Creating an external table\n", " - Querying an external table\n", " 3. **Writing new data with additional column to OCI Object Storage**\n", " - Writing to object storage\n", " - Reading data\n", " \n", "# **Parameters**\n", " - OCI Object Storage bucket name and Namespace name\n", " - Catalog name\n", " - Schema name\n", " - External table name\n", " - Folder name" ] }, { "cell_type": "code", "execution_count": null, "id": "6cf09b6f-ae3f-41cd-9e0a-9e5baed672ba", "metadata": { "execution": { "iopub.status.busy": "2025-03-28T17:28:49.092Z" } }, "outputs": [], "source": [ "# Change these values to your env\n", "oci_bucket=oidlUtils.parameters.getParameter(\"OCI_BUCKET\", \"oci://replace_bucket_name@replace_namespace\")\n", "p_catalog_name=oidlUtils.parameters.getParameter(\"CATALOG_NAME\", \"default\")\n", "p_schema_name=oidlUtils.parameters.getParameter(\"SCHEMA_NAME\", \"default\")\n", "p_table_name=oidlUtils.parameters.getParameter(\"TABLE_NAME\", \"deltatab\")\n", "p_folder_name=oidlUtils.parameters.getParameter(\"FOLDER_NAME\", \"mydata\")\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "8a1f34ad-2dca-41e4-bf7b-bf0df0ebf999", "metadata": { "execution": { "iopub.status.busy": "2025-03-28T17:27:06.122Z" } }, "outputs": [], "source": [ "import pyspark\n", "from pyspark.sql.types import *\n", "from pyspark.sql.functions import *" ] }, { "cell_type": "code", "execution_count": 1, "id": "2b9afc19-63d9-4f88-a7cf-55e78a3178e4", "metadata": { "execution": { "iopub.status.busy": "2025-03-28T17:27:22.807Z" } }, "outputs": [], "source": [ "# Create a spark dataframe and write as a delta table\n", "data = [(\"Robert\", \"Baratheon\", \"Baratheon\", \"Storms End\", 48),\n", " (\"Eddard\", \"Stark\", \"Stark\", \"Winterfell\", 46),\n", " (\"Jamie\", \"Lannister\", \"Lannister\", \"Casterly Rock\", 29)\n", " ]\n", "schema = StructType([\n", " StructField(\"firstname\", StringType(), True),\n", " StructField(\"lastname\", StringType(), True),\n", " StructField(\"house\", StringType(), True),\n", " StructField(\"location\", StringType(), True),\n", " StructField(\"age\", IntegerType(), True)\n", "])\n", "\n", "sample_dataframe = spark.createDataFrame(data=data, schema=schema)\n", "sample_dataframe.write.mode(saveMode=\"overwrite\").format(\"delta\").save(oci_bucket+'/'+p_folder_name+'/')" ] }, { "cell_type": "code", "execution_count": 1, "id": "60467d58-51cd-4b93-8ce1-7f39ab19a8d1", "metadata": { "execution": { "iopub.status.busy": "2025-03-28T17:27:33.435Z" } }, "outputs": [ { "data": { "text/html": [ "
DataFrame[]\n",
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Create an external table on the folder for the delta table above\n",
"spark.sql(\"create table if not exists \"+p_catalog_name+\".\"+p_schema_name+\".\"+p_table_name+\" USING DELTA LOCATION '\"+oci_bucket+\"/\"+p_folder_name+\"/'\");"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "67be876f-2362-4f6f-a38b-a0c12b9825c2",
"metadata": {
"execution": {
"iopub.status.busy": "2025-03-28T17:27:44.987Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"+---------+---------+---------+-------------+---+\n",
"|firstname| lastname| house| location|age|\n",
"+---------+---------+---------+-------------+---+\n",
"| Robert|Baratheon|Baratheon| Storms End| 48|\n",
"| Eddard| Stark| Stark| Winterfell| 46|\n",
"| Jamie|Lannister|Lannister|Casterly Rock| 29|\n",
"+---------+---------+---------+-------------+---+\n",
"\n",
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Query the Delta table\n",
"spark.sql(\"select * from \" + p_catalog_name+\".\"+p_schema_name+\".\"+p_table_name).show()"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b46d9ee6-3273-4eb8-967a-15e872740ecf",
"metadata": {
"execution": {
"iopub.status.busy": "2025-03-28T17:27:50.509Z"
}
},
"outputs": [],
"source": [
"# Let's evolve the schema, add a salary column\n",
"data = [(\"Jim\", \"Benson\", \"Hillmount\", \"Glasgow\", 34,100),\n",
" (\"Jen\", \"Oliver\", \"Sleepy Hollow\", \"Cheddar\", 37,200)\n",
" ]\n",
"schema = StructType([\n",
" StructField(\"firstname\", StringType(), True),\n",
" StructField(\"lastname\", StringType(), True),\n",
" StructField(\"house\", StringType(), True),\n",
" StructField(\"location\", StringType(), True),\n",
" StructField(\"age\", IntegerType(), True),\n",
" StructField(\"salary\", IntegerType(), True)\n",
"])\n",
"\n",
"sample_dataframe = spark.createDataFrame(data=data, schema=schema)\n",
"sample_dataframe.write.mode(saveMode=\"append\").format(\"delta\").option(\"mergeSchema\", \"true\").save(oci_bucket+'/'+p_folder_name+'/')"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "c559af35-bde2-47b2-8475-26e2e36de4e2",
"metadata": {
"execution": {
"iopub.status.busy": "2025-03-28T17:28:02.486Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"+---------+---------+-------------+-------------+---+------+\n",
"|firstname| lastname| house| location|age|salary|\n",
"+---------+---------+-------------+-------------+---+------+\n",
"| Jen| Oliver|Sleepy Hollow| Cheddar| 37| 200|\n",
"| Jim| Benson| Hillmount| Glasgow| 34| 100|\n",
"| Robert|Baratheon| Baratheon| Storms End| 48| NULL|\n",
"| Eddard| Stark| Stark| Winterfell| 46| NULL|\n",
"| Jamie|Lannister| Lannister|Casterly Rock| 29| NULL|\n",
"+---------+---------+-------------+-------------+---+------+\n",
"\n",
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Query the Delta table and see the new column 'salary', the null values in the table for historical rows\n",
"\n",
"spark.sql(\"select * from \" + p_catalog_name+\".\"+p_schema_name+\".\"+p_table_name).show()"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "43e4af8d-8812-4cbc-93f4-06c4df5582d1",
"metadata": {
"execution": {
"iopub.status.busy": "2025-03-28T17:28:26.052Z"
},
"type": "python"
},
"outputs": [
{
"data": {
"text/html": [
"DataFrame[]\n",
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"spark.sql(\"drop table \"+p_catalog_name+\".\"+p_schema_name+\".\"+p_table_name);"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}