{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "11183b59-8662-4a14-ac6a-35e44c1209f6", "metadata": {}, "outputs": [], "source": [ "Oracle AI Data Platform v1.0\n", "\n", "Copyright © 2025, Oracle and/or its affiliates.\n", "\n", "Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/" ] }, { "cell_type": "markdown", "id": "079f9106-82b1-48b2-b42b-bacff883df9e", "metadata": { "type": "python" }, "source": [ "# Delta Schema Evolution\n", "**This notebook illustrates how you can configure your AI Data Platform to allow for a schema that evolves over time.**\n", " \n", "It covers:\n", " \n", " 1. **Direct Write to OCI Object Storage**\n", " 2. **Accessing OCI Object Storage via an External Table**\n", " - Creating an external table\n", " - Querying an external table\n", " 3. **Writing new data with additional column to OCI Object Storage**\n", " - Writing to object storage\n", " - Reading data\n", " \n", "# **Parameters**\n", " - OCI Object Storage bucket name and Namespace name\n", " - Catalog name\n", " - Schema name\n", " - External table name\n", " - Folder name" ] }, { "cell_type": "code", "execution_count": null, "id": "6cf09b6f-ae3f-41cd-9e0a-9e5baed672ba", "metadata": { "execution": { "iopub.status.busy": "2025-03-28T17:28:49.092Z" } }, "outputs": [], "source": [ "# Change these values to your env\n", "oci_bucket=oidlUtils.parameters.getParameter(\"OCI_BUCKET\", \"oci://replace_bucket_name@replace_namespace\")\n", "p_catalog_name=oidlUtils.parameters.getParameter(\"CATALOG_NAME\", \"default\")\n", "p_schema_name=oidlUtils.parameters.getParameter(\"SCHEMA_NAME\", \"default\")\n", "p_table_name=oidlUtils.parameters.getParameter(\"TABLE_NAME\", \"deltatab\")\n", "p_folder_name=oidlUtils.parameters.getParameter(\"FOLDER_NAME\", \"mydata\")\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "8a1f34ad-2dca-41e4-bf7b-bf0df0ebf999", "metadata": { "execution": { "iopub.status.busy": "2025-03-28T17:27:06.122Z" } }, "outputs": [], "source": [ "import pyspark\n", "from pyspark.sql.types import *\n", "from pyspark.sql.functions import *" ] }, { "cell_type": "code", "execution_count": 1, "id": "2b9afc19-63d9-4f88-a7cf-55e78a3178e4", "metadata": { "execution": { "iopub.status.busy": "2025-03-28T17:27:22.807Z" } }, "outputs": [], "source": [ "# Create a spark dataframe and write as a delta table\n", "data = [(\"Robert\", \"Baratheon\", \"Baratheon\", \"Storms End\", 48),\n", " (\"Eddard\", \"Stark\", \"Stark\", \"Winterfell\", 46),\n", " (\"Jamie\", \"Lannister\", \"Lannister\", \"Casterly Rock\", 29)\n", " ]\n", "schema = StructType([\n", " StructField(\"firstname\", StringType(), True),\n", " StructField(\"lastname\", StringType(), True),\n", " StructField(\"house\", StringType(), True),\n", " StructField(\"location\", StringType(), True),\n", " StructField(\"age\", IntegerType(), True)\n", "])\n", "\n", "sample_dataframe = spark.createDataFrame(data=data, schema=schema)\n", "sample_dataframe.write.mode(saveMode=\"overwrite\").format(\"delta\").save(oci_bucket+'/'+p_folder_name+'/')" ] }, { "cell_type": "code", "execution_count": 1, "id": "60467d58-51cd-4b93-8ce1-7f39ab19a8d1", "metadata": { "execution": { "iopub.status.busy": "2025-03-28T17:27:33.435Z" } }, "outputs": [ { "data": { "text/html": [ "
DataFrame[]\n",
       "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create an external table on the folder for the delta table above\n", "spark.sql(\"create table if not exists \"+p_catalog_name+\".\"+p_schema_name+\".\"+p_table_name+\" USING DELTA LOCATION '\"+oci_bucket+\"/\"+p_folder_name+\"/'\");" ] }, { "cell_type": "code", "execution_count": 1, "id": "67be876f-2362-4f6f-a38b-a0c12b9825c2", "metadata": { "execution": { "iopub.status.busy": "2025-03-28T17:27:44.987Z" } }, "outputs": [ { "data": { "text/html": [ "
+---------+---------+---------+-------------+---+\n",
       "|firstname| lastname|    house|     location|age|\n",
       "+---------+---------+---------+-------------+---+\n",
       "|   Robert|Baratheon|Baratheon|   Storms End| 48|\n",
       "|   Eddard|    Stark|    Stark|   Winterfell| 46|\n",
       "|    Jamie|Lannister|Lannister|Casterly Rock| 29|\n",
       "+---------+---------+---------+-------------+---+\n",
       "\n",
       "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Query the Delta table\n", "spark.sql(\"select * from \" + p_catalog_name+\".\"+p_schema_name+\".\"+p_table_name).show()" ] }, { "cell_type": "code", "execution_count": 1, "id": "b46d9ee6-3273-4eb8-967a-15e872740ecf", "metadata": { "execution": { "iopub.status.busy": "2025-03-28T17:27:50.509Z" } }, "outputs": [], "source": [ "# Let's evolve the schema, add a salary column\n", "data = [(\"Jim\", \"Benson\", \"Hillmount\", \"Glasgow\", 34,100),\n", " (\"Jen\", \"Oliver\", \"Sleepy Hollow\", \"Cheddar\", 37,200)\n", " ]\n", "schema = StructType([\n", " StructField(\"firstname\", StringType(), True),\n", " StructField(\"lastname\", StringType(), True),\n", " StructField(\"house\", StringType(), True),\n", " StructField(\"location\", StringType(), True),\n", " StructField(\"age\", IntegerType(), True),\n", " StructField(\"salary\", IntegerType(), True)\n", "])\n", "\n", "sample_dataframe = spark.createDataFrame(data=data, schema=schema)\n", "sample_dataframe.write.mode(saveMode=\"append\").format(\"delta\").option(\"mergeSchema\", \"true\").save(oci_bucket+'/'+p_folder_name+'/')" ] }, { "cell_type": "code", "execution_count": 1, "id": "c559af35-bde2-47b2-8475-26e2e36de4e2", "metadata": { "execution": { "iopub.status.busy": "2025-03-28T17:28:02.486Z" } }, "outputs": [ { "data": { "text/html": [ "
+---------+---------+-------------+-------------+---+------+\n",
       "|firstname| lastname|        house|     location|age|salary|\n",
       "+---------+---------+-------------+-------------+---+------+\n",
       "|      Jen|   Oliver|Sleepy Hollow|      Cheddar| 37|   200|\n",
       "|      Jim|   Benson|    Hillmount|      Glasgow| 34|   100|\n",
       "|   Robert|Baratheon|    Baratheon|   Storms End| 48|  NULL|\n",
       "|   Eddard|    Stark|        Stark|   Winterfell| 46|  NULL|\n",
       "|    Jamie|Lannister|    Lannister|Casterly Rock| 29|  NULL|\n",
       "+---------+---------+-------------+-------------+---+------+\n",
       "\n",
       "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Query the Delta table and see the new column 'salary', the null values in the table for historical rows\n", "\n", "spark.sql(\"select * from \" + p_catalog_name+\".\"+p_schema_name+\".\"+p_table_name).show()" ] }, { "cell_type": "code", "execution_count": 1, "id": "43e4af8d-8812-4cbc-93f4-06c4df5582d1", "metadata": { "execution": { "iopub.status.busy": "2025-03-28T17:28:26.052Z" }, "type": "python" }, "outputs": [ { "data": { "text/html": [ "
DataFrame[]\n",
       "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "spark.sql(\"drop table \"+p_catalog_name+\".\"+p_schema_name+\".\"+p_table_name);" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }