{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "9e2171fd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2.4.8"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"spark.version"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "2c23ce58",
"metadata": {},
"outputs": [],
"source": [
"import java.sql.Date\n",
"import org.apache.spark.sql.expressions.Window\n",
"import org.apache.spark.sql.functions._"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "077c8236",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"purchasesDF = [purchase_date: date, device_id: string]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[purchase_date: date, device_id: string]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val purchasesDF = Seq(\n",
" (Date.valueOf(\"2019-01-01\"), \"01\"),\n",
" (Date.valueOf(\"2019-05-10\"), \"01\"),\n",
" (Date.valueOf(\"2019-03-05\"), \"02\"),\n",
" (Date.valueOf(\"2019-02-20\"), \"03\"),\n",
" (Date.valueOf(\"2019-01-20\"), \"02\")\n",
").toDF(\"purchase_date\", \"device_id\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "1b9cd20b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"devicesDF = [device_id: string, device_name: string ... 1 more field]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[device_id: string, device_name: string ... 1 more field]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val devicesDF = Seq(\n",
" (\"01\", \"notebook\", 600.00),\n",
" (\"02\", \"small phone\", 100.00),\n",
" (\"03\", \"camera\",150.00),\n",
" (\"04\", \"large phone\", 700.00)\n",
").toDF(\"device_id\", \"device_name\", \"price\")"
]
},
{
"cell_type": "markdown",
"id": "bfe05688",
"metadata": {},
"source": [
"### error message example"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "a1694fdd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"lastException: Throwable = null\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"An error occurred converting DataFrame to html.\n",
"Reference 'device_id' is ambiguous, could be: device_id, device_id.;"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"purchasesDF.join(devicesDF, col(\"device_id\") === col(\"device_id\"), \"inner\")"
]
},
{
"cell_type": "markdown",
"id": "7522d989",
"metadata": {},
"source": [
"## adding aliases"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "4326cd71",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
purchase_date | device_id | device_id | device_name | price |
---|
2019-05-10 | 01 | 01 | notebook | 600.0 |
2019-01-01 | 01 | 01 | notebook | 600.0 |
2019-01-20 | 02 | 02 | small phone | 100.0 |
2019-03-05 | 02 | 02 | small phone | 100.0 |
2019-02-20 | 03 | 03 | camera | 150.0 |
"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"%%scan\n",
"purchasesDF.as(\"purchases\")\n",
" .join(devicesDF.as(\"devices\"), col(\"purchases.device_id\") === col(\"devices.device_id\"))"
]
},
{
"cell_type": "markdown",
"id": "2d1f5b4b",
"metadata": {},
"source": [
"## Viewing qualified column names via exceptions\n",
"\n",
"An engineer's gotta do what an engineer's gotta do"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e2838cee",
"metadata": {},
"outputs": [
{
"ename": "org.apache.spark.sql.AnalysisException",
"evalue": "cannot resolve '`xxxx`' given input columns: [purchases.device_id, devices.device_id, purchases.purchase_date, devices.device_name, devices.price];;\n'Project ['xxxx]\n+- Join Inner, (device_id#6 = device_id#16)\n :- SubqueryAlias `purchases`\n : +- Project [_1#2 AS purchase_date#5, _2#3 AS device_id#6]\n : +- LocalRelation [_1#2, _2#3]\n +- SubqueryAlias `devices`\n +- Project [_1#12 AS device_id#16, _2#13 AS device_name#17, _3#14 AS price#18]\n +- LocalRelation [_1#12, _2#13, _3#14]\n",
"output_type": "error",
"traceback": [
"org.apache.spark.sql.AnalysisException: cannot resolve '`xxxx`' given input columns: [purchases.device_id, devices.device_id, purchases.purchase_date, devices.device_name, devices.price];;",
"'Project ['xxxx]",
"+- Join Inner, (device_id#6 = device_id#16)",
" :- SubqueryAlias `purchases`",
" : +- Project [_1#2 AS purchase_date#5, _2#3 AS device_id#6]",
" : +- LocalRelation [_1#2, _2#3]",
" +- SubqueryAlias `devices`",
" +- Project [_1#12 AS device_id#16, _2#13 AS device_name#17, _3#14 AS price#18]",
" +- LocalRelation [_1#12, _2#13, _3#14]",
" at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)",
" at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$3.applyOrElse(CheckAnalysis.scala:113)",
" at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$3.applyOrElse(CheckAnalysis.scala:110)",
" at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:280)",
" at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:280)",
" at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69)",
" at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:279)",
" at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$transformExpressionsUp$1.apply(QueryPlan.scala:93)",
" at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$transformExpressionsUp$1.apply(QueryPlan.scala:93)",
" at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$1.apply(QueryPlan.scala:105)",
" at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$1.apply(QueryPlan.scala:105)",
" at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69)",
" at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpression$1(QueryPlan.scala:104)",
" at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:116)",
" at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1$2.apply(QueryPlan.scala:121)",
" at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)",
" at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)",
" at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)",
" at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)",
" at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)",
" at scala.collection.AbstractTraversable.map(Traversable.scala:104)",
" at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:121)",
" at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$2.apply(QueryPlan.scala:126)",
" at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:186)",
" at org.apache.spark.sql.catalyst.plans.QueryPlan.mapExpressions(QueryPlan.scala:126)",
" at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsUp(QueryPlan.scala:93)",
" at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:110)",
" at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:88)",
" at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:126)",
" at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:88)",
" at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:95)",
" at org.apache.spark.sql.catalyst.analysis.Analyzer$$anonfun$executeAndCheck$1.apply(Analyzer.scala:108)",
" at org.apache.spark.sql.catalyst.analysis.Analyzer$$anonfun$executeAndCheck$1.apply(Analyzer.scala:105)",
" at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:201)",
" at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:105)",
" at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:58)",
" at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:56)",
" at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:48)",
" at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:78)",
" at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withPlan(Dataset.scala:3411)",
" at org.apache.spark.sql.Dataset.select(Dataset.scala:1340)",
" at org.apache.spark.sql.Dataset.select(Dataset.scala:1358)",
" ... 44 elided"
]
}
],
"source": [
"purchasesDF.as(\"purchases\")\n",
" .join(devicesDF.as(\"devices\"), col(\"purchases.device_id\") === col(\"devices.device_id\"))\n",
" .select(\"xxxx\")"
]
},
{
"cell_type": "markdown",
"id": "8f40e29f",
"metadata": {},
"source": [
"## Dropping"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "40ffac92",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"lastException: Throwable = null\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"purchase_date | device_name | price |
---|
2019-05-10 | notebook | 600.0 |
2019-01-01 | notebook | 600.0 |
2019-01-20 | small phone | 100.0 |
2019-03-05 | small phone | 100.0 |
2019-02-20 | camera | 150.0 |
"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"%%scan\n",
"purchasesDF.as(\"purchases\")\n",
" .join(devicesDF.as(\"devices\"), col(\"purchases.device_id\") === col(\"devices.device_id\"))\n",
" .drop(\"device_id\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e8d076e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d981c56",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "7b6804c0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Apache Toree - Scala",
"language": "scala",
"name": "apache_toree_scala"
},
"language_info": {
"codemirror_mode": "text/x-scala",
"file_extension": ".scala",
"mimetype": "text/x-scala",
"name": "scala",
"pygments_lexer": "scala",
"version": "2.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}