{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "9e2171fd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.4.8" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spark.version" ] }, { "cell_type": "code", "execution_count": 2, "id": "2c23ce58", "metadata": {}, "outputs": [], "source": [ "import java.sql.Date\n", "import org.apache.spark.sql.expressions.Window\n", "import org.apache.spark.sql.functions._" ] }, { "cell_type": "code", "execution_count": 3, "id": "077c8236", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "purchasesDF = [purchase_date: date, device_id: string]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[purchase_date: date, device_id: string]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val purchasesDF = Seq(\n", " (Date.valueOf(\"2019-01-01\"), \"01\"),\n", " (Date.valueOf(\"2019-05-10\"), \"01\"),\n", " (Date.valueOf(\"2019-03-05\"), \"02\"),\n", " (Date.valueOf(\"2019-02-20\"), \"03\"),\n", " (Date.valueOf(\"2019-01-20\"), \"02\")\n", ").toDF(\"purchase_date\", \"device_id\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "1b9cd20b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "devicesDF = [device_id: string, device_name: string ... 1 more field]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[device_id: string, device_name: string ... 1 more field]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val devicesDF = Seq(\n", " (\"01\", \"notebook\", 600.00),\n", " (\"02\", \"small phone\", 100.00),\n", " (\"03\", \"camera\",150.00),\n", " (\"04\", \"large phone\", 700.00)\n", ").toDF(\"device_id\", \"device_name\", \"price\")" ] }, { "cell_type": "markdown", "id": "bfe05688", "metadata": {}, "source": [ "### error message example" ] }, { "cell_type": "code", "execution_count": 5, "id": "a1694fdd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "lastException: Throwable = null\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "An error occurred converting DataFrame to html.\n", "Reference 'device_id' is ambiguous, could be: device_id, device_id.;" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "purchasesDF.join(devicesDF, col(\"device_id\") === col(\"device_id\"), \"inner\")" ] }, { "cell_type": "markdown", "id": "7522d989", "metadata": {}, "source": [ "## adding aliases" ] }, { "cell_type": "code", "execution_count": 6, "id": "4326cd71", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
purchase_datedevice_iddevice_iddevice_nameprice
2019-05-100101notebook600.0
2019-01-010101notebook600.0
2019-01-200202small phone100.0
2019-03-050202small phone100.0
2019-02-200303camera150.0
" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "%%scan\n", "purchasesDF.as(\"purchases\")\n", " .join(devicesDF.as(\"devices\"), col(\"purchases.device_id\") === col(\"devices.device_id\"))" ] }, { "cell_type": "markdown", "id": "2d1f5b4b", "metadata": {}, "source": [ "## Viewing qualified column names via exceptions\n", "\n", "An engineer's gotta do what an engineer's gotta do" ] }, { "cell_type": "code", "execution_count": 7, "id": "e2838cee", "metadata": {}, "outputs": [ { "ename": "org.apache.spark.sql.AnalysisException", "evalue": "cannot resolve '`xxxx`' given input columns: [purchases.device_id, devices.device_id, purchases.purchase_date, devices.device_name, devices.price];;\n'Project ['xxxx]\n+- Join Inner, (device_id#6 = device_id#16)\n :- SubqueryAlias `purchases`\n : +- Project [_1#2 AS purchase_date#5, _2#3 AS device_id#6]\n : +- LocalRelation [_1#2, _2#3]\n +- SubqueryAlias `devices`\n +- Project [_1#12 AS device_id#16, _2#13 AS device_name#17, _3#14 AS price#18]\n +- LocalRelation [_1#12, _2#13, _3#14]\n", "output_type": "error", "traceback": [ "org.apache.spark.sql.AnalysisException: cannot resolve '`xxxx`' given input columns: [purchases.device_id, devices.device_id, purchases.purchase_date, devices.device_name, devices.price];;", "'Project ['xxxx]", "+- Join Inner, (device_id#6 = device_id#16)", " :- SubqueryAlias `purchases`", " : +- Project [_1#2 AS purchase_date#5, _2#3 AS device_id#6]", " : +- LocalRelation [_1#2, _2#3]", " +- SubqueryAlias `devices`", " +- Project [_1#12 AS device_id#16, _2#13 AS device_name#17, _3#14 AS price#18]", " +- LocalRelation [_1#12, _2#13, _3#14]", " at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)", " at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$3.applyOrElse(CheckAnalysis.scala:113)", " at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$3.applyOrElse(CheckAnalysis.scala:110)", " at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:280)", " at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:280)", " at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69)", " at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:279)", " at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$transformExpressionsUp$1.apply(QueryPlan.scala:93)", " at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$transformExpressionsUp$1.apply(QueryPlan.scala:93)", " at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$1.apply(QueryPlan.scala:105)", " at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$1.apply(QueryPlan.scala:105)", " at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69)", " at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpression$1(QueryPlan.scala:104)", " at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:116)", " at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1$2.apply(QueryPlan.scala:121)", " at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)", " at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)", " at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)", " at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)", " at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)", " at scala.collection.AbstractTraversable.map(Traversable.scala:104)", " at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:121)", " at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$2.apply(QueryPlan.scala:126)", " at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:186)", " at org.apache.spark.sql.catalyst.plans.QueryPlan.mapExpressions(QueryPlan.scala:126)", " at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsUp(QueryPlan.scala:93)", " at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:110)", " at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:88)", " at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:126)", " at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:88)", " at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:95)", " at org.apache.spark.sql.catalyst.analysis.Analyzer$$anonfun$executeAndCheck$1.apply(Analyzer.scala:108)", " at org.apache.spark.sql.catalyst.analysis.Analyzer$$anonfun$executeAndCheck$1.apply(Analyzer.scala:105)", " at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:201)", " at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:105)", " at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:58)", " at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:56)", " at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:48)", " at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:78)", " at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withPlan(Dataset.scala:3411)", " at org.apache.spark.sql.Dataset.select(Dataset.scala:1340)", " at org.apache.spark.sql.Dataset.select(Dataset.scala:1358)", " ... 44 elided" ] } ], "source": [ "purchasesDF.as(\"purchases\")\n", " .join(devicesDF.as(\"devices\"), col(\"purchases.device_id\") === col(\"devices.device_id\"))\n", " .select(\"xxxx\")" ] }, { "cell_type": "markdown", "id": "8f40e29f", "metadata": {}, "source": [ "## Dropping" ] }, { "cell_type": "code", "execution_count": 8, "id": "40ffac92", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "lastException: Throwable = null\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
purchase_datedevice_nameprice
2019-05-10notebook600.0
2019-01-01notebook600.0
2019-01-20small phone100.0
2019-03-05small phone100.0
2019-02-20camera150.0
" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "%%scan\n", "purchasesDF.as(\"purchases\")\n", " .join(devicesDF.as(\"devices\"), col(\"purchases.device_id\") === col(\"devices.device_id\"))\n", " .drop(\"device_id\")" ] }, { "cell_type": "code", "execution_count": null, "id": "7e8d076e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "2d981c56", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "7b6804c0", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Apache Toree - Scala", "language": "scala", "name": "apache_toree_scala" }, "language_info": { "codemirror_mode": "text/x-scala", "file_extension": ".scala", "mimetype": "text/x-scala", "name": "scala", "pygments_lexer": "scala", "version": "2.11.12" } }, "nbformat": 4, "nbformat_minor": 5 }