{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.4.8" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spark.version" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import java.sql.Date\n", "import org.apache.spark.sql.expressions.Window\n", "import org.apache.spark.sql.functions._" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "devicesDf = [purchase_date: date, device: string ... 1 more field]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[purchase_date: date, device: string ... 1 more field]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val devicesDf = Seq(\n", " (Date.valueOf(\"2019-01-01\"), \"notebook\", 600.00),\n", " (Date.valueOf(\"2019-05-10\"), \"notebook\", 1200.00),\n", " (Date.valueOf(\"2019-03-05\"), \"small phone\", 100.00),\n", " (Date.valueOf(\"2019-02-20\"), \"camera\",150.00),\n", " (Date.valueOf(\"2019-01-20\"), \"small phone\", 300.00),\n", " (Date.valueOf(\"2019-02-15\"), \"large phone\", 700.00),\n", " (Date.valueOf(\"2019-07-01\"), \"camera\", 300.00),\n", " (Date.valueOf(\"2019-04-01\"), \"small phone\", 50.00)\n", ").toDF(\"purchase_date\", \"device\", \"price\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
purchase_datedeviceprice
2019-01-01notebook600.0
2019-01-20small phone300.0
2019-02-15large phone700.0
2019-02-20camera150.0
2019-03-05small phone100.0
2019-04-01small phone50.0
2019-05-10notebook1200.0
2019-07-01camera300.0
" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "devicesDf.sort(\"purchase_date\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## average value per group" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
purchase_datedevicepriceaverage_price_in_group
2019-02-15large phone700.0700.0
2019-03-05small phone100.0150.0
2019-01-20small phone300.0150.0
2019-04-01small phone50.0150.0
2019-01-01notebook600.0900.0
2019-05-10notebook1200.0900.0
2019-02-20camera150.0225.0
2019-07-01camera300.0225.0
" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", ":paste\n", "\n", "devicesDf\n", " .withColumn(\"average_price_in_group\", mean(\"price\") over Window.partitionBy(\"device\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## where row is largest in group" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
purchase_datedevicepricemax_price_in_group
2019-02-15large phone700.0700.0
2019-03-05small phone100.0300.0
2019-01-20small phone300.0300.0
2019-04-01small phone50.0300.0
2019-01-01notebook600.01200.0
2019-05-10notebook1200.01200.0
2019-02-20camera150.0300.0
2019-07-01camera300.0300.0
" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", ":paste\n", "\n", "devicesDf.withColumn(\"max_price_in_group\", max(\"price\") over Window.partitionBy(\"device\"))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
purchase_datedevicepricemax_price_in_group
2019-02-15large phone700.0700.0
2019-01-20small phone300.0300.0
2019-05-10notebook1200.01200.0
2019-07-01camera300.0300.0
" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", ":paste\n", "\n", "devicesDf\n", " .withColumn(\"max_price_in_group\", max(\"price\") over Window.partitionBy(\"device\"))\n", " .filter($\"price\" === $\"max_price_in_group\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## where row is most recent value in group" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
purchase_datedevicepricemost_recent_purchase_in_group
2019-02-15large phone700.02019-02-15
2019-03-05small phone100.02019-04-01
2019-01-20small phone300.02019-04-01
2019-04-01small phone50.02019-04-01
2019-01-01notebook600.02019-05-10
2019-05-10notebook1200.02019-05-10
2019-02-20camera150.02019-07-01
2019-07-01camera300.02019-07-01
" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", ":paste\n", "\n", "devicesDf\n", " .withColumn(\"most_recent_purchase_in_group\", max(\"purchase_date\") over Window.partitionBy(\"device\"))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
purchase_datedevicepricemost_recent_purchase_in_group
2019-02-15large phone700.02019-02-15
2019-04-01small phone50.02019-04-01
2019-05-10notebook1200.02019-05-10
2019-07-01camera300.02019-07-01
" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", ":paste\n", "\n", "devicesDf\n", " .withColumn(\"most_recent_purchase_in_group\", max(\"purchase_date\") over Window.partitionBy(\"device\"))\n", " .filter($\"purchase_date\" === $\"most_recent_purchase_in_group\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## get median value\n", "\n", "median = percentile 50\n", "\n", "what's the lowest price over percentile 50?" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
purchase_datedevicepricepercentile
2019-04-01small phone50.00.0
2019-03-05small phone100.00.14285714285714285
2019-02-20camera150.00.2857142857142857
2019-01-20small phone300.00.42857142857142855
2019-07-01camera300.00.42857142857142855
2019-01-01notebook600.00.7142857142857143
2019-02-15large phone700.00.8571428571428571
2019-05-10notebook1200.01.0
" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", ":paste\n", "devicesDf\n", " .withColumn(\"percentile\", percent_rank() over Window.orderBy(\"price\"))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
purchase_datedevicepricepercentile
2019-01-01notebook600.00.7142857142857143
" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", ":paste\n", "devicesDf\n", " .withColumn(\"percentile\", percent_rank() over Window.orderBy(\"price\"))\n", " .filter($\"percentile\" >= 0.5)\n", " .limit(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## get percentile\n", "\n", "what's the lowest price over percentile 85?" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
purchase_datedevicepricepercentile
2019-04-01small phone50.00.0
2019-03-05small phone100.00.14285714285714285
2019-02-20camera150.00.2857142857142857
2019-01-20small phone300.00.42857142857142855
2019-07-01camera300.00.42857142857142855
2019-01-01notebook600.00.7142857142857143
2019-02-15large phone700.00.8571428571428571
2019-05-10notebook1200.01.0
" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", ":paste\n", "devicesDf\n", " .withColumn(\"percentile\", percent_rank() over Window.orderBy(\"price\"))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
purchase_datedevicepricepercentile
2019-02-15large phone700.00.8571428571428571
" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", ":paste\n", "devicesDf\n", " .withColumn(\"percentile\", percent_rank() over Window.orderBy(\"price\"))\n", " .filter($\"percentile\" >= 0.85)\n", " .limit(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## cumulative sum\n", "\n", "cumulative sum requires an ordered window" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
purchase_datedevicepricecumulative_sum
2019-01-01notebook600.0600.0
2019-01-20small phone300.0900.0
2019-02-15large phone700.01600.0
2019-02-20camera150.01750.0
2019-03-05small phone100.01850.0
2019-04-01small phone50.01900.0
2019-05-10notebook1200.03100.0
2019-07-01camera300.03400.0
" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", ":paste\n", "devicesDf\n", " .withColumn(\"cumulative_sum\", sum(\"price\") over Window.orderBy(\"purchase_date\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## get row number\n", "\n", "Also called **rank**.\n", "\n", "`row_number` requires an ordered window" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
purchase_datedevicepricerow_number
2019-01-01notebook600.01
2019-01-20small phone300.02
2019-02-15large phone700.03
2019-02-20camera150.04
2019-03-05small phone100.05
2019-04-01small phone50.06
2019-05-10notebook1200.07
2019-07-01camera300.08
" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", ":paste\n", "devicesDf\n", " .withColumn(\"row_number\", row_number() over Window.orderBy(\"purchase_date\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## row number in partition" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "also called **rank**\n", "\n", "`row_number` requires an ordered window" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
purchase_datedevicepricerow_number
2019-02-15large phone700.01
2019-01-20small phone300.01
2019-03-05small phone100.02
2019-04-01small phone50.03
2019-01-01notebook600.01
2019-05-10notebook1200.02
2019-02-20camera150.01
2019-07-01camera300.02
" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", ":paste\n", "devicesDf\n", ".withColumn(\"row_number\", row_number() over Window.partitionBy(\"device\").orderBy(\"purchase_date\"))" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df1 = [a: string, b: string]\n", "df2 = [a: string, b: string]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[a: string, b: string]" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val df1 = Seq((\"1\",\"x\"), (\"2\", \"y\")).toDF(\"a\",\"b\");\n", "\n", "val df2 = Seq((\"1\",\"x2\"), (\"3\", \"z\")).toDF(\"a\",\"b\");" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
ab
1x
2y
" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df1" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
ab
1x2
3z
" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df2" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
abab
nullnull3z
1x1x2
2ynullnull
" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df1.as(\"df1\").join(df2.as(\"df2\"),col(\"df1.a\")===col(\"df2.a\"),\"outer\")" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
abb
1xx2
" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df1.join(df2, Seq(\"a\"), \"inner\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Apache Toree - Scala", "language": "scala", "name": "apache_toree_scala" }, "language_info": { "codemirror_mode": "text/x-scala", "file_extension": ".scala", "mimetype": "text/x-scala", "name": "scala", "pygments_lexer": "scala", "version": "2.11.12" } }, "nbformat": 4, "nbformat_minor": 2 }