{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2.4.8"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"spark.version"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import java.sql.Date\n",
"import org.apache.spark.sql.expressions.Window\n",
"import org.apache.spark.sql.functions._"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"devicesDf = [purchase_date: date, device: string ... 1 more field]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[purchase_date: date, device: string ... 1 more field]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val devicesDf = Seq(\n",
" (Date.valueOf(\"2019-01-01\"), \"notebook\", 600.00),\n",
" (Date.valueOf(\"2019-05-10\"), \"notebook\", 1200.00),\n",
" (Date.valueOf(\"2019-03-05\"), \"small phone\", 100.00),\n",
" (Date.valueOf(\"2019-02-20\"), \"camera\",150.00),\n",
" (Date.valueOf(\"2019-01-20\"), \"small phone\", 300.00),\n",
" (Date.valueOf(\"2019-02-15\"), \"large phone\", 700.00),\n",
" (Date.valueOf(\"2019-07-01\"), \"camera\", 300.00),\n",
" (Date.valueOf(\"2019-04-01\"), \"small phone\", 50.00)\n",
").toDF(\"purchase_date\", \"device\", \"price\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
purchase_date | device | price |
---|
2019-01-01 | notebook | 600.0 |
2019-01-20 | small phone | 300.0 |
2019-02-15 | large phone | 700.0 |
2019-02-20 | camera | 150.0 |
2019-03-05 | small phone | 100.0 |
2019-04-01 | small phone | 50.0 |
2019-05-10 | notebook | 1200.0 |
2019-07-01 | camera | 300.0 |
"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"devicesDf.sort(\"purchase_date\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## average value per group"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"purchase_date | device | price | average_price_in_group |
---|
2019-02-15 | large phone | 700.0 | 700.0 |
2019-03-05 | small phone | 100.0 | 150.0 |
2019-01-20 | small phone | 300.0 | 150.0 |
2019-04-01 | small phone | 50.0 | 150.0 |
2019-01-01 | notebook | 600.0 | 900.0 |
2019-05-10 | notebook | 1200.0 | 900.0 |
2019-02-20 | camera | 150.0 | 225.0 |
2019-07-01 | camera | 300.0 | 225.0 |
"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
":paste\n",
"\n",
"devicesDf\n",
" .withColumn(\"average_price_in_group\", mean(\"price\") over Window.partitionBy(\"device\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## where row is largest in group"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"purchase_date | device | price | max_price_in_group |
---|
2019-02-15 | large phone | 700.0 | 700.0 |
2019-03-05 | small phone | 100.0 | 300.0 |
2019-01-20 | small phone | 300.0 | 300.0 |
2019-04-01 | small phone | 50.0 | 300.0 |
2019-01-01 | notebook | 600.0 | 1200.0 |
2019-05-10 | notebook | 1200.0 | 1200.0 |
2019-02-20 | camera | 150.0 | 300.0 |
2019-07-01 | camera | 300.0 | 300.0 |
"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
":paste\n",
"\n",
"devicesDf.withColumn(\"max_price_in_group\", max(\"price\") over Window.partitionBy(\"device\"))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"purchase_date | device | price | max_price_in_group |
---|
2019-02-15 | large phone | 700.0 | 700.0 |
2019-01-20 | small phone | 300.0 | 300.0 |
2019-05-10 | notebook | 1200.0 | 1200.0 |
2019-07-01 | camera | 300.0 | 300.0 |
"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
":paste\n",
"\n",
"devicesDf\n",
" .withColumn(\"max_price_in_group\", max(\"price\") over Window.partitionBy(\"device\"))\n",
" .filter($\"price\" === $\"max_price_in_group\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## where row is most recent value in group"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"purchase_date | device | price | most_recent_purchase_in_group |
---|
2019-02-15 | large phone | 700.0 | 2019-02-15 |
2019-03-05 | small phone | 100.0 | 2019-04-01 |
2019-01-20 | small phone | 300.0 | 2019-04-01 |
2019-04-01 | small phone | 50.0 | 2019-04-01 |
2019-01-01 | notebook | 600.0 | 2019-05-10 |
2019-05-10 | notebook | 1200.0 | 2019-05-10 |
2019-02-20 | camera | 150.0 | 2019-07-01 |
2019-07-01 | camera | 300.0 | 2019-07-01 |
"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
":paste\n",
"\n",
"devicesDf\n",
" .withColumn(\"most_recent_purchase_in_group\", max(\"purchase_date\") over Window.partitionBy(\"device\"))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"purchase_date | device | price | most_recent_purchase_in_group |
---|
2019-02-15 | large phone | 700.0 | 2019-02-15 |
2019-04-01 | small phone | 50.0 | 2019-04-01 |
2019-05-10 | notebook | 1200.0 | 2019-05-10 |
2019-07-01 | camera | 300.0 | 2019-07-01 |
"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
":paste\n",
"\n",
"devicesDf\n",
" .withColumn(\"most_recent_purchase_in_group\", max(\"purchase_date\") over Window.partitionBy(\"device\"))\n",
" .filter($\"purchase_date\" === $\"most_recent_purchase_in_group\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## get median value\n",
"\n",
"median = percentile 50\n",
"\n",
"what's the lowest price over percentile 50?"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"purchase_date | device | price | percentile |
---|
2019-04-01 | small phone | 50.0 | 0.0 |
2019-03-05 | small phone | 100.0 | 0.14285714285714285 |
2019-02-20 | camera | 150.0 | 0.2857142857142857 |
2019-01-20 | small phone | 300.0 | 0.42857142857142855 |
2019-07-01 | camera | 300.0 | 0.42857142857142855 |
2019-01-01 | notebook | 600.0 | 0.7142857142857143 |
2019-02-15 | large phone | 700.0 | 0.8571428571428571 |
2019-05-10 | notebook | 1200.0 | 1.0 |
"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
":paste\n",
"devicesDf\n",
" .withColumn(\"percentile\", percent_rank() over Window.orderBy(\"price\"))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"purchase_date | device | price | percentile |
---|
2019-01-01 | notebook | 600.0 | 0.7142857142857143 |
"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
":paste\n",
"devicesDf\n",
" .withColumn(\"percentile\", percent_rank() over Window.orderBy(\"price\"))\n",
" .filter($\"percentile\" >= 0.5)\n",
" .limit(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## get percentile\n",
"\n",
"what's the lowest price over percentile 85?"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"purchase_date | device | price | percentile |
---|
2019-04-01 | small phone | 50.0 | 0.0 |
2019-03-05 | small phone | 100.0 | 0.14285714285714285 |
2019-02-20 | camera | 150.0 | 0.2857142857142857 |
2019-01-20 | small phone | 300.0 | 0.42857142857142855 |
2019-07-01 | camera | 300.0 | 0.42857142857142855 |
2019-01-01 | notebook | 600.0 | 0.7142857142857143 |
2019-02-15 | large phone | 700.0 | 0.8571428571428571 |
2019-05-10 | notebook | 1200.0 | 1.0 |
"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
":paste\n",
"devicesDf\n",
" .withColumn(\"percentile\", percent_rank() over Window.orderBy(\"price\"))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"purchase_date | device | price | percentile |
---|
2019-02-15 | large phone | 700.0 | 0.8571428571428571 |
"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
":paste\n",
"devicesDf\n",
" .withColumn(\"percentile\", percent_rank() over Window.orderBy(\"price\"))\n",
" .filter($\"percentile\" >= 0.85)\n",
" .limit(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## cumulative sum\n",
"\n",
"cumulative sum requires an ordered window"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"purchase_date | device | price | cumulative_sum |
---|
2019-01-01 | notebook | 600.0 | 600.0 |
2019-01-20 | small phone | 300.0 | 900.0 |
2019-02-15 | large phone | 700.0 | 1600.0 |
2019-02-20 | camera | 150.0 | 1750.0 |
2019-03-05 | small phone | 100.0 | 1850.0 |
2019-04-01 | small phone | 50.0 | 1900.0 |
2019-05-10 | notebook | 1200.0 | 3100.0 |
2019-07-01 | camera | 300.0 | 3400.0 |
"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
":paste\n",
"devicesDf\n",
" .withColumn(\"cumulative_sum\", sum(\"price\") over Window.orderBy(\"purchase_date\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## get row number\n",
"\n",
"Also called **rank**.\n",
"\n",
"`row_number` requires an ordered window"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"purchase_date | device | price | row_number |
---|
2019-01-01 | notebook | 600.0 | 1 |
2019-01-20 | small phone | 300.0 | 2 |
2019-02-15 | large phone | 700.0 | 3 |
2019-02-20 | camera | 150.0 | 4 |
2019-03-05 | small phone | 100.0 | 5 |
2019-04-01 | small phone | 50.0 | 6 |
2019-05-10 | notebook | 1200.0 | 7 |
2019-07-01 | camera | 300.0 | 8 |
"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
":paste\n",
"devicesDf\n",
" .withColumn(\"row_number\", row_number() over Window.orderBy(\"purchase_date\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## row number in partition"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"also called **rank**\n",
"\n",
"`row_number` requires an ordered window"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"purchase_date | device | price | row_number |
---|
2019-02-15 | large phone | 700.0 | 1 |
2019-01-20 | small phone | 300.0 | 1 |
2019-03-05 | small phone | 100.0 | 2 |
2019-04-01 | small phone | 50.0 | 3 |
2019-01-01 | notebook | 600.0 | 1 |
2019-05-10 | notebook | 1200.0 | 2 |
2019-02-20 | camera | 150.0 | 1 |
2019-07-01 | camera | 300.0 | 2 |
"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
":paste\n",
"devicesDf\n",
".withColumn(\"row_number\", row_number() over Window.partitionBy(\"device\").orderBy(\"purchase_date\"))"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df1 = [a: string, b: string]\n",
"df2 = [a: string, b: string]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[a: string, b: string]"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val df1 = Seq((\"1\",\"x\"), (\"2\", \"y\")).toDF(\"a\",\"b\");\n",
"\n",
"val df2 = Seq((\"1\",\"x2\"), (\"3\", \"z\")).toDF(\"a\",\"b\");"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df1"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"a | b | a | b |
---|
null | null | 3 | z |
1 | x | 1 | x2 |
2 | y | null | null |
"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df1.as(\"df1\").join(df2.as(\"df2\"),col(\"df1.a\")===col(\"df2.a\"),\"outer\")"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df1.join(df2, Seq(\"a\"), \"inner\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Apache Toree - Scala",
"language": "scala",
"name": "apache_toree_scala"
},
"language_info": {
"codemirror_mode": "text/x-scala",
"file_extension": ".scala",
"mimetype": "text/x-scala",
"name": "scala",
"pygments_lexer": "scala",
"version": "2.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}