{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "385e66a8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2.4.8"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"spark.version"
]
},
{
"cell_type": "markdown",
"id": "170dfc6e",
"metadata": {},
"source": [
"## date_add / date_sub"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "255697e0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df = [device: string, purchase_time: timestamp]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[device: string, purchase_time: timestamp]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import java.sql.Timestamp.valueOf\n",
"import org.apache.spark.sql.functions.{date_add, date_sub}\n",
"\n",
"// note that the dates are just strings\n",
"val df = Seq(\n",
" (\"notebook\", valueOf(\"2019-01-29 12:00:00\")),\n",
" (\"notebook\", valueOf(\"2019-01-01 00:00:00\")),\n",
" (\"small_phone\", valueOf(\"2019-01-15 23:00:00\")),\n",
" (\"small_phone\", valueOf(\"2019-01-01 09:00:00\"))\n",
").toDF(\"device\", \"purchase_time\").sort(\"device\",\"purchase_time\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "38f57761",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
device | purchase_time |
---|
notebook | 2019-01-01 00:00:00.0 |
notebook | 2019-01-29 12:00:00.0 |
small_phone | 2019-01-01 09:00:00.0 |
small_phone | 2019-01-15 23:00:00.0 |
"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "4d8107cc",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"device | purchase_time | plus_2_days |
---|
notebook | 2019-01-01 00:00:00.0 | 2019-01-03 |
notebook | 2019-01-29 12:00:00.0 | 2019-01-31 |
small_phone | 2019-01-01 09:00:00.0 | 2019-01-03 |
small_phone | 2019-01-15 23:00:00.0 | 2019-01-17 |
"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df.withColumn(\"plus_2_days\", date_add($\"purchase_time\",2))"
]
},
{
"cell_type": "markdown",
"id": "8dfffc71",
"metadata": {},
"source": [
"## datediff"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "15218f01",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df = [device: string, purchase_date: date ... 1 more field]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[device: string, purchase_date: date ... 1 more field]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import java.sql.Date.valueOf\n",
"import org.apache.spark.sql.functions.datediff\n",
"\n",
"val df = Seq(\n",
" (\"notebook\", valueOf(\"2019-01-29\"), valueOf(\"2019-02-10\")),\n",
" (\"notebook\", valueOf(\"2019-01-01\"), valueOf(\"2019-01-15\")),\n",
" (\"small_phone\", valueOf(\"2019-01-15\"), valueOf(\"2019-01-05\")),\n",
" (\"small_phone\", valueOf(\"2019-01-01\"), valueOf(\"2019-01-20\"))\n",
").toDF(\"device\", \"purchase_date\", \"arrival_date\").sort(\"device\",\"purchase_date\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "f5591075",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"device | purchase_date | arrival_date |
---|
notebook | 2019-01-01 | 2019-01-15 |
notebook | 2019-01-29 | 2019-02-10 |
small_phone | 2019-01-01 | 2019-01-20 |
small_phone | 2019-01-15 | 2019-01-05 |
"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "62d19889",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"device | purchase_date | arrival_date | days_to_arrive |
---|
notebook | 2019-01-01 | 2019-01-15 | 14 |
notebook | 2019-01-29 | 2019-02-10 | 12 |
small_phone | 2019-01-01 | 2019-01-20 | 19 |
small_phone | 2019-01-15 | 2019-01-05 | -10 |
"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df.withColumn(\"days_to_arrive\",datediff($\"arrival_date\", $\"purchase_date\"))"
]
},
{
"cell_type": "markdown",
"id": "a05f4191",
"metadata": {},
"source": [
"## difference in seconds"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "d53bee7f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df = [col1: string, purchase_time: timestamp ... 1 more field]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[col1: string, purchase_time: timestamp ... 1 more field]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import java.sql.Timestamp.valueOf\n",
"import org.apache.spark.sql.functions.unix_timestamp\n",
"\n",
"val df = Seq(\n",
" (\"foo\", valueOf(\"2019-01-01 00:00:00\"), valueOf(\"2019-01-01 01:00:00\")), // 1 hour apart\n",
" (\"bar\", valueOf(\"2019-01-01 00:00:00\"), valueOf(\"2019-01-02 00:00:00\")), // 24 hours apart\n",
" (\"baz\", valueOf(\"2019-01-01 00:00:00\"), valueOf(\"2019-01-07 00:00:00\")) // 7 days apart\n",
").toDF(\"col1\", \"purchase_time\", \"arrival_time\").sort(\"col1\", \"purchase_time\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "c4cbbe02",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"col1 | purchase_time | arrival_time |
---|
bar | 2019-01-01 00:00:00.0 | 2019-01-02 00:00:00.0 |
baz | 2019-01-01 00:00:00.0 | 2019-01-07 00:00:00.0 |
foo | 2019-01-01 00:00:00.0 | 2019-01-01 01:00:00.0 |
"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "f3b31763",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"col1 | purchase_time | arrival_time | diff_in_seconds_2 |
---|
bar | 2019-01-01 00:00:00.0 | 2019-01-02 00:00:00.0 | 86400 |
baz | 2019-01-01 00:00:00.0 | 2019-01-07 00:00:00.0 | 518400 |
foo | 2019-01-01 00:00:00.0 | 2019-01-01 01:00:00.0 | 3600 |
"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"\n",
"df.withColumn(\"diff_in_seconds_2\", unix_timestamp($\"arrival_time\") - unix_timestamp($\"purchase_time\"))"
]
},
{
"cell_type": "markdown",
"id": "8004a502",
"metadata": {},
"source": [
"## difference in milliseconds"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "76c6b901",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df = [col1: string, time_before: timestamp ... 1 more field]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[col1: string, time_before: timestamp ... 1 more field]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import java.sql.Timestamp.valueOf\n",
"import org.apache.spark.sql.functions.to_timestamp\n",
"\n",
"val df = Seq(\n",
" (\"foo\", valueOf(\"2019-01-01 00:00:00.000\"), valueOf(\"2019-01-01 00:00:00.400\")), \n",
" (\"bar\", valueOf(\"2019-01-01 00:00:00.000\"), valueOf(\"2019-01-01 00:00:00.650\")), \n",
" (\"baz\", valueOf(\"2019-01-01 00:00:00.000\"), valueOf(\"2019-01-01 00:01:00.000\")) \n",
").toDF(\"col1\", \"time_before\", \"time_after\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "095040a7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"col1 | time_before | time_after |
---|
foo | 2019-01-01 00:00:00.0 | 2019-01-01 00:00:00.4 |
bar | 2019-01-01 00:00:00.0 | 2019-01-01 00:00:00.65 |
baz | 2019-01-01 00:00:00.0 | 2019-01-01 00:01:00.0 |
"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "d9fcbbb5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"col1 | time_before | time_after | diff_millis |
---|
foo | 2019-01-01 00:00:00.0 | 2019-01-01 00:00:00.4 | 400 |
bar | 2019-01-01 00:00:00.0 | 2019-01-01 00:00:00.65 | 650 |
baz | 2019-01-01 00:00:00.0 | 2019-01-01 00:01:00.0 | 60000 |
"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"%%scan\n",
"\n",
"(df\n",
".withColumn(\"diff_millis\", ($\"time_after\".cast(\"double\") - $\"time_before\".cast(\"double\")))\n",
".withColumn(\"diff_millis\", ($\"diff_millis\"*1000).cast(\"long\")))"
]
},
{
"cell_type": "markdown",
"id": "80b31c12",
"metadata": {},
"source": [
"## expr interval"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "c7d1cd49",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df = [col1: string, timestamp_col: timestamp]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[col1: string, timestamp_col: timestamp]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import java.sql.Timestamp.valueOf\n",
"import org.apache.spark.sql.functions.expr\n",
"\n",
"val df = Seq(\n",
" (\"foo\", valueOf(\"2019-10-10 00:45:00\")), \n",
" (\"bar\", valueOf(\"2019-10-10 12:34:56\")), \n",
" (\"baz\", valueOf(\"2019-10-10 23:59:00\")) \n",
").toDF(\"col1\", \"timestamp_col\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "261c90fe",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"col1 | timestamp_col |
---|
foo | 2019-10-10 00:45:00.0 |
bar | 2019-10-10 12:34:56.0 |
baz | 2019-10-10 23:59:00.0 |
"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "55db67ea",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"col1 | timestamp_col | timestamp_minus_24_hours |
---|
foo | 2019-10-10 00:45:00.0 | 2019-10-09 00:45:00.0 |
bar | 2019-10-10 12:34:56.0 | 2019-10-09 12:34:56.0 |
baz | 2019-10-10 23:59:00.0 | 2019-10-09 23:59:00.0 |
"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"%%scan\n",
"\n",
"df\n",
".withColumn(\"timestamp_minus_24_hours\", $\"timestamp_col\" - expr(\"INTERVAL 24 HOURS\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f1fb27c7",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Apache Toree - Scala",
"language": "scala",
"name": "apache_toree_scala"
},
"language_info": {
"codemirror_mode": "text/x-scala",
"file_extension": ".scala",
"mimetype": "text/x-scala",
"name": "scala",
"pygments_lexer": "scala",
"version": "2.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}