{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import java.sql.{Date,Timestamp}"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2.4.8"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"spark.version"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## to_date"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df = [device: string, purchase_date: string]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Array((device,StringType), (purchase_date,StringType))"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import org.apache.spark.sql.functions.to_date\n",
"\n",
"val df = Seq(\n",
" (\"notebook\",\"2019-01-01\"),\n",
" (\"notebook\", \"2019-01-10\"),\n",
" (\"small_phone\", \"2019-01-15\"),\n",
" (\"small_phone\", \"2019-01-30\")\n",
").toDF(\"device\", \"purchase_date\").sort(\"device\",\"purchase_date\")\n",
"\n",
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Array((device,StringType), (purchase_date,DateType))"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.withColumn(\"purchase_date\",to_date($\"purchase_date\")).dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## to_date, custom format"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df = [device: string, purchase_date: string]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[device: string, purchase_date: string]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import org.apache.spark.sql.functions.to_date\n",
"\n",
"val df = Seq(\n",
" (\"notebook\",\"27/12/2019\"),\n",
" (\"notebook\", \"01/12/2019\"),\n",
" (\"small_phone\", \"23/01/2019\"),\n",
" (\"small_phone\", \"27/12/2019\")\n",
").toDF(\"device\", \"purchase_date\").sort(\"device\",\"purchase_date\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
device | purchase_date |
---|
notebook | 01/12/2019 |
notebook | 27/12/2019 |
small_phone | 23/01/2019 |
small_phone | 27/12/2019 |
"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"device | purchase_date |
---|
notebook | 2019-12-01 |
notebook | 2019-12-27 |
small_phone | 2019-01-23 |
small_phone | 2019-12-27 |
"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df.withColumn(\"purchase_date\",to_date($\"purchase_date\", \"dd/MM/yyyy\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## to_timestamp"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df = [device: string, purchase_time: string]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[device: string, purchase_time: string]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val df = Seq(\n",
" (\"notebook\",\"2019-01-01 00:00:00\"),\n",
" (\"notebook\", \"2019-01-10 13:00:00\"),\n",
" (\"small_phone\", \"2019-01-15 12:00:00\"),\n",
" (\"small_phone\", \"2019-01-30 09:30:00\")\n",
").toDF(\"device\", \"purchase_time\").sort(\"device\",\"purchase_time\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Array((device,StringType), (purchase_time,StringType))"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import org.apache.spark.sql.functions.to_timestamp"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Array((device,StringType), (purchase_time,TimestampType))"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.withColumn(\"purchase_time\",to_timestamp($\"purchase_time\")).dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## to_timestamp custom format"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df = [device: string, purchase_time: string]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Array((device,StringType), (purchase_time,StringType))"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val df = Seq(\n",
" (\"notebook\",\"27/12/2019 12:00\"),\n",
" (\"notebook\", \"01/12/2019 00:00\"),\n",
" (\"small_phone\", \"23/01/2019 12:00\"),\n",
" (\"small_phone\", \"27/12/2019 12:00\")\n",
").toDF(\"device\", \"purchase_time\").sort(\"device\",\"purchase_time\")\n",
"\n",
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"device | purchase_time |
---|
notebook | null |
notebook | null |
small_phone | null |
small_phone | null |
"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df.withColumn(\"purchase_time\",to_timestamp($\"purchase_time\"))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"device | purchase_time |
---|
notebook | 2019-12-01 00:00:00.0 |
notebook | 2019-12-27 12:00:00.0 |
small_phone | 2019-01-23 12:00:00.0 |
small_phone | 2019-12-27 12:00:00.0 |
"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df.withColumn(\"purchase_time\",to_timestamp($\"purchase_time\",\"d/M/y H:m\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## timestamp to date"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df = [device: string, purchase_time: timestamp]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Array((device,StringType), (purchase_time,TimestampType))"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val df = Seq(\n",
" (\"notebook\",Timestamp.valueOf(\"2019-01-29 12:00:00\")),\n",
" (\"notebook\", Timestamp.valueOf(\"2019-01-01 00:00:00\")),\n",
" (\"small_phone\", Timestamp.valueOf(\"2019-01-15 23:00:00\")),\n",
" (\"small_phone\", Timestamp.valueOf(\"2019-01-01 09:00:00\"))\n",
").toDF(\"device\", \"purchase_time\").sort(\"device\",\"purchase_time\")\n",
"\n",
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"device | purchase_time | purchase_date |
---|
notebook | 2019-01-01 00:00:00.0 | 2019-01-01 |
notebook | 2019-01-29 12:00:00.0 | 2019-01-29 |
small_phone | 2019-01-01 09:00:00.0 | 2019-01-01 |
small_phone | 2019-01-15 23:00:00.0 | 2019-01-15 |
"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df.withColumn(\"purchase_date\",to_date($\"purchase_time\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## date to timestamp with zero hours"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df = [device: string, purchase_date: date]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[device: string, purchase_date: date]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import java.sql.Date\n",
"import org.apache.spark.sql.functions.to_timestamp\n",
"\n",
"val df = Seq(\n",
" (\"notebook\",Date.valueOf(\"2019-01-29\")),\n",
" (\"notebook\", Date.valueOf(\"2019-01-01\")),\n",
" (\"small_phone\", Date.valueOf(\"2019-01-15\")),\n",
" (\"small_phone\", Date.valueOf(\"2019-01-01\"))\n",
").toDF(\"device\", \"purchase_date\").sort(\"device\",\"purchase_date\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"device | purchase_date |
---|
notebook | 2019-01-01 |
notebook | 2019-01-29 |
small_phone | 2019-01-01 |
small_phone | 2019-01-15 |
"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"device | purchase_date | purchase_time |
---|
notebook | 2019-01-01 | 2019-01-01 00:00:00.0 |
notebook | 2019-01-29 | 2019-01-29 00:00:00.0 |
small_phone | 2019-01-01 | 2019-01-01 00:00:00.0 |
small_phone | 2019-01-15 | 2019-01-15 00:00:00.0 |
"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df.withColumn(\"purchase_time\",to_timestamp($\"purchase_date\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## custom date/timestamp formatting"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df = [device: string, purchase_time: timestamp]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[device: string, purchase_time: timestamp]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import java.sql.Timestamp\n",
"import org.apache.spark.sql.functions.date_format\n",
"\n",
"val df = Seq(\n",
" (\"notebook\",Timestamp.valueOf(\"2019-01-29 12:00:00\")),\n",
" (\"notebook\", Timestamp.valueOf(\"2019-01-01 00:00:00\")),\n",
" (\"small_phone\", Timestamp.valueOf(\"2019-01-15 23:00:00\")),\n",
" (\"small_phone\", Timestamp.valueOf(\"2019-01-01 09:00:00\"))\n",
").toDF(\"device\", \"purchase_time\").sort(\"device\",\"purchase_time\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"device | purchase_time | formatted_purchase_time |
---|
notebook | 2019-01-01 00:00:00.0 | 2019-01 |
notebook | 2019-01-29 12:00:00.0 | 2019-01 |
small_phone | 2019-01-01 09:00:00.0 | 2019-01 |
small_phone | 2019-01-15 23:00:00.0 | 2019-01 |
"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df.withColumn(\"formatted_purchase_time\",date_format($\"purchase_time\",\"y-MM\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## current timestamp"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df = [col1: string]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[col1: string]"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import org.apache.spark.sql.functions.current_timestamp\n",
"\n",
"val df = Seq(\n",
" (\"foo\"), \n",
" (\"bar\"), \n",
" (\"baz\") \n",
").toDF(\"col1\")"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"col1 | now |
---|
foo | 2022-06-12 01:51:20.683 |
bar | 2022-06-12 01:51:20.683 |
baz | 2022-06-12 01:51:20.683 |
"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"%%scan\n",
"\n",
"df\n",
" .withColumn(\"now\", current_timestamp)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## current date"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df = [col1: string]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[col1: string]"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import org.apache.spark.sql.functions.current_date\n",
"\n",
"val df = Seq(\n",
" (\"foo\"), \n",
" (\"bar\"), \n",
" (\"baz\") \n",
").toDF(\"col1\")"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"col1 | today |
---|
foo | 2022-06-12 |
bar | 2022-06-12 |
baz | 2022-06-12 |
"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"%%scan\n",
"\n",
"df\n",
" .withColumn(\"today\", current_date)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## hour"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df = [col1: string, some_timestamp: string]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[col1: string, some_timestamp: string]"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import org.apache.spark.sql.functions.hour\n",
"\n",
"val df = Seq(\n",
" (\"foo\", \"2019-01-01 01:00:00.000\"), \n",
" (\"bar\", \"2019-01-01 12:30:00.000\"), \n",
" (\"baz\", \"2019-01-01 23:01:00.000\") \n",
").toDF(\"col1\", \"some_timestamp\")"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"col1 | some_timestamp |
---|
foo | 2019-01-01 01:00:00.000 |
bar | 2019-01-01 12:30:00.000 |
baz | 2019-01-01 23:01:00.000 |
"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"col1 | some_timestamp | hour |
---|
foo | 2019-01-01 01:00:00.000 | 1 |
bar | 2019-01-01 12:30:00.000 | 12 |
baz | 2019-01-01 23:01:00.000 | 23 |
"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"%%scan\n",
"df\n",
" .withColumn(\"hour\", hour($\"some_timestamp\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## beginning of week"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"df = [source_date: string]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[source_date: string]"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import org.apache.spark.sql.{Column, DataFrame}\n",
"import org.apache.spark.sql.functions._\n",
"\n",
"// dummy dataframe for testing\n",
"val df = Seq(\n",
" (\"2018-12-28\"), \n",
" (\"2019-01-01\"), \n",
" (\"2019-01-04\") \n",
").toDF(\"source_date\")"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"source_date |
---|
2018-12-28 |
2019-01-01 |
2019-01-04 |
"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dayNameToIndex: (col: org.apache.spark.sql.Column)org.apache.spark.sql.Column\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"// turn a day name (like \"Wed\") to its position on the week (e.g. 3)\n",
"def dayNameToIndex(col: Column) : Column = {\n",
" when(col.isNull, null)\n",
" .when(col === \"Sun\", 0)\n",
" .when(col === \"Mon\", 1)\n",
" .when(col === \"Tue\", 2)\n",
" .when(col === \"Wed\", 3)\n",
" .when(col === \"Thu\", 4)\n",
" .when(col === \"Fri\", 5)\n",
" .when(col === \"Sat\", 7)\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"source_date | week_start |
---|
2018-12-28 | 2018-12-23 |
2019-01-01 | 2018-12-30 |
2019-01-04 | 2018-12-30 |
"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%dataframe\n",
"// need to use expr because the number of days to subtract is a column value\n",
"df\n",
" .withColumn(\"day_index\", dayNameToIndex(date_format(col(\"source_date\"), \"E\")))\n",
" .withColumn(\"week_start\", expr(\"date_sub(source_date, day_index)\"))\n",
" .drop(\"day_index\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Apache Toree - Scala",
"language": "scala",
"name": "apache_toree_scala"
},
"language_info": {
"codemirror_mode": "text/x-scala",
"file_extension": ".scala",
"mimetype": "text/x-scala",
"name": "scala",
"pygments_lexer": "scala",
"version": "2.11.12"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}