{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import java.sql.{Date,Timestamp}" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.4.8" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spark.version" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## to_date" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df = [device: string, purchase_date: string]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Array((device,StringType), (purchase_date,StringType))" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import org.apache.spark.sql.functions.to_date\n", "\n", "val df = Seq(\n", " (\"notebook\",\"2019-01-01\"),\n", " (\"notebook\", \"2019-01-10\"),\n", " (\"small_phone\", \"2019-01-15\"),\n", " (\"small_phone\", \"2019-01-30\")\n", ").toDF(\"device\", \"purchase_date\").sort(\"device\",\"purchase_date\")\n", "\n", "df.dtypes" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Array((device,StringType), (purchase_date,DateType))" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.withColumn(\"purchase_date\",to_date($\"purchase_date\")).dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## to_date, custom format" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df = [device: string, purchase_date: string]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[device: string, purchase_date: string]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import org.apache.spark.sql.functions.to_date\n", "\n", "val df = Seq(\n", " (\"notebook\",\"27/12/2019\"),\n", " (\"notebook\", \"01/12/2019\"),\n", " (\"small_phone\", \"23/01/2019\"),\n", " (\"small_phone\", \"27/12/2019\")\n", ").toDF(\"device\", \"purchase_date\").sort(\"device\",\"purchase_date\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
devicepurchase_date
notebook01/12/2019
notebook27/12/2019
small_phone23/01/2019
small_phone27/12/2019
" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
devicepurchase_date
notebook2019-12-01
notebook2019-12-27
small_phone2019-01-23
small_phone2019-12-27
" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df.withColumn(\"purchase_date\",to_date($\"purchase_date\", \"dd/MM/yyyy\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## to_timestamp" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df = [device: string, purchase_time: string]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[device: string, purchase_time: string]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val df = Seq(\n", " (\"notebook\",\"2019-01-01 00:00:00\"),\n", " (\"notebook\", \"2019-01-10 13:00:00\"),\n", " (\"small_phone\", \"2019-01-15 12:00:00\"),\n", " (\"small_phone\", \"2019-01-30 09:30:00\")\n", ").toDF(\"device\", \"purchase_time\").sort(\"device\",\"purchase_time\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Array((device,StringType), (purchase_time,StringType))" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import org.apache.spark.sql.functions.to_timestamp" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Array((device,StringType), (purchase_time,TimestampType))" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.withColumn(\"purchase_time\",to_timestamp($\"purchase_time\")).dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## to_timestamp custom format" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df = [device: string, purchase_time: string]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Array((device,StringType), (purchase_time,StringType))" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val df = Seq(\n", " (\"notebook\",\"27/12/2019 12:00\"),\n", " (\"notebook\", \"01/12/2019 00:00\"),\n", " (\"small_phone\", \"23/01/2019 12:00\"),\n", " (\"small_phone\", \"27/12/2019 12:00\")\n", ").toDF(\"device\", \"purchase_time\").sort(\"device\",\"purchase_time\")\n", "\n", "df.dtypes" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
devicepurchase_time
notebooknull
notebooknull
small_phonenull
small_phonenull
" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df.withColumn(\"purchase_time\",to_timestamp($\"purchase_time\"))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
devicepurchase_time
notebook2019-12-01 00:00:00.0
notebook2019-12-27 12:00:00.0
small_phone2019-01-23 12:00:00.0
small_phone2019-12-27 12:00:00.0
" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df.withColumn(\"purchase_time\",to_timestamp($\"purchase_time\",\"d/M/y H:m\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## timestamp to date" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df = [device: string, purchase_time: timestamp]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Array((device,StringType), (purchase_time,TimestampType))" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val df = Seq(\n", " (\"notebook\",Timestamp.valueOf(\"2019-01-29 12:00:00\")),\n", " (\"notebook\", Timestamp.valueOf(\"2019-01-01 00:00:00\")),\n", " (\"small_phone\", Timestamp.valueOf(\"2019-01-15 23:00:00\")),\n", " (\"small_phone\", Timestamp.valueOf(\"2019-01-01 09:00:00\"))\n", ").toDF(\"device\", \"purchase_time\").sort(\"device\",\"purchase_time\")\n", "\n", "df.dtypes" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
devicepurchase_timepurchase_date
notebook2019-01-01 00:00:00.02019-01-01
notebook2019-01-29 12:00:00.02019-01-29
small_phone2019-01-01 09:00:00.02019-01-01
small_phone2019-01-15 23:00:00.02019-01-15
" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df.withColumn(\"purchase_date\",to_date($\"purchase_time\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## date to timestamp with zero hours" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df = [device: string, purchase_date: date]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[device: string, purchase_date: date]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import java.sql.Date\n", "import org.apache.spark.sql.functions.to_timestamp\n", "\n", "val df = Seq(\n", " (\"notebook\",Date.valueOf(\"2019-01-29\")),\n", " (\"notebook\", Date.valueOf(\"2019-01-01\")),\n", " (\"small_phone\", Date.valueOf(\"2019-01-15\")),\n", " (\"small_phone\", Date.valueOf(\"2019-01-01\"))\n", ").toDF(\"device\", \"purchase_date\").sort(\"device\",\"purchase_date\")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
devicepurchase_date
notebook2019-01-01
notebook2019-01-29
small_phone2019-01-01
small_phone2019-01-15
" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
devicepurchase_datepurchase_time
notebook2019-01-012019-01-01 00:00:00.0
notebook2019-01-292019-01-29 00:00:00.0
small_phone2019-01-012019-01-01 00:00:00.0
small_phone2019-01-152019-01-15 00:00:00.0
" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df.withColumn(\"purchase_time\",to_timestamp($\"purchase_date\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## custom date/timestamp formatting" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df = [device: string, purchase_time: timestamp]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[device: string, purchase_time: timestamp]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import java.sql.Timestamp\n", "import org.apache.spark.sql.functions.date_format\n", "\n", "val df = Seq(\n", " (\"notebook\",Timestamp.valueOf(\"2019-01-29 12:00:00\")),\n", " (\"notebook\", Timestamp.valueOf(\"2019-01-01 00:00:00\")),\n", " (\"small_phone\", Timestamp.valueOf(\"2019-01-15 23:00:00\")),\n", " (\"small_phone\", Timestamp.valueOf(\"2019-01-01 09:00:00\"))\n", ").toDF(\"device\", \"purchase_time\").sort(\"device\",\"purchase_time\")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
devicepurchase_timeformatted_purchase_time
notebook2019-01-01 00:00:00.02019-01
notebook2019-01-29 12:00:00.02019-01
small_phone2019-01-01 09:00:00.02019-01
small_phone2019-01-15 23:00:00.02019-01
" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df.withColumn(\"formatted_purchase_time\",date_format($\"purchase_time\",\"y-MM\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## current timestamp" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df = [col1: string]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[col1: string]" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import org.apache.spark.sql.functions.current_timestamp\n", "\n", "val df = Seq(\n", " (\"foo\"), \n", " (\"bar\"), \n", " (\"baz\") \n", ").toDF(\"col1\")" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
col1now
foo2022-06-12 01:51:20.683
bar2022-06-12 01:51:20.683
baz2022-06-12 01:51:20.683
" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "%%scan\n", "\n", "df\n", " .withColumn(\"now\", current_timestamp)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## current date" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df = [col1: string]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[col1: string]" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import org.apache.spark.sql.functions.current_date\n", "\n", "val df = Seq(\n", " (\"foo\"), \n", " (\"bar\"), \n", " (\"baz\") \n", ").toDF(\"col1\")" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
col1today
foo2022-06-12
bar2022-06-12
baz2022-06-12
" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "%%scan\n", "\n", "df\n", " .withColumn(\"today\", current_date)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## hour" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df = [col1: string, some_timestamp: string]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[col1: string, some_timestamp: string]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import org.apache.spark.sql.functions.hour\n", "\n", "val df = Seq(\n", " (\"foo\", \"2019-01-01 01:00:00.000\"), \n", " (\"bar\", \"2019-01-01 12:30:00.000\"), \n", " (\"baz\", \"2019-01-01 23:01:00.000\") \n", ").toDF(\"col1\", \"some_timestamp\")" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
col1some_timestamp
foo2019-01-01 01:00:00.000
bar2019-01-01 12:30:00.000
baz2019-01-01 23:01:00.000
" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
col1some_timestamphour
foo2019-01-01 01:00:00.0001
bar2019-01-01 12:30:00.00012
baz2019-01-01 23:01:00.00023
" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "%%scan\n", "df\n", " .withColumn(\"hour\", hour($\"some_timestamp\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## beginning of week" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df = [source_date: string]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[source_date: string]" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import org.apache.spark.sql.{Column, DataFrame}\n", "import org.apache.spark.sql.functions._\n", "\n", "// dummy dataframe for testing\n", "val df = Seq(\n", " (\"2018-12-28\"), \n", " (\"2019-01-01\"), \n", " (\"2019-01-04\") \n", ").toDF(\"source_date\")" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
source_date
2018-12-28
2019-01-01
2019-01-04
" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dayNameToIndex: (col: org.apache.spark.sql.Column)org.apache.spark.sql.Column\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "// turn a day name (like \"Wed\") to its position on the week (e.g. 3)\n", "def dayNameToIndex(col: Column) : Column = {\n", " when(col.isNull, null)\n", " .when(col === \"Sun\", 0)\n", " .when(col === \"Mon\", 1)\n", " .when(col === \"Tue\", 2)\n", " .when(col === \"Wed\", 3)\n", " .when(col === \"Thu\", 4)\n", " .when(col === \"Fri\", 5)\n", " .when(col === \"Sat\", 7)\n", "}" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
source_dateweek_start
2018-12-282018-12-23
2019-01-012018-12-30
2019-01-042018-12-30
" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "// need to use expr because the number of days to subtract is a column value\n", "df\n", " .withColumn(\"day_index\", dayNameToIndex(date_format(col(\"source_date\"), \"E\")))\n", " .withColumn(\"week_start\", expr(\"date_sub(source_date, day_index)\"))\n", " .drop(\"day_index\")" ] } ], "metadata": { "kernelspec": { "display_name": "Apache Toree - Scala", "language": "scala", "name": "apache_toree_scala" }, "language_info": { "codemirror_mode": "text/x-scala", "file_extension": ".scala", "mimetype": "text/x-scala", "name": "scala", "pygments_lexer": "scala", "version": "2.11.12" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }