{ "cells": [ { "cell_type": "code", "execution_count": 6, "id": "385e66a8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.4.8" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spark.version" ] }, { "cell_type": "markdown", "id": "170dfc6e", "metadata": {}, "source": [ "## date_add / date_sub" ] }, { "cell_type": "code", "execution_count": 8, "id": "255697e0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df = [device: string, purchase_time: timestamp]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[device: string, purchase_time: timestamp]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import java.sql.Timestamp.valueOf\n", "import org.apache.spark.sql.functions.{date_add, date_sub}\n", "\n", "// note that the dates are just strings\n", "val df = Seq(\n", " (\"notebook\", valueOf(\"2019-01-29 12:00:00\")),\n", " (\"notebook\", valueOf(\"2019-01-01 00:00:00\")),\n", " (\"small_phone\", valueOf(\"2019-01-15 23:00:00\")),\n", " (\"small_phone\", valueOf(\"2019-01-01 09:00:00\"))\n", ").toDF(\"device\", \"purchase_time\").sort(\"device\",\"purchase_time\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "38f57761", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
devicepurchase_time
notebook2019-01-01 00:00:00.0
notebook2019-01-29 12:00:00.0
small_phone2019-01-01 09:00:00.0
small_phone2019-01-15 23:00:00.0
" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df" ] }, { "cell_type": "code", "execution_count": 10, "id": "4d8107cc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
devicepurchase_timeplus_2_days
notebook2019-01-01 00:00:00.02019-01-03
notebook2019-01-29 12:00:00.02019-01-31
small_phone2019-01-01 09:00:00.02019-01-03
small_phone2019-01-15 23:00:00.02019-01-17
" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df.withColumn(\"plus_2_days\", date_add($\"purchase_time\",2))" ] }, { "cell_type": "markdown", "id": "8dfffc71", "metadata": {}, "source": [ "## datediff" ] }, { "cell_type": "code", "execution_count": 11, "id": "15218f01", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df = [device: string, purchase_date: date ... 1 more field]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[device: string, purchase_date: date ... 1 more field]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import java.sql.Date.valueOf\n", "import org.apache.spark.sql.functions.datediff\n", "\n", "val df = Seq(\n", " (\"notebook\", valueOf(\"2019-01-29\"), valueOf(\"2019-02-10\")),\n", " (\"notebook\", valueOf(\"2019-01-01\"), valueOf(\"2019-01-15\")),\n", " (\"small_phone\", valueOf(\"2019-01-15\"), valueOf(\"2019-01-05\")),\n", " (\"small_phone\", valueOf(\"2019-01-01\"), valueOf(\"2019-01-20\"))\n", ").toDF(\"device\", \"purchase_date\", \"arrival_date\").sort(\"device\",\"purchase_date\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "f5591075", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
devicepurchase_datearrival_date
notebook2019-01-012019-01-15
notebook2019-01-292019-02-10
small_phone2019-01-012019-01-20
small_phone2019-01-152019-01-05
" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df" ] }, { "cell_type": "code", "execution_count": 13, "id": "62d19889", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
devicepurchase_datearrival_datedays_to_arrive
notebook2019-01-012019-01-1514
notebook2019-01-292019-02-1012
small_phone2019-01-012019-01-2019
small_phone2019-01-152019-01-05-10
" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df.withColumn(\"days_to_arrive\",datediff($\"arrival_date\", $\"purchase_date\"))" ] }, { "cell_type": "markdown", "id": "a05f4191", "metadata": {}, "source": [ "## difference in seconds" ] }, { "cell_type": "code", "execution_count": 14, "id": "d53bee7f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df = [col1: string, purchase_time: timestamp ... 1 more field]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[col1: string, purchase_time: timestamp ... 1 more field]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import java.sql.Timestamp.valueOf\n", "import org.apache.spark.sql.functions.unix_timestamp\n", "\n", "val df = Seq(\n", " (\"foo\", valueOf(\"2019-01-01 00:00:00\"), valueOf(\"2019-01-01 01:00:00\")), // 1 hour apart\n", " (\"bar\", valueOf(\"2019-01-01 00:00:00\"), valueOf(\"2019-01-02 00:00:00\")), // 24 hours apart\n", " (\"baz\", valueOf(\"2019-01-01 00:00:00\"), valueOf(\"2019-01-07 00:00:00\")) // 7 days apart\n", ").toDF(\"col1\", \"purchase_time\", \"arrival_time\").sort(\"col1\", \"purchase_time\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "c4cbbe02", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
col1purchase_timearrival_time
bar2019-01-01 00:00:00.02019-01-02 00:00:00.0
baz2019-01-01 00:00:00.02019-01-07 00:00:00.0
foo2019-01-01 00:00:00.02019-01-01 01:00:00.0
" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df" ] }, { "cell_type": "code", "execution_count": 16, "id": "f3b31763", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
col1purchase_timearrival_timediff_in_seconds_2
bar2019-01-01 00:00:00.02019-01-02 00:00:00.086400
baz2019-01-01 00:00:00.02019-01-07 00:00:00.0518400
foo2019-01-01 00:00:00.02019-01-01 01:00:00.03600
" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "\n", "df.withColumn(\"diff_in_seconds_2\", unix_timestamp($\"arrival_time\") - unix_timestamp($\"purchase_time\"))" ] }, { "cell_type": "markdown", "id": "8004a502", "metadata": {}, "source": [ "## difference in milliseconds" ] }, { "cell_type": "code", "execution_count": 17, "id": "76c6b901", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df = [col1: string, time_before: timestamp ... 1 more field]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[col1: string, time_before: timestamp ... 1 more field]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import java.sql.Timestamp.valueOf\n", "import org.apache.spark.sql.functions.to_timestamp\n", "\n", "val df = Seq(\n", " (\"foo\", valueOf(\"2019-01-01 00:00:00.000\"), valueOf(\"2019-01-01 00:00:00.400\")), \n", " (\"bar\", valueOf(\"2019-01-01 00:00:00.000\"), valueOf(\"2019-01-01 00:00:00.650\")), \n", " (\"baz\", valueOf(\"2019-01-01 00:00:00.000\"), valueOf(\"2019-01-01 00:01:00.000\")) \n", ").toDF(\"col1\", \"time_before\", \"time_after\")" ] }, { "cell_type": "code", "execution_count": 18, "id": "095040a7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
col1time_beforetime_after
foo2019-01-01 00:00:00.02019-01-01 00:00:00.4
bar2019-01-01 00:00:00.02019-01-01 00:00:00.65
baz2019-01-01 00:00:00.02019-01-01 00:01:00.0
" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df" ] }, { "cell_type": "code", "execution_count": 19, "id": "d9fcbbb5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
col1time_beforetime_afterdiff_millis
foo2019-01-01 00:00:00.02019-01-01 00:00:00.4400
bar2019-01-01 00:00:00.02019-01-01 00:00:00.65650
baz2019-01-01 00:00:00.02019-01-01 00:01:00.060000
" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "%%scan\n", "\n", "(df\n", ".withColumn(\"diff_millis\", ($\"time_after\".cast(\"double\") - $\"time_before\".cast(\"double\")))\n", ".withColumn(\"diff_millis\", ($\"diff_millis\"*1000).cast(\"long\")))" ] }, { "cell_type": "markdown", "id": "80b31c12", "metadata": {}, "source": [ "## expr interval" ] }, { "cell_type": "code", "execution_count": 20, "id": "c7d1cd49", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "df = [col1: string, timestamp_col: timestamp]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[col1: string, timestamp_col: timestamp]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import java.sql.Timestamp.valueOf\n", "import org.apache.spark.sql.functions.expr\n", "\n", "val df = Seq(\n", " (\"foo\", valueOf(\"2019-10-10 00:45:00\")), \n", " (\"bar\", valueOf(\"2019-10-10 12:34:56\")), \n", " (\"baz\", valueOf(\"2019-10-10 23:59:00\")) \n", ").toDF(\"col1\", \"timestamp_col\")" ] }, { "cell_type": "code", "execution_count": 21, "id": "261c90fe", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
col1timestamp_col
foo2019-10-10 00:45:00.0
bar2019-10-10 12:34:56.0
baz2019-10-10 23:59:00.0
" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "df" ] }, { "cell_type": "code", "execution_count": 23, "id": "55db67ea", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
col1timestamp_coltimestamp_minus_24_hours
foo2019-10-10 00:45:00.02019-10-09 00:45:00.0
bar2019-10-10 12:34:56.02019-10-09 12:34:56.0
baz2019-10-10 23:59:00.02019-10-09 23:59:00.0
" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "%%scan\n", "\n", "df\n", ".withColumn(\"timestamp_minus_24_hours\", $\"timestamp_col\" - expr(\"INTERVAL 24 HOURS\"))" ] }, { "cell_type": "code", "execution_count": null, "id": "f1fb27c7", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Apache Toree - Scala", "language": "scala", "name": "apache_toree_scala" }, "language_info": { "codemirror_mode": "text/x-scala", "file_extension": ".scala", "mimetype": "text/x-scala", "name": "scala", "pygments_lexer": "scala", "version": "2.11.12" } }, "nbformat": 4, "nbformat_minor": 5 }