{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Available line magics:\n", "%lsmagic %showtypes %showoutput %adddeps %truncation %addjar\n", "\n", "Available cell magics:\n", "%%sql %%html %%javascript %%dataframe %%scala\n", "\n", "Type % for usage info.\n", " \n" ] } ], "source": [ "%lsmagic" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.4.3" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sc.version" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## pivot using .pivot" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "unpivotedDf = [salesperson: string, device: string ... 1 more field]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[salesperson: string, device: string ... 1 more field]" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val unpivotedDf = Seq(\n", " (\"john\", \"notebook\", 2),\n", " (\"gary\", \"notebook\", 3),\n", " (\"john\", \"small_phone\", 2),\n", " (\"mary\", \"small_phone\", 3),\n", " (\"john\", \"large_phone\", 3),\n", " (\"john\", \"camera\", 3)\n", ").toDF(\"salesperson\",\"device\", \"amount_sold\").sort(\"salesperson\",\"device\")" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
salespersondeviceamount_sold
garynotebook3
johncamera3
johnlarge_phone3
johnnotebook2
johnsmall_phone2
marysmall_phone3
" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "unpivotedDf" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
salespersoncameralarge_phonenotebooksmall_phone
garynullnull3null
marynullnullnull3
john3322
" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "unpivotedDf.groupBy(\"salesperson\").pivot(\"device\").sum(\"amount_sold\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## unpivot using stack" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pivotedDf = [salesperson: string, camera: int ... 3 more fields]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[salesperson: string, camera: int ... 3 more fields]" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val pivotedDf = Seq(\n", " (\"gary\", None, None, Some(3),None),\n", " (\"mary\", None, None, None, Some(3)),\n", " (\"john\", Some(3),Some(3),Some(2),Some(2))\n", ").toDF(\"salesperson\", \"camera\", \"large_phone\", \"notebook\", \"small_phone\")" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
salespersoncameralarge_phonenotebooksmall_phone
garynullnull3null
marynullnullnull3
john3322
" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe\n", "pivotedDf" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "unpivotedDf = [salesperson: string, device: string ... 1 more field]\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[salesperson: string, device: string ... 1 more field]" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val unpivotedDf = pivotedDf\n", " .selectExpr(\"salesperson\",\"stack(4,'camera',camera,'large_phone',large_phone,'notebook',notebook,'small_phone',small_phone)\")\n", " .withColumnRenamed(\"col0\",\"device\") // default name of this column is col0\n", " .withColumnRenamed(\"col1\",\"amount_sold\") // default name of this column is col1\n", " .filter($\"amount_sold\".isNotNull) // must explicitly remove nulls" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
salespersondeviceamount_sold
garynotebook3
johncamera3
johnlarge_phone3
johnnotebook2
johnsmall_phone2
marysmall_phone3
" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%dataframe --limit=100\n", "unpivotedDf.sort(\"salesperson\",\"device\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Apache Toree - Scala", "language": "scala", "name": "apache_toree_scala" }, "language_info": { "codemirror_mode": "text/x-scala", "file_extension": ".scala", "mimetype": "text/x-scala", "name": "scala", "pygments_lexer": "scala", "version": "2.11.12" } }, "nbformat": 4, "nbformat_minor": 2 }