{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "# almond-spark\n", "\n", "* Based [ammonite-spark](https://github.com/alexarchambault/ammonite-spark)\n", "* Works for any Spark version >= 2.0\n", "* Currently support for local, standalone and yarn clusters" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "import $ivy.`org.apache.spark::spark-sql:2.4.3` // Or use any other 2.x version here\n", "import $ivy.`sh.almond::almond-spark:0.5.0`\n", "\n", "import org.apache.spark.sql._, org.apache.log4j.{Level, Logger}\n", "Logger.getLogger(\"org\").setLevel(Level.OFF)\n", "\n", "val spark = {\n", " NotebookSparkSession.builder()\n", " .master(\"local[*]\")\n", " .getOrCreate()\n", "}\n", "def sc = spark.sparkContext" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "## Load a DataFrame" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "import spark.implicits._\n", "val titanic = spark\n", " .read\n", " .format(\"csv\")\n", " .option(\"inferSchema\", \"true\")\n", " .option(\"header\", \"true\")\n", " .load(\"titanic.csv\")" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "## Show as text" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "titanic.show()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "## Let's make the output a bit nicer" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "implicit class RichDF(val df: DataFrame) {\n", " def showHTML(limit:Int = 20) = {\n", " import xml.Utility.escape\n", " val data = df.take(limit)\n", " val header = df.schema.fieldNames.toSeq\n", " val rows: Seq[Seq[String]] = data.map { row =>\n", " row.toSeq.map { cell =>\n", " cell match {\n", " case null => \"null\"\n", " case binary: Array[Byte] => binary.map(\"%02X\".format(_)).mkString(\"[\", \" \", \"]\")\n", " case array: Array[_] => array.mkString(\"[\", \", \", \"]\")\n", " case seq: Seq[_] => seq.mkString(\"[\", \", \", \"]\")\n", " case _ => cell.toString\n", " }\n", " }: Seq[String]\n", " }\n", "\n", " publish.html(s\"\"\"\n", "
\n", " \n", " \n", " \n", " ${header.map(h => s\"\").mkString}\n", " \n", " \n", " \n", " ${rows.map { row =>\n", " s\"${row.map { c => s\"\" }.mkString}\"\n", " }.mkString\n", " }\n", " \n", "
${escape(h)}
${escape(c)}
\n", "
\"\"\")\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "titanic.showHTML(8)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "# Let's try some visualization" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "import $ivy.`org.vegas-viz::vegas-spark:0.3.11`\n", "import vegas._, vegas.data.External._, vegas.sparkExt._\n", "\n", "Vegas(\"Titanic Survivors\").\n", " withDataFrame(titanic).\n", " mark(Bar).\n", " encodeY(\"*\", aggregate=AggOps.Count, axis=Axis(title=\"Number of People\", grid=false)).\n", " encodeColumn(\"Pclass\", Ord, scale=Scale(padding=10.0), axis=Axis(orient=Orient.Bottom, axisWidth=1.0, offset= -8.0)).\n", " encodeX(\"Survived\", Nominal, scale=Scale(bandSize = 16.0), hideAxis=true).\n", " encodeColor(\"Survived\", Nominal, scale=Scale(rangeNominals=List(\"red\", \"green\"))).\n", " configFacet(cell=CellConfig(strokeWidth = 0)).\n", " configCell(height=400).\n", " show" ] } ], "metadata": { "kernelspec": { "display_name": "Scala", "language": "scala", "name": "scala" }, "language_info": { "codemirror_mode": "text/x-scala", "file_extension": ".scala", "mimetype": "text/x-scala", "name": "scala", "nbconvert_exporter": "script", "version": "2.12.8" } }, "nbformat": 4, "nbformat_minor": 2 }