{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "# almond-spark\n", "\n", "* Based [ammonite-spark](https://github.com/alexarchambault/ammonite-spark)\n", "* Works for any Spark version >= 2.0\n", "* Currently support for local, standalone and yarn clusters" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "import $ivy.`org.apache.spark::spark-sql:2.4.3` // Or use any other 2.x version here\n", "import $ivy.`sh.almond::almond-spark:0.5.0`\n", "\n", "import org.apache.spark.sql._, org.apache.log4j.{Level, Logger}\n", "Logger.getLogger(\"org\").setLevel(Level.OFF)\n", "\n", "val spark = {\n", " NotebookSparkSession.builder()\n", " .master(\"local[*]\")\n", " .getOrCreate()\n", "}\n", "def sc = spark.sparkContext" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "## Load a DataFrame" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "import spark.implicits._\n", "val titanic = spark\n", " .read\n", " .format(\"csv\")\n", " .option(\"inferSchema\", \"true\")\n", " .option(\"header\", \"true\")\n", " .load(\"titanic.csv\")" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "## Show as text" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "titanic.show()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "## Let's make the output a bit nicer" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "implicit class RichDF(val df: DataFrame) {\n", " def showHTML(limit:Int = 20) = {\n", " import xml.Utility.escape\n", " val data = df.take(limit)\n", " val header = df.schema.fieldNames.toSeq\n", " val rows: Seq[Seq[String]] = data.map { row =>\n", " row.toSeq.map { cell =>\n", " cell match {\n", " case null => \"null\"\n", " case binary: Array[Byte] => binary.map(\"%02X\".format(_)).mkString(\"[\", \" \", \"]\")\n", " case array: Array[_] => array.mkString(\"[\", \", \", \"]\")\n", " case seq: Seq[_] => seq.mkString(\"[\", \", \", \"]\")\n", " case _ => cell.toString\n", " }\n", " }: Seq[String]\n", " }\n", "\n", " publish.html(s\"\"\"\n", "
${escape(h)} | \").mkString}\n", "
---|
${escape(c)} | \" }.mkString}