{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "source": [
    "# almond-spark\n",
    "\n",
    "* Based [ammonite-spark](https://github.com/alexarchambault/ammonite-spark)\n",
    "* Works for any Spark version >= 2.0\n",
    "* Currently support for local, standalone and yarn clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [],
   "source": [
    "import $ivy.`org.apache.spark::spark-sql:2.4.3` // Or use any other 2.x version here\n",
    "import $ivy.`sh.almond::almond-spark:0.5.0`\n",
    "\n",
    "import org.apache.spark.sql._, org.apache.log4j.{Level, Logger}\n",
    "Logger.getLogger(\"org\").setLevel(Level.OFF)\n",
    "\n",
    "val spark = {\n",
    "  NotebookSparkSession.builder()\n",
    "    .master(\"local[*]\")\n",
    "    .getOrCreate()\n",
    "}\n",
    "def sc = spark.sparkContext"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "source": [
    "## Load a DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [],
   "source": [
    "import spark.implicits._\n",
    "val titanic = spark\n",
    "  .read\n",
    "  .format(\"csv\")\n",
    "  .option(\"inferSchema\", \"true\")\n",
    "  .option(\"header\", \"true\")\n",
    "  .load(\"titanic.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "source": [
    "## Show as text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [],
   "source": [
    "titanic.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "source": [
    "## Let's make the output a bit nicer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [],
   "source": [
    "implicit class RichDF(val df: DataFrame) {\n",
    "  def showHTML(limit:Int = 20) = {\n",
    "    import xml.Utility.escape\n",
    "    val data = df.take(limit)\n",
    "    val header = df.schema.fieldNames.toSeq\n",
    "    val rows: Seq[Seq[String]] = data.map { row =>\n",
    "      row.toSeq.map { cell =>\n",
    "        cell match {\n",
    "          case null => \"null\"\n",
    "          case binary: Array[Byte] => binary.map(\"%02X\".format(_)).mkString(\"[\", \" \", \"]\")\n",
    "          case array: Array[_] => array.mkString(\"[\", \", \", \"]\")\n",
    "          case seq: Seq[_] => seq.mkString(\"[\", \", \", \"]\")\n",
    "          case _ => cell.toString\n",
    "        }\n",
    "      }: Seq[String]\n",
    "    }\n",
    "\n",
    "    publish.html(s\"\"\"\n",
    "    <div>\n",
    "      <table border=\"1\" class=\"dataframe\">\n",
    "      <thead>\n",
    "        <tr>\n",
    "        ${header.map(h => s\"<th>${escape(h)}</th>\").mkString}\n",
    "        </tr>\n",
    "         </thead>\n",
    "         <tbody>\n",
    "        ${rows.map { row =>\n",
    "          s\"<tr>${row.map { c => s\"<td>${escape(c)}</td>\" }.mkString}</tr>\"\n",
    "        }.mkString\n",
    "        }\n",
    "        </tbody>\n",
    "      </table>\n",
    "    </div>\"\"\")\n",
    "  }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "titanic.showHTML(8)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "source": [
    "# Let's try some visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [],
   "source": [
    "import $ivy.`org.vegas-viz::vegas-spark:0.3.11`\n",
    "import vegas._, vegas.data.External._, vegas.sparkExt._\n",
    "\n",
    "Vegas(\"Titanic Survivors\").\n",
    "  withDataFrame(titanic).\n",
    "  mark(Bar).\n",
    "  encodeY(\"*\", aggregate=AggOps.Count, axis=Axis(title=\"Number of People\", grid=false)).\n",
    "  encodeColumn(\"Pclass\", Ord, scale=Scale(padding=10.0), axis=Axis(orient=Orient.Bottom, axisWidth=1.0, offset= -8.0)).\n",
    "  encodeX(\"Survived\", Nominal, scale=Scale(bandSize = 16.0), hideAxis=true).\n",
    "  encodeColor(\"Survived\", Nominal, scale=Scale(rangeNominals=List(\"red\", \"green\"))).\n",
    "  configFacet(cell=CellConfig(strokeWidth = 0)).\n",
    "  configCell(height=400).\n",
    "  show"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Scala",
   "language": "scala",
   "name": "scala"
  },
  "language_info": {
   "codemirror_mode": "text/x-scala",
   "file_extension": ".scala",
   "mimetype": "text/x-scala",
   "name": "scala",
   "nbconvert_exporter": "script",
   "version": "2.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}