{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# replace this with a Spark history log of your own or parameterize with Papermill!\n",
    "\n",
    "metrics_file = \"metrics/application_1601392010735_0030\"\n",
    "\n",
    "wide_output_file = \"output.db\"\n",
    "output_file = \"output.db\"\n",
    "\n",
    "transform_structure_prefixes = True\n",
    "use_structure_prefixes = False\n",
    "driver_memory = '8g'\n",
    "executor_memory = '8g'\n",
    "master = 'local[*]'\n",
    "\n",
    "debug_me = False\n",
    "interactive = True\n",
    "store_parquet = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "if debug_me:\n",
    "    pd.options.display.max_columns = None\n",
    "    pd.options.display.max_rows = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyspark\n",
    "import pyspark.sql.functions as F\n",
    "from pyspark import SparkConf\n",
    "\n",
    "import json\n",
    "import eventlog\n",
    "\n",
    "spark = pyspark.sql.SparkSession.\\\n",
    "    builder.\\\n",
    "    master(master).\\\n",
    "    config(\"spark.ui.showConsoleProgress\", False).\\\n",
    "    config(\"spark.driver.memory\", driver_memory).\\\n",
    "    config(\"spark.executor.memory\", executor_memory).\\\n",
    "    getOrCreate()\n",
    "\n",
    "sc = spark.sparkContext\n",
    "sc.setLogLevel(\"OFF\")\n",
    "\n",
    "logger = sc._jvm.org.apache.log4j\n",
    "logger.LogManager.getLogger(\"org\").setLevel(logger.Level.ERROR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "metrics = spark.read.json(metrics_file)\n",
    "eventlog.init_eventlog(metrics, use_structure_prefixes=use_structure_prefixes, transform_structure_prefixes=transform_structure_prefixes, debug_me=debug_me)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from eventlog import *\n",
    "\n",
    "app_id, app_name = metrics.select(\"App ID\", \"App Name\").dropna().collect()[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "app_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accumulator_updates = driver_accumulator_updates(metrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "appevents = drop_null_cols(metrics.where((metrics.Event == \"SparkListenerApplicationEnd\") | (metrics.Event == \"SparkListenerApplicationStart\")))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if debug_me:\n",
    "    appevents.toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "app_runtime = app_timeline(metrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if debug_me:\n",
    "    app_runtime.toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "job_runtime = job_timeline(metrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql_runtime = sql_timeline(metrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "jobs_to_stages = all_stage_meta(metrics).select('Job ID', 'Stage ID').distinct()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plan_nodes, accumulable_nodes, metadata_nodes = plan_dfs(metrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if debug_me:\n",
    "    accumulable_nodes.groupBy(\"name\", \"accumulatorId\", \"execution_id\").count().where(F.col(\"count\") > 1).orderBy(\"name\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accumulator_updates = driver_accumulator_updates(metrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if debug_me:\n",
    "    accumulable_nodes.groupBy(\"name\", \"accumulatorId\", \"execution_id\").count().orderBy(-F.col(\"count\"), F.col(\"accumulatorId\")).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if debug_me:\n",
    "    accumulator_updates.orderBy(\"accumulator\").toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql_info_df = with_appmeta(sql_info(metrics))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "metric_meta = spark.createDataFrame(data=eventlog.metric_metas)\n",
    "task_metrics = tidy_tasks(metrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tasks_to_plans = with_appmeta(task_metrics.join(accumulable_nodes, \"accumulatorId\").join(plan_nodes, \"plan_node\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import altair as alt\n",
    "alt.data_transformers.disable_max_rows()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Shuffle metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "task_byte_metrics = tidy_tasks(metrics).join(\n",
    "    metric_meta.withColumnRenamed(\"MetricName\", \"Metric Name\"), \n",
    "    \"Metric Name\", \n",
    "    how=\"leftouter\"\n",
    ").where(F.col(\"unit\") == \"bytes\").groupBy(\"Stage ID\", \"Task ID\", \"Metric Name\").sum(\"Metric Value\").withColumnRenamed(\"sum(Metric Value)\", \"Metric Value\").toPandas()\n",
    "\n",
    "task_shuffle_metrics = task_byte_metrics[task_byte_metrics['Metric Name'].str.contains('internal.metrics.shuffle')].sort_values('Task ID')\n",
    "shuffle_replacer = lambda match: \"Shuffle %s\" % match.group('metric')\n",
    "task_shuffle_metrics['Metric Name'] = task_shuffle_metrics['Metric Name'].str.replace(r'internal\\.metrics\\.shuffle\\.(?P<kind>read|write).(?P<metric>.*)$', shuffle_replacer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "stage_and_task_charts(task_shuffle_metrics, \"bytes\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Executor time metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "task_metrics = tidy_tasks(metrics).join(\n",
    "    metric_meta.withColumnRenamed(\"MetricName\", \"Metric Name\"), \n",
    "    \"Metric Name\",\n",
    "    how=\"leftouter\"\n",
    ").withColumn(\"Metric Value\", F.col(\"Metric Value\").cast(\"float\"))\n",
    "\n",
    "task_ms_metrics = task_metrics.where(F.col(\"unit\") == \"ms\").groupBy(\"Stage ID\", \"Task ID\", \"Metric Name\").sum(\"Metric Value\").withColumnRenamed(\"sum(Metric Value)\", \"Metric Value\")\n",
    "task_ns_metrics = task_metrics.where(F.col(\"unit\") == \"ns\").groupBy(\"Stage ID\", \"Task ID\", \"Metric Name\").sum(\"Metric Value\").withColumnRenamed(\"sum(Metric Value)\", \"Metric Value\").withColumn(\"Metric Value\", F.col(\"Metric Value\").cast(\"float\") / 1000000)\n",
    "\n",
    "task_time_metrics = task_ms_metrics.union(task_ns_metrics).toPandas()\n",
    "\n",
    "stage_and_task_charts(task_time_metrics[~task_time_metrics[\"Metric Name\"].str.contains(\"internal.metrics\")])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "task_metrics.where(F.col(\"unit\").isin(\"ms\", \"ns\")).select(\"Metric Name\", \"unit\").distinct().toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "stage_and_task_charts(\n",
    "    task_time_metrics[task_time_metrics[\"Metric Name\"].str.contains(\"CPU\", case=False) | \n",
    "                      task_time_metrics[\"Metric Name\"].str.contains(\"GPU\", case=False) |\n",
    "                      task_time_metrics[\"Metric Name\"].str.contains(\"JVM GC\", case=False) |\n",
    "                      task_time_metrics[\"Metric Name\"].str.contains(\"JVMGC\", case=False)\n",
    "                     ])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Plotting wall-clock vs CPU time with layered charts\n",
    "\n",
    "This gives us some sense of the relationship between CPU time and system time."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cputime = task_time_metrics[task_time_metrics['Metric Name'].str.contains('executorCpuTime')]\n",
    "runtime = task_time_metrics[task_time_metrics['Metric Name'].str.contains('executorRunTime')]\n",
    "layered_stage_and_task_charts([runtime, cputime])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Memory and spill metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "stage_and_task_charts(task_byte_metrics[task_byte_metrics['Metric Name'].str.contains(' memory', case=False) | task_byte_metrics['Metric Name'].str.contains('size') | task_byte_metrics['Metric Name'].str.contains('Bytes Spilled')], \"bytes\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "stage_and_task_charts(task_byte_metrics, \"bytes\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Task metrics and metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "task_metrics_table, task_meta_table = split_metrics(task_metrics)\n",
    "task_all_spark = task_metrics_table.join(task_meta_table, [\"Task ID\", \"Stage ID\", \"Application ID\", \"Application Name\"]).join(jobs_to_stages, \"Stage ID\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Query plan node metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plan_metrics = plan_nodes.join(accumulable_nodes, [\"plan_node\", \"Application ID\", \"Application Name\"]).join(task_all_spark, [\"accumulatorId\", \"Application ID\", \"Application Name\"])\n",
    "\n",
    "plan_metrics_full = plan_metrics_rollup(plan_metrics)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "# Configuration information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "configs = meltconfig(metrics, [\"SparkListenerEnvironmentUpdate\",\"SparkListenerJobStart\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "configs.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exporting tabular data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sqlite3\n",
    "conn = sqlite3.Connection(output_file)\n",
    "wide_conn = sqlite3.Connection(wide_output_file)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if wide_output_file != output_file:\n",
    "    with_appmeta(configs).toPandas().to_sql(\"configs\", wide_conn, index=False, if_exists='append')\n",
    "    \n",
    "with_appmeta(configs).toPandas().to_sql(\"configs\", conn, index=False, if_exists='append')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "task_metrics_table.join(jobs_to_stages, \"Stage ID\").toPandas().to_sql('task_metrics', conn, index=False, if_exists='append')\n",
    "task_meta_table.join(jobs_to_stages, \"Stage ID\").toPandas().to_sql('task_meta', conn, index=False, if_exists='append')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "task_all = task_all_spark.toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "project_columns = ['Application ID', 'Application Name', 'Attempt', 'Executor ID', 'Failed', 'Finish Time', 'Getting Result Time', 'Host', 'Index', 'Killed', 'Launch Time', 'Locality', 'Metric Name', 'Metric Value', 'Speculative', 'Job ID', 'Stage ID', 'Task ID']\n",
    "\n",
    "index_columns = ['Application ID','Application Name','Attempt','Executor ID','Failed','Finish Time','Getting Result Time','Host','Index','Killed','Launch Time','Locality','Speculative','Job ID', 'Stage ID','Task ID']\n",
    "\n",
    "wide_tasks = task_all[\n",
    "    project_columns\n",
    "].pivot_table(index=index_columns, columns=\"Metric Name\", values=\"Metric Value\").reset_index().rename_axis(None, axis=1)\n",
    "\n",
    "# wide_tasks.to_sql('wide_tasks', conn, index=False, if_exists='append')\n",
    "\n",
    "if wide_output_file != output_file:\n",
    "    safe_write(wide_tasks, 'wide_tasks', wide_conn, index=False)\n",
    "\n",
    "safe_write(wide_tasks, 'wide_tasks', conn, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exporting query plans "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ppn = plan_nodes.toPandas()\n",
    "pan = accumulable_nodes.toPandas()\n",
    "sqli = with_appmeta(sql_info_df).toPandas()\n",
    "\n",
    "sqli.to_sql('sql_info', conn, index=False, if_exists='append')\n",
    "ppn.to_sql('plans', conn, index=False, if_exists='append')\n",
    "pan.to_sql('accumulables', conn, index=False, if_exists='append')\n",
    "\n",
    "if wide_output_file != output_file:\n",
    "    sqli.to_sql('sql_info', wide_conn, index=False, if_exists='append')\n",
    "    ppn.to_sql('plans', wide_conn, index=False, if_exists='append')\n",
    "    pan.to_sql('accumulables', wide_conn, index=False, if_exists='append')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "metric_names = metric_names_for(plan_metrics)\n",
    "wide_plan_metrics = plan_metrics_full.groupBy([\"plan_node\", \"accumulatorId\", \"Application ID\", \"Application Name\", \"Task ID\", \"Stage ID\", \"Job ID\"]).pivot(\"Metric Name\", metric_names).agg(F.sum(\"Metric Value\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pmf = plan_metrics_full.toPandas()\n",
    "wpm = wide_plan_metrics.toPandas()\n",
    "\n",
    "if wide_output_file != output_file:\n",
    "    safe_write(pmf, 'plan_metrics', wide_conn, index=False)\n",
    "    safe_write(wpm, 'wide_plans', wide_conn, index=False)\n",
    "    \n",
    "safe_write(pmf, 'plan_metrics', conn, index=False)\n",
    "safe_write(wpm, 'wide_plans', conn, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "safe_write(plan_nodes.toPandas(), 'plan_nodes', conn, index=False)\n",
    "safe_write(accumulable_nodes.toPandas(), 'accumulable_nodes', conn, index=False)\n",
    "safe_write(metadata_nodes.toPandas(), 'metadata_nodes', conn, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Miscellaneous metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for table in [job_info, stage_meta, stage_parents, stage_rddmeta, stage_rddparents, app_timeline, sql_timeline, driver_accumulator_updates]:\n",
    "    raw = table(metrics)\n",
    "    the_df = with_appmeta(raw).toPandas()\n",
    "    the_df.to_sql(table.__name__, conn, index=False, if_exists='append')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if wide_output_file != output_file:\n",
    "    wide_conn.execute('CREATE INDEX IF NOT EXISTS accumulable_apps on accumulables ([Application ID])')\n",
    "    wide_conn.execute('CREATE INDEX IF NOT EXISTS plan_apps on plans ([Application ID])')\n",
    "    wide_conn.execute('CREATE INDEX IF NOT EXISTS plan_metrics_apps on plan_metrics ([Application ID])')\n",
    "    wide_conn.execute('CREATE INDEX IF NOT EXISTS wide_tasks_task on wide_tasks ([Application ID], [Task ID])')\n",
    "    wide_conn.execute('CREATE INDEX IF NOT EXISTS wide_tasks_app on wide_tasks ([Application ID])')\n",
    "    \n",
    "conn.execute('CREATE INDEX IF NOT EXISTS wide_tasks_task on wide_tasks ([Application ID], [Task ID])')\n",
    "conn.execute('CREATE INDEX IF NOT EXISTS wide_tasks_app on wide_tasks ([Application ID])')\n",
    "conn.execute('CREATE INDEX IF NOT EXISTS accumulable_apps on accumulables ([Application ID])')\n",
    "conn.execute('CREATE INDEX IF NOT EXISTS task_metric_apps on task_metrics ([Application ID])')\n",
    "conn.execute('CREATE INDEX IF NOT EXISTS task_metric_names on task_metrics ([Metric Name])')\n",
    "conn.execute('CREATE INDEX IF NOT EXISTS task_metric_kinds on task_metrics ([kind])')\n",
    "conn.execute('CREATE INDEX IF NOT EXISTS task_metric_agg on task_metrics ([Task ID], [Metric Name])')\n",
    "conn.execute('CREATE INDEX IF NOT EXISTS stage_metric_agg on task_metrics ([Stage ID], [Metric Name])')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "conn.close()\n",
    "wide_conn.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not interactive:\n",
    "    spark.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}