{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "clusterdata_analysis_colab.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "authorship_tag": "ABX9TyN3LZGXsYYQ/HRBwFP0rm4Q",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/google/cluster-data/blob/master/clusterdata_analysis_colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-qcDkhOIjj8a",
        "colab_type": "text"
      },
      "source": [
        "# Google trace analysis colab\n",
        "\n",
        "This colab provides several example queries and graphs using [Altair](https://altair-viz.github.io/) for the 2019 Google cluster trace. Further examples will be added over time.\n",
        "\n",
        "**Important:** in order to be able to run the queries you will need to:\n",
        "\n",
        "1.   Use the [Cloud Resource Manager](https://console.cloud.google.com/cloud-resource-manager) to Create a Cloud Platform project if you do not already have one.\n",
        "2.   [Enable billing](https://support.google.com/cloud/answer/6293499#enable-billing) for the project.\n",
        "3.   [Enable BigQuery](https://console.cloud.google.com/flows/enableapi?apiid=bigquery) APIs for the project.\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Vcjo13Kejgij",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#@title Please input your project id\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import altair as alt\n",
        "from google.cloud import bigquery\n",
        "# Provide credentials to the runtime\n",
        "from google.colab import auth\n",
        "from google.cloud.bigquery import magics\n",
        "\n",
        "auth.authenticate_user()\n",
        "print('Authenticated')\n",
        "project_id = '' #@param {type: \"string\"}\n",
        "# Set the default project id for %bigquery magic\n",
        "magics.context.project = project_id\n",
        "\n",
        "# Use the client to run queries constructed from a more complicated function.\n",
        "client = bigquery.Client(project=project_id)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "NFUPuLiajrC8",
        "colab_type": "text"
      },
      "source": [
        "# Basic queries\n",
        "\n",
        "This section shows the most basic way of querying the trace using the [bigquery magic](https://googleapis.dev/python/bigquery/latest/magics.html)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "3xyBH9oQjr1w",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "%%bigquery\n",
        "SELECT capacity.cpus AS cpu_cap, \n",
        "capacity.memory AS memory_cap, \n",
        "COUNT(DISTINCT machine_id) AS num_machines\n",
        "FROM `google.com:google-cluster-data`.clusterdata_2019_a.machine_events\n",
        "GROUP BY 1,2"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "SzIHMxy2jvsM",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "%%bigquery\n",
        "SELECT COUNT(DISTINCT collection_id) AS collections FROM \n",
        "`google.com:google-cluster-data`.clusterdata_2019_a.collection_events;"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gm-gaRS-jwZj",
        "colab_type": "text"
      },
      "source": [
        "# Cell level resource usage time series\n",
        "\n",
        "This query takes a cell as input and plots a resource usage time-series for every hour of the trace broken down by tier."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ekhDGYOAjy54",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#@title Select a cell and a resource to plot the cell level usage series\n",
        "\n",
        "def query_cell_capacity(cell):\n",
        "  return '''\n",
        "SELECT SUM(cpu_cap) AS cpu_capacity,\n",
        "  SUM(memory_cap) AS memory_capacity\n",
        "FROM (\n",
        "  SELECT machine_id, MAX(capacity.cpus) AS cpu_cap,\n",
        "    MAX(capacity.memory) AS memory_cap\n",
        "  FROM `google.com:google-cluster-data`.clusterdata_2019_{cell}.machine_events\n",
        "  GROUP BY 1\n",
        ")\n",
        "  '''.format(cell=cell)\n",
        "\n",
        "def query_per_instance_usage_priority(cell):\n",
        "  return '''\n",
        "SELECT u.time AS time,\n",
        "  u.collection_id AS collection_id,\n",
        "  u.instance_index AS instance_index,\n",
        "  e.priority AS priority,\n",
        "  CASE\n",
        "    WHEN e.priority BETWEEN 0 AND 99 THEN '1_free'\n",
        "    WHEN e.priority BETWEEN 100 AND 115 THEN '2_beb'\n",
        "    WHEN e.priority BETWEEN 116 AND 119 THEN '3_mid'\n",
        "    ELSE '4_prod'\n",
        "  END AS tier,\n",
        "  u.cpu_usage AS cpu_usage,\n",
        "  u.memory_usage AS memory_usage\n",
        "FROM (\n",
        "  SELECT start_time AS time,\n",
        "    collection_id,\n",
        "    instance_index,\n",
        "    machine_id,\n",
        "    average_usage.cpus AS cpu_usage,\n",
        "    average_usage.memory AS memory_usage\n",
        "  FROM `google.com:google-cluster-data`.clusterdata_2019_{cell}.instance_usage\n",
        "  WHERE (alloc_collection_id IS NULL OR alloc_collection_id = 0)\n",
        "    AND (end_time - start_time) >= (5 * 60 * 1e6)\n",
        ") AS u JOIN (\n",
        "  SELECT collection_id, instance_index, machine_id,\n",
        "    MAX(priority) AS priority\n",
        "  FROM `google.com:google-cluster-data`.clusterdata_2019_{cell}.instance_events\n",
        "  WHERE (alloc_collection_id IS NULL OR alloc_collection_id = 0)\n",
        "  GROUP BY 1, 2, 3\n",
        ") AS e ON u.collection_id = e.collection_id\n",
        "  AND u.instance_index = e.instance_index\n",
        "  AND u.machine_id = e.machine_id\n",
        "  '''.format(cell=cell)\n",
        "\n",
        "def query_per_tier_utilization_time_series(cell, cpu_capacity, memory_capacity):\n",
        "  return '''\n",
        "SELECT CAST(FLOOR(time/(1e6 * 60 * 60)) AS INT64) AS hour_index,\n",
        "  tier,\n",
        "  SUM(cpu_usage) / (12 * {cpu_capacity}) AS avg_cpu_usage,\n",
        "  SUM(memory_usage) / (12 * {memory_capacity}) AS avg_memory_usage\n",
        "FROM ({table})\n",
        "GROUP BY 1, 2 ORDER BY hour_index\n",
        "  '''.format(table=query_per_instance_usage_priority(cell),\n",
        "             cpu_capacity=cpu_capacity, memory_capacity=memory_capacity)\n",
        "  \n",
        "def run_query_utilization_per_time_time_series(cell):\n",
        "  cell_cap = client.query(query_cell_capacity(cell)).to_dataframe()\n",
        "  query = query_per_tier_utilization_time_series(\n",
        "      cell,\n",
        "      cell_cap['cpu_capacity'][0],\n",
        "      cell_cap['memory_capacity'][0])\n",
        "  time_series = client.query(query).to_dataframe()\n",
        "  return time_series\n",
        "\n",
        "cell = 'c' #@param ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']\n",
        "hourly_usage = run_query_utilization_per_time_time_series(cell)\n",
        "\n",
        "# CPU graph\n",
        "cpu = alt.Chart(hourly_usage).mark_area().encode(\n",
        "        alt.X('hour_index:N'),\n",
        "        alt.Y('avg_cpu_usage:Q'),\n",
        "        color=alt.Color('tier', legend=alt.Legend(orient=\"left\")),\n",
        "        order=alt.Order('tier', sort='descending'),\n",
        "        tooltip=['hour_index', 'tier', 'avg_cpu_usage']\n",
        "  )\n",
        "cpu.encoding.x.title = \"Hour\"\n",
        "cpu.encoding.y.title = \"Average CPU usage\"\n",
        "cpu.display()\n",
        "\n",
        "# Memory graph\n",
        "memory = alt.Chart(hourly_usage).mark_area().encode(\n",
        "        alt.X('hour_index:N'),\n",
        "        alt.Y('avg_memory_usage:Q'),\n",
        "        color=alt.Color('tier', legend=alt.Legend(orient=\"left\")),\n",
        "        order=alt.Order('tier', sort='descending'),\n",
        "        tooltip=['hour_index', 'tier', 'avg_memory_usage']\n",
        "  )\n",
        "memory.encoding.x.title = \"Hour\"\n",
        "memory.encoding.y.title = \"Average memory usage\"\n",
        "memory.display()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qz9m4P5hj2bv",
        "colab_type": "text"
      },
      "source": [
        "#Per machine resource usage distribution\n",
        "\n",
        "This query takes a cell as input and plots a per-machine resource utilization CDF."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fgSP3kvyj25-",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#@title Select a cell and plot its per-machine resource utilization CDFs\n",
        "\n",
        "# Functions to plot CDFs using Altair\n",
        "def pick_quantiles_from_tall_dataframe(data, qcol, name=\"\"):\n",
        "  quantiles = pd.DataFrame([x for x in data[qcol]]).transpose()\n",
        "  if name != \"\":\n",
        "    quantiles.columns = data[name]\n",
        "  return quantiles\n",
        "\n",
        "# - data: a dataframe with one row and one or more columns of quantiles (results\n",
        "#   returned from APPROX_QUANTILES)\n",
        "# - qcols: a list of names of the quantiles\n",
        "# - names: the names of each returned quantiles' columns.\n",
        "def pick_quantiles_from_wide_dataframe(data, qcols, names=[]):\n",
        "  quantiles = {}\n",
        "  i = 0\n",
        "  for qcol in qcols:\n",
        "    col_name = qcol\n",
        "    if i < len(names):\n",
        "      col_name = names[i]\n",
        "    quantiles[col_name] = data[qcol][0]\n",
        "    i+=1\n",
        "  return pd.DataFrame(quantiles)\n",
        "\n",
        "# - quantiles: a dataframe where each column contains the quantiles of one\n",
        "#   data set. The index (i.e. row names) of the dataframe is the quantile. The\n",
        "#   column names are the names of the data set.\n",
        "def plot_cdfs(quantiles, xlab=\"Value\", ylab=\"CDF\",\n",
        "              legend_title=\"dataset\", labels=[],\n",
        "              interactive=False,\n",
        "              title=''):\n",
        "  dfs = []\n",
        "  label = legend_title\n",
        "  yval = range(quantiles.shape[0])\n",
        "  esp = 1.0/(len(quantiles)-1)\n",
        "  yval = [y * esp for y in yval]\n",
        "  while label == xlab or label == ylab:\n",
        "    label += '_'\n",
        "  for col_idx, col in enumerate(quantiles.columns):\n",
        "    col_label = col\n",
        "    if col_idx < len(labels):\n",
        "      col_label = labels[col_idx]\n",
        "    dfs.append(pd.DataFrame({\n",
        "        label: col_label,\n",
        "        xlab: quantiles[col],\n",
        "        ylab: yval\n",
        "    }))\n",
        "  cdfs = pd.concat(dfs)\n",
        "  lines = alt.Chart(cdfs).mark_line().encode(\n",
        "    # If you can draw a CDF, it has to be continuous real-valued\n",
        "    x=xlab+\":Q\",\n",
        "    y=ylab+\":Q\",\n",
        "    color=label+\":N\"\n",
        "  ).properties(\n",
        "    title=title\n",
        "  )\n",
        "  if not interactive:\n",
        "    return lines\n",
        "  # Create a selection that chooses the nearest point & selects based on x-value\n",
        "  nearest = alt.selection(type='single', nearest=True, on='mouseover',\n",
        "                        fields=[ylab], empty='none')\n",
        "  # Transparent selectors across the chart. This is what tells us\n",
        "  # the y-value of the cursor\n",
        "  selectors = alt.Chart(cdfs).mark_point().encode(\n",
        "    y=ylab+\":Q\",\n",
        "    opacity=alt.value(0),\n",
        "  ).properties(\n",
        "    selection=nearest\n",
        "  )\n",
        "\n",
        "  # Draw text labels near the points, and highlight based on selection\n",
        "  text = lines.mark_text(align='left', dx=5, dy=-5).encode(\n",
        "    text=alt.condition(nearest,\n",
        "                       alt.Text(xlab+\":Q\", format=\".2f\"),\n",
        "                       alt.value(' '))\n",
        "  )\n",
        "\n",
        "  # Draw a rule at the location of the selection\n",
        "  rules = alt.Chart(cdfs).mark_rule(color='gray').encode(\n",
        "    y=ylab+\":Q\",\n",
        "  ).transform_filter(\n",
        "    nearest.ref()\n",
        "  )\n",
        "  # Draw points on the line, and highlight based on selection\n",
        "  points = lines.mark_point().encode(\n",
        "    opacity=alt.condition(nearest, alt.value(1), alt.value(0))\n",
        "  )\n",
        "  # Put the five layers into a chart and bind the data\n",
        "  return alt.layer(lines, selectors, rules, text, points).interactive(\n",
        "      bind_y=False)\n",
        "  \n",
        "# Functions to create the query\n",
        "\n",
        "def query_machine_capacity(cell):\n",
        "  return '''\n",
        "SELECT machine_id, MAX(capacity.cpus) AS cpu_cap,\n",
        "  MAX(capacity.memory) AS memory_cap\n",
        "FROM `google.com:google-cluster-data`.clusterdata_2019_{cell}.machine_events\n",
        "GROUP BY 1\n",
        "  '''.format(cell=cell)\n",
        "\n",
        "def query_top_level_instance_usage(cell):\n",
        "  return '''\n",
        "SELECT CAST(FLOOR(start_time/(1e6 * 300)) * (1000000 * 300) AS INT64) AS time,\n",
        "  collection_id,\n",
        "  instance_index,\n",
        "  machine_id,\n",
        "  average_usage.cpus AS cpu_usage,\n",
        "  average_usage.memory AS memory_usage\n",
        "FROM `google.com:google-cluster-data`.clusterdata_2019_{cell}.instance_usage\n",
        "WHERE (alloc_collection_id IS NULL OR alloc_collection_id = 0)\n",
        "  AND (end_time - start_time) >= (5 * 60 * 1e6)\n",
        "  '''.format(cell=cell)\n",
        "\n",
        "def query_machine_usage(cell):\n",
        "  return '''\n",
        "SELECT u.time AS time,\n",
        "  u.machine_id AS machine_id,\n",
        "  SUM(u.cpu_usage) AS cpu_usage,\n",
        "  SUM(u.memory_usage) AS memory_usage,\n",
        "  MAX(m.cpu_cap) AS cpu_capacity,\n",
        "  MAX(m.memory_cap) AS memory_capacity\n",
        "FROM ({instance_usage}) AS u JOIN\n",
        " ({machine_capacity}) AS m\n",
        "ON u.machine_id = m.machine_id\n",
        "GROUP BY 1, 2\n",
        "  '''.format(instance_usage = query_top_level_instance_usage(cell),\n",
        "             machine_capacity = query_machine_capacity(cell))\n",
        "  \n",
        "def query_machine_utilization_distribution(cell):\n",
        "  return '''\n",
        "SELECT APPROX_QUANTILES(IF(cpu_usage > cpu_capacity, 1.0, cpu_usage / cpu_capacity), 100) AS cpu_util_dist,\n",
        "  APPROX_QUANTILES(IF(memory_usage > memory_capacity, 1.0, memory_usage / memory_capacity), 100) AS memory_util_dist\n",
        "FROM ({table})\n",
        "  '''.format(table = query_machine_usage(cell))\n",
        "\n",
        "cell = 'd' #@param ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']\n",
        "query = query_machine_utilization_distribution(cell)\n",
        "machine_util_dist = client.query(query).to_dataframe()\n",
        "plot_cdfs(pick_quantiles_from_wide_dataframe(machine_util_dist, ['cpu_util_dist', 'memory_util_dist'], ['CPU', 'Memory']), xlab='x - resource utilization (%)', ylab=\"Probability (resource utilization < x)\", interactive=True)"
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}