{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "name": "Vega + Comet.ipynb",
   "provenance": [],
   "collapsed_sections": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  }
 },
 "cells": [
  {
   "cell_type": "code",
   "metadata": {
    "id": "Tx5C-o8hy0y6"
   },
   "source": [
    "!pip install comet_ml"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "vIofsUO4zQdh"
   },
   "source": [
    "import comet_ml\n",
    "comet_ml.login()"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "SuF1W-4lAvn-"
   },
   "source": [
    "from sklearn.datasets import load_wine as load_data"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "Qd8TTfwrAzWA"
   },
   "source": [
    "dataset = load_data()"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "WlVUGqViA9GD"
   },
   "source": [
    "X, y = dataset.data, dataset.target\n",
    "featurecols = dataset.feature_names"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "b6OnxD2PCAqr"
   },
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.DataFrame(X, columns=featurecols)\n",
    "df[\"target\"] = y"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "2KsTEoywGFt3"
   },
   "source": [
    "df.head()"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "ork4vYk1bA1Z"
   },
   "source": [
    "experiment = comet_ml.start(project_name=\"comet-vega\")\n",
    "experiment.add_tag(\"dataset\")\n",
    "experiment.log_table(\"wine.json\", df, headers=False, **{\"orient\": \"records\"})\n",
    "experiment.end()"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "shnDFgEdiMkw"
   },
   "source": [
    "df.shape"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "VTdFWCmfWjc4"
   },
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.preprocessing import OneHotEncoder"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "XfCq1DRajftB"
   },
   "source": [
    "y.shape"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "EEpBfvyPavrv"
   },
   "source": [
    "RANDOM_STATE = 1\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=0.2, random_state=RANDOM_STATE\n",
    ")"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "WawyXybEjvyW"
   },
   "source": [
    "y_test.shape"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "reLHUCNJWwfp"
   },
   "source": [
    "optimizer_config = {\n",
    "    # We pick the Bayes algorithm:\n",
    "    \"algorithm\": \"random\",\n",
    "    # Declare your hyperparameters in the Vizier-inspired format:\n",
    "    \"parameters\": {\n",
    "        \"n_estimators\": {\"type\": \"discrete\", \"values\": [10, 100, 500]},\n",
    "        \"max_depth\": {\"type\": \"discrete\", \"values\": [4, 6, 8]},\n",
    "    },\n",
    "    # Declare what we will be optimizing, and how:\n",
    "    \"spec\": {\n",
    "        \"metric\": \"accuracy\",\n",
    "        \"objective\": \"maximize\",\n",
    "    },\n",
    "}\n",
    "optimizer = comet_ml.Optimizer(optimizer_config)"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "Y6ghfoVob8Gx"
   },
   "source": [
    "def create_feature_importance_df(model, feature_names):\n",
    "    output = pd.DataFrame()\n",
    "    importances = model.feature_importances_\n",
    "\n",
    "    output[\"feature_name\"] = feature_names\n",
    "    output[\"feature_importance\"] = importances\n",
    "\n",
    "    return output"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "oMlJU0r3ZK2a"
   },
   "source": [
    "feature_importance_df = pd.DataFrame()\n",
    "\n",
    "for experiment in optimizer.get_experiments(project_name=\"comet-vega\"):\n",
    "    model = RandomForestClassifier(\n",
    "        random_state=RANDOM_STATE,\n",
    "        max_depth=experiment.get_parameter(\"max_depth\"),\n",
    "        n_estimators=experiment.get_parameter(\"n_estimators\"),\n",
    "    )\n",
    "\n",
    "    model.fit(X_train, y_train)\n",
    "    predictions = model.predict(X_test)\n",
    "\n",
    "    report = classification_report(y_test, predictions, output_dict=True)\n",
    "    for k, v in report.items():\n",
    "        if isinstance(v, dict):\n",
    "            experiment.log_metrics(v, prefix=f\"label_{k}\")\n",
    "        else:\n",
    "            experiment.log_metric(k, v)\n",
    "\n",
    "    feature_importance = create_feature_importance_df(model, featurecols)\n",
    "    experiment.log_table(\n",
    "        \"importance.json\", feature_importance, headers=False, **{\"orient\": \"records\"}\n",
    "    )"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "Q6OVXUrvqyCk"
   },
   "source": [
    ""
   ],
   "execution_count": null,
   "outputs": []
  }
 ]
}