{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example of cross validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import sklearn.model_selection\n",
    "import sklearn.datasets\n",
    "import sklearn.metrics\n",
    "import autosklearn.classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0.980000, MyDummyClassifier(configuration=1, init_params=None, random_state=None)),\n",
      "(0.020000, SimpleClassificationPipeline({'balancing:strategy': 'none', 'classifier:__choice__': 'liblinear_svc', 'imputation:strategy': 'median', 'one_hot_encoding:use_minimum_fraction': 'False', 'preprocessor:__choice__': 'polynomial', 'rescaling:__choice__': 'minmax', 'classifier:liblinear_svc:C': 2.4244459875201874, 'classifier:liblinear_svc:dual': 'False', 'classifier:liblinear_svc:fit_intercept': 'True', 'classifier:liblinear_svc:intercept_scaling': 1, 'classifier:liblinear_svc:loss': 'squared_hinge', 'classifier:liblinear_svc:multi_class': 'ovr', 'classifier:liblinear_svc:penalty': 'l2', 'classifier:liblinear_svc:tol': 0.0014473335587607684, 'preprocessor:polynomial:degree': 3, 'preprocessor:polynomial:include_bias': 'False', 'preprocessor:polynomial:interaction_only': 'True'},\n",
      "dataset_properties={\n",
      "  'task': 2,\n",
      "  'sparse': False,\n",
      "  'multilabel': False,\n",
      "  'multiclass': True,\n",
      "  'target_type': 'classification',\n",
      "  'signed': False})),\n",
      "]\n",
      "auto-sklearn results:\n",
      "  Dataset name: digits\n",
      "  Metric: accuracy\n",
      "  Best validation score: 0.988764\n",
      "  Number of target algorithm runs: 24\n",
      "  Number of successful target algorithm runs: 0\n",
      "  Number of crashed target algorithm runs: 0\n",
      "  Number of target algorithms that exceeded the memory limit: 0\n",
      "  Number of target algorithms that exceeded the time limit: 0\n",
      "\n",
      "Accuracy score 0.986666666667\n"
     ]
    }
   ],
   "source": [
    "def main():\n",
    "    X, y = sklearn.datasets.load_digits(return_X_y=True)\n",
    "    X_train, X_test, y_train, y_test = \\\n",
    "        sklearn.model_selection.train_test_split(X, y, random_state=1)\n",
    "\n",
    "    automl = autosklearn.classification.AutoSklearnClassifier(\n",
    "        time_left_for_this_task=120, per_run_time_limit=30,\n",
    "        tmp_folder='/tmp/autoslearn_sequential_example_tmp',\n",
    "        output_folder='/tmp/autosklearn_sequential_example_out',\n",
    "        # Do not construct ensembles in parallel to avoid using more than one\n",
    "        # core at a time. The ensemble will be constructed after auto-sklearn\n",
    "        # finished fitting all machine learning models.\n",
    "        ensemble_size=0, delete_tmp_folder_after_terminate=False)\n",
    "    automl.fit(X_train, y_train, dataset_name='digits')\n",
    "    # This call to fit_ensemble uses all models trained in the previous call\n",
    "    # to fit to build an ensemble which can be used with automl.predict()\n",
    "    automl.fit_ensemble(y_train, ensemble_size=50)\n",
    "\n",
    "    print(automl.show_models())\n",
    "    predictions = automl.predict(X_test)\n",
    "    print(automl.sprint_statistics())\n",
    "    print(\"Accuracy score\", sklearn.metrics.accuracy_score(y_test, predictions))\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    main()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example feature types "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "################################################################################\n",
      "    To run this example you need to install openml-python:\n",
      "    pip install git+https://github.com/renatopp/liac-arff\n",
      "    pip install requests xmltodict\n",
      "    pip install git+https://github.com/openml/openml-python@develop --no-deps\n",
      "################################################################################\n"
     ]
    },
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'openml'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-3-184bc93afb14>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m     \u001b[0;32mimport\u001b[0m \u001b[0mopenml\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mImportError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     print(\"#\"*80 + \"\"\"\n\u001b[1;32m      5\u001b[0m     \u001b[0mTo\u001b[0m \u001b[0mrun\u001b[0m \u001b[0mthis\u001b[0m \u001b[0mexample\u001b[0m \u001b[0myou\u001b[0m \u001b[0mneed\u001b[0m \u001b[0mto\u001b[0m \u001b[0minstall\u001b[0m \u001b[0mopenml\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mpython\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'openml'"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    import openml\n",
    "except ImportError:\n",
    "    print(\"#\"*80 + \"\"\"\n",
    "    To run this example you need to install openml-python:\n",
    "    pip install git+https://github.com/renatopp/liac-arff\n",
    "    pip install requests xmltodict\n",
    "    pip install git+https://github.com/openml/openml-python@develop --no-deps\\n\"\"\" +\n",
    "          \"#\"*80)\n",
    "    raise\n",
    "\n",
    "\n",
    "def main():\n",
    "    # Load adult dataset from openml.org, see https://www.openml.org/t/2117\n",
    "    openml.config.apikey = '610344db6388d9ba34f6db45a3cf71de'\n",
    "\n",
    "    task = openml.tasks.get_task(2117)\n",
    "    train_indices, test_indices = task.get_train_test_split_indices()\n",
    "    X, y = task.get_X_and_y()\n",
    "\n",
    "    X_train = X[train_indices]\n",
    "    y_train = y[train_indices]\n",
    "    X_test = X[test_indices]\n",
    "    y_test = y[test_indices]\n",
    "\n",
    "    dataset = task.get_dataset()\n",
    "    _, _, categorical_indicator = dataset.\\\n",
    "        get_data(target=task.target_name, return_categorical_indicator=True)\n",
    "\n",
    "    # Create feature type list from openml.org indicator and run autosklearn\n",
    "    feat_type = ['Categorical' if ci else 'Numerical'\n",
    "                 for ci in categorical_indicator]\n",
    "\n",
    "    cls = autosklearn.classification.\\\n",
    "        AutoSklearnClassifier(time_left_for_this_task=120,\n",
    "                              per_run_time_limit=30)\n",
    "    cls.fit(X_train, y_train, feat_type=feat_type)\n",
    "\n",
    "    predictions = cls.predict(X_test)\n",
    "    print(\"Accuracy score\", sklearn.metrics.accuracy_score(y_test, predictions))\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    main()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### After doing the following"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#pip install git+https://github.com/renatopp/liac-arff\n",
    "#pip install requests xmltodict\n",
    "# pip install git+https://github.com/openml/openml-python@develop --no-deps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[WARNING] [2017-08-05 17:10:17,667:smac.intensification.intensification.Intensifier] Challenger was the same as the current incumbent; Skipping challenger\n",
      "[WARNING] [2017-08-05 17:10:17,667:smac.intensification.intensification.Intensifier] Challenger was the same as the current incumbent; Skipping challenger\n",
      "[WARNING] [2017-08-05 17:10:17,676:smac.intensification.intensification.Intensifier] Challenger was the same as the current incumbent; Skipping challenger\n",
      "[WARNING] [2017-08-05 17:10:17,676:smac.intensification.intensification.Intensifier] Challenger was the same as the current incumbent; Skipping challenger\n",
      "Accuracy score 0.851523236334\n"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    import openml\n",
    "except ImportError:\n",
    "    print(\"#\"*80 + \"\"\"\n",
    "    To run this example you need to install openml-python:\n",
    "    pip install git+https://github.com/renatopp/liac-arff\n",
    "    pip install requests xmltodict\n",
    "    pip install git+https://github.com/openml/openml-python@develop --no-deps\\n\"\"\" +\n",
    "          \"#\"*80)\n",
    "    raise\n",
    "\n",
    "\n",
    "def main():\n",
    "    # Load adult dataset from openml.org, see https://www.openml.org/t/2117\n",
    "    openml.config.apikey = '610344db6388d9ba34f6db45a3cf71de'\n",
    "\n",
    "    task = openml.tasks.get_task(2117)\n",
    "    train_indices, test_indices = task.get_train_test_split_indices()\n",
    "    X, y = task.get_X_and_y()\n",
    "\n",
    "    X_train = X[train_indices]\n",
    "    y_train = y[train_indices]\n",
    "    X_test = X[test_indices]\n",
    "    y_test = y[test_indices]\n",
    "\n",
    "    dataset = task.get_dataset()\n",
    "    _, _, categorical_indicator = dataset.\\\n",
    "        get_data(target=task.target_name, return_categorical_indicator=True)\n",
    "\n",
    "    # Create feature type list from openml.org indicator and run autosklearn\n",
    "    feat_type = ['Categorical' if ci else 'Numerical'\n",
    "                 for ci in categorical_indicator]\n",
    "\n",
    "    cls = autosklearn.classification.\\\n",
    "        AutoSklearnClassifier(time_left_for_this_task=120,\n",
    "                              per_run_time_limit=30)\n",
    "    cls.fit(X_train, y_train, feat_type=feat_type)\n",
    "\n",
    "    predictions = cls.predict(X_test)\n",
    "    print(\"Accuracy score\", sklearn.metrics.accuracy_score(y_test, predictions))\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    main()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example of holdout "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are already timing task: index_run5\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0.960000, MyDummyClassifier(configuration=1, init_params=None, random_state=None)),\n",
      "(0.020000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'gradient_boosting', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'standardize', 'classifier:gradient_boosting:learning_rate': 0.03627152792976942, 'classifier:gradient_boosting:loss': 'deviance', 'classifier:gradient_boosting:max_depth': 10, 'classifier:gradient_boosting:max_features': 4.211238636565405, 'classifier:gradient_boosting:max_leaf_nodes': 'None', 'classifier:gradient_boosting:min_samples_leaf': 15, 'classifier:gradient_boosting:min_samples_split': 16, 'classifier:gradient_boosting:min_weight_fraction_leaf': 0.0, 'classifier:gradient_boosting:n_estimators': 340, 'classifier:gradient_boosting:subsample': 0.6289005711340923, 'one_hot_encoding:minimum_fraction': 0.0002148748655476835},\n",
      "dataset_properties={\n",
      "  'task': 2,\n",
      "  'sparse': False,\n",
      "  'multilabel': False,\n",
      "  'multiclass': True,\n",
      "  'target_type': 'classification',\n",
      "  'signed': False})),\n",
      "(0.020000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'k_nearest_neighbors', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'none', 'classifier:k_nearest_neighbors:n_neighbors': 2, 'classifier:k_nearest_neighbors:p': 1, 'classifier:k_nearest_neighbors:weights': 'distance', 'one_hot_encoding:minimum_fraction': 0.3530578080502024},\n",
      "dataset_properties={\n",
      "  'task': 2,\n",
      "  'sparse': False,\n",
      "  'multilabel': False,\n",
      "  'multiclass': True,\n",
      "  'target_type': 'classification',\n",
      "  'signed': False})),\n",
      "]\n",
      "auto-sklearn results:\n",
      "  Dataset name: digits\n",
      "  Metric: accuracy\n",
      "  Best validation score: 0.982022\n",
      "  Number of target algorithm runs: 23\n",
      "  Number of successful target algorithm runs: 0\n",
      "  Number of crashed target algorithm runs: 0\n",
      "  Number of target algorithms that exceeded the memory limit: 0\n",
      "  Number of target algorithms that exceeded the time limit: 0\n",
      "\n",
      "Accuracy score 0.991111111111\n"
     ]
    }
   ],
   "source": [
    "def main():\n",
    "    X, y = sklearn.datasets.load_digits(return_X_y=True)\n",
    "    X_train, X_test, y_train, y_test = \\\n",
    "        sklearn.model_selection.train_test_split(X, y, random_state=1)\n",
    "\n",
    "    automl = autosklearn.classification.AutoSklearnClassifier(\n",
    "        time_left_for_this_task=120, per_run_time_limit=30,\n",
    "        tmp_folder='/tmp/autoslearn_holdout_example_tmp',\n",
    "        output_folder='/tmp/autosklearn_holdout_example_out',\n",
    "        disable_evaluator_output=False)\n",
    "    automl.fit(X_train, y_train, dataset_name='digits')\n",
    "\n",
    "    # Print the final ensemble constructed by auto-sklearn.\n",
    "    print(automl.show_models())\n",
    "    predictions = automl.predict(X_test)\n",
    "    # Print statistics about the auto-sklearn run such as number of\n",
    "    # iterations, number of models failed with a time out.\n",
    "    print(automl.sprint_statistics())\n",
    "    print(\"Accuracy score\", sklearn.metrics.accuracy_score(y_test, predictions))\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    main()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example metrics "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Available CLASSIFICATION metrics autosklearn.metrics.*:\n",
      "\t*accuracy\n",
      "\t*balanced_accuracy\n",
      "\t*roc_auc\n",
      "\t*average_precision\n",
      "\t*log_loss\n",
      "\t*pac_score\n",
      "\t*precision\n",
      "\t*precision_macro\n",
      "\t*precision_micro\n",
      "\t*precision_samples\n",
      "\t*precision_weighted\n",
      "\t*recall\n",
      "\t*recall_macro\n",
      "\t*recall_micro\n",
      "\t*recall_samples\n",
      "\t*recall_weighted\n",
      "\t*f1\n",
      "\t*f1_macro\n",
      "\t*f1_micro\n",
      "\t*f1_samples\n",
      "\t*f1_weighted\n",
      "Available REGRESSION autosklearn.metrics.*:\n",
      "\t*r2\n",
      "\t*mean_squared_error\n",
      "\t*mean_absolute_error\n",
      "\t*median_absolute_error\n",
      "################################################################################\n",
      "Use predefined accuracy metric\n",
      "[WARNING] [2017-08-05 17:16:44,323:smac.intensification.intensification.Intensifier] Challenger was the same as the current incumbent; Skipping challenger\n",
      "[WARNING] [2017-08-05 17:16:44,323:smac.intensification.intensification.Intensifier] Challenger was the same as the current incumbent; Skipping challenger\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are already timing task: index_run2\n",
      "You are already timing task: index_run3\n",
      "You are already timing task: index_run3\n",
      "You are already timing task: index_run3\n",
      "You are already timing task: index_run3\n",
      "You are already timing task: index_run3\n",
      "You are already timing task: index_run3\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy score 0.965035 using accuracy\n",
      "################################################################################\n",
      "Use self defined accuracy accuracy metric\n",
      "[WARNING] [2017-08-05 17:17:37,674:AutoMLSMBO(1)::d6d58dae5b02e07797da6d4d126ac9b6] Could not find meta-data directory /Users/tarrysingh/anaconda/lib/python3.6/site-packages/autosklearn/metalearning/files/accu_binary.classification_dense\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are already timing task: index_run2\n",
      "You are already timing task: index_run2\n",
      "You are already timing task: index_run2\n",
      "You are already timing task: index_run2\n",
      "You are already timing task: index_run2\n",
      "You are already timing task: index_run2\n",
      "You are already timing task: index_run2\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy score 0.951049 using accu\n",
      "################################################################################\n",
      "Use self defined accuracy with additional argument\n",
      "[WARNING] [2017-08-05 17:18:35,228:AutoMLSMBO(1)::d6d58dae5b02e07797da6d4d126ac9b6] Could not find meta-data directory /Users/tarrysingh/anaconda/lib/python3.6/site-packages/autosklearn/metalearning/files/accu_add_binary.classification_dense\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are already timing task: index_run2\n",
      "You are already timing task: index_run2\n",
      "You are already timing task: index_run2\n",
      "You are already timing task: index_run2\n",
      "You are already timing task: index_run2\n",
      "You are already timing task: index_run2\n",
      "You are already timing task: index_run2\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy score 0.951049 using accu_add\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import sklearn.model_selection\n",
    "import sklearn.datasets\n",
    "import sklearn.metrics\n",
    "import autosklearn.classification\n",
    "import autosklearn.metrics\n",
    "\n",
    "\n",
    "\n",
    "def accuracy(solution, prediction):\n",
    "    # function defining accuracy\n",
    "    return np.mean(solution == prediction)\n",
    "\n",
    "\n",
    "def accuracy_wk(solution, prediction, dummy):\n",
    "    # function defining accuracy and accepting an additional argument\n",
    "    assert dummy is None\n",
    "    return np.mean(solution == prediction)\n",
    "\n",
    "\n",
    "def main():\n",
    "\n",
    "    X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)\n",
    "    X_train, X_test, y_train, y_test = \\\n",
    "        sklearn.model_selection.train_test_split(X, y, random_state=1)\n",
    "\n",
    "    # Print a list of available metrics\n",
    "    print(\"Available CLASSIFICATION metrics autosklearn.metrics.*:\")\n",
    "    print(\"\\t*\" + \"\\n\\t*\".join(autosklearn.metrics.CLASSIFICATION_METRICS))\n",
    "\n",
    "    print(\"Available REGRESSION autosklearn.metrics.*:\")\n",
    "    print(\"\\t*\" + \"\\n\\t*\".join(autosklearn.metrics.REGRESSION_METRICS))\n",
    "\n",
    "    # First example: Use predefined accuracy metric\n",
    "    print(\"#\"*80)\n",
    "    print(\"Use predefined accuracy metric\")\n",
    "    cls = autosklearn.classification.\\\n",
    "        AutoSklearnClassifier(time_left_for_this_task=60,\n",
    "                              per_run_time_limit=30, seed=1)\n",
    "    cls.fit(X_train, y_train, metric=autosklearn.metrics.accuracy)\n",
    "\n",
    "    predictions = cls.predict(X_test)\n",
    "    print(\"Accuracy score {:g} using {:s}\".\n",
    "          format(sklearn.metrics.accuracy_score(y_test, predictions),\n",
    "                 cls._automl._automl._metric.name))\n",
    "\n",
    "    # Second example: Use own accuracy metric\n",
    "    print(\"#\"*80)\n",
    "    print(\"Use self defined accuracy accuracy metric\")\n",
    "    accuracy_scorer = autosklearn.metrics.make_scorer(name=\"accu\",\n",
    "                                                      score_func=accuracy,\n",
    "                                                      greater_is_better=True,\n",
    "                                                      needs_proba=False,\n",
    "                                                      needs_threshold=False)\n",
    "    cls = autosklearn.classification.\\\n",
    "        AutoSklearnClassifier(time_left_for_this_task=60,\n",
    "                              per_run_time_limit=30, seed=1)\n",
    "    cls.fit(X_train, y_train, metric=accuracy_scorer)\n",
    "\n",
    "    predictions = cls.predict(X_test)\n",
    "    print(\"Accuracy score {:g} using {:s}\".\n",
    "          format(sklearn.metrics.accuracy_score(y_test, predictions),\n",
    "                 cls._automl._automl._metric.name))\n",
    "\n",
    "    # Third example: Use own accuracy metric with additional argument\n",
    "    print(\"#\"*80)\n",
    "    print(\"Use self defined accuracy with additional argument\")\n",
    "    accuracy_scorer = autosklearn.metrics.make_scorer(name=\"accu_add\",\n",
    "                                                      score_func=accuracy_wk,\n",
    "                                                      greater_is_better=True,\n",
    "                                                      needs_proba=False,\n",
    "                                                      needs_threshold=False,\n",
    "                                                      dummy=None)\n",
    "    cls = autosklearn.classification.\\\n",
    "        AutoSklearnClassifier(time_left_for_this_task=60,\n",
    "                              per_run_time_limit=30, seed=1)\n",
    "    cls.fit(X_train, y_train, metric=accuracy_scorer)\n",
    "\n",
    "    predictions = cls.predict(X_test)\n",
    "    print(\"Accuracy score {:g} using {:s}\".\n",
    "          format(sklearn.metrics.accuracy_score(y_test, predictions),\n",
    "                 cls._automl._automl._metric.name))\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    main()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example in parallel "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting to build an ensemble!\n",
      "[(0.900000, MyDummyClassifier(configuration=1, init_params=None, random_state=None)),\n",
      "(0.050000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'k_nearest_neighbors', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'none', 'classifier:k_nearest_neighbors:n_neighbors': 2, 'classifier:k_nearest_neighbors:p': 1, 'classifier:k_nearest_neighbors:weights': 'distance', 'one_hot_encoding:minimum_fraction': 0.3530578080502024},\n",
      "dataset_properties={\n",
      "  'task': 2,\n",
      "  'sparse': False,\n",
      "  'multilabel': False,\n",
      "  'multiclass': True,\n",
      "  'target_type': 'classification',\n",
      "  'signed': False})),\n",
      "(0.050000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'passive_aggressive', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'False', 'preprocessor:__choice__': 'random_trees_embedding', 'rescaling:__choice__': 'normalize', 'classifier:passive_aggressive:C': 0.10437045130506178, 'classifier:passive_aggressive:fit_intercept': 'True', 'classifier:passive_aggressive:loss': 'hinge', 'classifier:passive_aggressive:n_iter': 25, 'preprocessor:random_trees_embedding:max_depth': 7, 'preprocessor:random_trees_embedding:max_leaf_nodes': 'None', 'preprocessor:random_trees_embedding:min_samples_leaf': 2, 'preprocessor:random_trees_embedding:min_samples_split': 6, 'preprocessor:random_trees_embedding:min_weight_fraction_leaf': 1.0, 'preprocessor:random_trees_embedding:n_estimators': 64},\n",
      "dataset_properties={\n",
      "  'task': 2,\n",
      "  'sparse': False,\n",
      "  'multilabel': False,\n",
      "  'multiclass': True,\n",
      "  'target_type': 'classification',\n",
      "  'signed': False})),\n",
      "]\n",
      "Accuracy score 0.991111111111\n"
     ]
    }
   ],
   "source": [
    "import multiprocessing\n",
    "import shutil\n",
    "from autosklearn.metrics import accuracy\n",
    "from autosklearn.classification import AutoSklearnClassifier\n",
    "from autosklearn.constants import *\n",
    "\n",
    "tmp_folder = '/tmp/autosklearn_parallel_example_tmp'\n",
    "output_folder = '/tmp/autosklearn_parallel_example_out'\n",
    "\n",
    "\n",
    "for dir in [tmp_folder, output_folder]:\n",
    "    try:\n",
    "        shutil.rmtree(dir)\n",
    "    except OSError as e:\n",
    "        pass\n",
    "\n",
    "\n",
    "def spawn_classifier(seed, dataset_name):\n",
    "    \"\"\"Spawn a subprocess.\n",
    "    auto-sklearn does not take care of spawning worker processes. This\n",
    "    function, which is called several times in the main block is a new\n",
    "    process which runs one instance of auto-sklearn.\n",
    "    \"\"\"\n",
    "\n",
    "    # Use the initial configurations from meta-learning only in one out of\n",
    "    # the four processes spawned. This prevents auto-sklearn from evaluating\n",
    "    # the same configurations in four processes.\n",
    "    if seed == 0:\n",
    "        initial_configurations_via_metalearning = 25\n",
    "    else:\n",
    "        initial_configurations_via_metalearning = 0\n",
    "\n",
    "    # Arguments which are different to other runs of auto-sklearn:\n",
    "    # 1. all classifiers write to the same output directory\n",
    "    # 2. shared_mode is set to True, this enables sharing of data between\n",
    "    # models.\n",
    "    # 3. all instances of the AutoSklearnClassifier must have a different seed!\n",
    "    automl = AutoSklearnClassifier(\n",
    "        time_left_for_this_task=60, # sec., how long should this seed fit\n",
    "        # process run\n",
    "        per_run_time_limit=15, # sec., each model may only take this long before it's killed\n",
    "        ml_memory_limit=1024, # MB, memory limit imposed on each call to a ML algorithm\n",
    "        shared_mode=True, # tmp folder will be shared between seeds\n",
    "        tmp_folder=tmp_folder,\n",
    "        output_folder=output_folder,\n",
    "        delete_tmp_folder_after_terminate=False,\n",
    "        ensemble_size=0, # ensembles will be built when all optimization runs are finished\n",
    "        initial_configurations_via_metalearning=initial_configurations_via_metalearning,\n",
    "        seed=seed)\n",
    "    automl.fit(X_train, y_train, dataset_name=dataset_name)\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    \n",
    "    X, y = sklearn.datasets.load_digits(return_X_y=True)\n",
    "    X_train, X_test, y_train, y_test = \\\n",
    "        sklearn.model_selection.train_test_split(X, y, random_state=1)\n",
    "\n",
    "    processes = []\n",
    "    for i in range(4): # set this at roughly half of your cores\n",
    "        p = multiprocessing.Process(target=spawn_classifier, args=(i, 'digits'))\n",
    "        p.start()\n",
    "        processes.append(p)\n",
    "    for p in processes:\n",
    "        p.join()\n",
    "\n",
    "    print('Starting to build an ensemble!')\n",
    "    automl = AutoSklearnClassifier(time_left_for_this_task=15,\n",
    "                                   per_run_time_limit=15,\n",
    "                                   ml_memory_limit=1024,\n",
    "                                   shared_mode=True,\n",
    "                                   ensemble_size=50,\n",
    "                                   ensemble_nbest=200,\n",
    "                                   tmp_folder=tmp_folder,\n",
    "                                   output_folder=output_folder,\n",
    "                                   initial_configurations_via_metalearning=0,\n",
    "                                   seed=1)\n",
    "\n",
    "    # Both the ensemble_size and ensemble_nbest parameters can be changed now if\n",
    "    # necessary\n",
    "    automl.fit_ensemble(y_train,\n",
    "                        task=MULTICLASS_CLASSIFICATION,\n",
    "                        metric=accuracy,\n",
    "                        precision='32',\n",
    "                        dataset_name='digits',\n",
    "                        ensemble_size=20,\n",
    "                        ensemble_nbest=50)\n",
    "\n",
    "    predictions = automl.predict(X_test)\n",
    "    print(automl.show_models())\n",
    "    print(\"Accuracy score\", sklearn.metrics.accuracy_score(y_test, predictions))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Regression example "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[WARNING] [2017-08-05 17:21:45,349:AutoMLSMBO(1)::boston] Could not find meta-data directory /Users/tarrysingh/anaconda/lib/python3.6/site-packages/autosklearn/metalearning/files/r2_regression_dense\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are already timing task: index_run2\n",
      "You are already timing task: index_run2\n",
      "You are already timing task: index_run2\n",
      "You are already timing task: index_run3\n",
      "You are already timing task: index_run3\n",
      "You are already timing task: index_run3\n",
      "You are already timing task: index_run3\n",
      "You are already timing task: index_run3\n",
      "You are already timing task: index_run4\n",
      "You are already timing task: index_run5\n",
      "You are already timing task: index_run5\n",
      "You are already timing task: index_run5\n",
      "You are already timing task: index_run5\n",
      "You are already timing task: index_run6\n",
      "You are already timing task: index_run6\n",
      "You are already timing task: index_run6\n",
      "You are already timing task: index_run7\n",
      "You are already timing task: index_run7\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0.660000, SimpleRegressionPipeline({'imputation:strategy': 'median', 'one_hot_encoding:use_minimum_fraction': 'True', 'preprocessor:__choice__': 'feature_agglomeration', 'regressor:__choice__': 'random_forest', 'rescaling:__choice__': 'standardize', 'one_hot_encoding:minimum_fraction': 0.010836306032657955, 'preprocessor:feature_agglomeration:affinity': 'euclidean', 'preprocessor:feature_agglomeration:linkage': 'ward', 'preprocessor:feature_agglomeration:n_clusters': 25, 'preprocessor:feature_agglomeration:pooling_func': 'mean', 'regressor:random_forest:bootstrap': 'False', 'regressor:random_forest:criterion': 'mse', 'regressor:random_forest:max_depth': 'None', 'regressor:random_forest:max_features': 4.418965161789183, 'regressor:random_forest:max_leaf_nodes': 'None', 'regressor:random_forest:min_samples_leaf': 2, 'regressor:random_forest:min_samples_split': 14, 'regressor:random_forest:min_weight_fraction_leaf': 0.0, 'regressor:random_forest:n_estimators': 100},\n",
      "dataset_properties={\n",
      "  'task': 4,\n",
      "  'sparse': False,\n",
      "  'multilabel': False,\n",
      "  'multiclass': False,\n",
      "  'target_type': 'regression',\n",
      "  'signed': False})),\n",
      "(0.200000, SimpleRegressionPipeline({'imputation:strategy': 'most_frequent', 'one_hot_encoding:use_minimum_fraction': 'True', 'preprocessor:__choice__': 'random_trees_embedding', 'regressor:__choice__': 'ridge_regression', 'rescaling:__choice__': 'standardize', 'one_hot_encoding:minimum_fraction': 0.00017301224964059824, 'preprocessor:random_trees_embedding:max_depth': 6, 'preprocessor:random_trees_embedding:max_leaf_nodes': 'None', 'preprocessor:random_trees_embedding:min_samples_leaf': 4, 'preprocessor:random_trees_embedding:min_samples_split': 17, 'preprocessor:random_trees_embedding:min_weight_fraction_leaf': 1.0, 'preprocessor:random_trees_embedding:n_estimators': 56, 'regressor:ridge_regression:alpha': 4.110147069014959, 'regressor:ridge_regression:fit_intercept': 'True', 'regressor:ridge_regression:tol': 0.003182277286417395},\n",
      "dataset_properties={\n",
      "  'task': 4,\n",
      "  'sparse': False,\n",
      "  'multilabel': False,\n",
      "  'multiclass': False,\n",
      "  'target_type': 'regression',\n",
      "  'signed': False})),\n",
      "(0.140000, SimpleRegressionPipeline({'imputation:strategy': 'most_frequent', 'one_hot_encoding:use_minimum_fraction': 'False', 'preprocessor:__choice__': 'random_trees_embedding', 'regressor:__choice__': 'ridge_regression', 'rescaling:__choice__': 'standardize', 'preprocessor:random_trees_embedding:max_depth': 6, 'preprocessor:random_trees_embedding:max_leaf_nodes': 'None', 'preprocessor:random_trees_embedding:min_samples_leaf': 18, 'preprocessor:random_trees_embedding:min_samples_split': 20, 'preprocessor:random_trees_embedding:min_weight_fraction_leaf': 1.0, 'preprocessor:random_trees_embedding:n_estimators': 86, 'regressor:ridge_regression:alpha': 4.152939579336265e-05, 'regressor:ridge_regression:fit_intercept': 'True', 'regressor:ridge_regression:tol': 0.00038470779387244015},\n",
      "dataset_properties={\n",
      "  'task': 4,\n",
      "  'sparse': False,\n",
      "  'multilabel': False,\n",
      "  'multiclass': False,\n",
      "  'target_type': 'regression',\n",
      "  'signed': False})),\n",
      "]\n",
      "R2 score: 0.86442782432\n"
     ]
    }
   ],
   "source": [
    "import autosklearn.regression\n",
    "\n",
    "\n",
    "def main():\n",
    "    X, y = sklearn.datasets.load_boston(return_X_y=True)\n",
    "    feature_types = (['numerical'] * 3) + ['categorical'] + (['numerical'] * 9)\n",
    "    X_train, X_test, y_train, y_test = \\\n",
    "        sklearn.model_selection.train_test_split(X, y, random_state=1)\n",
    "\n",
    "    automl = autosklearn.regression.AutoSklearnRegressor(\n",
    "        time_left_for_this_task=120, per_run_time_limit=30,\n",
    "        tmp_folder='/tmp/autoslearn_regression_example_tmp',\n",
    "        output_folder='/tmp/autosklearn_regression_example_out')\n",
    "    automl.fit(X_train, y_train, dataset_name='boston',\n",
    "               feat_type=feature_types)\n",
    "\n",
    "    print(automl.show_models())\n",
    "    predictions = automl.predict(X_test)\n",
    "    print(\"R2 score:\", sklearn.metrics.r2_score(y_test, predictions))\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    main()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ... and Finally Sequential "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0.960000, MyDummyClassifier(configuration=1, init_params=None, random_state=None)),\n",
      "(0.020000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'gradient_boosting', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'standardize', 'classifier:gradient_boosting:learning_rate': 0.03627152792976942, 'classifier:gradient_boosting:loss': 'deviance', 'classifier:gradient_boosting:max_depth': 10, 'classifier:gradient_boosting:max_features': 4.211238636565405, 'classifier:gradient_boosting:max_leaf_nodes': 'None', 'classifier:gradient_boosting:min_samples_leaf': 15, 'classifier:gradient_boosting:min_samples_split': 16, 'classifier:gradient_boosting:min_weight_fraction_leaf': 0.0, 'classifier:gradient_boosting:n_estimators': 340, 'classifier:gradient_boosting:subsample': 0.6289005711340923, 'one_hot_encoding:minimum_fraction': 0.0002148748655476835},\n",
      "dataset_properties={\n",
      "  'task': 2,\n",
      "  'sparse': False,\n",
      "  'multilabel': False,\n",
      "  'multiclass': True,\n",
      "  'target_type': 'classification',\n",
      "  'signed': False})),\n",
      "(0.020000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'k_nearest_neighbors', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'none', 'classifier:k_nearest_neighbors:n_neighbors': 2, 'classifier:k_nearest_neighbors:p': 1, 'classifier:k_nearest_neighbors:weights': 'distance', 'one_hot_encoding:minimum_fraction': 0.3530578080502024},\n",
      "dataset_properties={\n",
      "  'task': 2,\n",
      "  'sparse': False,\n",
      "  'multilabel': False,\n",
      "  'multiclass': True,\n",
      "  'target_type': 'classification',\n",
      "  'signed': False})),\n",
      "]\n",
      "auto-sklearn results:\n",
      "  Dataset name: digits\n",
      "  Metric: accuracy\n",
      "  Best validation score: 0.982022\n",
      "  Number of target algorithm runs: 23\n",
      "  Number of successful target algorithm runs: 0\n",
      "  Number of crashed target algorithm runs: 0\n",
      "  Number of target algorithms that exceeded the memory limit: 0\n",
      "  Number of target algorithms that exceeded the time limit: 0\n",
      "\n",
      "Accuracy score 0.991111111111\n"
     ]
    }
   ],
   "source": [
    "def main():\n",
    "    X, y = sklearn.datasets.load_digits(return_X_y=True)\n",
    "    X_train, X_test, y_train, y_test = \\\n",
    "        sklearn.model_selection.train_test_split(X, y, random_state=1)\n",
    "\n",
    "    automl = autosklearn.classification.AutoSklearnClassifier(\n",
    "        time_left_for_this_task=120, per_run_time_limit=30,\n",
    "        tmp_folder='/tmp/autoslearn_sequential_example_tmp',\n",
    "        output_folder='/tmp/autosklearn_sequential_example_out',\n",
    "        # Do not construct ensembles in parallel to avoid using more than one\n",
    "        # core at a time. The ensemble will be constructed after auto-sklearn\n",
    "        # finished fitting all machine learning models.\n",
    "        ensemble_size=0, delete_tmp_folder_after_terminate=False)\n",
    "    automl.fit(X_train, y_train, dataset_name='digits')\n",
    "    # This call to fit_ensemble uses all models trained in the previous call\n",
    "    # to fit to build an ensemble which can be used with automl.predict()\n",
    "    automl.fit_ensemble(y_train, ensemble_size=50)\n",
    "\n",
    "    print(automl.show_models())\n",
    "    predictions = automl.predict(X_test)\n",
    "    print(automl.sprint_statistics())\n",
    "    print(\"Accuracy score\", sklearn.metrics.accuracy_score(y_test, predictions))\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    main()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}