{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset</th>\n",
       "      <th>classifier</th>\n",
       "      <th>parameters</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>macrof1</th>\n",
       "      <th>bal_accuracy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>led24</td>\n",
       "      <td>BernoulliNB</td>\n",
       "      <td>preprocessor=Binarizer,alpha=0.0,fit_prior=Tru...</td>\n",
       "      <td>0.722812</td>\n",
       "      <td>0.721038</td>\n",
       "      <td>0.84569</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>led24</td>\n",
       "      <td>BernoulliNB</td>\n",
       "      <td>preprocessor=Binarizer,alpha=0.0,fit_prior=Tru...</td>\n",
       "      <td>0.722812</td>\n",
       "      <td>0.721038</td>\n",
       "      <td>0.84569</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>led24</td>\n",
       "      <td>BernoulliNB</td>\n",
       "      <td>preprocessor=Binarizer,alpha=0.0,fit_prior=Tru...</td>\n",
       "      <td>0.722812</td>\n",
       "      <td>0.721038</td>\n",
       "      <td>0.84569</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>led24</td>\n",
       "      <td>BernoulliNB</td>\n",
       "      <td>preprocessor=Binarizer,alpha=0.0,fit_prior=Tru...</td>\n",
       "      <td>0.722812</td>\n",
       "      <td>0.721038</td>\n",
       "      <td>0.84569</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>led24</td>\n",
       "      <td>BernoulliNB</td>\n",
       "      <td>preprocessor=Binarizer,alpha=0.0,fit_prior=Tru...</td>\n",
       "      <td>0.722812</td>\n",
       "      <td>0.721038</td>\n",
       "      <td>0.84569</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  dataset   classifier                                         parameters  \\\n",
       "0   led24  BernoulliNB  preprocessor=Binarizer,alpha=0.0,fit_prior=Tru...   \n",
       "1   led24  BernoulliNB  preprocessor=Binarizer,alpha=0.0,fit_prior=Tru...   \n",
       "2   led24  BernoulliNB  preprocessor=Binarizer,alpha=0.0,fit_prior=Tru...   \n",
       "3   led24  BernoulliNB  preprocessor=Binarizer,alpha=0.0,fit_prior=Tru...   \n",
       "4   led24  BernoulliNB  preprocessor=Binarizer,alpha=0.0,fit_prior=Tru...   \n",
       "\n",
       "   accuracy   macrof1  bal_accuracy  \n",
       "0  0.722812  0.721038       0.84569  \n",
       "1  0.722812  0.721038       0.84569  \n",
       "2  0.722812  0.721038       0.84569  \n",
       "3  0.722812  0.721038       0.84569  \n",
       "4  0.722812  0.721038       0.84569  "
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "data = pd.read_csv('sklearn-preprocessor-benchmark-data.tsv.gz', sep='\\t', names=['dataset',\n",
    "                                                                     'classifier',\n",
    "                                                                     'parameters',\n",
    "                                                                     'accuracy', \n",
    "                                                                     'macrof1',\n",
    "                                                                     'bal_accuracy']).fillna('')\n",
    "\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "166"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_datasets = set(data['dataset'].unique())\n",
    "len(all_datasets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 672256/672256 [03:26<00:00, 3255.35it/s] \n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "missing_evaluations = {}\n",
    "\n",
    "for i, classifier_param_group in tqdm(data.groupby(['classifier', 'parameters'])):\n",
    "    clf_p_datasets = set(classifier_param_group['dataset'].unique())\n",
    "    clf_p_missing_datasets = list(all_datasets - clf_p_datasets)\n",
    "    if len(clf_p_missing_datasets) > 0:\n",
    "        missing_evaluations['-'.join(i)] = clf_p_missing_datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(672256, 670749)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data.groupby(['classifier', 'parameters'])['accuracy'].max().reset_index()), len(missing_evaluations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "dataset_counts = defaultdict(int)\n",
    "\n",
    "for missing_evals_list in missing_evaluations.values():\n",
    "    for dataset_name in missing_evals_list:\n",
    "        dataset_counts[dataset_name] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mnist 670368 (70000, 785)\n",
      "kddcup 664716 (494020, 42)\n",
      "poker 651044 (1025009, 11)\n",
      "mushroom 630647 (8124, 23)\n",
      "mfeat-morphological 613486 (2000, 7)\n",
      "fars 558980 (100968, 30)\n",
      "mfeat-karhunen 538282 (2000, 65)\n",
      "GAMETES_Epistasis_2-Way_1000atts_0.4H_EDM-1_EDM-1_1 534425 (1600, 1001)\n",
      "clean2 523655 (6598, 169)\n",
      "mfeat-pixel 516721 (2000, 241)\n",
      "mfeat-factors 515980 (2000, 217)\n",
      "connect-4 501952 (67557, 43)\n",
      "dna 499840 (3186, 181)\n",
      "letter 496578 (20000, 17)\n",
      "sleep 479563 (105908, 14)\n",
      "waveform-21 472548 (5000, 22)\n",
      "parity5+5 464801 (1124, 11)\n",
      "shuttle 423744 (58000, 10)\n",
      "texture 419133 (5500, 41)\n",
      "optdigits 416683 (5620, 65)\n",
      "lymphography 412790 (148, 19)\n",
      "kr-vs-kp 411913 (3196, 37)\n",
      "hypothyroid 408873 (3163, 26)\n",
      "analcatdata_authorship 407422 (841, 71)\n",
      "allhypo 397289 (3770, 30)\n",
      "satimage 395571 (6435, 37)\n",
      "allrep 394608 (3772, 30)\n",
      "krkopt 394021 (28056, 7)\n",
      "churn 393835 (5000, 21)\n",
      "german 391877 (1000, 21)\n",
      "solar-flare_2 391625 (1066, 13)\n",
      "waveform-40 390714 (5000, 41)\n",
      "twonorm 375651 (7400, 21)\n",
      "horse-colic 372328 (368, 23)\n",
      "led7 363254 (3200, 8)\n",
      "pendigits 342452 (10992, 17)\n",
      "clean1 318285 (476, 169)\n",
      "prnn_crabs 306492 (200, 8)\n",
      "GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_50_EDM-2_001 303088 (1600, 21)\n",
      "molecular-biology_promoters 299878 (106, 59)\n",
      "ionosphere 291056 (351, 35)\n",
      "coil2000 211389 (9822, 86)\n",
      "monk2 177503 (601, 7)\n",
      "liver-disorder 173703 (345, 7)\n",
      "Hill_Valley_with_noise 164850 (1212, 101)\n",
      "cmc 162261 (1473, 10)\n",
      "adult 161522 (48842, 15)\n",
      "collins 160129 (485, 24)\n",
      "ecoli 158180 (327, 8)\n",
      "mfeat-fourier 155190 (2000, 77)\n",
      "Hill_Valley_without_noise 150347 (1212, 101)\n",
      "glass 147097 (205, 10)\n",
      "mfeat-zernike 146042 (2000, 48)\n",
      "movement_libras 143856 (360, 91)\n",
      "magic 142839 (19020, 11)\n",
      "cloud 141903 (108, 8)\n",
      "phoneme 139319 (5404, 6)\n",
      "agaricus-lepiota 138797 (8145, 23)\n",
      "page-blocks 138579 (5473, 11)\n",
      "calendarDOW 136917 (399, 33)\n",
      "cleveland-nominal 136461 (303, 8)\n",
      "contraceptive 135137 (1473, 10)\n",
      "GAMETES_Epistasis_3-Way_20atts_0.2H_EDM-1_1 134632 (1600, 21)\n",
      "iris 134621 (150, 5)\n",
      "monk1 134079 (556, 7)\n",
      "cars 131878 (392, 9)\n",
      "GAMETES_Epistasis_2-Way_20atts_0.1H_EDM-1_1 131467 (1600, 21)\n",
      "nursery 129663 (12958, 9)\n",
      "banana 129433 (5300, 3)\n",
      "splice 125856 (3188, 61)\n",
      "hayes-roth 125658 (160, 5)\n",
      "flare 125588 (1066, 11)\n",
      "balance-scale 121579 (625, 5)\n",
      "ann-thyroid 120470 (7200, 22)\n",
      "GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_75_EDM-2_001 118885 (1600, 21)\n",
      "analcatdata_dmft 118871 (797, 5)\n",
      "led24 117122 (3200, 25)\n",
      "spambase 116508 (4601, 58)\n",
      "confidence 116214 (72, 4)\n",
      "analcatdata_happiness 116148 (60, 4)\n",
      "yeast 115376 (1479, 9)\n",
      "ring 113427 (7400, 21)\n",
      "parity5 112139 (32, 6)\n",
      "mux6 112082 (128, 7)\n",
      "allbp 111211 (3772, 30)\n",
      "cars1 109223 (392, 8)\n",
      "car-evaluation 108812 (1728, 22)\n",
      "wine-quality-white 103272 (4893, 12)\n",
      "chess 103123 (3196, 37)\n",
      "credit-a 102427 (690, 16)\n",
      "dermatology 100549 (366, 35)\n",
      "promoters 99695 (106, 59)\n",
      "dis 98801 (3772, 30)\n",
      "car 95034 (1728, 7)\n",
      "bupa 93325 (345, 7)\n",
      "flags 93218 (178, 44)\n",
      "analcatdata_germangss 92437 (400, 6)\n",
      "new-thyroid 91091 (215, 6)\n",
      "tae 90964 (151, 6)\n",
      "prnn_fglass 90952 (205, 10)\n",
      "cleveland 88380 (303, 14)\n",
      "GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1 86582 (1600, 21)\n",
      "haberman 86048 (306, 4)\n",
      "segmentation 85367 (2310, 20)\n",
      "colic 81253 (368, 23)\n",
      "soybean 80745 (675, 36)\n",
      "allhyper 80499 (3771, 30)\n",
      "vowel 79967 (990, 14)\n",
      "wine-quality-red 79357 (1599, 12)\n",
      "analcatdata_aids 74792 (50, 5)\n",
      "postoperative-patient-data 74781 (88, 9)\n",
      "analcatdata_boxing1 74751 (120, 4)\n",
      "breast-w 67897 (699, 10)\n",
      "breast-cancer-wisconsin 66270 (569, 31)\n",
      "analcatdata_asbestos 65835 (83, 4)\n",
      "heart-c 64867 (303, 14)\n",
      "wine-recognition 59537 (178, 14)\n",
      "glass2 56976 (163, 10)\n",
      "vehicle 56753 (846, 19)\n",
      "auto 56743 (202, 26)\n",
      "solar-flare_1 56743 (315, 13)\n",
      "heart-statlog 54169 (270, 14)\n",
      "analcatdata_boxing2 48663 (132, 4)\n",
      "analcatdata_creditscore 47667 (100, 7)\n",
      "analcatdata_japansolvent 44911 (52, 10)\n",
      "analcatdata_bankruptcy 37644 (50, 7)\n",
      "titanic 37597 (2201, 4)\n",
      "prnn_synth 37585 (250, 3)\n",
      "lupus 37451 (87, 4)\n",
      "irish 37416 (500, 6)\n",
      "saheart 37411 (462, 10)\n",
      "appendicitis 37410 (106, 8)\n",
      "profb 37408 (672, 10)\n",
      "pima 37404 (768, 9)\n",
      "diabetes 37403 (768, 9)\n",
      "tic-tac-toe 37403 (958, 10)\n",
      "xd6 37402 (973, 10)\n",
      "analcatdata_lawsuit 37402 (264, 5)\n",
      "threeOf9 37402 (512, 10)\n",
      "breast-cancer 37402 (286, 10)\n",
      "biomed 37399 (209, 9)\n",
      "monk3 37399 (554, 7)\n",
      "corral 37393 (160, 7)\n",
      "buggyCrx 32352 (690, 16)\n",
      "cleve 30951 (303, 14)\n",
      "heart-h 30002 (294, 14)\n",
      "hepatitis 26724 (155, 20)\n",
      "backache 24681 (180, 33)\n",
      "australian 24648 (690, 15)\n",
      "analcatdata_fraud 22190 (42, 12)\n",
      "analcatdata_cyyoung9302 16509 (92, 11)\n",
      "credit-g 2840 (1000, 21)\n",
      "hungarian 2839 (294, 14)\n",
      "mofn-3-7-10 720 (1324, 11)\n",
      "analcatdata_cyyoung8092 94 (97, 11)\n",
      "labor 77 (57, 17)\n",
      "wdbc 56 (569, 31)\n",
      "crx 55 (690, 16)\n",
      "schizo 48 (340, 15)\n",
      "spect 48 (267, 23)\n",
      "breast 46 (699, 11)\n",
      "spectf 45 (349, 45)\n",
      "house-votes-84 43 (435, 17)\n",
      "vote 43 (435, 17)\n",
      "tokyo1 26 (959, 45)\n",
      "sonar 24 (208, 61)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "for dataset_name in sorted(dataset_counts, key=dataset_counts.get, reverse=True):\n",
    "    tmp_data = pd.read_csv('~/Downloads/{}.csv.gz'.format(dataset_name), sep='\\t', compression='gzip')\n",
    "    print(dataset_name, dataset_counts[dataset_name], tmp_data.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}