{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using the Crystal Structure Representation of Ward et al.\n",
    "Recreate the Voronoi-tessellation-based machine learning approach of [Ward et al.](https://journals.aps.org/prb/abstract/10.1103/PhysRevB.96.024104). Builds a model to predict the formation enthalpy based on the crystal structure of a material, using the [FLLA dataset](https://onlinelibrary.wiley.com/doi/abs/10.1002/qua.24917). \n",
    "\n",
    "Note: Requires approximately 2 CPU-hours to run. \n",
    "\n",
    "*Last Tested Version of matminer*: 0.4.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "from matplotlib import pyplot as plt\n",
    "from matminer.datasets import load_dataset\n",
    "from matminer.featurizers.base import MultipleFeaturizer\n",
    "from matminer.featurizers.composition import ElementProperty, Stoichiometry, ValenceOrbital, IonProperty\n",
    "from matminer.featurizers.structure import (SiteStatsFingerprint, StructuralHeterogeneity,\n",
    "                                            ChemicalOrdering, StructureComposition, MaximumPackingEfficiency)\n",
    "from matminer.featurizers.conversions import DictToObject\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.model_selection import ShuffleSplit, train_test_split\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.impute import SimpleImputer\n",
    "from scipy import stats\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create the featurizer\n",
    "Ward et al. use a variety of different featurizers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "featurizer = MultipleFeaturizer([\n",
    "    SiteStatsFingerprint.from_preset(\"CoordinationNumber_ward-prb-2017\"),\n",
    "    StructuralHeterogeneity(),\n",
    "    ChemicalOrdering(),\n",
    "    MaximumPackingEfficiency(),\n",
    "    SiteStatsFingerprint.from_preset(\"LocalPropertyDifference_ward-prb-2017\"),\n",
    "    StructureComposition(Stoichiometry()),\n",
    "    StructureComposition(ElementProperty.from_preset(\"magpie\")),\n",
    "    StructureComposition(ValenceOrbital(props=['frac'])),\n",
    "    StructureComposition(IonProperty(fast=True))\n",
    "])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load in a test dataset\n",
    "Get the dataset from Faber 2015"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 3938 entries\n",
      "CPU times: user 3.99 s, sys: 160 ms, total: 4.15 s\n",
      "Wall time: 6.93 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# Note: If this is your first time loading the flla dataset, it will be downloaded from an online dataset repository\n",
    "data = load_dataset(\"flla\")\n",
    "print('Loaded {} entries'.format(len(data)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "258688ef88bd4334b19db447c9ae527f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='DictToObject', max=3938), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "dto = DictToObject(target_col_id='structure', overwrite_data=True)\n",
    "data = dto.featurize_dataframe(data, 'structure')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of features: 273\n",
      "Number of sites in structure: 2\n",
      "CPU times: user 200 ms, sys: 10.9 ms, total: 211 ms\n",
      "Wall time: 387 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print('Total number of features:', len(featurizer.featurize(data['structure'][0])))\n",
    "print('Number of sites in structure:', len(data['structure'][0]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ward et al. report 100ms for their average featurization time. At least for this structure, we have a similar runtime."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Featurize the entire test set\n",
    "Running the calculations in parallel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "62f2372432b849a196117e13dab56558",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='MultipleFeaturizer', max=3938), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/daniel/.conda/envs/matminer/lib/python3.6/site-packages/pymatgen/core/periodic_table.py:409: UserWarning:\n",
      "\n",
      "No electronegativity for Ne. Setting to NaN. This has no physical meaning, and is mainly done to avoid errors caused by the code expecting a float.\n",
      "\n",
      "/Users/daniel/.conda/envs/matminer/lib/python3.6/site-packages/pymatgen/core/periodic_table.py:409: UserWarning:\n",
      "\n",
      "No electronegativity for He. Setting to NaN. This has no physical meaning, and is mainly done to avoid errors caused by the code expecting a float.\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "CPU times: user 4.52 s, sys: 1.09 s, total: 5.61 s\n",
      "Wall time: 53min 54s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "X = featurizer.featurize_many(data['structure'], ignore_errors=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Convert `X` to a full array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Input data shape: (3938, 273)\n"
     ]
    }
   ],
   "source": [
    "X = np.array(X)\n",
    "print('Input data shape:', X.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check how many tessellations failed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number failed: 14/3938\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "failed = np.any(pd.isnull(X), axis=1)\n",
    "print('Number failed: {}/{}'.format(np.sum(failed), len(failed)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train an ML Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Pipeline([\n",
    "    ('imputer', SimpleImputer()), # For the failed structures\n",
    "    ('model', RandomForestRegressor(n_estimators=150, n_jobs=-1))\n",
    "])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train model on whole dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 26s, sys: 2.2 s, total: 1min 28s\n",
      "Wall time: 37.4 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Pipeline(memory=None,\n",
       "     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',\n",
       "       verbose=0)), ('model', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
       "           max_features='auto', max_leaf_nodes=None,\n",
       "           min_impurity_decrease=0.0, min_impuri...mators=150, n_jobs=-1,\n",
       "           oob_score=False, random_state=None, verbose=0, warm_start=False))])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "model.fit(X, data['formation_energy_per_atom'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Evaluate the MAE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/daniel/.conda/envs/matminer/lib/python3.6/site-packages/sklearn/model_selection/_split.py:1678: FutureWarning:\n",
      "\n",
      "From version 0.21, test_size will always complement train_size unless both are specified.\n",
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e400c737c6b147c9b5a0aec779428774",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "maes = []\n",
    "for train_ids, test_ids in tqdm(ShuffleSplit(train_size=3000, n_splits=20).split(X)):\n",
    "    # Split off the datasets\n",
    "    train_X = X[train_ids, :]\n",
    "    train_y = data['formation_energy_per_atom'].iloc[train_ids]\n",
    "    test_X = X[test_ids, :]\n",
    "    test_y = data['formation_energy_per_atom'].iloc[test_ids]\n",
    "    \n",
    "    # Train the model\n",
    "    model.fit(train_X, train_y)\n",
    "    \n",
    "    # Run the model, compute MAE\n",
    "    predict_y = model.predict(test_X)\n",
    "    maes.append(np.abs(test_y - predict_y).mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MAE: 0.170+/-0.002 eV/atom\n"
     ]
    }
   ],
   "source": [
    "print('MAE: {:.3f}+/-{:.3f} eV/atom'.format(np.mean(maes), stats.sem(maes)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "*Finding*: 0.17 eV/atom is in close agreement to what Ward et al. report in their reproduction of this test using OQMD data and Magpie to compute features."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "matminer python",
   "language": "python",
   "name": "matminer"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}