{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get all MP oxide CIFs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook walks through a use case of getting a property (in this case, a material's structure in CIF format) for tens of thousands of materials in the MP database for offline analysis. Two helpful ideas presented here are to (1) chunk your API requests and (2) save the IDs for materials you have so that incremental updates to your offline collection can be done efficiently.\n",
    "\n",
    "It is assumed that you have an MP API key set as your \"MAPI_KEY\" environment variable. If not, be sure to paste in your API key as a string argument to the `MPRester()` call in the first cell below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import json\n",
    "from itertools import izip_longest\n",
    "\n",
    "# An optional utility to display a progress bar\n",
    "# for long-running loops. `pip install tqdm`.\n",
    "from tqdm import tqdm\n",
    "\n",
    "from pymatgen.ext.matproj import MPRester\n",
    "\n",
    "mpr = MPRester()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "entries = mpr.query({\"elements\": \"O\", \"nelements\": {\"$gte\": 2}}, [\"material_id\"])\n",
    "oxide_mp_ids = [e['material_id'] for e in entries]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "38371"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(oxide_mp_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# A utility function to \"chunk\" our queries\n",
    "\n",
    "def grouper(iterable, n, fillvalue=None):\n",
    "    \"Collect data into fixed-length chunks or blocks\"\n",
    "    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx\n",
    "    args = [iter(iterable)] * n\n",
    "    return izip_longest(fillvalue=fillvalue, *args)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 39/39 [00:20<00:00,  1.97it/s]\n"
     ]
    }
   ],
   "source": [
    "data = []\n",
    "mpid_groups = [g for g in grouper(oxide_mp_ids, 1000)]\n",
    "for group in tqdm(mpid_groups):\n",
    "    # The last group may have fewer than 1000 actual ids,\n",
    "    # so filter the `None`s out.\n",
    "    mpid_list = filter(None, group)\n",
    "    entries = mpr.query({\"material_id\": {\"$in\": mpid_list}}, [\"material_id\", \"cif\"])\n",
    "    data.extend(entries)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 38371/38371 [00:03<00:00, 12502.72it/s]\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "if not os.path.exists('mp_oxide_cifs'):\n",
    "    os.mkdir('mp_oxide_cifs')\n",
    "\n",
    "for d in tqdm(data):\n",
    "    with open(\"mp_oxide_cifs/{}.cif\".format(d[\"material_id\"]), 'w') as f:\n",
    "        f.write(d[\"cif\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Save IDs for saved structures, so you can\n",
    "# efficiently update later.\n",
    "\n",
    "with open('oxide_mp_ids.json', 'w') as f:\n",
    "    json.dump(oxide_mp_ids, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Structure Summary\n",
       "Lattice\n",
       "    abc : 3.7931997599999998 3.7931997599999998 3.7931997599999998\n",
       " angles : 90.0 90.0 90.0\n",
       " volume : 54.577940461944955\n",
       "      A : 3.7931997599999998 0.0 2.3226649723052543e-16\n",
       "      B : -2.3226649723052543e-16 3.7931997599999998 2.3226649723052543e-16\n",
       "      C : 0.0 0.0 3.7931997599999998\n",
       "PeriodicSite: Re (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000]\n",
       "PeriodicSite: O (1.8966, 0.0000, 0.0000) [0.5000, 0.0000, 0.0000]\n",
       "PeriodicSite: O (0.0000, 0.0000, 1.8966) [0.0000, 0.0000, 0.5000]\n",
       "PeriodicSite: O (-0.0000, 1.8966, 0.0000) [0.0000, 0.5000, 0.0000]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Do fun things with pymatgen\n",
    "\n",
    "from pymatgen.core import Structure\n",
    "\n",
    "Structure.from_file('mp_oxide_cifs/mp-190.cif')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Update collection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "entries = mpr.query({\"elements\": \"O\", \"nelements\": {\"$gte\": 2}}, [\"material_id\"])\n",
    "oxide_mp_ids = [e['material_id'] for e in entries]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Figure out what you don't have\n",
    "\n",
    "new_mp_ids = []\n",
    "with open('oxide_mp_ids.json', 'r') as f:\n",
    "    old_mp_ids = json.load(f)\n",
    "    new_mp_ids = list(set(oxide_mp_ids) - set(old_mp_ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "new_data = mpr.query({\"material_id\": {\"$in\": new_mp_ids}}, [\"material_id\", \"cif\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open('oxide_mp_ids.json', 'w') as f:\n",
    "    json.dump(oxide_mp_ids, f)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}