{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load and Process models\n",
    "\n",
    "This script will load the M models in the collection using cobrapy, and convert them to a normalized format. They will also be exported to the \"mat\" format used by the COBRA toolbox.\n",
    "\n",
    "This requires [cobrapy](https://opencobra.github.io/cobrapy) version 0.4.0b1 or later."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import warnings\n",
    "import re\n",
    "from itertools import chain\n",
    "\n",
    "import sympy\n",
    "import scipy\n",
    "import scipy.io\n",
    "\n",
    "import cobra\n",
    "\n",
    "from read_excel import read_excel"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read in Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def open_exchanges(model, amount=10):\n",
    "    for reaction in model.reactions:\n",
    "        if len(reaction.metabolites) == 1:\n",
    "            # Ensure we are not creating any new sinks\n",
    "            if reaction.metabolites.values()[0] > 0:\n",
    "                reaction.upper_bound = max(reaction.upper_bound, amount)\n",
    "            else:\n",
    "                reaction.lower_bound = min(reaction.lower_bound, -amount)\n",
    "\n",
    "def add_exchanges(model, extracellular_suffix=\"[e]\", uptake_amount=10):\n",
    "    for metabolite in model.metabolites:\n",
    "        if str(metabolite).endswith(extracellular_suffix):\n",
    "            if len(metabolite.reactions) == 0:\n",
    "                print \"no reactions for \" + metabolite.id\n",
    "                continue\n",
    "            if min(len(i.metabolites) for i in metabolite.reactions) > 1:\n",
    "                EX_reaction = cobra.Reaction(\"EX_\" + metabolite.id)\n",
    "                EX_reaction.add_metabolites({metabolite: 1})\n",
    "                m.add_reaction(EX_reaction)\n",
    "                EX_reaction.upper_bound = uptake_amount\n",
    "                EX_reaction.lower_bound = -uptake_amount"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### SBML models\n",
    "\n",
    "These models will be read in using [libSBML](http://sbml.org/Software/libSBML) through cobrapy. Some models will need their exchanges opened."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "legacy_SBML = {\"T_Maritima\", \"iNJ661m\", \"iSR432\", \"iTH366\"}\n",
    "open_boundaries = {\"iRsp1095\", \"iWV1314\", \"iFF708\", \"iZM363\"}\n",
    "\n",
    "models = cobra.DictList()\n",
    "for i in sorted(os.listdir(\"sbml\")):\n",
    "    if not i.endswith(\".xml\"):\n",
    "        continue\n",
    "    model_id = i[:-4]\n",
    "    filepath = os.path.join(\"sbml\", i)\n",
    "    with warnings.catch_warnings():\n",
    "        warnings.simplefilter(\"ignore\")\n",
    "        m = cobra.io.read_legacy_sbml(filepath) if model_id in legacy_SBML \\\n",
    "            else cobra.io.read_sbml_model(filepath)\n",
    "    m.id = m.description = model_id.replace(\".\", \"_\")\n",
    "\n",
    "    if m.id in open_boundaries:\n",
    "        open_exchanges(m)\n",
    "\n",
    "    models.append(m)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Models available in COBRA Toolbox \"mat\" format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for i in sorted(os.listdir(\"mat\")):\n",
    "    if not i.endswith(\".mat\"):\n",
    "        continue\n",
    "    m = cobra.io.load_matlab_model(os.path.join(\"mat\", i))\n",
    "    m.id = i[:-4]\n",
    "    \n",
    "    if m.id in open_boundaries:\n",
    "        open_exchanges(m)\n",
    "\n",
    "    models.append(m)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Some models are only available as Microsoft Excel files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "m = read_excel(\"xls/iJS747.xls\",\n",
    "               verbose=False, rxn_sheet_header=7)\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "m = read_excel(\"xls/iRM588.xls\",\n",
    "               verbose=False, rxn_sheet_header=5)\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "m = read_excel(\"xls/iSO783.xls\", verbose=False, rxn_sheet_header=2)\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "m = read_excel(\"xls/iCR744.xls\", rxn_sheet_header=4, verbose=False)\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "m = read_excel(\"xls/iNV213.xls\", rxn_str_key=\"Reaction Formula\", verbose=False)\n",
    "# remove boundary metabolites\n",
    "for met in list(m.metabolites):\n",
    "    if met.id.endswith(\"[b]\"):\n",
    "        met.remove_from_model()\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "cobra/core/Reaction.py:98 \u001b[1;31mUserWarning\u001b[0m: malformed gene_reaction_rule '(PICST_57189 or PICST_52197or PICST_32180 or PICST_68755 or PICST_11039 or PICST_78721 or PICST_37009 or PICST_41966 or PICST_67826 or PICST_52358 or PICST_28502)' for <Reaction SS1034 at 0x7f6d192ddc50>\n"
     ]
    }
   ],
   "source": [
    "m = read_excel(\"xls/iTL885.xls\", verbose=False,\n",
    "               rxn_id_key=\"Rxn name\", rxn_gpr_key=\"Gene-reaction association\", met_sheet_name=\"ignore\")\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "m = read_excel(\"xls/iWZ663.xls\", verbose=False,\n",
    "               rxn_id_key=\"auto\", rxn_name_key=\"Reaction name\", rxn_gpr_key=\"Local gene\")\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "m = read_excel(\"xls/iOR363.xls\", verbose=False)\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "m = read_excel(\"xls/iMA945.xls\", verbose=False)\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "m = read_excel(\"xls/iPP668.xls\", verbose=False)\n",
    "add_exchanges(m)\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "m = read_excel(\"xls/iVM679.xls\", verbose=False, met_sheet_name=\"ignore\",\n",
    "               rxn_id_key=\"Name\", rxn_name_key=\"Description\", rxn_str_key=\"Reaction\")\n",
    "open_exchanges(m)\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "m = read_excel(\"xls/iTY425.xls\", rxn_sheet_header=1,\n",
    "               rxn_sheet_name=\"S8\", rxn_id_key=\"Number\", rxn_str_key=\"Reaction\", verbose=False)\n",
    "add_exchanges(m, \"xt\")\n",
    "# Protein production reaction does not prdocue \"PROTEIN\" metabolite\n",
    "m.reactions.R511.add_metabolites({m.metabolites.PROTEIN: 1})\n",
    "m.id = m.id + \"_fixed\"\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "m = read_excel(\"xls/iSS724.xls\", rxn_str_key=\"Reactions\",\n",
    "               rxn_sheet_header=1, met_sheet_header=1, rxn_id_key=\"Name\",\n",
    "               verbose=False)\n",
    "add_exchanges(m, \"xt\")\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "m = read_excel(\"xls/iCS400.xls\", rxn_sheet_name=\"Complete Rxn List\",\n",
    "               rxn_sheet_header=2, rxn_str_key=\"Reaction\",\n",
    "               rxn_id_key=\"Name\", verbose=False)\n",
    "add_exchanges(m, \"xt\")\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "m = read_excel(\"xls/iLL672.xls\",\n",
    "               rxn_id_key=\"auto\", met_sheet_name=\"Appendix 3 iLL672 metabolites\",\\\n",
    "               rxn_str_key=\"REACTION\", rxn_gpr_key=\"skip\", verbose=False,\n",
    "               rxn_sheet_name='Appendix 3 iLL672 reactions')\n",
    "m.reactions[-1].objective_coefficient = 1\n",
    "m.metabolites.BM.remove_from_model()\n",
    "add_exchanges(m, \"xt\")\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plus_re = re.compile(\"(?<=\\S)\\+\")  # substitute H+ with H, etc.\n",
    "m = read_excel(\"xls/iMH551.xls\", rxn_sheet_name=\"GPR Annotation\", rxn_sheet_header=4,\n",
    "               rxn_id_key=\"auto\", rxn_str_key=\"REACTION\", rxn_gpr_key=\"skip\",\n",
    "               rxn_name_key=\"ENZYME\", rxn_skip_rows=[625, 782, 787], verbose=False,\n",
    "               rxn_sheet_converters={\"REACTION\": lambda x: plus_re.sub(\"\", x)})\n",
    "for met in m.metabolites:\n",
    "    if met.id.endswith(\"(extracellular)\"):\n",
    "        met.id = met.id[:-15] + \"_e\"\n",
    "m.repair()\n",
    "add_exchanges(m, \"_e\")\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "m = read_excel(\"xls/iCS291.xls\", rxn_sheet_name=\"Sheet1\",\n",
    "               rxn_str_key=\"Reaction\",\n",
    "               rxn_sheet_header=5, rxn_id_key=\"Name\",\n",
    "               verbose=False)\n",
    "add_exchanges(m, \"xt\")\n",
    "# BIOMASS is just all model metabolites in the Demands list\n",
    "m.add_reaction(cobra.Reaction(\"BIOMASS\"))\n",
    "# taken from Table 1 in publication\n",
    "biomass_mets = {}\n",
    "for i in {\"ALA\", \"ARG\", \"ASN\", \"ASP\", \"CYS\", \"GLU\", \"GLN\", \"GLY\",\n",
    "          \"HIS\", \"ILE\", \"LEU\", \"LYS\", \"MET\", \"PHE\", \"PRO\", \"SER\",\n",
    "          \"THR\", \"TRP\", \"TYR\", \"VAL\", \"PTRC\", \"SPMD\", \"ATP\", \"GTP\",\n",
    "          \"CTP\", \"UTP\", \"DATP\", \"DGTP\", \"DCTP\", \"DTTP\", \"PS\", \"PE\",\n",
    "          \"PG\", \"PEPTIDO\", \"LPS\", \"OPP\", \"UDPP\", \"NAD\", \"NADP\", \"FAD\",\n",
    "          \"COA\", \"ACP\", \"PTH\", \"THIAMIN\", \"MTHF\", \"MK\", \"DMK\"\n",
    "         }:\n",
    "    biomass_mets[m.metabolites.get_by_id(i)] = -1\n",
    "    dm = cobra.Reaction(\"DM_\" + i)\n",
    "    m.add_reaction(dm)\n",
    "    dm.add_metabolites({m.metabolites.get_by_id(i): -1})\n",
    "m.reactions.BIOMASS.add_metabolites(biomass_mets)\n",
    "m.change_objective(\"BIOMASS\")\n",
    "add_exchanges(m, \"xt\")\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "m = read_excel(\"xls/iYO844.xls\", rxn_sheet_name=\"Reaction and locus\", verbose=False, rxn_gpr_key=\"Locus name\",\n",
    "           rxn_str_key=u'Equation (note [c] and [e] at the beginning refer to the compartment \\n'\n",
    "                       'the reaction takes place in, cytosolic and extracellular respectively)')\n",
    "add_exchanges(m)\n",
    "\n",
    "# create the biomass reaction from supplementary data table\n",
    "# http://www.jbc.org/content/suppl/2007/06/29/M703759200.DC1/Biomass_composition.doc\n",
    "r = cobra.Reaction(\"biomass\")\n",
    "r.objective_coefficient = 1.\n",
    "m.add_reaction(r)\n",
    "r.reaction = (\"408.3 gly[c] + 266.9 ala-L[c] + 306.7 val-L[c] + 346.4 leu-L[c] + 269.9 ile-L[c] + \"\n",
    "              \"216.2 ser-L[c] + 186.3 thr-L[c] + 175.9 phe-L[c] + 110.8 tyr-L[c] + 54.3 trp-L[c] + \"\n",
    "              \"56.7 cys-L[c] + 113.3 met-L[c] + 323.1 lys-L[c] + 193.0 arg-L[c] + 81.7 his-L[c] + \"\n",
    "              \"148.0 asp-L[c] +  260.4 glu-L[c] + 148.0 asp-L[c] + 260.3 gln-L[c] + 160.6 pro-L[c] + \"\n",
    "              \"62.7 gtp[c] + 38.9 ctp[c] + 41.5 utp[c] + 23.0 datp[c] + 17.4 dgtp[c] + 17.4 dctp[c] + \"\n",
    "              \"22.9 dttp[c] + 0.085750 m12dg_BS[c] + 0.110292 d12dg_BS[c] + 0.065833 t12dg_BS[c] + \"\n",
    "              \"0.004642 cdlp_BS[c] + 0.175859 pgly_BS[c] + 0.022057 lysylpgly_BS[c] + 0.559509 psetha_BS[c] + \"\n",
    "              \"0.006837 lipo1-24_BS[c] + 0.006123 lipo2-24_BS[c] + 0.018162 lipo3-24_BS[c] + \"\n",
    "              \"0.014676 lipo4-24_BS[c] + 101.82 peptido_BS[c] + 3.62 gtca1-45_BS[c] + 2.35 gtca2-45_BS[c] + \"\n",
    "              \"1.82 gtca3-45_BS[c] + 3.11 tcam_BS[c] + 706.3 k[c] + 101.7 mg2[c] + 3.4 fe3[c] + 3.2 ca2[c] + \"\n",
    "              \"0.9 ppi[c] + 0.3 mql7[c] + 0.4 10fthf[c] + 16.2 nad[c] + 4.7 amp[c]  + 2.6 adp[c] + 1.0 cmp[c]  + \"\n",
    "              \"0.9 nadp[c] + 0.5 ctp[c]  + 0.5 gmp[c] + 0.4 gtp[c] + 0.3 cdp[c] + 0.2 nadph[c] + 0.2 gdp[c] + \"\n",
    "              \"105053.5 atp[c] + 105000 h2o[c] --> 104985.6 pi[c] + 104997.4 adp[c] + 105000 h[c]\")\n",
    "# units are in mg for this reaction, so scale to grams\n",
    "r *= 0.001\n",
    "models.append(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "models.sort()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Determine Objective Reactions\n",
    "\n",
    "Some of these models do not specify an objective (or biomass) reaction. These will be automatically detected if possible, or set from a manually curated list."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# regular expression to detect \"biomass\"\n",
    "biomass_re = re.compile(\"biomass\", re.IGNORECASE)\n",
    "\n",
    "# manually identified objective reactions\n",
    "curated_objectives = {\"VvuMBEL943\": \"R806\",\n",
    "                      \"iAI549\": \"BIO_CBDB1_DM_855\",\n",
    "                      \"mus_musculus\": \"BIO028\",\n",
    "                      \"iRsp1095\": \"RXN1391\",\n",
    "                      \"iLC915\": \"r1133\",\n",
    "                      \"PpaMBEL1254\": \"R01288\",\n",
    "                      \"AbyMBEL891\": \"R761\",\n",
    "                      \"iAbaylyiV4\": \"GROWTH_DASH_RXN\",\n",
    "                      \"iOG654\": \"RM00001\",\n",
    "                      \"iOR363\": \"OF14e_Retli\",\n",
    "                      \"iRM588\": \"agg_GS13m\",\n",
    "                      \"iJS747\": \"agg_GS13m_2\",\n",
    "                      \"iTL885\": \"SS1240\",\n",
    "                      \"iMH551\": \"R0227\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "autodetected biomass metabolite 'BIOMASS_c' for model 'GSMN_TB'\n",
      "autodetected objective reaction 'Biomass' for model 'PpuMBEL1071'\n",
      "autodetected objective reaction 'RXNBiomass' for model 'SpoMBEL1693'\n",
      "autodetected objective reaction 'BIOMASS_LM3' for model 'iAC560'\n",
      "autodetected biomass metabolite '140' for model 'iAO358'\n",
      "autodetected biomass metabolite 'PROT_LPL_v60[c]' for model 'iBT721_fixed'\n",
      "autodetected objective reaction 'BIO_Rfer3' for model 'iCR744'\n",
      "autodetected objective reaction 'BIOMASS' for model 'iCS400'\n",
      "autodetected biomass metabolite 'BIOMASS' for model 'iMA871'\n",
      "autodetected objective reaction 'ST_biomass_core' for model 'iMA945'\n",
      "autodetected objective reaction 'biomass_mm_1_no_glygln' for model 'iMM1415'\n",
      "autodetected objective reaction 'biomass_STR' for model 'iMP429_fixed'\n",
      "autodetected objective reaction 'R_biomass_target' for model 'iNV213'\n",
      "autodetected objective reaction 'BIOMASS' for model 'iPP668'\n",
      "autodetected objective reaction 'Biomass_Chlamy_auto' for model 'iRC1080'\n",
      "autodetected objective reaction 'ST_Biomass_Final' for model 'iRR1083'\n",
      "autodetected objective reaction 'Biomass' for model 'iSO783'\n",
      "autodetected objective reaction 'biomass_target' for model 'iSR432'\n",
      "autodetected biomass metabolite 'BIOMASS' for model 'iSS724'\n",
      "autodetected biomass metabolite 'm828' for model 'iSS884'\n",
      "autodetected biomass metabolite 'BIOMASS' for model 'iTY425_fixed'\n",
      "autodetected objective reaction 'Biomass' for model 'iVM679'\n",
      "autodetected biomass metabolite 'Biomass' for model 'iWV1314'\n",
      "autodetected objective reaction 'R0822' for model 'iWZ663'\n"
     ]
    }
   ],
   "source": [
    "for m in models:\n",
    "    if len(m.reactions.query(lambda x: x > 0, \"objective_coefficient\")):\n",
    "        continue\n",
    "    if m.id in curated_objectives:\n",
    "        m.change_objective(curated_objectives[m.id])\n",
    "        continue\n",
    "    \n",
    "    # look for reactions with \"biomass\" in the id or name\n",
    "    possible_objectives = m.reactions.query(biomass_re)\n",
    "    if len(possible_objectives) == 0:\n",
    "        possible_objectives = m.reactions.query(biomass_re, \"name\")\n",
    "    \n",
    "    # In some cases, a biomass \"metabolite\" is produced, whose production\n",
    "    # should be the objective function.\n",
    "    possible_biomass_metabolites = m.metabolites.query(biomass_re)\n",
    "    if len(possible_biomass_metabolites) == 0:\n",
    "        possible_biomass_metabolites = m.metabolites.query(biomass_re, \"name\")\n",
    "\n",
    "    if len(possible_biomass_metabolites) > 0:\n",
    "        biomass_met = possible_biomass_metabolites[0]\n",
    "        r = cobra.Reaction(\"added_biomass_sink\")\n",
    "        r.objective_coefficient = 1\n",
    "        r.add_metabolites({biomass_met: -1})\n",
    "        m.add_reaction(r)\n",
    "        print (\"autodetected biomass metabolite '%s' for model '%s'\" %\n",
    "              (biomass_met.id, m.id))\n",
    "\n",
    "    elif len(possible_objectives) > 0:\n",
    "        print(\"autodetected objective reaction '%s' for model '%s'\" %\n",
    "              (possible_objectives[0].id, m.id))\n",
    "        m.change_objective(possible_objectives[0])\n",
    "\n",
    "    else:\n",
    "        print(\"no objective found for \" + m.id)\n",
    "\n",
    "# Ensure the biomass objective flux is unconstrained\n",
    "for m in models:\n",
    "    for reaction in m.reactions.query(lambda x: x > 0, \"objective_coefficient\"):\n",
    "        reaction.lower_bound = min(reaction.lower_bound, 0)\n",
    "        reaction.upper_bound = max(reaction.upper_bound, 1000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fixes of various encoding bugs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### General"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "GSMN_TB does not use the convention of extracellular metabolites with exchanges. Although the model still solves with this formulation, this is still normalized here. This process does not change the mathematical structure of the model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "h_c = models.GSMN_TB.metabolites.H_c\n",
    "for r in models.GSMN_TB.reactions:\n",
    "    if len(r.metabolites) == 2 and h_c in r.metabolites:\n",
    "        met = [i for i in r.metabolites if i is not h_c][0]\n",
    "        EX_met = cobra.Metabolite(met.id[:-1] + \"e\")\n",
    "        r.add_metabolites({EX_met: -r.metabolites[met]})\n",
    "        if \"EX_\" + EX_met.id not in models.GSMN_TB.reactions:\n",
    "            exchange = cobra.Reaction(\"EX_\" + EX_met.id)\n",
    "            exchange.add_metabolites({EX_met: -1})\n",
    "            exchange.lower_bound = -1000000.0\n",
    "            exchange.upper_bound = 1000000.0\n",
    "            models.GSMN_TB.add_reaction(exchange)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reaction and Metabolites"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### id's"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# reaction id's with spaces in them\n",
    "models.iJS747.reactions.get_by_id(\"HDH [deleted 01/16/2007  12:02:30 PM]\").id = \"HDH_del\"\n",
    "models.iJS747.reactions.get_by_id(\"HIBD [deleted 03/21/2007  01:06:12 PM]\").id = \"HIBD_del\"\n",
    "models.iAC560.reactions.get_by_id(\"GLUDx [m]\").id = \"GLUDx[m]\"\n",
    "for r in models.iOR363.reactions:\n",
    "    if \" \" in r.id:\n",
    "        r.id = r.id.split()[0]\n",
    "\n",
    "models.textbook.reactions.query(\"Biomass\")[0].id = \"Biomass_Ecoli_core\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use the convention underscore + compartment i.e. _c instead of [c] (c) etc."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "SQBKT_re = re.compile(\"\\[([a-z])\\]$\")\n",
    "\n",
    "def fix_brackets(id_str, compiled_re):\n",
    "    result = compiled_re.findall(id_str)\n",
    "    if len(result) > 0:\n",
    "        return compiled_re.sub(\"_\" + result[0], id_str)\n",
    "    else:\n",
    "        return id_str\n",
    "\n",
    "for r in models.iRS1597.reactions:\n",
    "    r.id = fix_brackets(r.id, re.compile(\"_LSQBKT_([a-z])_RSQBKT_$\"))\n",
    "\n",
    "for m_id in [\"iJS747\", \"iRM588\", \"iSO783\", \"iCR744\", \"iNV213\", \"iWZ663\", \"iOR363\", \"iMA945\", \"iPP668\",\n",
    "             \"iTL885\", \"iVM679\", \"iYO844\", \"iZM363\"]:\n",
    "    for met in models.get_by_id(m_id).metabolites:\n",
    "        met.id = fix_brackets(met.id, SQBKT_re)\n",
    "\n",
    "for met in models.S_coilicolor_fixed.metabolites:\n",
    "    if met.id.endswith(\"_None_\"):\n",
    "        met.id = met.id[:-6]\n",
    "\n",
    "# Some models only have intra and extracellular metabolites, but don't use _c and _e.\n",
    "for m_id in [\"iCS291\", \"iCS400\", \"iTY425_fixed\", \"iSS724\"]:\n",
    "    for metabolite in models.get_by_id(m_id).metabolites:\n",
    "        if metabolite.id.endswith(\"xt\"):\n",
    "            metabolite.id = metabolite.id[:-2] + \"_e\"\n",
    "        elif len(metabolite.id) < 2 or metabolite.id[-2] != \"_\":\n",
    "            metabolite.id = metabolite.id + \"_c\"\n",
    "\n",
    "# Exchange reactions should have the id of the metabolite after with the same convention\n",
    "for m_id in [\"iAF1260\", \"iJO1366\", \"iAF692\", \"iJN746\", \"iRC1080\", \"textbook\", \"iNV213\",\n",
    "             \"iIT341\", \"iJN678\", \"iJR904\", \"iND750\", \"iNJ661\", \"iPS189_fixed\", \"iSB619\",\n",
    "             \"iZM363\", \"iMH551\"]:\n",
    "    for r in models.get_by_id(m_id).reactions:\n",
    "        if len(r.metabolites) != 1:\n",
    "            continue\n",
    "        if r.id.startswith(\"EX_\"):\n",
    "            r.id = \"EX_\" + list(r.metabolites.keys())[0].id\n",
    "        if r.id.startswith(\"DM_\"):\n",
    "            r.id = \"DM_\" + list(r.metabolites.keys())[0].id\n",
    "\n",
    "for m in models:\n",
    "    m.repair()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Metabolite Formulas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for model in models:\n",
    "    for metabolite in model.metabolites:\n",
    "        if metabolite.formula is None:\n",
    "            metabolite.formula = \"\"\n",
    "            continue\n",
    "        if str(metabolite.formula).lower() == \"none\":\n",
    "            metabolite.formula = \"\"\n",
    "            continue\n",
    "        # some characters should not be in a formula\n",
    "        if \"(\" in metabolite.formula or \\\n",
    "                \")\" in metabolite.formula or \\\n",
    "                \".\" in metabolite.formula:\n",
    "            metabolite.formula = \"\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Metabolite Compartments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "compartments = {\n",
    "    'c': 'Cytoplasm',\n",
    "    'e': 'Extracellular',\n",
    "    'p': 'Periplasm',\n",
    "    'm': 'Mitochondria',\n",
    "    'g': 'Golgi',\n",
    "    'n': \"Nucleus\",\n",
    "    'r': \"Endoplasmic reticulum\",\n",
    "    'x': \"Peroxisome\",\n",
    "    'v': \"Vacuole\",\n",
    "    \"h\": \"Chloroplast\",\n",
    "    \"x\": \"Glyoxysome\",\n",
    "    \"s\": \"Eyespot\",\n",
    "    \"default\": \"No Compartment\"}\n",
    "\n",
    "for model in models:\n",
    "    for metabolite in model.metabolites:\n",
    "        if metabolite.compartment is None or len(metabolite.compartment.strip()) == 0 or metabolite.compartment == \"[\":\n",
    "            if len(metabolite.id) > 2 and metabolite.id[-2] == \"_\" and metabolite.id[-1].isalpha():\n",
    "                metabolite.compartment = metabolite.id[-1]\n",
    "            else:\n",
    "                metabolite.compartment = \"default\"\n",
    "        if metabolite.compartment not in model.compartments:\n",
    "            model.compartments[metabolite.compartment] = compartments.get(metabolite.compartment, metabolite.compartment)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Metabolite and Reaction Names\n",
    "Names which start with numbers don't need to be escaped with underscores."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for model in models:\n",
    "    for x in chain(model.metabolites, model.reactions):\n",
    "        if x.name is not None and x.name.startswith(\"_\"):\n",
    "            x.name = x.name.lstrip(\"_\")\n",
    "        if x.name is not None:\n",
    "            x.name = x.name.strip()\n",
    "        if x.name is None:\n",
    "            x.name = x.id"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### MISC fixes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "models.iMM1415.reactions.EX_lnlc_dup_e.remove_from_model()\n",
    "models.iMM1415.reactions.EX_retpalm_e.remove_from_model(remove_orphans=True)\n",
    "\n",
    "# these reaction names are reaction strings\n",
    "for r in models.iCac802.reactions:\n",
    "    r.name = \"\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fix Genes and GPR's"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A lot of genes have characters which won't work in their names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# nonbreaking spaces\n",
    "models.iCB925.reactions.FDXNRy.gene_reaction_rule = '( Cbei_0661 or Cbei_2182 )'\n",
    "for r in models.iCB925.reactions:\n",
    "    if \"\\xa0\" in r.gene_reaction_rule:\n",
    "        r.gene_reaction_rule = r.gene_reaction_rule.replace(\"\\xc2\", \" \").replace(\"\\xa0\", \" \")\n",
    "for g in list(models.iCB925.genes):\n",
    "    if len(g.reactions) == 0:\n",
    "        models.iCB925.genes.remove(g)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Some GPR's are not valid boolean expressions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "multiple_ors = re.compile(\"(\\s*or\\s+){2,}\")\n",
    "multiple_ands = re.compile(\"(\\s*and\\s+){2,}\")\n",
    "\n",
    "for model_id in [\"iRS1563\", \"iRS1597\", \"iMM1415\"]:\n",
    "    model = models.get_by_id(model_id)\n",
    "    for reaction in model.reactions:\n",
    "        gpr = reaction.gene_reaction_rule\n",
    "        gpr = multiple_ors.sub(\" or \", gpr)\n",
    "        gpr = multiple_ands.sub(\" and \", gpr)\n",
    "        if \"[\" in gpr:\n",
    "            gpr = gpr.replace(\"[\", \"(\").replace(\"]\", \")\")\n",
    "        if gpr.endswith(\" or\"):\n",
    "            gpr = gpr[:-3]\n",
    "        if gpr.count(\"(\") != gpr.count(\")\"):\n",
    "            gpr = \"\"  # mismatched parenthesis somewhere\n",
    "        reaction.gene_reaction_rule = gpr\n",
    "    for gene in list(model.genes):\n",
    "        if gene.id.startswith(\"[\") or gene.id.endswith(\"]\"):\n",
    "            if len(gene.reactions) == 0:\n",
    "                model.genes.remove(gene.id)\n",
    "                \n",
    "# Some models are missing spaces between the ands/ors in some of their GPR's\n",
    "for m_id in [\"iJN678\", \"iTL885\"]:\n",
    "    for r in models.get_by_id(m_id).reactions:\n",
    "        r.gene_reaction_rule = r.gene_reaction_rule.replace(\"and\", \" and \").replace(\"or\", \" or \")\n",
    "\n",
    "models.iCac802.reactions.R0095.gene_reaction_rule = \\\n",
    "    models.iCac802.reactions.R0095.gene_reaction_rule.replace(\" AND \", \" and \")\n",
    "\n",
    "\n",
    "# make sbml3 output deterministic by sorting genes\n",
    "for m in models:\n",
    "    m.genes.sort()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ensure all ID's are SBML compliant"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for m in models:\n",
    "    cobra.manipulation.escape_ID(m)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Export Models"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### SBML 3\n",
    "Export the models to the use the fbc version 2 (draft RC6) extension to SBML level 3 version 1."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for model in models:\n",
    "    cobra.io.write_sbml_model(model, \"sbml3/%s.xml\" % model.id)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### mat\n",
    "Save all the models into a single mat file. In addition to the usual fields in the \"mat\" struct, we will also include S_num and S_denom, which are the numerator and denominator of the stoichiometric coefficients encoded as rational numbers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def convert_to_rational(value):\n",
    "    return sympy.Rational(\"%.15g\" % value)\n",
    "\n",
    "\n",
    "def construct_S_num_denom(model):\n",
    "    \"\"\"convert model to two S matrices\n",
    "\n",
    "    they encode the numerator and denominator of stoichiometric\n",
    "    coefficients encoded as rational numbers\n",
    "\n",
    "    \"\"\"\n",
    "    # intialize to 0\n",
    "    dimensions = (len(model.metabolites), len(model.reactions))\n",
    "    S_num = scipy.sparse.lil_matrix(dimensions)\n",
    "    S_denom = scipy.sparse.lil_matrix(dimensions)\n",
    "    # populate with stoichiometry\n",
    "    for i, r in enumerate(model.reactions):\n",
    "        for met, value in r._metabolites.iteritems():\n",
    "            rational_value = convert_to_rational(value)\n",
    "            num, denom = (rational_value.p, rational_value.q)\n",
    "            S_num[model.metabolites.index(met), i] = num\n",
    "            S_denom[model.metabolites.index(met), i] = denom\n",
    "    return S_num, S_denom"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "all_model_dict = {}\n",
    "for model in models:\n",
    "    model_dict = cobra.io.mat.create_mat_dict(model)\n",
    "    model_dict[\"S_num\"], model_dict[\"S_denom\"] = construct_S_num_denom(model)\n",
    "    all_model_dict[model.id] = model_dict\n",
    "scipy.io.savemat(\"all_models.mat\", all_model_dict, oned_as=\"column\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}