{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import rdkit\n", "import rdkit.Chem\n", "import rdkit.Chem.inchi\n", "import rdkit.Chem.AllChem\n", "import rdkit.DataStructs\n", "import pandas\n", "import sqlite3" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Establish database connection\n", "connection = sqlite3.connect('data/l1000.db')\n", "cursor = connection.cursor()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "query = \"\"\"\n", "SELECT * FROM perts\n", "WHERE pert_type == 'trt_cp'\n", "AND inchi_string NOTNULL;\n", "\"\"\"\n", "pert_df = pandas.read_sql(query, connection)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pert_uidpert_idpert_inamepert_typenum_goldnum_instnum_sigin_summlyinchi_stringinchi_keypubchem_cid
012BRD-K68741898BRD-K68741898trt_cp0310InChI=1S/C21H37N5O5/c1-15(2)13-31-21(29)24(5)1...MNAJUJYQFCFYAB-YQVWRLOYSA-N44505553
113BRD-A05457250BAY-K8644trt_cp12380InChI=1S/C16H15F3N2O4/c1-8-12(15(22)25-3)13(14...ZFLWDHHVRRZMEI-UHFFFAOYSA-N2303
214BRD-K72034655peucedanintrt_cp23380InChI=1S/C15H14O4/c1-8(2)14-15(17-3)10-6-9-4-5...YQBNJPACAUPNLV-UHFFFAOYSA-N8616
315BRD-K02458594KU-C103869trt_cp244130InChI=1S/C15H13NO/c1-11-6-2-5-9-14(11)16-10-12...YCUIAYUVYLNFFS-UHFFFAOYSA-N21785456
416BRD-K18814832BRD-K18814832trt_cp149130InChI=1S/C27H23ClN2O3S/c1-18-12-13-19(2)25(16-...ZQJTYJZLKBRKPC-UHFFFAOYSA-N2228302
\n", "
" ], "text/plain": [ " pert_uid pert_id pert_iname pert_type num_gold num_inst \\\n", "0 12 BRD-K68741898 BRD-K68741898 trt_cp 0 3 \n", "1 13 BRD-A05457250 BAY-K8644 trt_cp 1 23 \n", "2 14 BRD-K72034655 peucedanin trt_cp 2 33 \n", "3 15 BRD-K02458594 KU-C103869 trt_cp 2 44 \n", "4 16 BRD-K18814832 BRD-K18814832 trt_cp 1 49 \n", "\n", " num_sig in_summly inchi_string \\\n", "0 1 0 InChI=1S/C21H37N5O5/c1-15(2)13-31-21(29)24(5)1... \n", "1 8 0 InChI=1S/C16H15F3N2O4/c1-8-12(15(22)25-3)13(14... \n", "2 8 0 InChI=1S/C15H14O4/c1-8(2)14-15(17-3)10-6-9-4-5... \n", "3 13 0 InChI=1S/C15H13NO/c1-11-6-2-5-9-14(11)16-10-12... \n", "4 13 0 InChI=1S/C27H23ClN2O3S/c1-18-12-13-19(2)25(16-... \n", "\n", " inchi_key pubchem_cid \n", "0 MNAJUJYQFCFYAB-YQVWRLOYSA-N 44505553 \n", "1 ZFLWDHHVRRZMEI-UHFFFAOYSA-N 2303 \n", "2 YQBNJPACAUPNLV-UHFFFAOYSA-N 8616 \n", "3 YCUIAYUVYLNFFS-UHFFFAOYSA-N 21785456 \n", "4 ZQJTYJZLKBRKPC-UHFFFAOYSA-N 2228302 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pert_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "rows = list()\n", "\n", "for i, series in pert_df.iterrows():\n", " # check InChI Keys\n", " inchi = series.inchi_string\n", " inchi_key = rdkit.Chem.inchi.InchiToInchiKey(inchi)\n", " assert inchi_key == series.inchi_key\n", " \n", " # molecule\n", " mol = rdkit.Chem.MolFromInchi(inchi)\n", " \n", " # fingerprint\n", " fingerprint = rdkit.Chem.AllChem.GetMorganFingerprint(mol, 2)\n", " \n", " rows.append([series.pert_uid, inchi_key, inchi, mol, fingerprint])\n", "\n", "inchi_df = pandas.DataFrame(rows, columns=['pert_uid', 'inchi_key', 'inchi_string', 'mol', 'fingerprint'])\n", "inchi_df.sort('inchi_key', inplace=True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pert_uidinchi_keyinchi_stringmolfingerprint
908139929AAALVYBICLMAMA-UHFFFAOYSA-NInChI=1S/C20H15N3O2/c24-19-15-11-17(21-13-7-3-...<rdkit.Chem.rdchem.Mol object at 0x7fdfc0ee07b8><rdkit.DataStructs.cDataStructs.UIntSparseIntV...
26649777AACFPJSJOWQNBN-UHFFFAOYSA-NInChI=1S/C12H11NO3/c14-7-3-4-10-9(6-7)8-2-1-5-...<rdkit.Chem.rdchem.Mol object at 0x7fdfc18cf668><rdkit.DataStructs.cDataStructs.UIntSparseIntV...
7892195AADCDMQTJNYOSS-LBPRGKRZSA-NInChI=1S/C17H25ClN2O3/c1-4-11-9-13(18)16(23-3)...<rdkit.Chem.rdchem.Mol object at 0x7fdfc1917518><rdkit.DataStructs.cDataStructs.UIntSparseIntV...
1646247318AADVJQLQUVDEBP-GQIGUUNPSA-NInChI=1S/C22H36N4O4/c1-14(2)23-22(28)24-17-8-9...<rdkit.Chem.rdchem.Mol object at 0x7fdfc0860828><rdkit.DataStructs.cDataStructs.UIntSparseIntV...
1963750497AADVJQLQUVDEBP-GUXCAODWSA-NInChI=1S/C22H36N4O4/c1-14(2)23-22(28)24-17-8-9...<rdkit.Chem.rdchem.Mol object at 0x7fdfc06fdb38><rdkit.DataStructs.cDataStructs.UIntSparseIntV...
\n", "
" ], "text/plain": [ " pert_uid inchi_key \\\n", "9081 39929 AAALVYBICLMAMA-UHFFFAOYSA-N \n", "2664 9777 AACFPJSJOWQNBN-UHFFFAOYSA-N \n", "789 2195 AADCDMQTJNYOSS-LBPRGKRZSA-N \n", "16462 47318 AADVJQLQUVDEBP-GQIGUUNPSA-N \n", "19637 50497 AADVJQLQUVDEBP-GUXCAODWSA-N \n", "\n", " inchi_string \\\n", "9081 InChI=1S/C20H15N3O2/c24-19-15-11-17(21-13-7-3-... \n", "2664 InChI=1S/C12H11NO3/c14-7-3-4-10-9(6-7)8-2-1-5-... \n", "789 InChI=1S/C17H25ClN2O3/c1-4-11-9-13(18)16(23-3)... \n", "16462 InChI=1S/C22H36N4O4/c1-14(2)23-22(28)24-17-8-9... \n", "19637 InChI=1S/C22H36N4O4/c1-14(2)23-22(28)24-17-8-9... \n", "\n", " mol \\\n", "9081 \n", "2664 \n", "789 \n", "16462 \n", "19637 \n", "\n", " fingerprint \n", "9081 \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pert_id_0pert_id_1chemical
0BRD-K68741898BRD-K687418981.0000
1BRD-K68741898BRD-A054572500.1589
2BRD-A05457250BRD-K687418980.1589
3BRD-A05457250BRD-A054572501.0000
\n", "" ], "text/plain": [ " pert_id_0 pert_id_1 chemical\n", "0 BRD-K68741898 BRD-K68741898 1.0000\n", "1 BRD-K68741898 BRD-A05457250 0.1589\n", "2 BRD-A05457250 BRD-K68741898 0.1589\n", "3 BRD-A05457250 BRD-A05457250 1.0000" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# test simple case\n", "pert_ids = ['BRD-K68741898', 'BRD-A05457250']\n", "get_similarities(pert_ids, connection)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pert_id_0pert_id_1chemical
1209995BRD-A25143711BRD-K999469020.2205
1209996BRD-A25143711BRD-K864658140.2041
1209997BRD-A25143711BRD-A777227530.0650
1209998BRD-A25143711BRD-K027156880.1786
1209999BRD-A25143711BRD-A251437111.0000
\n", "
" ], "text/plain": [ " pert_id_0 pert_id_1 chemical\n", "1209995 BRD-A25143711 BRD-K99946902 0.2205\n", "1209996 BRD-A25143711 BRD-K86465814 0.2041\n", "1209997 BRD-A25143711 BRD-A77722753 0.0650\n", "1209998 BRD-A25143711 BRD-K02715688 0.1786\n", "1209999 BRD-A25143711 BRD-A25143711 1.0000" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# test scalability\n", "pert_ids = pert_df.pert_id[:1100]\n", "large_df = get_similarities(pert_ids, connection)\n", "large_df.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 69, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pert_uid_0pert_uid_1chemical
039929399291.0000
13992997770.2689
23992921950.1324
339929473180.2078
439929504970.2078
\n", "
" ], "text/plain": [ " pert_uid_0 pert_uid_1 chemical\n", "0 39929 39929 1.0000\n", "1 39929 9777 0.2689\n", "2 39929 2195 0.1324\n", "3 39929 47318 0.2078\n", "4 39929 50497 0.2078" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# head of similarities table\n", "pandas.read_sql('SELECT * FROM similarities LIMIT 5', connection)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pert_uidpert_idpert_inamepert_typenum_goldnum_instnum_sigin_summlyinchi_stringinchi_keypubchem_cid
01CSS001-ATTGCATATTGCATtrt_sh.css0070NoneNoneNone
12CSS001-GAGGATAGAGGATAtrt_sh.css0010NoneNoneNone
23CSS001-TCAATGATCAATGAtrt_sh.css0070NoneNoneNone
34CSS001-TCAGTTCTCAGTTCtrt_sh.css0070NoneNoneNone
45CSS001-TCCATCATCCATCAtrt_sh.css0010NoneNoneNone
\n", "
" ], "text/plain": [ " pert_uid pert_id pert_iname pert_type num_gold num_inst \\\n", "0 1 CSS001-ATTGCAT ATTGCAT trt_sh.css 0 0 \n", "1 2 CSS001-GAGGATA GAGGATA trt_sh.css 0 0 \n", "2 3 CSS001-TCAATGA TCAATGA trt_sh.css 0 0 \n", "3 4 CSS001-TCAGTTC TCAGTTC trt_sh.css 0 0 \n", "4 5 CSS001-TCCATCA TCCATCA trt_sh.css 0 0 \n", "\n", " num_sig in_summly inchi_string inchi_key pubchem_cid \n", "0 7 0 None None None \n", "1 1 0 None None None \n", "2 7 0 None None None \n", "3 7 0 None None None \n", "4 1 0 None None None " ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# head of perts table\n", "pandas.read_sql('SELECT * FROM perts LIMIT 5', connection)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.3" } }, "nbformat": 4, "nbformat_minor": 0 }