{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import rdkit\n",
"import rdkit.Chem\n",
"import rdkit.Chem.inchi\n",
"import rdkit.Chem.AllChem\n",
"import rdkit.DataStructs\n",
"import pandas\n",
"import sqlite3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Establish database connection\n",
"connection = sqlite3.connect('data/l1000.db')\n",
"cursor = connection.cursor()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"query = \"\"\"\n",
"SELECT * FROM perts\n",
"WHERE pert_type == 'trt_cp'\n",
"AND inchi_string NOTNULL;\n",
"\"\"\"\n",
"pert_df = pandas.read_sql(query, connection)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" pert_uid | \n",
" pert_id | \n",
" pert_iname | \n",
" pert_type | \n",
" num_gold | \n",
" num_inst | \n",
" num_sig | \n",
" in_summly | \n",
" inchi_string | \n",
" inchi_key | \n",
" pubchem_cid | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 12 | \n",
" BRD-K68741898 | \n",
" BRD-K68741898 | \n",
" trt_cp | \n",
" 0 | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" InChI=1S/C21H37N5O5/c1-15(2)13-31-21(29)24(5)1... | \n",
" MNAJUJYQFCFYAB-YQVWRLOYSA-N | \n",
" 44505553 | \n",
"
\n",
" \n",
" 1 | \n",
" 13 | \n",
" BRD-A05457250 | \n",
" BAY-K8644 | \n",
" trt_cp | \n",
" 1 | \n",
" 23 | \n",
" 8 | \n",
" 0 | \n",
" InChI=1S/C16H15F3N2O4/c1-8-12(15(22)25-3)13(14... | \n",
" ZFLWDHHVRRZMEI-UHFFFAOYSA-N | \n",
" 2303 | \n",
"
\n",
" \n",
" 2 | \n",
" 14 | \n",
" BRD-K72034655 | \n",
" peucedanin | \n",
" trt_cp | \n",
" 2 | \n",
" 33 | \n",
" 8 | \n",
" 0 | \n",
" InChI=1S/C15H14O4/c1-8(2)14-15(17-3)10-6-9-4-5... | \n",
" YQBNJPACAUPNLV-UHFFFAOYSA-N | \n",
" 8616 | \n",
"
\n",
" \n",
" 3 | \n",
" 15 | \n",
" BRD-K02458594 | \n",
" KU-C103869 | \n",
" trt_cp | \n",
" 2 | \n",
" 44 | \n",
" 13 | \n",
" 0 | \n",
" InChI=1S/C15H13NO/c1-11-6-2-5-9-14(11)16-10-12... | \n",
" YCUIAYUVYLNFFS-UHFFFAOYSA-N | \n",
" 21785456 | \n",
"
\n",
" \n",
" 4 | \n",
" 16 | \n",
" BRD-K18814832 | \n",
" BRD-K18814832 | \n",
" trt_cp | \n",
" 1 | \n",
" 49 | \n",
" 13 | \n",
" 0 | \n",
" InChI=1S/C27H23ClN2O3S/c1-18-12-13-19(2)25(16-... | \n",
" ZQJTYJZLKBRKPC-UHFFFAOYSA-N | \n",
" 2228302 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pert_uid pert_id pert_iname pert_type num_gold num_inst \\\n",
"0 12 BRD-K68741898 BRD-K68741898 trt_cp 0 3 \n",
"1 13 BRD-A05457250 BAY-K8644 trt_cp 1 23 \n",
"2 14 BRD-K72034655 peucedanin trt_cp 2 33 \n",
"3 15 BRD-K02458594 KU-C103869 trt_cp 2 44 \n",
"4 16 BRD-K18814832 BRD-K18814832 trt_cp 1 49 \n",
"\n",
" num_sig in_summly inchi_string \\\n",
"0 1 0 InChI=1S/C21H37N5O5/c1-15(2)13-31-21(29)24(5)1... \n",
"1 8 0 InChI=1S/C16H15F3N2O4/c1-8-12(15(22)25-3)13(14... \n",
"2 8 0 InChI=1S/C15H14O4/c1-8(2)14-15(17-3)10-6-9-4-5... \n",
"3 13 0 InChI=1S/C15H13NO/c1-11-6-2-5-9-14(11)16-10-12... \n",
"4 13 0 InChI=1S/C27H23ClN2O3S/c1-18-12-13-19(2)25(16-... \n",
"\n",
" inchi_key pubchem_cid \n",
"0 MNAJUJYQFCFYAB-YQVWRLOYSA-N 44505553 \n",
"1 ZFLWDHHVRRZMEI-UHFFFAOYSA-N 2303 \n",
"2 YQBNJPACAUPNLV-UHFFFAOYSA-N 8616 \n",
"3 YCUIAYUVYLNFFS-UHFFFAOYSA-N 21785456 \n",
"4 ZQJTYJZLKBRKPC-UHFFFAOYSA-N 2228302 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pert_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"rows = list()\n",
"\n",
"for i, series in pert_df.iterrows():\n",
" # check InChI Keys\n",
" inchi = series.inchi_string\n",
" inchi_key = rdkit.Chem.inchi.InchiToInchiKey(inchi)\n",
" assert inchi_key == series.inchi_key\n",
" \n",
" # molecule\n",
" mol = rdkit.Chem.MolFromInchi(inchi)\n",
" \n",
" # fingerprint\n",
" fingerprint = rdkit.Chem.AllChem.GetMorganFingerprint(mol, 2)\n",
" \n",
" rows.append([series.pert_uid, inchi_key, inchi, mol, fingerprint])\n",
"\n",
"inchi_df = pandas.DataFrame(rows, columns=['pert_uid', 'inchi_key', 'inchi_string', 'mol', 'fingerprint'])\n",
"inchi_df.sort('inchi_key', inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pert_uid | \n",
" inchi_key | \n",
" inchi_string | \n",
" mol | \n",
" fingerprint | \n",
"
\n",
" \n",
" \n",
" \n",
" 9081 | \n",
" 39929 | \n",
" AAALVYBICLMAMA-UHFFFAOYSA-N | \n",
" InChI=1S/C20H15N3O2/c24-19-15-11-17(21-13-7-3-... | \n",
" <rdkit.Chem.rdchem.Mol object at 0x7fdfc0ee07b8> | \n",
" <rdkit.DataStructs.cDataStructs.UIntSparseIntV... | \n",
"
\n",
" \n",
" 2664 | \n",
" 9777 | \n",
" AACFPJSJOWQNBN-UHFFFAOYSA-N | \n",
" InChI=1S/C12H11NO3/c14-7-3-4-10-9(6-7)8-2-1-5-... | \n",
" <rdkit.Chem.rdchem.Mol object at 0x7fdfc18cf668> | \n",
" <rdkit.DataStructs.cDataStructs.UIntSparseIntV... | \n",
"
\n",
" \n",
" 789 | \n",
" 2195 | \n",
" AADCDMQTJNYOSS-LBPRGKRZSA-N | \n",
" InChI=1S/C17H25ClN2O3/c1-4-11-9-13(18)16(23-3)... | \n",
" <rdkit.Chem.rdchem.Mol object at 0x7fdfc1917518> | \n",
" <rdkit.DataStructs.cDataStructs.UIntSparseIntV... | \n",
"
\n",
" \n",
" 16462 | \n",
" 47318 | \n",
" AADVJQLQUVDEBP-GQIGUUNPSA-N | \n",
" InChI=1S/C22H36N4O4/c1-14(2)23-22(28)24-17-8-9... | \n",
" <rdkit.Chem.rdchem.Mol object at 0x7fdfc0860828> | \n",
" <rdkit.DataStructs.cDataStructs.UIntSparseIntV... | \n",
"
\n",
" \n",
" 19637 | \n",
" 50497 | \n",
" AADVJQLQUVDEBP-GUXCAODWSA-N | \n",
" InChI=1S/C22H36N4O4/c1-14(2)23-22(28)24-17-8-9... | \n",
" <rdkit.Chem.rdchem.Mol object at 0x7fdfc06fdb38> | \n",
" <rdkit.DataStructs.cDataStructs.UIntSparseIntV... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pert_uid inchi_key \\\n",
"9081 39929 AAALVYBICLMAMA-UHFFFAOYSA-N \n",
"2664 9777 AACFPJSJOWQNBN-UHFFFAOYSA-N \n",
"789 2195 AADCDMQTJNYOSS-LBPRGKRZSA-N \n",
"16462 47318 AADVJQLQUVDEBP-GQIGUUNPSA-N \n",
"19637 50497 AADVJQLQUVDEBP-GUXCAODWSA-N \n",
"\n",
" inchi_string \\\n",
"9081 InChI=1S/C20H15N3O2/c24-19-15-11-17(21-13-7-3-... \n",
"2664 InChI=1S/C12H11NO3/c14-7-3-4-10-9(6-7)8-2-1-5-... \n",
"789 InChI=1S/C17H25ClN2O3/c1-4-11-9-13(18)16(23-3)... \n",
"16462 InChI=1S/C22H36N4O4/c1-14(2)23-22(28)24-17-8-9... \n",
"19637 InChI=1S/C22H36N4O4/c1-14(2)23-22(28)24-17-8-9... \n",
"\n",
" mol \\\n",
"9081 \n",
"2664 \n",
"789 \n",
"16462 \n",
"19637 \n",
"\n",
" fingerprint \n",
"9081 \n",
"\n",
" \n",
" \n",
" | \n",
" pert_id_0 | \n",
" pert_id_1 | \n",
" chemical | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" BRD-K68741898 | \n",
" BRD-K68741898 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 1 | \n",
" BRD-K68741898 | \n",
" BRD-A05457250 | \n",
" 0.1589 | \n",
"
\n",
" \n",
" 2 | \n",
" BRD-A05457250 | \n",
" BRD-K68741898 | \n",
" 0.1589 | \n",
"
\n",
" \n",
" 3 | \n",
" BRD-A05457250 | \n",
" BRD-A05457250 | \n",
" 1.0000 | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
" pert_id_0 pert_id_1 chemical\n",
"0 BRD-K68741898 BRD-K68741898 1.0000\n",
"1 BRD-K68741898 BRD-A05457250 0.1589\n",
"2 BRD-A05457250 BRD-K68741898 0.1589\n",
"3 BRD-A05457250 BRD-A05457250 1.0000"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# test simple case\n",
"pert_ids = ['BRD-K68741898', 'BRD-A05457250']\n",
"get_similarities(pert_ids, connection)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pert_id_0 | \n",
" pert_id_1 | \n",
" chemical | \n",
"
\n",
" \n",
" \n",
" \n",
" 1209995 | \n",
" BRD-A25143711 | \n",
" BRD-K99946902 | \n",
" 0.2205 | \n",
"
\n",
" \n",
" 1209996 | \n",
" BRD-A25143711 | \n",
" BRD-K86465814 | \n",
" 0.2041 | \n",
"
\n",
" \n",
" 1209997 | \n",
" BRD-A25143711 | \n",
" BRD-A77722753 | \n",
" 0.0650 | \n",
"
\n",
" \n",
" 1209998 | \n",
" BRD-A25143711 | \n",
" BRD-K02715688 | \n",
" 0.1786 | \n",
"
\n",
" \n",
" 1209999 | \n",
" BRD-A25143711 | \n",
" BRD-A25143711 | \n",
" 1.0000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pert_id_0 pert_id_1 chemical\n",
"1209995 BRD-A25143711 BRD-K99946902 0.2205\n",
"1209996 BRD-A25143711 BRD-K86465814 0.2041\n",
"1209997 BRD-A25143711 BRD-A77722753 0.0650\n",
"1209998 BRD-A25143711 BRD-K02715688 0.1786\n",
"1209999 BRD-A25143711 BRD-A25143711 1.0000"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# test scalability\n",
"pert_ids = pert_df.pert_id[:1100]\n",
"large_df = get_similarities(pert_ids, connection)\n",
"large_df.tail()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pert_uid_0 | \n",
" pert_uid_1 | \n",
" chemical | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 39929 | \n",
" 39929 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 1 | \n",
" 39929 | \n",
" 9777 | \n",
" 0.2689 | \n",
"
\n",
" \n",
" 2 | \n",
" 39929 | \n",
" 2195 | \n",
" 0.1324 | \n",
"
\n",
" \n",
" 3 | \n",
" 39929 | \n",
" 47318 | \n",
" 0.2078 | \n",
"
\n",
" \n",
" 4 | \n",
" 39929 | \n",
" 50497 | \n",
" 0.2078 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pert_uid_0 pert_uid_1 chemical\n",
"0 39929 39929 1.0000\n",
"1 39929 9777 0.2689\n",
"2 39929 2195 0.1324\n",
"3 39929 47318 0.2078\n",
"4 39929 50497 0.2078"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# head of similarities table\n",
"pandas.read_sql('SELECT * FROM similarities LIMIT 5', connection)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pert_uid | \n",
" pert_id | \n",
" pert_iname | \n",
" pert_type | \n",
" num_gold | \n",
" num_inst | \n",
" num_sig | \n",
" in_summly | \n",
" inchi_string | \n",
" inchi_key | \n",
" pubchem_cid | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" CSS001-ATTGCAT | \n",
" ATTGCAT | \n",
" trt_sh.css | \n",
" 0 | \n",
" 0 | \n",
" 7 | \n",
" 0 | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" CSS001-GAGGATA | \n",
" GAGGATA | \n",
" trt_sh.css | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" CSS001-TCAATGA | \n",
" TCAATGA | \n",
" trt_sh.css | \n",
" 0 | \n",
" 0 | \n",
" 7 | \n",
" 0 | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" CSS001-TCAGTTC | \n",
" TCAGTTC | \n",
" trt_sh.css | \n",
" 0 | \n",
" 0 | \n",
" 7 | \n",
" 0 | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" CSS001-TCCATCA | \n",
" TCCATCA | \n",
" trt_sh.css | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pert_uid pert_id pert_iname pert_type num_gold num_inst \\\n",
"0 1 CSS001-ATTGCAT ATTGCAT trt_sh.css 0 0 \n",
"1 2 CSS001-GAGGATA GAGGATA trt_sh.css 0 0 \n",
"2 3 CSS001-TCAATGA TCAATGA trt_sh.css 0 0 \n",
"3 4 CSS001-TCAGTTC TCAGTTC trt_sh.css 0 0 \n",
"4 5 CSS001-TCCATCA TCCATCA trt_sh.css 0 0 \n",
"\n",
" num_sig in_summly inchi_string inchi_key pubchem_cid \n",
"0 7 0 None None None \n",
"1 1 0 None None None \n",
"2 7 0 None None None \n",
"3 7 0 None None None \n",
"4 1 0 None None None "
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# head of perts table\n",
"pandas.read_sql('SELECT * FROM perts LIMIT 5', connection)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}