{ "cells": [ { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import itertools\n", "import gzip\n", "\n", "import pandas\n", "import rdkit.Chem\n", "import rdkit.Chem.AllChem\n", "import rdkit.DataStructs" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2015-05-18 14:03:23-- http://www.drugbank.ca/system/downloads/current/structures/all.sdf.zip\n", "Resolving www.drugbank.ca (www.drugbank.ca)... 162.243.242.197\n", "Connecting to www.drugbank.ca (www.drugbank.ca)|162.243.242.197|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 4362948 (4.2M) [application/zip]\n", "Server file no newer than local file ‘download/all.sdf.zip’ -- not retrieving.\n", "\n", "Archive: download/all.sdf.zip\n", " inflating: download/all.sdf \n" ] } ], "source": [ "# Download DrugBank SDF file of structures\n", "! wget --timestamping --directory-prefix download http://www.drugbank.ca/system/downloads/current/structures/all.sdf.zip\n", "! unzip -d download download/all.sdf.zip\n", "! rm download/all.sdf.zip" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "6743" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Read SDF File\n", "supplier = rdkit.Chem.SDMolSupplier('download/all.sdf')\n", "molecules = [mol for mol in supplier if mol is not None]\n", "len(molecules)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Calculate fingerprints\n", "fingerprints = dict()\n", "for mol in molecules:\n", " drugbank_id = mol.GetProp('DATABASE_ID')\n", " fingerprint = rdkit.Chem.AllChem.GetMorganFingerprint(mol, 2)\n", " fingerprints[drugbank_id] = fingerprint" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Calculate pairwise compound similarities\n", "similarity_rows = list()\n", "for (id0, fp0), (id1, fp1) in itertools.combinations(fingerprints.items(), 2):\n", " similarity = rdkit.DataStructs.DiceSimilarity(fp0, fp1)\n", " similarity = round(similarity, 4)\n", " similarity_rows.append([id0, id1, similarity])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
compound0compound1similarity
0 DB05107 DB08426 0.0966
1 DB05107 DB05105 0.0441
2 DB05107 DB05104 0.1457
3 DB05107 DB08423 0.1268
4 DB05107 DB05102 0.1000
\n", "

5 rows × 3 columns

\n", "
" ], "text/plain": [ " compound0 compound1 similarity\n", "0 DB05107 DB08426 0.0966\n", "1 DB05107 DB05105 0.0441\n", "2 DB05107 DB05104 0.1457\n", "3 DB05107 DB08423 0.1268\n", "4 DB05107 DB05102 0.1000\n", "\n", "[5 rows x 3 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create a DataFrame of pairwise similarities\n", "similarity_df = pandas.DataFrame(similarity_rows, columns=['compound0', 'compound1', 'similarity'])\n", "with gzip.open('data/similarity.tsv.gz', 'w') as write_file:\n", " similarity_df.to_csv(write_file, sep='\\t', index=False)\n", "similarity_df.head()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.05267679727458775" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Save a similarity tsv with only compounds in our slim drugbank set\n", "drugbank_slim_df = pandas.read_table('data/drugbank-slim.tsv')\n", "slim_ids = set(drugbank_slim_df.drugbank_id)\n", "similarity_slim_df = similarity_df[similarity_df.compound0.isin(slim_ids) & similarity_df.compound1.isin(slim_ids)]\n", "with gzip.open('data/similarity-slim.tsv.gz', 'w') as write_file:\n", " similarity_slim_df.to_csv(write_file, sep='\\t', index=False)\n", "float(len(similarity_slim_df)) / len(similarity_df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "image/png": [ "iVBORw0KGgoAAAANSUhEUgAAAYgAAAD/CAYAAADrE0HrAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\n", "AAALEgAACxIB0t1+/AAAGX1JREFUeJzt3X2QVed92PHvouUtYhenyBJNCjUEyeNkhhAcRVSMASeM\n", "Uhu7RW41KJppbc+Iieq20ymZcRq1gc1o0pckbrFTgVqrbpWM+6IIJyizZaEQYWAlsagRQz2WX4hp\n", "y0zbaSxH7FYjXtbe/vF7Vvfs8tw32L333Hu/n5nLPvd3nz3nuZe953ee85xzHpAkSZIkSZIkSZIk\n", "SZIkSZI6ziLgd4CXga8CPwmsA84Ap4ADQF+quxs4B7wC7EixpcChVHcYuCvFNwGvpuXsLaxvH3AW\n", "GAXun483JEmaG38beCaV7wP+GDgMbEmxg8BOYCVwAVgIDKbyImAPlQSwC9ifyueBNak8DGwANgIn\n", "UmwVMDbn70aS1LAFdV7/cWAklb8F/Cjws0SPAOAIsJ3Y2x8FbgDjwEVgPbC58Psjqe4AkTwupfjR\n", "FN8MHEuxy0A/sOLW3pYk6XbVSxDngY+l8ibgvcAPFV6fAJYTvYYrVeLjNWKNLEOS1Ab1EsSXiI35\n", "aeJQ0jeB7xVeHwTeSnUGCvGBTDwXa2QZkqQ2qJcgfgb4I+BDwAvA/yEGrLem1z9CHG4aS3UWE3v9\n", "HwC+Rhx2+uisuhPAdWAtMcD9UIqPAj+fYqtT24rJaNpFYMqHDx8+fDT1uMgc+3PAfyGSwh8BPwbc\n", "C5xMsWepnMX0OJEoXgMeTrGlwPNED+Q4cHeKP0Cc7TQGPFVY3z7i7KYx4MEqbZq6vbfUVYba3YAS\n", "GWp3A0pkqN0NKJGhdjegRHpi29kTb7JBQ+1uQIkMtbsBJTLU7gaUyFC7G1AiTW876x1ikiT1KBNE\n", "ZzvZ7gaUyMl2N6BETra7ASVyst0NUGt5iEmSmuchJknS3DBBSJKyTBCSpCwThCQpywRRGv3jvHvF\n", "Y/94vdqSpJuV/Cym4oa+qY39FEylR9nfo6QO1PR2pa9+ldKZotztnpr5/9D37j+N/17DvyNJjWp6\n", "2+khppbyMJKkztGJe6kd1oNYCEzOfBnI9BLsQUiaT/YgymeSypBEUT/MGKuQpHLpb3cDetd04phm\n", "h0FSudiDkCRlmSBumQPOkrpbJx7XKMsgdbVB5cxprlN1yrnXSvEeJXWPOR+kXgB8CThDzBv9fmBd\n", "4fmBwgp3A+eIqUR3pNhS4FCqOwzcleKbiKlFzwB7C+vbB5wl5qe+v5k3Iklqrb8M/KdU3k5s7A8D\n", "W1LsILATWAlcIM7pHEzlRcAeKglgF7A/lc8Da1J5GNgAbAROpNgqYl7qnLKc8VPtyudCfPq1euXc\n", "a5I0p+Z8Poh3gOVEL2E5cB34INEjADhCJI77ib3+G8A4cBFYD2wGRlLdkVR3gEgel1L8aIpvBo6l\n", "2GXiDKsVzb4hSdLcqHea6yiwBPgGsbH+OJXeA8AEkTgGgStV4uM1YtPxtcBV4M3MMoqxkuoHJt3r\n", "l9RV6iWIzxJJ4h8CfwF4iTiMNG0QeIvY4A8U4gOZeC5WXMb1KsvIGSqUT9L2eWeL1zTM9dhy/zhM\n", "Fj6X/gmYHJzjlUjqPtvSY978OvDLqXwncVjoKLA1xZ4BHgHuIcYdFhN7/W+k8h5i4BngUeDpVH6d\n", "6DX0EWMQ9xNjEMdTbDUxTpFTlj31WxxbaKRef/EK60w9SWranG873gP8PnCaOOvoUeBeYo/9ZeBZ\n", "KrvMjxMDy68BD6fYUuD59PvHgbtT/AHibKcx4KnC+val9YwBD1ZpU1k2kPOYIOrVk6Sm9cS2oyxv\n", "sgQJ4pbnnpDUe5rednbixVhTlKPdUzPHHXLlWq/dTr1aF+WV4rORVD5Nbzu91YYkKcsEIUnKMkFI\n", "krJMEDV5x1ZJvasTBzRbOUg9ayC61hShDlJLKjUHqSVJc8MEIUnKMkFIkrJMEJKkLBOEJCnLBCFJ\n", "yjJBSJKyTBCSpCwTRMP6YcattSWpu9WbclTvKk4rCl6wLKnb2YOQJGU1kiA+CbyUHq8C7wAfBM4A\n", "p4ADVHandwPniOlEd6TYUuBQqjsM3JXim9LyzgB7C+vbB5wFRom5qiVJHeBfEnNPHwa2pNhBYCew\n", "ErgALAQGU3kRsIdKAtgF7E/l88CaVB4GNgAbgRMptoqYm3q2Vh7/b/FUok3PSe181ZIa1fT2oZlD\n", "TD8N/DjwLNGDOJXiR4DtxN7+KHADGAcuAuuBzcBIqjuS6g4QyeNSih9N8c3AsRS7TIyRrGjyPTXJ\n", "eZ0lKaeZBPEk8GupXByhnQCWE72GK1Xi4zVijSxjHk0OzDw5aXJgftcnSZ2h0bOY3gPcB3w1Pf9B\n", "4bVB4C1ig1/cuA5k4rlYcRnXqyxjtqFC+WR6zJF+YNJDNZI63bb0mHd/Bfh84fmLwNZUfgZ4BLiH\n", "GHdYTOz1v5HKe4iBZ4BHgadT+XVgLdEbGSYOUW0EjqfYamKcYra53niXaGzBMQhJ86bp7UOjPYj7\n", "gD8pPP8l4IvEOMLXgRfSyr8AnCYOXT0JXCMGsZ9L8WvAY2kZTwBfBu4gxiDOpfhp4iyoBcBnmn1D\n", "3c8ejqTW6MSrvZqeNu9m/eMzxxrme4rQVtSbft6R/6eS5l/T284evVCuODDdTYq3A/FsLEm3x1tt\n", "dJXi7UD6PBtL0m3p0R6EJKkeE4QkKcsE0bUcj5B0ezp1DOIn0s93gO+0syHl5XiEpNvTiadETsHq\n", "tEd8eRlMrSB/tXWdZbT7tNT5Os212jI68v9a0txp+jTXDu1B/I/B+LnsKrzdoe9BksrNMQhJUpYJ\n", "QpKUZYKQJGX1UIKYMTGQJKmOHkoQ3Xr/JUmaHz2UICRJzTBBSJKyTBCSpCwThCQpq5EE8SvAy8SU\n", "oJ8E1gFngFPAASqXbu9OdV4BdqTYUuBQqjsM3JXim4BX03L2Fta1DzgLjBJzVEuSSmob8GIq3wn8\n", "GnAY2JJiB4GdwErgArAQGEzlRcAeKglgF7A/lc8Da1J5GNgAbAROpNgqYKxKm6ZgKj3ufIdK0qmn\n", "8HvVyrVeK3u9esuQ1OOa3g7U60E8BPw34A+APySSxQeJHgHAEWA7sbc/CtwAxoGLwHpgMzCS6o6k\n", "ugNE8riU4kdTfDNwLMUuE/eJWtHsG1JO8dbf3v5bUmPq3ejuvcTe/MeAtUSSKN4NcAJYTvQarlSJ\n", "j9eITcfXAleBNzPLKMZ0S4q3/oaZt//uH49rRAD6J2BysJUtk1Re9RLEd4E3iC3Mt4iN+I8WXh8k\n", "brU9TvQMpg1k4rlYcRnXqywjYyj9vN5P9DwO13kfqmr6AkJw3gipq2xLj3mzg8phnx8Bvk1sjLem\n", "2DPAI8A9xLjDYmKv/41U3kMMPAM8Cjydyq8TvYY+YgzifmIM4niKrSbGKXIcg5iTZVT7XCR1qaa/\n", "3/V6EMPEgPQYMV7xGeC/A18kxhG+DryQVvwF4HSq9yRwjRjEfi7FrwGPpeU+AXwZuIMYgziX4qeJ\n", "s6Cm1yVJapNOnGVsqpIIl12Ft1cRh8Ka+L0yzADX6hnlZtd79/9+1ufSkX8TkuqbosnvtxfKSZKy\n", "TBCSpCwThCQpq8sSRHFSIC8Gk6TbUe8spg7jOf2SNFe6rAchSZorXZwgZt9/SJLUjC5OENP3HzI/\n", "3GxG8pSkrC5OEKqumDwlKc8EIUnKMkFIkrJMEJKkLBOEJCnLBCFJyjJBSJKyTBAqmH1xofezknpZ\n", "ownij4GX0uPfAOuAM8Ap4ACVSSh2E7PDvUJMVwqwFDiU6g5TmSJ0E/BqWs7ewrr2AWeBUWIqUrXM\n", "7IsLJ72flaSalhAJouhFYipSiGlFdwIriXmpFwKDqbyImJd6OgHsAvan8nlgTSoPAxuIealPpNgq\n", "YqrT2WrNSV2CuaE7ZU7qpueultTZmv4+N9KD+Engh4i5o08Qe/4biR4BwBFgO7G3PwrcAMaBi8B6\n", "YDMwkuqOpLoDRPK4lOJHU3wzcCzFLhPHPFY0+6YkSbevkdt9vw38JnFo6V4qG/tpE8ByotdwpUp8\n", "vEZsOr4WuAq8mVlGMSZJaoFGEsS3iN4AwLeJjfVPFV4fBN4iNvjFY9YDmXguVlzG9SrLkCS1WCOH\n", "mD4NfC6Vf4TYaB8DtqbYR4jDTWPAh4DFxF7/B4CvEYedPjqr7gSRDNYSA9wPpfgo8PMptjq173s3\n", "N2koPa73E4elJEkzbaOysRyar5X0A79LbMBPEWMQ9wIngZeBZ6mcxfQ4kSheAx5OsaXA88Bp4Dhw\n", "d4o/QJztNAY8VVjfPuLspjHgwUx7HKR2kFpS85r+PvfVr1I6U5X3uewqvL0K+O7Nr/Ux8/MoPq9W\n", "7uR687WujvwbkXSzKZr8PnuhnCQpywQhScoyQUiSskwQkqQsE4QkKcsEoRqKd3f1zq5Sr2nkSmr1\n", "rOm7uwL0eWdXqcfYg5AkZZkgJElZJghJUpYJQpKUZYKQJGWZINQgT3mVeo2nuapBnvIq9Rp7EJKk\n", "LBOEJCnLBCFJymo0QdwNXAbuA9YBZ4jpRw9QmaFoN3COmEZ0R4otBQ6lusNUpgfdREwregbYW1jP\n", "PuAsMTf1/U2/G0lSSy0Efh/4BvB+4EVgS3rtILATWAlcSHUHU3kRsIdKAtgF7E/l88CaVB4GNgAb\n", "gRMptoqYkzrHOanbvi7nqpY6UNPf20Z6EL9JJIL/nZ5vJHoEAEeA7cTe/ihwAxgHLgLrgc3ASKo7\n", "kuoOEMnjUoofTfHNwLEUu0ycYbWi2TckSZob9RLEp4A/pbLh7mPmpNcTwHKi13ClSny8RqyRZUiS\n", "2qDedRCfJrol24nDQM8B7y28Pgi8RWzwi+fGD2TiuVhxGderLEOS1Ab1EsTWQvkl4AnikNNW4KvA\n", "R4hxgzHg14HFwBLgA8DXiMNOHyUGrz9CHJqaIJLBWuIw00PAEPB94DeA3yLGIBYA38s3ayj9vN5P\n", "HJo6XP+tSlJP2ZYeLfEScRbTvcBJ4GXgWSqHnB4nEsVrwMMpthR4HjgNHCfOhgJ4gDjbaQx4qrCO\n", "fcTZTWPAg1Xa4SB129flILXUgZr+3vbVr1I6U5X3uewqvL0K+O7Nr/Ux8/MoPq9W7uR6rW5TR/7t\n", "SL1siia/t14oJ0nKMkFIkrJMELoFxVt/e/tvqVt5u2/dguKtv8Hbf0vdyR6EJCnLBKE54GxzUjfy\n", "EJPmgLPNSd3IHoQkKcsEIUnKMkFIkrJMEJKkLBOEJCnLBCFJyjJBSJKyTBCSpCwThCQpywQhScpq\n", "JEHcAXwJOENMHfoTwLr0/BRwgMosRbuJ+adfAXak2FLgUKo7TGWK0E3E9KJngL2F9e0DzhLzWd9/\n", "C+9JktQif5WYexpgK3A4Pbak2EFgJ7ASuAAsBAZTeRGwh0oC2AXsT+XzwJpUHgY2ABuBEym2ipib\n", "erbC3MjOSV3ONkkqoaa/m430IA4Dv5jK7wP+DPgg0SMAOAJsJ/b2R4EbwDhwEVgPbAZGUt2RVHeA\n", "SB6XUvxoim8GjqXYZeJmgiuafVOSpNvX6BjE94F/B3we+DIzJ76eAJYTvYYrVeLjNWKNLEOS1GLN\n", "3O77U8A9xGGfJYX4IPAWscEv3up5IBPPxYrLuF5lGbMMpZ/X+4lex+Em3ofmVT8wmbqy/RMwOdjW\n", "5ki9a1t6zKu/AfxKKg8C3yEOCW1NsWeAR4jkcQFYTOz1v5HKe4iBZ4BHgadT+XVgLdEbGSYOUW0E\n", "jqfYamKcYrZZYxD9E8yYH7nsx+d7YQzC8QiphJr+PjbSg3iBOLz0VWIA+u8B3wC+SIwjfD3VmQK+\n", "QJzptAB4ErhGDGI/l+LXgMfScp8gDlfdQSSccyl+mjgLagHwmfrNm1xWed99NWtKkhrXiVvUqUpC\n", "WHYV3l4yM0HkyrVe65Z6ZWxT37v/SGq7KZr8PnqhnCQpywShedQPFMeH+sdr15dUJs2cxSQ1aZJZ\n", "h6IGqtWUVD72ICRJWSYISVKWCUKSlGWCkCRlmSAkSVkmCElSlglCkpRlgpAkZZkgJElZJgi1UPHW\n", "G952Qyo7b7WhFireesPbbkhlZw9CkpRlgpAkZZkgJElZ9RLEQuB3gVPAWeDjwDrgTIodoDJD0W5i\n", "2tBXgB0pthQ4lOoOA3el+Cbg1bScvYX17UvrGSXmqJYkldSngH+eyj8M/E/gMLAlxQ4CO4GVwAUi\n", "oQym8iJgD5UEsAvYn8rngTWpPAxsADYCJ1JsFTBWpU1TMJUed74z83m1cq3XuqVeGdtUq16/EwlJ\n", "rTVVv8pM9c5i+j3ghVReANwgNuSnUuwI8BDwfWKv/0Z6XATWA5uBf5bqjgC/CgwQyeNSih8FtgPX\n", "gGMpdjm1bQXwZrNvSp3AM5qksqt3iOlt4P8RG/XfA/7RrN+ZAJYTvYYrVeLjNWKNLEOS1AaNXAex\n", "CvgK8DTwH4DfKLw2CLxFbPCLe4EDmXguVlzG9SrLyBhKP697HYck5W1Lj3lzD/AG8OFC7EVgayo/\n", "AzyS6l0AFhN7/W+k8h5i4BngUSLJALwOrCUGuIeJAemNwPEUW02MU+Q4BtH2dc1HPUnzbM6/Z58H\n", "/hfwUuGxHjgJvAw8S+UspseJgeXXgIdTbCnwPHCa2PjfneIPEGc7jQFPFda3jzi7aQx4sEqbTBBt\n", "X5cJQupATX/P+upXKZ2pyvtcdhXeXlJ53ke+XOu1bqlXxjY1U68j/xalTjJFk98zL5STJGWZICRJ\n", "WSYISVKWCUIlUJwnwiurpbLwOgKVQPGqavDKaqkc7EFIkrJMEJKkLBOESsi5q6UycAxCJeSdXqUy\n", "sAchScoyQUiSskwQkqQsE4QkKcsEIUnKMkGo5DzlVWoXT3NVyXnKq9Qu9iAkSVmNJogHiOlGAdYB\n", "Z4BTwAEqMxTtBs4RU4nuSLGlwKFUdxi4K8U3EVOLngH2FtazDzgLjBLzVEuSSuyzwAViDmqAF4Et\n", "qXwQ2AmsTHUWAoOpvAjYQyUB7AL2p/J5YE0qDwMbgI3AiRRbRcxLnVOYy9g5qcvdJueulkqk6e9P\n", "Iz2Ii8AnqPQUNhI9AoAjwHZib38UuAGMp99ZD2wGRlLdkVR3gEgel1L8aIpvBo6l2GVifGRFs29I\n", "3cx5I6RWaiRBfIUYKZxWnPR6AlhO9BquVImP14g1sgwpmR6wnn5MOmgtzaNbOYvpB4XyIPAWscEv\n", "flkHMvFcrLiM61WWkTGUfl73LCxJytuWHvPufcTgM8QYxNZUfgZ4BLiHGHdYTOz1v5HKe4iBZ4BH\n", "gadT+XVgLdEbGSYOUW0EjqfYamKcIqdwTNoxiHK3qRX1JDWo6e9LM3vg0wv/JeCLxDjC14EX0mtf\n", "AE4Th62eBK4Rg9jPpfg14LG0jCeALwN3EGMQ51L8NJGIFgCfafbNqJf1j1cOOfVPwORge9sjdb6+\n", "+lVKZ6qSq5ZdhbeXVJ73kS/Xeq1b6pWxTa2o9+7f8NTMeh35ty3Npyma/F54oZwkKcsEIUnKMkGo\n", "g824LkLSHDNBqIMVr4uQNNdMEJKkLBOEJCnLBKEu5D2bpLngrSrUhYqTDIETDUm3xh6EeoDTlkq3\n", "wh6EeoDTlkq3wh6EeozjE1Kj7EGoxzg+ITXKHoR6nOMTUjUmCPW4GVdjD2CykN7lISbpXQ5mS0X2\n", "IKQsB7OlMiaIBcRUpi8DLwE/1t7mqDcVDz3ddPjJxKGeUMYEsZOYzvRB4B8An2tvcySYmTBmlGcd\n", "iuofp/3JY1ub1ltG29rdgE5WxgSxGRhJ5bPAT7exLVIdsw9FTQ6UYNB7WwvXVXbb2t2ATlbGBDEI\n", "FL9M36ec7ZTIHIqq8loxWdx0mKpK2cNZaq8ynsU0TnyZpi0AfjCzys9eiZ/vLGtRm6TbdNMFeoXn\n", "1cqzny8cqDzpT8ucVnzeB0ztq1+v0dduud4ETA6ijtXX7gZkfAL4OPBpYBPwq8COwusXceBakpr1\n", "J8C6djfidvUBB4HR9Livvc2RJEmSJElS56p3wdzHgbH0+uOtbVrL1fssfgF4FThDHJ4r49jSXGn0\n", "Qsp/DfyTVjWqTep9FvcDp4DTwH8kri/qVvU+i4eBc8Q244nWNq0tHiA+h9m6Zrv5CeBLqfwA8AeF\n", "1xYC3waWp/IYcHdLW9datT6LpcTA/ZL0/N8TfwTdqtZnMe0XiS/AP25Vo9qk1mfRB7wOrE3PdwPv\n", "b13TWq7e38Ul4D3M3HZ0q88CF4jvQFHT280yX19Q64K5DxAbxSvADWLPeUtLW9datT6Lq8BfSj8h\n", "zjV8p3VNa7l6F1I+CPwM8K/o7p4U1P4s7gPeBPYAJ4mN4zdb2bgWq/d3cYP4DJZy87nE3eYikTBn\n", "//03vd0sc4KodcHcIPEmp03Q3XsEtT6LKeBPU/nvAncCx1vXtJar9Vn8eWAv8Hfo/uQAtT+Lu4hk\n", "+dvAduDngA+3tHWtVe8C288B/xX4GvCHs+p2m68w84KUaU1vN8ucIGpdMHdl1msDwJ+1qF3tUO/i\n", "wQXAbxEbgb/Wwna1Q63P4q8TG8b/DPwy8BjwN1vautaq9Vm8SewtfpPYWIzQ3betqfVZrCZ2Gv4i\n", "8D7gHuJvpdc0vd0sc4IYBT6aypuIY2rTvgHcC/wwMfC2BXilpa1rrVqfBcThlMXEQNxVulutz+K3\n", "iY3gh4F/SozH/E5LW9datT6L7wDLqAzWfojYe+5WtT6LJUSP4hqRNP4vcbip13TVdjN3wdwvEINt\n", "AB8jBlleA/5WOxrYQrU+i58i/vhfKjx2tqeZLVHv72LaJ+n+Qep6n8WHiePxY8C/aEcDW6jeZ/H3\n", "ibOYTgP/lnLeZmguvY/KIHWvbjclSZIkSZIkSZIkSZIkSZIkSZIkSb3u/wMsYvoUJuwO3gAAAABJ\n", "RU5ErkJggg==\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# histogram of similarities\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "plt.hist(similarity_df.similarity, 100);" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [ { "data": { "image/png": [ "iVBORw0KGgoAAAANSUhEUgAAAYIAAAD/CAYAAAD8MdEiAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\n", "AAALEgAACxIB0t1+/AAAETNJREFUeJzt3XuMXNdBx/HvJms7S5k1kUwSEJTEOKkiJOMGjE2tOBuw\n", "IiUmqhMoTisBiRSrBYEUTFQeAmel8JJKIKVqHIoptFULpKSQoCXryMWu310bElkRTajBEkggUYXG\n", "a6o2tpvhj3Omc3c8z92de+/c8/1I0949czxz9mbn/Oaccx8gSZIkSZIkSZIkSZIkSZIkXeGfgYPx\n", "8WfAOuAocBh4ChiL9XYBp4ATwPZYNgE8G+vOAGti+WbgZHydPUP/DSRJi3YNIQiynge2xu29wA7g\n", "BuAMsAKYjNsrgd00O/qdwJNx+2Xgprg9A2wYQtslST1c1UedHwS+DdgPfJ7wTf42wjd8gBeAbcBG\n", "4BhwCZgHzgLrgS3AbKw7G+vWCCFxLpbvj+WSpJyN91Hna8CHCFNCN9Ps1BsuAKsJo4DzHcrnu5Q1\n", "ytcO2HZJ0jLoJwj+lfDtHuDLwOvAOzPPTwJvEDr2Wqa81qa8XVn2NSRJOetnaugh4Im4/d2EDvxF\n", "4I5YdjdhmmgOuB1YRfjWfyvwCmG66J6WuheAi4RRwBhwF82ppqyzQN2HDx8+fPT9aHxxX1bjwKcI\n", "HfVhwhrBzcAh4Diwj+ZRQw8TAuE0cF8smwCeAY4AB4DrYvkmwtFFc8DjHd67vny/xsibLroBJTJd\n", "dANKYrroBpTIdNENKJHK9ZuV+4WWYLroBpTIdNENKInpohtQItNFN6BEBu43+5ka0rIan2fBMG58\n", "vsc/kKSkVXFEUId65tH37zg1xDaNmqmiG1ASU0U3oESmim5AiQzcb471rlKoOuVv46DqC/87jX3r\n", "fyRpGQzcbzo1JEmJMwhGRnZtwXUFSelIbI2ga2e/mHUFSelxjWAEdFsjyDx3xdpBt+ckqcE1AknS\n", "YAyCXCw4d0CSSqXs0wtVmRpqmdZxakjS0Dg1NHrGwdGCpAIZBIW7jDkgqUgGwVC0Xk9IksqrnxvT\n", "aGCXa23WASSplBwRSFLiDAJJSpxBIEmJMwiWjSeNSRpNBsGyaSwQmwOSRotBIEmJMwgkKXEGgSQl\n", "ziCQpMQZBJKUOINgSTxkVNLoMwiWxENGJY0+g0CSEmcQSFLiDAJJSpxBIEmJMwhKa8G9jF2NljQ0\n", "3qGstBr3Mm7wLmeShsMRwUjKjhbG5wtujKQR54hgJGVHC2O1IlsiafQ5IpCkxBkEI691UdmpIkmD\n", "cWpo5F2xqOxUkaSB9DsiuA74T+AWYB1wFDgMPEXzcJZdwCngBLA9lk0Az8a6M8CaWL4ZOBlfZ8+S\n", "fgNJ0tCtAP4WeBV4B/A8sDU+txfYAdwAnIl1J+P2SmA3zY5+J/Bk3H4ZuCluzwAbOrx32Y+fr0M9\n", "PjptL/a5pdSTlLCB+4B+RgQfInT4/x1/vo3wDR/gBWAbsBE4BlwC5oGzwHpgCzAb687GujVCSJyL\n", "5ftj+QhYcNlpO1xJldArCB4EvgK8GH8eY+GZTReA1YRRwPkO5fNdyrLlIyB72WlzQFI19FosfojQ\n", "420jTN98AvjOzPOTwBuEjj27SFlrU96uLPsanUxntg/FhyQpmIqPXBykuUZwRyx7GngPcD1hXWAV\n", "4dv9l+L2buCxWPcB4KNx+yVgLWF0MUOYWmqnbF+7l3l+3zUCSctuqH3AQcJRQzcTvpUfB/bRnCp6\n", "GJgDTgP3xbIJ4BngCHCAcPQRwCbC0UVzwONd3rNsnZpBIKnsBu4Dyn4lszrlamN94T4eo/lzp+3F\n", "PreUeqXaZ5LyNXC/6ZnFPXmDeknVZhD05A3qJVWbQSBJiTMIJClxBoEkJc4gkKTEGQSV420sJQ3G\n", "+xFUjrexlDQYRwSSlDiDIBmtl9B22khS4NRQMhonxjU4bSQpcEQgSYkzCCQpcQaBJCVuBILgbf8b\n", "Hvx00S2RpCoagSB49Vp4oAa8veiWSFIVjUAQfA9Q8xrQkjQkIxAEkqRhMggkKXEGgSQlziCQpMQZ\n", "BG1V5Yb12UtSS1J7BgFw5QXZqnLD+sYlqUf995A0TF50DmhzQbbCWiJJeXNEIEmJMwgkKXEGgSQl\n", "ziCQpMQZBJKUOINAkhJnEEhS4gwCSUqcQSBJiTMIkpW9DtH4fMGNkVQgLzGRrMZ1iADGakW2RFKx\n", "HBFIUuL6CYKrgY8DR4EjwA8A6+LPh4GnaF6lbRdwCjgBbI9lE8Czse4MsCaWbwZOxtfZs8TfQ5I0\n", "RO8G9sXtO4Dn4mNrLNsL7ABuAM4AK4DJuL0S2E2zo98JPBm3XwZuitszwIY2712Heh0euQg8uky/\n", "TzvxfRoP+thejnp5vlevepIqYuDPcz8jgueA98ftG4GvAj9E+IYP8AKwDdgIHAMuAfPAWWA9sAWY\n", "jXVnY90aISTOxfL9sVySlLN+1wi+CfwF8GHg0yy8YP8FYDVhFHC+Q/l8l7JsuSQpZ4McNfQgcD0w\n", "B1yTKZ8E3iB07NmjT2ptytuVZV9DkpSzfkYEPwP8etz+OmF0cJqwXgBwN2GaaA64HVhF+HZ/K/AK\n", "Ybronpa6F4CLwFrC6OIumlNNLaaBE1fHOlN9/VaSlI4pQkfZeAzFBPDXwBeA48C9wM3AofjzPppT\n", "RQ8TAuE0cF/m3z9DOOLoAHBdLN9EOLpoDni8w3sPcbG49T7FLhZLqoTKfZ6HGARl6JwNAknLbuDP\n", "syeUSVLiDAJJSpxBIEmJMwgkKXEGgSQlziCQpMQlFgQLzh3Qt3iTGillid2Y5nKtmQFjXWumxZvU\n", "SClLbEQgSWplEEhS4gwCSUqcQSBJiTMIJClxBoEkJc4gkKTEGQRqkT25zBPMpBQkdkKZesueXAae\n", "YCZVnyMCSUpcxYOg9b7EkqRWFZ8ayl5bCLy+kCRdqeIjAklSLwaBJCXOIJCkxBkEkpQ4g0CSEmcQ\n", "SFLiDAJJSpxBIEmJq2AQLDibWJLUQwWDoHE2sTkgSf2oYBBIkgZhEEhS4gwCSUqcQaAesncs825l\n", "UhVV/DLUWrrsHcu8W5lURY4IJClxBoEkJa5XEKwAPgUcBr4I3AusA47Gsqdo3vZrF3AKOAFsj2UT\n", "wLOx7gywJpZvBk7G19mzDL+HcuF6gZSiB4E/jNvXAv8BPAdsjWV7gR3ADcAZQnBMxu2VwG6aHf1O\n", "4Mm4/TJwU9yeATZ0eP861OvwyEXg0T7bHP9Nvb5wu/XnMtQrY5sGqSephAb+bPYaEXyWZkd+FXAJ\n", "uI3wDR/gBWAbsBE4Fp+fB84C64EtwGysOxvr1gghcS6W74/lA2i9Kb3fTiVpsXoFwdeA/yN03p8F\n", "frPl31wAVhNGAec7lM93KcuWDyB7GYl6/FmStBj9LBZ/L/CPwCeBvwTeyjw3CbxB6NiznXGtTXm7\n", "suxrSJIK0Os8guuBF4FfAA7GspeAO4AvAHcDnwfmgN8BVgHXALcCrxCmi+4hLCLfTZhSugBcBNYS\n", "pofuAqY7N2EaOHF1rHcaONT3bydJ1TcVH0PzYeC/CCHQeKwndMbHgX00jxp6mBAIp4H7YtkE8Axw\n", "BDgAXBfLNxGOLpoDHu/y/p0Wi9ssaLZ7ruwLrmVsk4vF0oir3GfTICh1PUklNPBn0xPKJClxBoEk\n", "Jc4gkKTEGQSSlLiKBEH2GjiSpEFUJAga18w3ByRpUBUJAknSYhkEkpQ4g0CSEmcQSFLiDAJJSpxB\n", "oEVacMhu3ZsDSaOr12WopQ4ah+w2jHlzIGlEOSKQpMQZBJKUOINAkhJnEGiZZBePXTiWRomLxVom\n", "2cVjF46lUeKIQJISZxBIUuIMAklKnEEgSYkzCCQpcQaBJCXOIJCkxBkEkpQ4g0CSEmcQSFLiDAIN\n", "gdcdkkaJ1xrSEHjdIWmUOCKQpMQZBJKUOINAkhJnEEhS4gwCSUqcQaAhyx5K6uGkUhl5+KiGLHso\n", "KXg4qVQ+/Y4INgEH4/Y64ChwGHgKGIvlu4BTwAlgeyybAJ6NdWeANbF8M3Ayvs6exTdfkpSHDwJn\n", "gOPx5+eBrXF7L7ADuCHWWQFMxu2VwG6aHf1O4Mm4/TJwU9yeATZ0eO861OvwyEXg0SvLGw/62C5j\n", "vTK2KY96koZo4M9YPyOCs8D9NL/530b4hg/wArAN2AgcAy4B8/HfrAe2ALOx7mysWyOExLlYvj+W\n", "S5IK0E8QfI4w0dswltm+AKwmjALOdyif71KWLe9h/LdpLjpKkpbJYhaL38psTwJvEDr27CJgrU15\n", "u7Lsa3QwDZy4Gi6vCMsUUyzMIklK2lR8DN2NhEVgCGsEd8Ttp4H3ANcT1gVWEb7dfylu7wYei3Uf\n", "AD4at18C1hJ69BnC1FI72TWCks53u0bgGoFUKgN/xgYZETRe/FeAPyXM8/8L8DfxuT8GjhCmm34D\n", "eJOwmPyJWP4m8L74Gh8APg1cTVgjODVowyVJaXBEULl6441RQd2Ty6ShGOqIQFoG3qtAKhsvMSFJ\n", "iTMIJClxBoEK5L2NpTJwjUAFcr1AKgNHBJKUOINAkhJnEEhS4gwCSUqcQSBJiTMIVBLe21gqioeP\n", "qiS8t7FUFEcEkpQ4g0CSEmcQSFLiDAKVlNchkvLiYrFKyusQSXlxRCBJiTMIJClxBoFG0Pg8rh9I\n", "y8Y1Ao2gyzXXD6Tl44hAkhJnEGgEtF6HSNJycmpII+CK6xAV1RCpkhwRSFLiDAJJSpxBoBHnpSik\n", "pXKNQCPOS1FIS+WIQBXiXc6kxXBEoArxLmfSYjgikKTEGQSqMBeSpX4YBKqwxlRRHaCGoSC15RqB\n", "EuHRRVInjgiUII8ukrIMAiUoO2XktJFUZBBcBTwNHAcOAt9fYFuUNNcSlLYig2AHsBJ4F/BrwBMF\n", "tkWKOoVCazCU4i5pUwW9bxlNFd2AUVZkEGwBZuP2F4EfLrAtUhvdppAad0nrFRhDNZXT+4yCqaIb\n", "MMqKDIJJIPuB+SauWajUFowWOpRfMb1Ub7/d7TmnpJSvIg8fnSd8YBquAt66stqPnYfXVgEr8mmW\n", "tFQLDlXtsN3tuRW1hRXH42u2bo8B9cd61+v2XGH1LsDlSVQKRd7q6X7gXuAhYDPwW8D2ljpncRFZ\n", "kgbxb8C6ohvRrzFgL3AsPm4ptjmSJEmSJEnKX68Ty+4F5uLzD+fbtNz12hfvBU4CRwnTakWu8Qxb\n", "vyccfgz4vbwaVZBe+2IjcBg4AvwV4fycquq1L+4DThH6jA/k27RCbCLsh1Yj12/eD3w8bm8C/i7z\n", "3Argy8DquD0HXJdr6/LVbV9MEBbPr4k/f4bwH7uquu2LhvcT/tB/N69GFaTbvhgDXgLWxp93Ae/I\n", "r2m56/V3cQ74Dhb2HVX1QeAM4TOQNXC/WYbj9rudWHYrofM7D1wifBPemmvr8tVtX3wD+NH4/xCO\n", "xft6fk3LXa8TDt8F/AjwJ1R7ZATd98UtwOvAbuAQoRN8Lc/G5azX38Ulwj6Y4MrjdavmLCEYW//+\n", "B+43yxAE3U4smyT8Mg0XqHbCd9sXdeArcfuXgLcBB/JrWu667YvvAvYAv0j1QwC674s1hFD8CLAN\n", "+HHgzlxbl69eJ6I+AfwT8Arw9y11q+ZzLDxpo2HgfrMMQdDtxLLzLc/VgK/m1K4i9DrJ7irgDwgf\n", "9p/MsV1F6LYvforQAf4D8KvA+4CfzbV1+eq2L14nfPt7jdApzFLty7V02xdvJ3w5+D7gRuB6wt9K\n", "agbuN8sQBMeAe+L2ZsKcV8OrwM3AtYQFsK3AiVxbl69u+wLCNMgqwoLYN6i2bvviI4TO7k7g9wnr\n", "JZ/MtXX56rYv/h34dpqLprcTvg1XVbd9cQ1hhPAmIRz+hzBNlJqR7DfbnVj2XsKiF8BPEBY7TgM/\n", "X0QDc9RtX7yT8Ed+MPPYUUwzc9Hr76Lh56j+YnGvfXEnYb58DvijIhqYo1774pcJRw0dAf6c6t+F\n", "8Uaai8Wp9puSJEmSJEmSJEmSJEmSJEmSJEmSpH79PwOKk9x4WJQhAAAAAElFTkSuQmCC\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# histogram of similarities in slim subset\n", "plt.hist(list(similarity_slim_df.similarity), 100);" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }