{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Distribution of publication count for Dmel TF genes\n", "\n", "For each TF gene, count the number of *curated* publications, using data from GO and Monarch" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "import ontobio.golr.golr_associations as ga" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "478" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Fetch all Dmel TF genes\n", "DNA_BINDING_TF = 'GO:0003700'\n", "DMEL = 'NCBITaxon:7227'\n", "tf_genes = ga.get_subjects_for_object(object=DNA_BINDING_TF, subject_taxon=DMEL)\n", "len(tf_genes)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "140" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Routine to go to GO and Monarch to fetch all annotations for a gene\n", "def get_pubs_for_gene(g):\n", " \n", " # Monarch\n", " r = ga.search_associations(subject=g, rows=-1)\n", " pubs = set()\n", " for a in r['associations']:\n", " pl = a['publications']\n", " if pl is not None:\n", " pubs.update([p['id'] for p in pl if p['id'].startswith('PMID')])\n", " \n", " # GO\n", " r = ga.search_associations(subject=g, rows=-1, object_category='function')\n", " for a in r['associations']:\n", " pl = a['reference']\n", " if pl is not None:\n", " pubs.update([p for p in pl if p.startswith('PMID')])\n", " \n", " return pubs\n", " \n", "len(get_pubs_for_gene(tf_genes[0]))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# find all gene,numberOfPub pairs\n", "pairs = []\n", "for g in tf_genes:\n", " np = len(get_pubs_for_gene(g))\n", " pairs.append((g,np))\n", " " ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[140, 97, 34, 107, 110]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check\n", "vals = [np for _,np in pairs]\n", "vals[0:5]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['FB:FBgn0085253']" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check\n", "tf_genes_with_no_pubs = [g for g,np in pairs if np==0]\n", "tf_genes_with_no_pubs" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['FB:FBgn0038626',\n", " 'UniProtKB:A0A0B4LH09',\n", " 'FB:FBgn0024975',\n", " 'FB:FBgn0025185',\n", " 'FB:FBgn0028647',\n", " 'FB:FBgn0029173',\n", " 'FB:FBgn0029928',\n", " 'FB:FBgn0030008',\n", " 'FB:FBgn0030012',\n", " 'FB:FBgn0032694',\n", " 'FB:FBgn0033449',\n", " 'FB:FBgn0033627',\n", " 'FB:FBgn0037317',\n", " 'FB:FBgn0039078',\n", " 'FB:FBgn0039329',\n", " 'FB:FBgn0039937',\n", " 'FB:FBgn0052006',\n", " 'FB:FBgn0053213',\n", " 'FB:FBgn0053557',\n", " 'FB:FBgn0085253',\n", " 'FB:FBgn0263511',\n", " 'UniProtKB:A0A0B4K653',\n", " 'UniProtKB:A0A0B4KGA3',\n", " 'UniProtKB:A0A0B4KGM5',\n", " 'UniProtKB:A0A0B4KGW2',\n", " 'UniProtKB:A0A0B4KGW6',\n", " 'UniProtKB:A0A0B4KHC8',\n", " 'UniProtKB:A0A0B4LGG8']" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# genes with fewer than 5 pubs\n", "[g for g,np in pairs if np < 5]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "%matplotlib inline\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEKCAYAAAAfGVI8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAFEFJREFUeJzt3XuwXWd93vHvg21CEDbY6HCqAEIuo3HrNCC7J24obgo1TklMkZuhjimmKhFVZrgEAqQImrbuNDPVTEsutAlTJTYWYC6qjSMz5lIjcIC0ActG8TWOiWsN9uiGIbbjTEwMv/6xl+LDyblsSWftfbTf72fmzF7XvX5es+3H77vWeleqCklSu54y7gIkSeNlEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIad/K4CxjG6tWra926deMuQ5JOKLfccsu3qmpqqe1OiCBYt24de/bsGXcZknRCSbJvmO3sGpKkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMadEE8W92nd1hsWXHf/totGWIkkjYctAklqnEEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJalxvQZDkrCR7Z/09kuTtSc5IcmOSe7vP0/uqQZK0tN6CoKruqaoNVbUB+PvAXwDXAVuB3VW1HtjdzUuSxmRUXUMXAH9aVfuAjcCObvkO4OIR1SBJmseoguBS4GPd9HRV7e+mDwDTI6pBkjSP3oMgyVOBVwP/a+66qiqgFthvS5I9SfYcPny45yolqV2jaBH8NHBrVR3s5g8mWQPQfR6ab6eq2l5VM1U1MzU1NYIyJalNowiC1/JktxDA9cCmbnoTsGsENUiSFtBrECRZBVwIfHLW4m3AhUnuBV7RzUuSxqTXN5RV1WPAs+cse4jBXUSSpBXAJ4slqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDWu73cWPyvJNUn+OMndSV6S5IwkNya5t/s8vc8aJEmL67tF8JvAZ6vq7wAvBu4GtgK7q2o9sLublySNSW9BkOSZwE8CVwBU1Xer6s+AjcCObrMdwMV91SBJWlqfLYIzgcPAB5N8PcnvJlkFTFfV/m6bA8B0jzVIkpbQZxCcDJwLfKCqzgEeY043UFUVUPPtnGRLkj1J9hw+fLjHMiWpbX0GwQPAA1X11W7+GgbBcDDJGoDu89B8O1fV9qqaqaqZqampHsuUpLb1FgRVdQD4ZpKzukUXAHcB1wObumWbgF191SBJWtrJPX//W4GrkzwVuA94A4Pw2ZlkM7APuKTnGiRJi+g1CKpqLzAzz6oL+jyuJGl4PlksSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMb1PfroRFu39YZF19+/7aIRVSJJx84WgSQ1ziCQpMYZBJLUOINAkhrnxeJFLHUxWJImgS0CSWpcry2CJPcDjwLfA56oqpkkZwCfANYB9wOXVNV3+qxDkrSwUbQIXl5VG6rqyEvstwK7q2o9sLublySNyTi6hjYCO7rpHcDFY6hBktTpOwgK+HySW5Js6ZZNV9X+bvoAMN1zDZKkRRzVNYIkTwGeUVWPDLnL+VX1YJLnADcm+ePZK6uqktQCx9oCbAFYu3bt0ZQpSToKS7YIknw0yWlJVgF3AHcl+eVhvryqHuw+DwHXAecBB5Os6b57DXBogX23V9VMVc1MTU0N908jSTpqw3QNnd21AC4GPgOcCbx+qZ2SrEpy6pFp4KcYBMn1wKZus03ArmOoW5K0TIbpGjolySkMguB/VNVfLdSdM8c0cF2SI8f5aFV9NsnNwM4km4F9wCXHWLskaRkMEwT/k8H9/n8EfCnJC4AlrxFU1X3Ai+dZ/hBwwdGVKUnqy5JBUFXvB94/a9G+JC/vr6TJsdgQFb6rQNJKMczF4ukkVyT5TDd/Nk/28UuSTnDDXCy+Cvgc8CPd/J8Ab++rIEnSaA0TBKuraifwfYCqeoLB2EGSpAkwTBA8luTZDJ4SJslPAA/3WpUkaWSGuWvoHQzu/X9hkj8ApoDX9FqVJGlkhrlr6NYk/xg4CwhwT1X9Ve+VSZJGYtixhs5j8P6Ak4Fzk1BVH+qtKknSyCwZBEk+DLwQ2MuTF4kLMAgkaQIM0yKYYTDe0DDDSkiSTjDD3DV0B/C3+i5EkjQew7QIVjMYevprwONHFlbVq3urSpI0MsMEweV9FyFJGp9hbh/9/W7E0fVV9fkkTwdO6r80SdIoDDPo3L8BrmEwHDXAc4Hf67MoSdLoDHOx+M3AS+neQVBV9wLP6bMoSdLoDBMEj1fVd4/MJDmZbtwhSdKJb5iLxb+f5L3ADye5EHgT8Kl+y1o+i70cRpI0XItgK3AYuB34BeDTwK/0WZQkaXSGuWvo+8DvdH9HLclJwB7gwap6VZIzgE8wGLvofuCSqvrOsXy3JOn4DXPX0O1Jbpvz9+Ukv969p2ApbwPunjW/FdhdVeuB3d28JGlMhuka+gxwA/C67u9TDP4P/wCD11guKMnzgIuA3521eCOwo5veAVx8VBVLkpbVMBeLX1FV586avz3JrVV1bpLLltj3N4B/C5w6a9l0Ve3vpg8A0/PtmGQLsAVg7dq1Q5QpSToWw7QITkpy3pGZJD/Ok08WP7HQTkleBRyqqlsW2qYb0XTeW1GrantVzVTVzNTU1BBlSpKOxTAtgjcCVyZ5Rjf/KLA5ySrgvyyy30uBVyf5GeBpwGlJPgIcTLKmqvYnWQMcOo76JUnHackWQVXdXFU/BmwANlTVi7plj1XVzkX2e09VPa+q1gGXAl+oqssYvP94U7fZJmDXcf9TSJKO2bCvqqSqHl6mY24DdibZDOwDLlmm75UkHYOhg+B4VNVNwE3d9EPABaM4riRpaQt2DSX5F93nmaMrR5I0aotdI3hP93ntKAqRJI3HYl1DDyX538CZSa6fu9JXVUrSZFgsCC4CzgU+DLxvNOVIkkZtwSDo3kHwh0n+YVUdPvIcQVX9+ciqkyT1bpgni6eTfB24E7gryS1J/l7PdUmSRmSYINgOvKOqXlBVa4F3dsskSRNgmCBYVVVfPDLTPROwqreKJEkjNcwDZfcl+fcMLhoDXAbc119JkqRRGqZF8PPAFPBJBs8UrO6WSZImwDCvqvwO8IsjqEWSNAbDtAgkSRPMIJCkxhkEktS4JYMgyfOSXJfkcJJDSa7tXkovSZoAw7QIPsjgrWJrgB8BPtUtkyRNgGGCYKqqPlhVT3R/VzG4nVSSNAGGCYKHklyW5KTu7zLgob4LkySNxrAPlF0CHAD2A68B3rDUTkmeluRrSf4oyd1JtnXLz0hyY5J7u8/Tj+cfQJJ0fIZ5oGwfcCwvoXkc+CdV9edJTgG+kuQfAf8M2F1V25JsBbYC7z6G75ckLYMFgyDJf1hkv6qq/7zYF1dVAUfeXXAKcBLwHWAj8LJu+Q4GL7U3CCRpTBbrGnpsnj+AzQz5H+7umsJe4BBwU1XdAUxX1f5ukwPA9LEULklaHou9oeyvX0+Z5FTgbQyuDXycIV9dWVXfAzYkeRbwuSQvn7O+ktR8+ybZAmwBWLt27TCHkyQdg0UvFncXdn8VuI1BaJxbVe+uqkNHc5Cq+jPgBmAGOJhkTff9axi0FubbZ3tVzVTVzNSUd6tKUl8WDIIk/xW4GXgU+LGqurwbiXQoSaa6lgBJfhi4ENjL4OG0Td1mm4Bdx1i7JGkZLHbX0DsZ3PnzK8C/S3JkeRj06py2xHevAXYkeQqDwPlIVd2Y5FZgZ5LNwD4Gt6ZKksZksWsExzUgXVXdBpwzz/KHgAuO57slScvH0UclqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1Lgl31msfqzbesOi6+/fdtGIKpHUOlsEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXG9BUGS5yf5YpK7ktyZ5G3d8jOS3Jjk3u7z9L5qkCQtrc/nCJ4A3llVtyY5FbglyY3AvwZ2V9W2JFuBrcC7e6xj4vgMgqTl1FuLoKr2V9Wt3fSjwN3Ac4GNwI5usx3AxX3VIEla2kiuESRZB5wDfBWYrqr93aoDwPQoapAkza/3ISaSPAO4Fnh7VT2S5K/XVVUlqQX22wJsAVi7dm3fZa44S3X/SNJy6bVFkOQUBiFwdVV9slt8MMmabv0a4NB8+1bV9qqaqaqZqampPsuUpKb1eddQgCuAu6vq12atuh7Y1E1vAnb1VYMkaWl9dg29FHg9cHuSvd2y9wLbgJ1JNgP7gEt6rEGStITegqCqvgJkgdUX9HVcSdLR8X0EE+h4LjT7DILUHoeYkKTGGQSS1Di7hnRUFut2sltJOjHZIpCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqco49qxXBkU2k8bBFIUuN6C4IkVyY5lOSOWcvOSHJjknu7z9P7Or4kaTh9tgiuAl45Z9lWYHdVrQd2d/OSpDHqLQiq6kvAt+cs3gjs6KZ3ABf3dXxJ0nBGfY1guqr2d9MHgOmFNkyyJcmeJHsOHz48muokqUFju1hcVQXUIuu3V9VMVc1MTU2NsDJJasuog+BgkjUA3eehER9fkjTHqJ8juB7YBGzrPneN+PiaUIs9gwA+hyAtps/bRz8G/F/grCQPJNnMIAAuTHIv8IpuXpI0Rr21CKrqtQusuqCvY0qSjp5DTOgHLNXFImnyOMSEJDXOFoGWTZ8XbG2pSP2xRSBJjTMIJKlxdg2pecfT7eTzCZoEtggkqXEGgSQ1zq4hNWES7zry1Z5aLrYIJKlxBoEkNc6uIY3MJHbPSJPAFoEkNc4gkKTG2TUkHYfj7e46Ee/uWakvAVqpdZ0IbBFIUuNsEUgrlBfXNSq2CCSpcWMJgiSvTHJPkm8k2TqOGiRJAyPvGkpyEvBbwIXAA8DNSa6vqrtGXYs0bn11/xzvhdNx1bWUSbzguxIuco+jRXAe8I2quq+qvgt8HNg4hjokSYwnCJ4LfHPW/APdMknSGKSqRnvA5DXAK6vqjd3864F/UFVvmbPdFmBLN3sWcM8xHnI18K1j3LclnqeleY6G43la2qjO0QuqamqpjcZx++iDwPNnzT+vW/YDqmo7sP14D5ZkT1XNHO/3TDrP09I8R8PxPC1tpZ2jcXQN3QysT3JmkqcClwLXj6EOSRJjaBFU1RNJ3gJ8DjgJuLKq7hx1HZKkgbE8WVxVnwY+PaLDHXf3UiM8T0vzHA3H87S0FXWORn6xWJK0sjjEhCQ1bqKDwKEs5pfk/iS3J9mbZE+37IwkNya5t/s8fdx1jlqSK5McSnLHrGULnpck7+l+W/ck+afjqXq0FjhHlyd5sPs97U3yM7PWtXiOnp/ki0nuSnJnkrd1y1fsb2lig2DWUBY/DZwNvDbJ2eOtakV5eVVtmHUL21Zgd1WtB3Z38625CnjlnGXznpfut3Qp8KPdPr/d/eYm3VX8zXME8Ovd72lDdw2w5XP0BPDOqjob+Angzd25WLG/pYkNAhzK4mhtBHZ00zuAi8dYy1hU1ZeAb89ZvNB52Qh8vKoer6r/B3yDwW9uoi1wjhbS6jnaX1W3dtOPAnczGD1hxf6WJjkIHMpiYQV8Pskt3RPcANNVtb+bPgBMj6e0FWeh8+Lv6we9NcltXdfRkS6P5s9RknXAOcBXWcG/pUkOAi3s/KrawKDb7M1JfnL2yhrcSubtZHN4Xhb0AeBvAxuA/cD7xlvOypDkGcC1wNur6pHZ61bab2mSg2CooSxaVFUPdp+HgOsYNEMPJlkD0H0eGl+FK8pC58XfV6eqDlbV96rq+8Dv8GS3RrPnKMkpDELg6qr6ZLd4xf6WJjkIHMpiHklWJTn1yDTwU8AdDM7Npm6zTcCu8VS44ix0Xq4HLk3yQ0nOBNYDXxtDfWN35D9unX/O4PcEjZ6jJAGuAO6uql+btWrF/pYm9p3FDmWxoGngusFvlZOBj1bVZ5PcDOxMshnYB1wyxhrHIsnHgJcBq5M8APxHYBvznJequjPJTuAuBneJvLmqvjeWwkdogXP0siQbGHR13A/8ArR7joCXAq8Hbk+yt1v2Xlbwb8kniyWpcZPcNSRJGoJBIEmNMwgkqXEGgSQ1ziCQpMYZBJpYSSrJ+2bNvyvJ5cvwvT+U5PPdSJs/d5zftW72SJ7SOBgEmmSPAz+bZPUyf+85AN1Im59Y5u+WRs4g0CR7gsErAX9p7oru/8S/0A2UtjvJ2nm2OSPJ73Xb/GGSFyV5DvAR4Me7FsEL5+xzU5Lf7NbdkeS8bvnlSd41a7s7ugHJAE5OcnWSu5Nck+Tp3TbbujHtb0vy35bpnEh/g0GgSfdbwOuSPHPO8v8O7KiqFwFXA++fZ9//BHy92+a9wIe68ZneCHy5axH86Tz7Pb0b1O9NwJVD1HgW8NtV9XeBR4A3JXk2g+EafrQ7/q8O8T3SMTEINNG6UR8/BPzinFUvAT7aTX8YOH+e3c/v1lFVXwCeneS0IQ77sW6fLwGnJXnWEtt/s6r+oJv+SHfch4G/BK5I8rPAXwxxXOmYGARqwW8Am4FVIzre3HFbikE31ex/35622PZV9QSDUTyvAV4FfHa5i5SOMAg08arq28BOBmFwxP9hMCItwOuAL8+z65e7dSR5GfCtuePKL+Dnun3OBx6uqocZDMZ2brf8XODMWduvTfKSbvpfAl/pxrJ/Zvfax18CXjzEcaVjMrGjj0pzvA94y6z5twIfTPLLwGHgDfPsczlwZZLbGHTNbJpnm/n8ZZKvA6cAP98tuxb4V0nuZPC2qj+Ztf09DF4QdCWDESg/ADwT2JXkaUCAdwx5bOmoOfqotIyS3AS8q6r2jLsWaVh2DUlS42wRSFLjbBFIUuMMAklqnEEgSY0zCCSpcQaBJDXOIJCkxv1/hELzORjO/2AAAAAASUVORK5CYII=\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Histogram\n", "plt.hist(vals, bins=40)\n", "plt.ylabel('No of genes')\n", "plt.xlabel('No of pubs')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# Save results\n", "import csv\n", "with open('gene-pubs.csv', 'w', newline='') as csvfile:\n", " w = csv.writer(csvfile, delimiter=',')\n", " for g,np in pairs:\n", " w.writerow([g,np])\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 2 }