{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "from Bio import SeqIO\n", "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "baseOutDir='/cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/'\n", "\n", "#myDict={'Homo_sapiens':'/cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa'}" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fa_dir='/cellar/users/btsui/Data/ensembl/clean/'\n", "fnames=pd.Series(os.listdir(fa_dir))\n", "myDict=pd.Series(data=(fa_dir+fnames).values,\n", " index=fnames.str.split('\\.').str[0])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "./../Chip-seq/BuildEmptyPickles.ipynb:27: \"#myDict={'Homo_sapiens':'/cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa'}\"\n", "./../Chip-seq/BuildEmptyPickles.ipynb:38: \"fa_dir='/cellar/users/btsui/Data/ensembl/clean/'\\n\",\n", "./../Chip-seq/BuildEmptyPickles.ipynb:52: \"!grep -rn clean ./.../*.ipynb \"\n", "./../Chip-seq/BuildGenomes.ipynb:41: \"clean_fa_dir='/cellar/users/btsui/Data/ensembl/clean/'\\n\",\n", "./../Chip-seq/BuildGenomes.ipynb:42: \"myFaNames=pd.Series(os.listdir(clean_fa_dir))\\n\",\n", "./../Chip-seq/BuildGenomes.ipynb:44: \" data=(clean_fa_dir+myFaNames).values)\\n\",\n", "./../DownloadGenome/ExtractCleanFastas.ipynb:24: \"clean_fa_out_dir='/cellar/users/btsui/Data/ensembl/clean/'\"\n", "./../DownloadGenome/ExtractCleanFastas.ipynb:60: \" my_fa_out_dir=clean_fa_out_dir+my_specie+'.fa'\\n\",\n", "./../XGS_WGS/old_MaskingGenomeWithSnp.ipynb:302: \"#!ls /cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa\"\n" ] } ], "source": [ "!grep -rn clean ./../*/*.ipynb " ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#from \n", "\n", "\n", "### spit out the data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#binSize=20\n", "for mySpecie, faDir in myDict.iteritems():\n", " ###for each specie, generate the data\n", " with open(faDir, \"rU\") as handle:\n", " #myChrNames=[]\n", " #myChrSize=[]\n", " myDict={}\n", " for record in SeqIO.parse(handle, \"fasta\"):\n", " myDict[record.id]=len(record)\n", " pd.Series(myDict).to_csv(baseOutDir+mySpecie+'.size.tsv',sep='\\t')" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#myDict" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tmpDf=pd.read_csv('/cellar/users/btsui/Data/Project/Skymap/ChipSeq/EpigenomeRoadmap_inter/GSM493384_UW.CD34.rep1.ChromatinAccessibility.CD34+-DS12274.bed.gz',sep='\\t',header=None)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'/cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/Homo_sapiens.size.csv'" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "baseOutDir+mySpecie+'.size.csv'" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Homo_sapiens,/cellar/users/btsui/Data/Project/KangZhang/refFa/hg19.fa\r\n" ] } ], "source": [ "!cat /cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/Homo_sapiens.size.csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## scratch" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [], "source": [ "binSize=20\n", "for mySpecie, faDir in myDict.iteritems():\n", " ###for each specie, generate the data\n", " with open(faDir, \"rU\") as handle:\n", " myChrNames=[]\n", " myChrBins=[]\n", " \n", " for record in SeqIO.parse(handle, \"fasta\"):\n", " nBp=len(record)\n", " bins=np.arange(0,nBp+100,binSize)\n", " myChrNames.append(record.id)\n", " myChrBins.append(pd.Series(index=bins))\n", " \n", " \n", " myMergedS=pd.concat(myChrBins,keys=myChrNames).sort_index().fillna(0).astype(np.int16)\n", " outDir=baseOutDir+mySpecie+'.pickle'\n", " myMergedS.to_pickle(outDir)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'/cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/Homo_sapiens.pickle'" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [], "source": [ "base_dir='/cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/'\n", "myL=[]\n", "fnames=os.listdir(base_dir)\n", "for fname in fnames:\n", " myL.append(pd.read_csv(base_dir+fname,sep='\\t',header=None))" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#myL[1]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
01158337067.0
110104305016.0
211107310763.0
31291163125.0
41384240350.0
51484648390.0
61585296676.0
71681724687.0
81775158596.0
91866004023.0
101964057457.0
112137060424.0
122072042655.0
132171599096.0
142261435874.0
152352530062.0
162462714930.0
172542904170.0
182651681464.0
192745407902.0
202846312546.0
212951505224.0
223121430405.0
234120829699.0
245121191424.0
256119458736.0
267112638659.0
278113384836.0
289105708250.0
29MT16338.0
30X148823899.0
\n", "
" ], "text/plain": [ " 0 1\n", "0 1 158337067.0\n", "1 10 104305016.0\n", "2 11 107310763.0\n", "3 12 91163125.0\n", "4 13 84240350.0\n", "5 14 84648390.0\n", "6 15 85296676.0\n", "7 16 81724687.0\n", "8 17 75158596.0\n", "9 18 66004023.0\n", "10 19 64057457.0\n", "11 2 137060424.0\n", "12 20 72042655.0\n", "13 21 71599096.0\n", "14 22 61435874.0\n", "15 23 52530062.0\n", "16 24 62714930.0\n", "17 25 42904170.0\n", "18 26 51681464.0\n", "19 27 45407902.0\n", "20 28 46312546.0\n", "21 29 51505224.0\n", "22 3 121430405.0\n", "23 4 120829699.0\n", "24 5 121191424.0\n", "25 6 119458736.0\n", "26 7 112638659.0\n", "27 8 113384836.0\n", "28 9 105708250.0\n", "29 MT 16338.0\n", "30 X 148823899.0" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat(myL,keys=fnames).loc['Bos_taurus.size.tsv']#.groupby(level=0).size().sort_values()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#!rm /cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/Homo_sapiens.csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.14" } }, "nbformat": 4, "nbformat_minor": 0 }