{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "from Bio import SeqIO\n", "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "baseOutDir='/cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/'\n", "\n", "#myDict={'Homo_sapiens':'/cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa'}" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fa_dir='/cellar/users/btsui/Data/ensembl/clean/'\n", "fnames=pd.Series(os.listdir(fa_dir))\n", "myDict=pd.Series(data=(fa_dir+fnames).values,\n", " index=fnames.str.split('\\.').str[0])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "./../Chip-seq/BuildEmptyPickles.ipynb:27: \"#myDict={'Homo_sapiens':'/cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa'}\"\n", "./../Chip-seq/BuildEmptyPickles.ipynb:38: \"fa_dir='/cellar/users/btsui/Data/ensembl/clean/'\\n\",\n", "./../Chip-seq/BuildEmptyPickles.ipynb:52: \"!grep -rn clean ./.../*.ipynb \"\n", "./../Chip-seq/BuildGenomes.ipynb:41: \"clean_fa_dir='/cellar/users/btsui/Data/ensembl/clean/'\\n\",\n", "./../Chip-seq/BuildGenomes.ipynb:42: \"myFaNames=pd.Series(os.listdir(clean_fa_dir))\\n\",\n", "./../Chip-seq/BuildGenomes.ipynb:44: \" data=(clean_fa_dir+myFaNames).values)\\n\",\n", "./../DownloadGenome/ExtractCleanFastas.ipynb:24: \"clean_fa_out_dir='/cellar/users/btsui/Data/ensembl/clean/'\"\n", "./../DownloadGenome/ExtractCleanFastas.ipynb:60: \" my_fa_out_dir=clean_fa_out_dir+my_specie+'.fa'\\n\",\n", "./../XGS_WGS/old_MaskingGenomeWithSnp.ipynb:302: \"#!ls /cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa\"\n" ] } ], "source": [ "!grep -rn clean ./../*/*.ipynb " ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#from \n", "\n", "\n", "### spit out the data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#binSize=20\n", "for mySpecie, faDir in myDict.iteritems():\n", " ###for each specie, generate the data\n", " with open(faDir, \"rU\") as handle:\n", " #myChrNames=[]\n", " #myChrSize=[]\n", " myDict={}\n", " for record in SeqIO.parse(handle, \"fasta\"):\n", " myDict[record.id]=len(record)\n", " pd.Series(myDict).to_csv(baseOutDir+mySpecie+'.size.tsv',sep='\\t')" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#myDict" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tmpDf=pd.read_csv('/cellar/users/btsui/Data/Project/Skymap/ChipSeq/EpigenomeRoadmap_inter/GSM493384_UW.CD34.rep1.ChromatinAccessibility.CD34+-DS12274.bed.gz',sep='\\t',header=None)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'/cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/Homo_sapiens.size.csv'" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "baseOutDir+mySpecie+'.size.csv'" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Homo_sapiens,/cellar/users/btsui/Data/Project/KangZhang/refFa/hg19.fa\r\n" ] } ], "source": [ "!cat /cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/Homo_sapiens.size.csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## scratch" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [], "source": [ "binSize=20\n", "for mySpecie, faDir in myDict.iteritems():\n", " ###for each specie, generate the data\n", " with open(faDir, \"rU\") as handle:\n", " myChrNames=[]\n", " myChrBins=[]\n", " \n", " for record in SeqIO.parse(handle, \"fasta\"):\n", " nBp=len(record)\n", " bins=np.arange(0,nBp+100,binSize)\n", " myChrNames.append(record.id)\n", " myChrBins.append(pd.Series(index=bins))\n", " \n", " \n", " myMergedS=pd.concat(myChrBins,keys=myChrNames).sort_index().fillna(0).astype(np.int16)\n", " outDir=baseOutDir+mySpecie+'.pickle'\n", " myMergedS.to_pickle(outDir)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'/cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/Homo_sapiens.pickle'" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [], "source": [ "base_dir='/cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/'\n", "myL=[]\n", "fnames=os.listdir(base_dir)\n", "for fname in fnames:\n", " myL.append(pd.read_csv(base_dir+fname,sep='\\t',header=None))" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#myL[1]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "1 | \n", "
---|---|---|
0 | \n", "1 | \n", "158337067.0 | \n", "
1 | \n", "10 | \n", "104305016.0 | \n", "
2 | \n", "11 | \n", "107310763.0 | \n", "
3 | \n", "12 | \n", "91163125.0 | \n", "
4 | \n", "13 | \n", "84240350.0 | \n", "
5 | \n", "14 | \n", "84648390.0 | \n", "
6 | \n", "15 | \n", "85296676.0 | \n", "
7 | \n", "16 | \n", "81724687.0 | \n", "
8 | \n", "17 | \n", "75158596.0 | \n", "
9 | \n", "18 | \n", "66004023.0 | \n", "
10 | \n", "19 | \n", "64057457.0 | \n", "
11 | \n", "2 | \n", "137060424.0 | \n", "
12 | \n", "20 | \n", "72042655.0 | \n", "
13 | \n", "21 | \n", "71599096.0 | \n", "
14 | \n", "22 | \n", "61435874.0 | \n", "
15 | \n", "23 | \n", "52530062.0 | \n", "
16 | \n", "24 | \n", "62714930.0 | \n", "
17 | \n", "25 | \n", "42904170.0 | \n", "
18 | \n", "26 | \n", "51681464.0 | \n", "
19 | \n", "27 | \n", "45407902.0 | \n", "
20 | \n", "28 | \n", "46312546.0 | \n", "
21 | \n", "29 | \n", "51505224.0 | \n", "
22 | \n", "3 | \n", "121430405.0 | \n", "
23 | \n", "4 | \n", "120829699.0 | \n", "
24 | \n", "5 | \n", "121191424.0 | \n", "
25 | \n", "6 | \n", "119458736.0 | \n", "
26 | \n", "7 | \n", "112638659.0 | \n", "
27 | \n", "8 | \n", "113384836.0 | \n", "
28 | \n", "9 | \n", "105708250.0 | \n", "
29 | \n", "MT | \n", "16338.0 | \n", "
30 | \n", "X | \n", "148823899.0 | \n", "