{ "cells": [ { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "#!ls /cellar/users/btsui/Data/" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "#!mkdir /cellar/users/btsui/Data/SRA/microbes" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": true }, "outputs": [], "source": [ "outMergedDir='/cellar/users/btsui/Data/SRA/microbes/'+'microbes'\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import re\n", "import os\n", "import math\n", "from multiprocessing import Pool\n", "\n", "## init\n", "mySpecie='Homo_sapiens'\n", "\n", "##change this dir to point to the updated csv\n", "full_meta_dir=\"/cellar/users/btsui/Project/METAMAP/notebook/Parsing/sra_dump.csv\"\n", "inSrrDir='/nrnb/users/btsui/Data/all_seq/snp/'\n", "tmp_dir='/nrnb/users/btsui/Data/all_seq/tmp/'\n", "\n", "inAllFames=pd.Series(os.listdir(inSrrDir))\n", "\n", "#os.system('rm '+tmp_dir+'*')\n", "\n", "perFaStatS=inAllFames[inAllFames.str.contains('per_fa_record_stat.txt.gz')].values\n", "\n", "TEST=False\n", "if TEST:\n", " toRunSrrs=perFaStatS[:10]\n", " chunkSize=5\n", " nThread=1\n", "else:\n", " toRunSrrs=perFaStatS\n", " chunkSize=1000\n", " nThread=64\n", "\n", "## get the microbes FA\n", "tmpDf=pd.read_csv(inSrrDir+'ERR1497972_per_fa_record_stat.txt.gz',sep='\\t',header=None)\n", "ignoreList=['1', '2', '3', '4', '5', '6', '7',\n", " '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',\n", " '19', '20', '21', '22', 'X', 'Y', 'MT', '*']\n", "\n", "microbes_fa=tmpDf[0][~tmpDf[0].isin(ignoreList)].values\n", "\n", "\n", "#inFname=toRunSrrs[0]\n", "def parseFile(inFname):\n", " \"\"\"\n", " input: inAllFames\n", " for each file, merge the microbe data file\n", " \"\"\"\n", " #The output is TAB-delimited with each line consisting of reference sequence name, sequence length, # mapped reads and # unmapped reads. \n", " fDir=inSrrDir+inFname\n", " tmpDf2=pd.read_csv(fDir,sep='\\t',header=None)\n", " tmpDf2.columns=['fa_name','','n_mapped_reads','']\n", " tmpS=tmpDf2[~tmpDf2['fa_name'].isin(ignoreList)].set_index('fa_name')['n_mapped_reads']\n", " tmpS.name=inFname\n", " #tmpS.index.name=inFname\n", " return tmpS\n", "\n", "\n", "\n", "#tmpS=parseFile(inFname)\n", "\n", "\n", "def mergeSrrsL(i):\n", " tmpL=[]\n", " failedSrrsL=[]\n", " for srr in toRunSrrs[i:(i+chunkSize)]:\n", " try:\n", " tmpL.append(parseFile(srr))\n", " except :\n", " print 'failed: '+srr\n", " failedSrrsL.append(srr)\n", " tmpMergedDf=pd.DataFrame(tmpL).T\n", " #print tmpMergedDf.shape\n", " #tmpMergedDf=pd.concat([parseSrr(srr) for srr in toRunSrrs[i:(i+chunkSize)]])\n", " reorderedDf=tmpMergedDf.sort_index()\n", " if TEST:\n", " print tmp_dir+str(i)+'.pickle.gz'\n", " reorderedDf.to_pickle(tmp_dir+str(i)+'.pickle.gz',compression='gzip')\n", " return failedSrrsL\n", "\n", "Chunks=np.arange(0, len(toRunSrrs),chunkSize)\n", "if TEST:\n", " failed_srr_l=map(mergeSrrsL,Chunks.tolist())\n", "else:\n", " from multiprocessing import Pool\n", " p=Pool(nThread)\n", " ### sweep for uncompleted chunks\n", " failed_srr_l=p.map(mergeSrrsL,Chunks.tolist())\n", " p.close()\n", "\n", "\n", "#testDf3=pd.read_pickle('/nrnb/users/btsui/Data/all_seq/tmp/0.pickle.gz')\n", "\n", "myL=[]\n", "for fname in os.listdir(tmp_dir):\n", " tmpDf10=pd.read_pickle(tmp_dir+fname).astype(np.float32)\n", " myL.append(tmpDf10)\n", "mergedDf=pd.concat(myL,axis=1)\n", "\n", "mergedDf.columns=mergedDf.columns.str.replace('_per_fa_record_stat\\.txt\\.gz','')" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Chunks=np.arange(0, len(toRunSrrs),chunkSize)\n", "import sharedVariable as shv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "shv.exportDf(outMergedDir,mergedDf)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"full_meta_df=pd.read_csv(full_meta_dir)\\n\\n#inSrrDir='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/'\\n#existingMergedDf=pd.read_pickle(outMergedDir)\\n\\nmySpecieDf=full_meta_df[full_meta_df['ScientificName']==mySpecie]\\n\"" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"full_meta_df=pd.read_csv(full_meta_dir)\n", "\n", "#inSrrDir='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/'\n", "#existingMergedDf=pd.read_pickle(outMergedDir)\n", "\n", "mySpecieDf=full_meta_df[full_meta_df['ScientificName']==mySpecie]\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "#shv.loadDf(outMergedDir)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "### get the read counts for just the non chromosomal ones. " ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!mv /cellar/users/btsui/0.pickle.gz /cellar/users/btsui/10000.pickle.gz ." ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tmpDf10=pd.read_pickle('0.pickle.gz')#.sum(axis=0)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tmp_dir='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/../Microbiome/viral.1.1_2.1.genomic.fa.fai'\n", "tmpDf11=pd.read_csv(tmp_dir,sep='\\t',header=None)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "#tmpDf10[~tmpDf10.index.isin(tmpDf11[0])].sum(axis=0)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }