{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!ls /cellar/users/btsui/Data/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!mkdir /cellar/users/btsui/Data/SRA/microbes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "outMergedDir='/cellar/users/btsui/Data/SRA/microbes/'+'microbes'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import re\n",
    "import os\n",
    "import math\n",
    "from multiprocessing import Pool\n",
    "\n",
    "## init\n",
    "mySpecie='Homo_sapiens'\n",
    "\n",
    "##change this dir to point to the updated csv\n",
    "full_meta_dir=\"/cellar/users/btsui/Project/METAMAP/notebook/Parsing/sra_dump.csv\"\n",
    "inSrrDir='/nrnb/users/btsui/Data/all_seq/snp/'\n",
    "tmp_dir='/nrnb/users/btsui/Data/all_seq/tmp/'\n",
    "\n",
    "inAllFames=pd.Series(os.listdir(inSrrDir))\n",
    "\n",
    "#os.system('rm '+tmp_dir+'*')\n",
    "\n",
    "perFaStatS=inAllFames[inAllFames.str.contains('per_fa_record_stat.txt.gz')].values\n",
    "\n",
    "TEST=False\n",
    "if TEST:\n",
    "    toRunSrrs=perFaStatS[:10]\n",
    "    chunkSize=5\n",
    "    nThread=1\n",
    "else:\n",
    "    toRunSrrs=perFaStatS\n",
    "    chunkSize=1000\n",
    "    nThread=64\n",
    "\n",
    "## get the microbes FA\n",
    "tmpDf=pd.read_csv(inSrrDir+'ERR1497972_per_fa_record_stat.txt.gz',sep='\\t',header=None)\n",
    "ignoreList=['1', '2', '3', '4', '5', '6', '7',\n",
    "       '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',\n",
    "       '19', '20', '21', '22', 'X', 'Y', 'MT', '*']\n",
    "\n",
    "microbes_fa=tmpDf[0][~tmpDf[0].isin(ignoreList)].values\n",
    "\n",
    "\n",
    "#inFname=toRunSrrs[0]\n",
    "def parseFile(inFname):\n",
    "    \"\"\"\n",
    "    input: inAllFames\n",
    "    for each file, merge the microbe data file\n",
    "    \"\"\"\n",
    "    #The output is TAB-delimited with each line consisting of reference sequence name, sequence length, # mapped reads and # unmapped reads. \n",
    "    fDir=inSrrDir+inFname\n",
    "    tmpDf2=pd.read_csv(fDir,sep='\\t',header=None)\n",
    "    tmpDf2.columns=['fa_name','','n_mapped_reads','']\n",
    "    tmpS=tmpDf2[~tmpDf2['fa_name'].isin(ignoreList)].set_index('fa_name')['n_mapped_reads']\n",
    "    tmpS.name=inFname\n",
    "    #tmpS.index.name=inFname\n",
    "    return tmpS\n",
    "\n",
    "\n",
    "\n",
    "#tmpS=parseFile(inFname)\n",
    "\n",
    "\n",
    "def mergeSrrsL(i):\n",
    "    tmpL=[]\n",
    "    failedSrrsL=[]\n",
    "    for srr in toRunSrrs[i:(i+chunkSize)]:\n",
    "        try:\n",
    "            tmpL.append(parseFile(srr))\n",
    "        except :\n",
    "            print 'failed: '+srr\n",
    "            failedSrrsL.append(srr)\n",
    "    tmpMergedDf=pd.DataFrame(tmpL).T\n",
    "    #print tmpMergedDf.shape\n",
    "    #tmpMergedDf=pd.concat([parseSrr(srr) for srr in toRunSrrs[i:(i+chunkSize)]])\n",
    "    reorderedDf=tmpMergedDf.sort_index()\n",
    "    if TEST:\n",
    "        print tmp_dir+str(i)+'.pickle.gz'\n",
    "    reorderedDf.to_pickle(tmp_dir+str(i)+'.pickle.gz',compression='gzip')\n",
    "    return failedSrrsL\n",
    "\n",
    "Chunks=np.arange(0, len(toRunSrrs),chunkSize)\n",
    "if TEST:\n",
    "    failed_srr_l=map(mergeSrrsL,Chunks.tolist())\n",
    "else:\n",
    "    from multiprocessing import Pool\n",
    "    p=Pool(nThread)\n",
    "    ### sweep for uncompleted chunks\n",
    "    failed_srr_l=p.map(mergeSrrsL,Chunks.tolist())\n",
    "    p.close()\n",
    "\n",
    "\n",
    "#testDf3=pd.read_pickle('/nrnb/users/btsui/Data/all_seq/tmp/0.pickle.gz')\n",
    "\n",
    "myL=[]\n",
    "for fname in os.listdir(tmp_dir):\n",
    "    tmpDf10=pd.read_pickle(tmp_dir+fname).astype(np.float32)\n",
    "    myL.append(tmpDf10)\n",
    "mergedDf=pd.concat(myL,axis=1)\n",
    "\n",
    "mergedDf.columns=mergedDf.columns.str.replace('_per_fa_record_stat\\.txt\\.gz','')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Chunks=np.arange(0, len(toRunSrrs),chunkSize)\n",
    "import sharedVariable as shv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "shv.exportDf(outMergedDir,mergedDf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"full_meta_df=pd.read_csv(full_meta_dir)\\n\\n#inSrrDir='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/'\\n#existingMergedDf=pd.read_pickle(outMergedDir)\\n\\nmySpecieDf=full_meta_df[full_meta_df['ScientificName']==mySpecie]\\n\""
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"full_meta_df=pd.read_csv(full_meta_dir)\n",
    "\n",
    "#inSrrDir='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/'\n",
    "#existingMergedDf=pd.read_pickle(outMergedDir)\n",
    "\n",
    "mySpecieDf=full_meta_df[full_meta_df['ScientificName']==mySpecie]\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "#shv.loadDf(outMergedDir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "### get the read counts for just the non chromosomal ones. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#!mv /cellar/users/btsui/0.pickle.gz /cellar/users/btsui/10000.pickle.gz ."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tmpDf10=pd.read_pickle('0.pickle.gz')#.sum(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tmp_dir='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/../Microbiome/viral.1.1_2.1.genomic.fa.fai'\n",
    "tmpDf11=pd.read_csv(tmp_dir,sep='\\t',header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "#tmpDf10[~tmpDf10.index.isin(tmpDf11[0])].sum(axis=0)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}