{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "The code in here is somewhat involved mostly because it needs to parallelize the data reading process, otherwise it could be a lot simpler. \n", "\n", "parseSrr is probably what u need to read the most" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import re\n", "import os\n", "import math\n", "from multiprocessing import Pool\n", "\n", "## init\n", "mySpecie='Homo_sapiens'\n", "outMergedDir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.h5'\n", "\n", "##change this dir to point to the updated csv\n", "full_meta_dir=\"/cellar/users/btsui/Project/METAMAP/notebook/Parsing/sra_dump.csv\"\n", "inSrrDir='/nrnb/users/btsui/Data/all_seq/snp/'\n", "tmp_dir='/nrnb/users/btsui/Data/all_seq/tmp/'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#!df -h /data/cellardata/" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", "Wall time: 11.7 µs\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (5,6,25,26) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n", "/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] } ], "source": [ "%%time\n", "#%matplotlib inline\n", "##machine: 5-1\n", "\n", "\n", "\n", "full_meta_df=pd.read_csv(full_meta_dir)\n", "\n", "#inSrrDir='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/'\n", "#existingMergedDf=pd.read_pickle(outMergedDir)\n", "\n", "mySpecieDf=full_meta_df[full_meta_df['ScientificName']==mySpecie]\n", "\n", "#find the chromosomes \n", "tmpBedDf=pd.read_csv('/data/cellardata/users/btsui/dbsnp/snp_beds/'+mySpecie+'.bed',header=None,sep='\\t')\n", "unique_chroms=tmpBedDf[0].astype(np.str).unique()\n", "\n", "### start merging one by one \n", "if os.path.exists(tmp_dir):\n", " os.system('rm -r '+tmp_dir)\n", "os.system('mkdir -p '+tmp_dir)\n", "\n", "\n", "os.chdir(tmp_dir)\n", "#identify non empty files\n", "os.system('ls -la '+inSrrDir+' > ls_out.txt ')\n", "ls_df=pd.read_csv('ls_out.txt',sep='\\s+',header=None,names=np.arange(9)).iloc[1:]\n", "#ls_df=\n", "size_S=ls_df[4]\n", "m4=size_S.astype(np.int)>1000\n", "m5=ls_df[8].str.contains('.txt.snp.gz$')\n", "non_empty_files=ls_df[m4&m5][8].str.split('/').str[-1].str.split('.').str[0].values\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "# of files to merge: 253005\n" ] } ], "source": [ "print ('# of files to merge:',len(non_empty_files))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 2.86 s, sys: 540 ms, total: 3.4 s\n", "Wall time: 3.93 s\n" ] } ], "source": [ "%%time\n", "\"\"\"\n", "given: srr id \n", "return: the merged file\n", "\"\"\"\n", "def parseSrr(inSrr):\n", " #print inSrr\n", " fname=inSrrDir+inSrr+'.txt.snp.gz'\n", " tmpDf_all=pd.read_csv(fname,sep='\\s+',header=None,names=np.arange(50),index_col=None,error_bad_lines=False)\n", " myCols=['Chr','Pos','Ref','rd_all','','A','C','G','T','N']\n", " tmpDf=tmpDf_all.iloc[:,:len(myCols)]\n", " tmpDf.columns=myCols\n", " tmpDf2=tmpDf.set_index(['Chr','Pos'])\n", " myBases=['A','C','G','T']\n", " myL=[]\n", " for base in myBases:\n", " splitL=tmpDf2[base].str.split(':',expand=True)\n", " ### extract the read count and base quality\n", " tmpDf5=splitL[[1,3]].astype(np.float)\n", " tmpDf5.columns=['ReadDepth','AverageBaseQuality']\n", " myL.append(tmpDf5)\n", " tmpDf6=pd.concat(myL,keys=myBases,axis=0,names=['base'])\n", " tmpDf6.columns.name='features'\n", " mergedDf=tmpDf6.astype(np.uint16)\n", " non_zero_df=mergedDf[mergedDf['ReadDepth']>0]\n", " tmpDf7=non_zero_df.reset_index()\n", " Run_digits=re.search('[DES]RR(\\d+)', inSrr)\n", " Run_Db=re.search('([DES]RR)\\d+', inSrr)\n", " tmpDf7['Run_digits']=Run_digits.group(1)\n", " tmpDf7['Run_db']=Run_Db.group(1)\n", " ###convert the datatypes\n", " tmpDf7['Pos']=tmpDf7['Pos'].astype(np.uint32) \n", " tmpDf7['Run_digits']=tmpDf7['Run_digits'].astype(np.uint64)\n", " tmpDf7['Chr']=tmpDf7['Chr'].astype(np.str).astype('category',\n", " categories=unique_chroms,ordered=True)\n", " tmpDf7['Run_db']=tmpDf7['Run_db'].astype(np.str).astype('category',\n", " categories=['DRR','ERR','SRR'],ordered=True)\n", " tmpDf7['base']=tmpDf7['base'].astype('category',\n", " categories=myBases,ordered=True)\n", " srr_pickle_df=tmpDf7.set_index(['Run_db','Run_digits',u'Chr', u'Pos',u'base']).sort_index()\n", " return srr_pickle_df\n", "\n", "### identify files to be merged\n", "fnames=pd.Series(os.listdir(inSrrDir))\n", "snpFnames=fnames[fnames.str.contains('.snp.gz$')]\n", "srrsWithData=snpFnames.str.split('.').str[0]\n", "#mergedSrrs=existingMergedDf.index.get_level_values('Run')\n", "#m1=~srrsWithData.isin(mergedSrrs)\n", "m2=srrsWithData.isin(mySpecieDf['Run'])\n", "m3=srrsWithData.isin(non_empty_files)\n", "toMergeSrrs=srrsWithData[m2&m3].values\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "TEST=True\n", "if TEST:\n", " toRunSrrs=toMergeSrrs[:10]\n", " chunkSize=5\n", " nThread=1\n", "else:\n", " toRunSrrs=toMergeSrrs\n", " chunkSize=1000\n", " nThread=64\n", "#optional: free up the memory\n", "if not TEST:\n", " del mySpecieDf, full_meta_df\n", "\n", "def mergeSrrsL(i):\n", " tmpL=[]\n", " failedSrrsL=[]\n", " for srr in toRunSrrs[i:(i+chunkSize)]:\n", " try:\n", " tmpL.append(parseSrr(srr))\n", " except :\n", " print ('failed: '+srr)\n", " failedSrrsL.append(srr)\n", " tmpMergedDf=pd.concat(tmpL)\n", " #tmpMergedDf=pd.concat([parseSrr(srr) for srr in toRunSrrs[i:(i+chunkSize)]])\n", " reorderedDf=tmpMergedDf.sort_index()\n", " reorderedDf.to_pickle(tmp_dir+str(i)+'.pickle.gz',compression='gzip')\n", " return failedSrrsL\n", "\n", "Chunks=np.arange(0, len(toRunSrrs),chunkSize)\n", "if TEST:\n", " failed_srr_l=map(mergeSrrsL,Chunks.tolist())\n", "else:\n", " from multiprocessing import Pool\n", " p=Pool(nThread)\n", " ### sweep for uncompleted chunks\n", " failed_srr_l=p.map(mergeSrrsL,Chunks.tolist())\n", " p.close()\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "#!ls /cellar/users/btsui/Data/dbsnp/" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] } ], "source": [ "snpBed='/cellar/users/btsui/Data/dbsnp/snp_beds/'+mySpecie+'.bed'\n", "\n", "\n", "### find the data that overlap exactly\n", "tmpDf_ref_all_snps_df=pd.read_csv(snpBed,\n", " sep='\\s+',header=None,names=np.arange(50),index_col=None,error_bad_lines=False)\n", "myCols=['Chr','Pos','Ref','rd_all','','A','C','G','T','N']\n", "tmpDf=tmpDf_ref_all_snps_df.iloc[:,:len(myCols)]\n", "tmpDf.columns=myCols\n", "myReindexedDf=tmpDf.set_index(['Chr','Pos'])\n", "refIndex=myReindexedDf.index" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### merging the files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "changes: need to do the split for different bins in here\n", "\n", "Map the regions into the right bin\n", "\"\"\"\n", "#myInDir=''\n", "#feature=''\n", "from multiprocessing import Pool\n", "import pandas as pd\n", "import numpy as np\n", "import os\n", "tmp_dir='/nrnb/users/btsui/Data/all_seq/tmp/'\n", "TEST=False\n", "\n", "\"\"\"\n", "change directory and identify intermediate pickle objects\n", "\"\"\"\n", "os.chdir(tmp_dir)\n", "my_ls_S=pd.Series(os.listdir('./'))\n", "ChunksFnames=my_ls_S[my_ls_S.str.contains('.pickle.gz$')].values\n", "\n", "### create another dirctory for \n", "tmp_dir_2='/nrnb/users/btsui/Data/all_seq/tmp_2/'\n", "os.system('rm -r '+tmp_dir_2)\n", "os.system('mkdir '+tmp_dir_2)\n", "\n", "\"\"\"\n", "input: pickle name\n", "output: split into chunks \n", "\"\"\"\n", "feature_for_hash='Pos'\n", "#2483 blocks per file \n", "def splitChunk(inChunkFname):\n", " tmpDf3=pd.read_pickle(inChunkFname).reset_index()\n", " my_base_shift=int(10**5)#int(10**(max_size_order-n))\n", " my_block_S=(tmpDf3[feature_for_hash]/my_base_shift).astype(np.int)*my_base_shift\n", " ##add one for the loop at the end\n", " tmpDf3['block']=my_block_S #6.5s\n", " tmpDf3=tmpDf3.sort_values(feature_for_hash) # might improve perfromance of group by\n", " \"\"\"\n", " export chunks\n", " \"\"\"\n", " g=tmpDf3.groupby('block')\n", " for myBlock,tmpDf9 in g:\n", " myKeyName='Chunk_'+str(inChunkFname.split('.')[0])+'.'+feature_for_hash+'_block_'+str(myBlock)\n", " #print (tmp_dir_2+myKeyName)\n", " tmpDf9.to_pickle(tmp_dir_2+myKeyName)\n", " \n", "#splitChunk('60700.pickle.gz')\n", "\n", "\"\"\"\n", "test on a small number of files first \n", "\"\"\"\n", "p=Pool(64)\n", "if TEST:\n", " p.map(splitChunk,ChunksFnames[:10])\n", "else:\n", " p.map(splitChunk,ChunksFnames)\n", "p.close()\n", "\n", "\n", "\"\"\"\n", "\n", "\"\"\"\n", "tmp_dir_3='/nrnb/users/btsui/Data/all_seq/tmp_3/'\n", "os.system('rm -r '+tmp_dir_3)\n", "os.system('mkdir '+tmp_dir_3)\n", "my_chunked_pickles=pd.Series(os.listdir(tmp_dir_2))\n", "tmpDf6=pd.DataFrame({\"chunk\":my_chunked_pickles.str.split('.').str[1],\"fname\":my_chunked_pickles})\n", "g=tmpDf6.groupby('chunk')['fname']\n", "myChunks=g.groups.keys()\n", "\n", "def mergeChunks(inputChunk):\n", " myFiles=g.get_group(inputChunk)\n", " myL=[]\n", " for myFile in myFiles:\n", " myL.append(pd.read_pickle(tmp_dir_2+myFile))\n", " myMergedDf=pd.concat(myL,axis=0)\n", " myMergedDf.set_index(['Chr','base','Run_db']).to_hdf(tmp_dir_3+inputChunk,'chunk',mode='w',format='fixed')\n", "\n", "p=Pool(64)\n", "p.map(mergeChunks,myChunks)\n", "p.close()\n", "\"\"\"\n", "for each chunks\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\"\"\"\n", "for each chunk, merge into a hdf5 file \n", "merge all the tiny chunks into \n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "myDfDf=pd.read_hdf('Pos_block_140700000',mode='r')" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#subDf=myDfDf.loc['7']\n", "#subDf[subDf['Pos']==140753336].reset_index()#.index.get_level_values('base')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['tmp.pickle.gz', '0.pickle.gz', '1000.pickle.gz', 'tmp2.pickle.gz'], dtype=object)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "changes: need to do the split for different bins in here\n", "\n", "Map the regions into the right bin\n", "\"\"\"\n", "#myInDir=''\n", "#feature=''\n", "import pandas as pd\n", "from multiprocessing import Pool\n", "import numpy as np\n", "import os\n", "\"\"\"\n", "change directory and identify intermediate pickle objects\n", "\"\"\"\n", "os.chdir(tmp_dir)\n", "my_ls_S=pd.Series(os.listdir('./'))\n", "ChunksFnames=my_ls_S[my_ls_S.str.contains('.pickle.gz$')].values\n", "\n", "\n", "### create another dirctory for \n", "tmp_dir_2='/nrnb/users/btsui/Data/all_seq/tmp_2/'\n", "os.system('rm -r '+tmp_dir_2)\n", "os.system('mkdir '+tmp_dir_2)\n", "\n", "\"\"\"\n", "input: pickle name\n", "output: split into chunks \n", "\"\"\"\n", "feature_for_hash='Pos'\n", "inChunkFname=ChunksFnames[0]\n", "tmpDf3=pd.read_pickle(inChunkFname).reset_index()\n", "my_max=tmpDf3[feature_for_hash].max()\n", "\n", "print (my_max)# 6121752\n", "#max_size_order=math.ceil(np.log10(my_max))\n", "#print ('order of max',max_size_order)\n", "#n=4\n", "my_base_shift=int(10**5)#int(10**(max_size_order-n))\n", "my_block_S=(tmpDf3[feature_for_hash]/my_base_shift).astype(np.int)*my_base_shift\n", "\n", "nBlocks=my_block_S.max()\n", "print ('# of blocks',)\n", "blocks_VC=my_block_S.value_counts()\n", "##add one for the loop at the end\n", "my_range=blocks_VC.index#np.arange(0,nBlocks+1,my_base_shift)\n", "print ('range size: ',len(my_range))\n", "outH5Name=outMergedDir.replace('.h5','.'+feature_for_hash+'.'+str(my_base_shift)+'.chunked.h5')\n", "print ('output name:',outH5Name)\n", "os.system('rm '+outH5Name)\n", "%time tmpDf3['block']=my_block_S #6.5s\n", "%time tmpDf3=tmpDf3.sort_values(feature_for_hash) # 30 mins\n", "\n", "\"\"\"\n", "export chunks\n", "\"\"\"\n", "#myOrderDict={'Pos':[u'Chr', u'Pos',u'base','Run_digits','Run_db'],\n", "# 'Run_digits':['Run_digits','Run_db',u'Chr', u'Pos',u'base']}\n", "g=tmpDf3.groupby('block')\n", "for myBlock,tmpDf9 in g:\n", " tmpDf9=tmpDf3[tmpDf3['block']==myBlock]# For run, 27s, For Pos: 2 min :30 s\n", " my_index_order=myOrderDict[feature_for_hash]\n", " #tmpDf10=tmpDf9.set_index(my_index_order) \n", " myKeyName=feature_for_hash+'_'+str(myBlock)+'.block_'+str(myBlock)\n", " print (tmp_dir_2+myKeyName)\n", " tmpDf9.to_pickle(tmp_dir_2+myKeyName)\n", " #print (myKeyName,tmpDf9.shape)\n", " #myKeyName\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "### merge the files\n", "\"\"\"\n", "my_max=tmpDf3[feature_for_hash].max()\n", "\n", "print (my_max)# 6121752\n", "max_size_order=math.ceil(np.log10(my_max))\n", "print ('order of max',max_size_order)\n", "n=4\n", "my_base_shift=int(10**(max_size_order-n))\n", "my_block_S=(tmpDf3[feature_for_hash]/my_base_shift).astype(np.int)*my_base_shift\n", "##check the data type of my_block_S and Run_digits\n", "#my_block_S=my_block_S\n", "### make the data become int \n", "nBlocks=my_block_S.max()\n", "print ('# of blocks',)\n", "\n", "blocks_VC=my_block_S.value_counts()\n", "\n", "##add one for the loop at the end\n", "my_range=blocks_VC.index#np.arange(0,nBlocks+1,my_base_shift)\n", "print ('range size: ',len(my_range))\n", "outH5Name=outMergedDir.replace('.h5','.'+feature_for_hash+'.'+str(my_base_shift)+'.chunked.h5')\n", "print ('output name:',outH5Name)\n", "os.system('rm '+outH5Name)\n", "%time tmpDf3['block']=my_block_S #6.5s\n", "%time tmpDf3=tmpDf3.sort_values(feature_for_hash) # 30 mins\n", "\n", "\"\"\"\n", "\"\"\"\n", "changes: need to do the split for different bins in here\n", "\n", "\n", "\"\"\"\n", "\"\"\"\n", "myTmpL=[]\n", "my_ls_S=pd.Series(os.listdir('./'))\n", "ChunksFnames=my_ls_S[my_ls_S.str.contains('.pickle.gz$')].values\n", "for chunkFname in ChunksFnames:\n", " tmpDf=pd.read_pickle(chunkFname)\n", " tmpDf2=tmpDf.reset_index()\n", " #tmpDf3=tmpDf2.set_index(['Chr','Pos'])\n", " #m=tmpDf3.index.isin(refIndex)\n", " subTmpDf2=tmpDf2#[m]\n", " print '% sites in targets:', (m.mean())\n", " print subTmpDf2.shape\n", " myTmpL.append(subTmpDf2)\n", " \n", "mergedDf=pd.concat(myTmpL)\n", "\n", "#mergedS\n", "\n", "all_mergedDf=mergedDf.set_index(\n", " ['Run_db','Run_digits']).sort_index()\n", "#all_mergedDf.to_pickle(outMergedDir)\n", "\"\"\"\n", "all_mergedDf.to_hdf(outMergedDir,key='master',mode=\"w\")\n", "allFailedSrrs=pd.Series(reduce(lambda a,x:a+x,failed_srr_l,[]))\n", "allFailedSrrs.to_csv('/nrnb/users/btsui/Data/all_seq/'+mySpecie+'.failed.srrs.txt')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "###change corrdinate to chromsome base for slicing\n", "#mySpecie='Homo_sapiens'\n", "#skymap_snp_dir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.h5'\n", "sorted_snp_dir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.chrom_pos_sorted.h5'\n", "tmpDf=pd.read_hdf(outMergedDir,key='master',mode='r')\n", "tmpDf4=tmpDf.reset_index()\n", "sortedDf=tmpDf4.set_index( [u'Chr', u'Pos',u'base']).sort_index()\n", "sortedDf.to_hdf(sorted_snp_dir,key='master',mode='w')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### scratch" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#sorted_snp_dir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.chrom_pos_sorted.h5'\n", "\n", "\n", "\n", "tmpDf=pd.read_hdf(outMergedDir,key='master',mode='r')\n", "tmpDf3=tmpDf.reset_index()\n", "tmpDf3['base']=tmpDf3['base'].astype('category')\n", "\n", "tmpDf3['Chr']=tmpDf3['Chr'].astype('category')\n", "tmpDf3['Run_db']=tmpDf3['Run_db'].astype('category')\n", "\n", "tmpDf3['Pos']=tmpDf3['Pos'].astype(np.uint32)\n", "tmpDf3['Run_digits']=tmpDf3['Run_digits'].astype(np.uint32)\n", "\n", "#tmpDf4=tmpDf3.set_index(['Run_digits','Run_db',u'Chr', u'Pos',u'base'])\n", "#tmpDf5=tmpDf4.sort_index()\n", "outRunSortedOutDir=outMergedDir.replace('.h5','.pickle')\n", "os.system('rm '+outRunSortedOutDir)\n", "tmpDf3.to_pickle(outRunSortedOutDir)\n", "#tmpDf3.to_hdf(outRunSortedOutDir,key='master',mode='w')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### test exporting to hdf5 data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "'/cellar/users/btsui/'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import re\n", "import os\n", "from multiprocessing import Pool\n", "\n", "## init\n", "mySpecie='Homo_sapiens'\n", "outMergedDir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.h5'\n", "\n", "\n", "%time tmpDf=pd.read_hdf(outMergedDir,key='master',mode='r')\n", "%time tmpDf3=tmpDf.reset_index()\n", "\"\"\"\n", "CPU times: user 8.21 s, sys: 19.9 s, total: 28.2 s\n", "Wall time: 1min 18s\n", "CPU times: user 1min 35s, sys: 57.8 s, total: 2min 33s\n", "Wall time: 2min 33s\n", "\n", "\"\"\"\n", "%time tmpDf5=tmpDf3.set_index(['Run_digits','Run_db'])\n", "\"\"\"\n", "#CPU times: user 2min 7s, sys: 1min 54s, total: 4min 1s\n", "Wall time: 4min 1s\n", "\"\"\"\n", "%time tmpDf5.to_hdf('./test_wo_chroms.hdf','master',mode='w',append=False,format='table')\n", "#Exception: cannot find the correct atom type -> [dtype->object,items->Index(['Chr', 'base'], dtype='object')] \n" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": true }, "outputs": [], "source": [ "math.ceil()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#myCol=np.array(['Run_digits','Run_db',u'Chr', u'Pos',u'base'])\n", "### what's the datatype for those guys:\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "### let's just \n", "import pandas as pd\n", "import numpy as np\n", "import re\n", "import os\n", "import math\n", "\n", "from multiprocessing import Pool\n", "\n", "## init\n", "mySpecie='Homo_sapiens'\n", "outMergedDir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.h5'\n", "\n", "%time tmpDf=pd.read_hdf(outMergedDir,key='master',mode='r')\n", "%time tmpDf3=tmpDf.reset_index()\n", "\n", "#feature_for_hash='Run_digits'\n", "feature_for_hash='Pos'\n", "myOrderDict={'Pos':[u'Chr', u'Pos',u'base','Run_digits','Run_db'],\n", " 'Run_digits':['Run_digits','Run_db',u'Chr', u'Pos',u'base']}\n", "my_max=tmpDf3[feature_for_hash].max()\n", "\n", "print (my_max)# 6121752\n", "max_size_order=math.ceil(np.log10(my_max))\n", "print ('order of max',max_size_order)\n", "n=4\n", "my_base_shift=int(10**(max_size_order-n))\n", "my_block_S=(tmpDf3[feature_for_hash]/my_base_shift).astype(np.int)*my_base_shift\n", "##check the data type of my_block_S and Run_digits\n", "#my_block_S=my_block_S\n", "### make the data become int \n", "nBlocks=my_block_S.max()\n", "print ('# of blocks',)\n", "\n", "blocks_VC=my_block_S.value_counts()\n", "\n", "##add one for the loop at the end\n", "my_range=blocks_VC.index#np.arange(0,nBlocks+1,my_base_shift)\n", "print ('range size: ',len(my_range))\n", "outH5Name=outMergedDir.replace('.h5','.'+feature_for_hash+'.'+str(my_base_shift)+'.chunked.h5')\n", "print ('output name:',outH5Name)\n", "os.system('rm '+outH5Name)\n", "%time tmpDf3['block']=my_block_S #6.5s\n", "%time tmpDf3=tmpDf3.sort_values(feature_for_hash) # 30 mins\n", "#%time g=tmpDf3.groupby('block') #1min 35s\n", "#\n", "for myBlock in my_range:\n", "#for myBlock,tmpDf9 in g:\n", " \"\"\"\n", " in Pos, \n", " \"\"\"\n", " %time tmpDf9=tmpDf3[tmpDf3['block']==myBlock]# For run, 27s, For Pos: 2 min :30 s\n", " \n", " my_index_order=myOrderDict[feature_for_hash]\n", " \n", " #tmpDf10=tmpDf9.set_index(my_index_order) \n", " myKeyName=feature_for_hash+'_'+str(myBlock)\n", " print (myKeyName,tmpDf9.shape)\n", " \n", " tmpDf9.to_hdf(outH5Name,myKeyName,mode='a',format='fixed')\n", " #if myBlock >3000:\n", " # break" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0015764881876101904" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#('range size: ', 1201)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Acinetobacter_baumannii_all_merged_snp.pickle\r\n", "Acropora_millepora_all_merged_snp.pickle\r\n", "activated_sludge_metagenome_all_merged_snp.pickle\r\n", "Aedes_aegypti_all_merged_snp.pickle\r\n", "air_metagenome_all_merged_snp.pickle\r\n", "algae_metagenome_all_merged_snp.pickle\r\n", "anaerobic_digester_metagenome_all_merged_snp.pickle\r\n", "Anopheles_arabiensis_all_merged_snp.pickle\r\n", "Anopheles_gambiae_all_merged_snp.pickle\r\n", "Apis_mellifera_all_merged_snp.pickle\r\n", "aquatic_metagenome_all_merged_snp.pickle\r\n", "Arabidopsis_thaliana_all_merged_snp.pickle\r\n", "Bacteria_all_merged_snp.pickle\r\n", "biofilm_metagenome_all_merged_snp.pickle\r\n", "bioreactor_metagenome_all_merged_snp.pickle\r\n", "bird_metagenome_all_merged_snp.pickle\r\n", "Boechera_stricta_all_merged_snp.pickle\r\n", "Bordetella_pertussis_all_merged_snp.pickle\r\n", "Bos_taurus_all_merged_snp.pickle\r\n", "bovine_gut_metagenome_all_merged_snp.pickle\r\n", "bovine_metagenome_all_merged_snp.pickle\r\n", "Brachypodium_distachyon_all_merged_snp.pickle\r\n", "Brassica_napus_all_merged_snp.pickle\r\n", "Brassica_rapa_all_merged_snp.pickle\r\n", "Burkholderia_pseudomallei_all_merged_snp.pickle\r\n", "Caenorhabditis_elegans_all_merged_snp.pickle\r\n", "Calidris_pugnax_all_merged_snp.pickle\r\n", "Campylobacter_all_merged_snp.pickle\r\n", "Campylobacter_coli_all_merged_snp.pickle\r\n", "Campylobacter_jejuni_all_merged_snp.pickle\r\n", "Campylobacter_sp._all_merged_snp.pickle\r\n", "Candida_albicans_all_merged_snp.pickle\r\n", "Canis_lupus_familiaris_all_merged_snp.pickle\r\n", "Cannabis_sativa_all_merged_snp.pickle\r\n", "Capra_hircus_all_merged_snp.pickle\r\n", "Centaurea_solstitialis_all_merged_snp.pickle\r\n", "chicken_gut_metagenome_all_merged_snp.pickle\r\n", "Chlamydomonas_reinhardtii_all_merged_snp.pickle\r\n", "Chlorocebus_sabaeus_all_merged_snp.pickle\r\n", "Ciona_robusta_all_merged_snp.pickle\r\n", "Clostridioides_difficile_all_merged_snp.pickle\r\n", "[Clostridium]_difficile_all_merged_snp.pickle\r\n", "coral_metagenome_all_merged_snp.pickle\r\n", "Crassostrea_gigas_all_merged_snp.pickle\r\n", "Danio_rerio_all_merged_snp.pickle\r\n", "Drosophila_melanogaster_all_merged_snp.pickle\r\n", "endophyte_metagenome_all_merged_snp.pickle\r\n", "Enterobacter_cloacae_all_merged_snp.pickle\r\n", "Enterococcus_faecium_all_merged_snp.pickle\r\n", "environmental_samples_all_merged_snp.pickle\r\n", "Equus_caballus_all_merged_snp.pickle\r\n", "Erythranthe_guttata_all_merged_snp.pickle\r\n", "Escherichia_coli_all_merged_snp.pickle\r\n", "Escherichia_coli_str._K-12_substr._MG1655_all_merged_snp.pickle\r\n", "Fagopyrum_tataricum_all_merged_snp.pickle\r\n", "feces_metagenome_all_merged_snp.pickle\r\n", "fish_gut_metagenome_all_merged_snp.pickle\r\n", "fish_metagenome_all_merged_snp.pickle\r\n", "food_fermentation_metagenome_all_merged_snp.pickle\r\n", "food_metagenome_all_merged_snp.pickle\r\n", "freshwater_metagenome_all_merged_snp.pickle\r\n", "freshwater_sediment_metagenome_all_merged_snp.pickle\r\n", "Gallus_gallus_all_merged_snp.pickle\r\n", "Gasterosteus_aculeatus_all_merged_snp.pickle\r\n", "Glycine_max_all_merged_snp.pickle\r\n", "Gossypium_hirsutum_all_merged_snp.pickle\r\n", "gut_metagenome_all_merged_snp.pickle\r\n", "Haemonchus_contortus_all_merged_snp.pickle\r\n", "Helianthus_annuus_all_merged_snp.pickle\r\n", "Hepatitis_C_virus_all_merged_snp.pickle\r\n", "Homo_sapiens_all_merged_snp.chrom_pos_sorted.h5\r\n", "Homo_sapiens_all_merged_snp.h5\r\n", "Homo_sapiens_all_merged_snp.pickle\r\n", "Homo_sapiens_all_merged_snp.pickle.gz\r\n", "Homo_sapiens_all_merged_snp.run_sorted.h5\r\n", "Homo_sapiens_all_merged_snp.run_sorted.pickle\r\n", "Hordeum_vulgare_all_merged_snp.pickle\r\n", "Hordeum_vulgare_subsp._vulgare_all_merged_snp.pickle\r\n", "hot_springs_metagenome_all_merged_snp.pickle\r\n", "human_gut_metagenome_all_merged_snp.pickle\r\n", "Human_immunodeficiency_virus_1_all_merged_snp.pickle\r\n", "human_lung_metagenome_all_merged_snp.pickle\r\n", "human_metagenome_all_merged_snp.pickle\r\n", "human_nasopharyngeal_metagenome_all_merged_snp.pickle\r\n", "human_oral_metagenome_all_merged_snp.pickle\r\n", "human_skin_metagenome_all_merged_snp.pickle\r\n", "human_vaginal_metagenome_all_merged_snp.pickle\r\n", "hydrocarbon_metagenome_all_merged_snp.pickle\r\n", "hydrothermal_vent_metagenome_all_merged_snp.pickle\r\n", "indoor_metagenome_all_merged_snp.pickle\r\n", "Influenza_A_virus_all_merged_snp.pickle\r\n", "insect_gut_metagenome_all_merged_snp.pickle\r\n", "insect_metagenome_all_merged_snp.pickle\r\n", "Klebsiella_pneumoniae_all_merged_snp.pickle\r\n", "lake_water_metagenome_all_merged_snp.pickle\r\n", "Lates_calcarifer_all_merged_snp.pickle\r\n", "leaf_metagenome_all_merged_snp.pickle\r\n", "Legionella_pneumophila_all_merged_snp.pickle\r\n", "Listeria_monocytogenes_all_merged_snp.pickle\r\n", "Lolium_perenne_all_merged_snp.pickle\r\n", "lung_metagenome_all_merged_snp.pickle\r\n", "Macaca_fascicularis_all_merged_snp.pickle\r\n", "Macaca_mulatta_all_merged_snp.pickle\r\n", "Manihot_esculenta_all_merged_snp.pickle\r\n", "Mannheimia_haemolytica_all_merged_snp.pickle\r\n", "marine_metagenome_all_merged_snp.pickle\r\n", "marine_sediment_metagenome_all_merged_snp.pickle\r\n", "Medicago_truncatula_all_merged_snp.pickle\r\n", "Menidia_menidia_all_merged_snp.pickle\r\n", "metagenome_all_merged_snp.pickle\r\n", "metagenomes_all_merged_snp.pickle\r\n", "metagenome_sequence_all_merged_snp.pickle\r\n", "microbial_mat_metagenome_all_merged_snp.pickle\r\n", "milk_metagenome_all_merged_snp.pickle\r\n", "Miscanthus_sinensis_all_merged_snp.pickle\r\n", "mosquito_metagenome_all_merged_snp.pickle\r\n", "mouse_gut_metagenome_all_merged_snp.pickle\r\n", "mouse_metagenome_all_merged_snp.pickle\r\n", "Mus_musculus_domesticus_all_merged_snp.pickle\r\n", "Mycobacterium_bovis_all_merged_snp.pickle\r\n", "Mycobacterium_tuberculosis_all_merged_snp.pickle\r\n", "Mycobacterium_tuberculosis_complex_bacterium_all_merged_snp.pickle\r\n", "Neisseria_gonorrhoeae_all_merged_snp.pickle\r\n", "Neisseria_meningitidis_all_merged_snp.pickle\r\n", "Neurospora_crassa_all_merged_snp.pickle\r\n", "Nothobranchius_furzeri_all_merged_snp.pickle\r\n", "Oncorhynchus_mykiss_all_merged_snp.pickle\r\n", "Oncorhynchus_nerka_all_merged_snp.pickle\r\n", "Oncorhynchus_tshawytscha_all_merged_snp.pickle\r\n", "oral_metagenome_all_merged_snp.pickle\r\n", "Oryza_sativa_all_merged_snp.pickle\r\n", "Oryza_sativa_Indica_Group_all_merged_snp.pickle\r\n", "Oryza_sativa_Japonica_Group_all_merged_snp.pickle\r\n", "Ovis_aries_all_merged_snp.pickle\r\n", "Panicum_hallii_all_merged_snp.pickle\r\n", "Panicum_virgatum_all_merged_snp.pickle\r\n", "Pan_troglodytes_all_merged_snp.pickle\r\n", "phyllosphere_metagenome_all_merged_snp.pickle\r\n", "Picea_abies_all_merged_snp.pickle\r\n", "pig_gut_metagenome_all_merged_snp.pickle\r\n", "plant_metagenome_all_merged_snp.pickle\r\n", "Plasmodium_falciparum_all_merged_snp.pickle\r\n", "Plasmodium_vivax_all_merged_snp.pickle\r\n", "Populus_trichocarpa_all_merged_snp.pickle\r\n", "Pseudomonas_aeruginosa_all_merged_snp.pickle\r\n", "Pseudomonas_fluorescens_all_merged_snp.pickle\r\n", "Pseudotsuga_menziesii_all_merged_snp.pickle\r\n", "rat_gut_metagenome_all_merged_snp.pickle\r\n", "Rattus_norvegicus_all_merged_snp.pickle\r\n", "Rhinella_marina_all_merged_snp.pickle\r\n", "rhizosphere_metagenome_all_merged_snp.pickle\r\n", "root_associated_fungus_metagenome_all_merged_snp.pickle\r\n", "root_metagenome_all_merged_snp.pickle\r\n", "Saccharomyces_cerevisiae_all_merged_snp.pickle\r\n", "Saccharomyces_cerevisiae_S288C_all_merged_snp.pickle\r\n", "Salmonella_enterica_all_merged_snp.pickle\r\n", "Salmonella_enterica_subsp._enterica_all_merged_snp.pickle\r\n", "Salmonella_enterica_subsp._enterica_serovar_Enteritidis_all_merged_snp.pickle\r\n", "Salmonella_enterica_subsp._enterica_serovar_Typhi_all_merged_snp.pickle\r\n", "Salmonella_enterica_subsp._enterica_serovar_Typhimurium_all_merged_snp.pickle\r\n", "Salmo_salar_all_merged_snp.pickle\r\n", "Salvelinus_namaycush_all_merged_snp.pickle\r\n", "Schizosaccharomyces_pombe_all_merged_snp.pickle\r\n", "Schmidtea_mediterranea_all_merged_snp.pickle\r\n", "seawater_metagenome_all_merged_snp.pickle\r\n", "sediment_metagenome_all_merged_snp.pickle\r\n", "Sesamum_indicum_all_merged_snp.pickle\r\n", "Setaria_italica_all_merged_snp.pickle\r\n", "Shigella_flexneri_all_merged_snp.pickle\r\n", "Shigella_sonnei_all_merged_snp.pickle\r\n", "skin_metagenome_all_merged_snp.pickle\r\n", "sludge_metagenome_all_merged_snp.pickle\r\n", "soil_metagenome_all_merged_snp.pickle\r\n", "Solanum_lycopersicum_all_merged_snp.pickle\r\n", "Solanum_tuberosum_all_merged_snp.pickle\r\n", "Sorghum_bicolor_all_merged_snp.pickle\r\n", "sponge_metagenome_all_merged_snp.pickle\r\n", "Staphylococcus_aureus_all_merged_snp.pickle\r\n", "Streptococcus_agalactiae_all_merged_snp.pickle\r\n", "Streptococcus_pneumoniae_all_merged_snp.pickle\r\n", "Streptococcus_pyogenes_all_merged_snp.pickle\r\n", "Streptococcus_suis_all_merged_snp.pickle\r\n", "Sus_scrofa_all_merged_snp.pickle\r\n", "Syngnathus_scovelli_all_merged_snp.pickle\r\n", "synthetic_construct_all_merged_snp.pickle\r\n", "synthetic_metagenome_all_merged_snp.pickle\r\n", "terrestrial_metagenome_all_merged_snp.pickle\r\n", "Timema_cristinae_all_merged_snp.pickle\r\n", "Triticum_aestivum_all_merged_snp.pickle\r\n", "Triticum_turgidum_all_merged_snp.pickle\r\n", "unclassified_sequences_all_merged_snp.pickle\r\n", "uncultured_bacterium_all_merged_snp.pickle\r\n", "uncultured_fungus_all_merged_snp.pickle\r\n", "unidentified_all_merged_snp.pickle\r\n", "vaginal_metagenome_all_merged_snp.pickle\r\n", "Vibrio_cholerae_all_merged_snp.pickle\r\n", "viral_metagenome_all_merged_snp.pickle\r\n", "Vitis_vinifera_all_merged_snp.pickle\r\n", "wastewater_metagenome_all_merged_snp.pickle\r\n", "wetland_metagenome_all_merged_snp.pickle\r\n", "Zaire_ebolavirus_all_merged_snp.pickle\r\n", "Zea_mays_all_merged_snp.pickle\r\n", "Zea_mays_subsp._mays_all_merged_snp.pickle\r\n" ] } ], "source": [ "### make sure the results is chunked as expects\n", "#!ls /cellar/users/btsui/all_seq_snp/\n", "#2 seconds per data push " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#7 digits, \n", "# 18gb to 0.1\n", "#create an hdf5 object, \n", "tmpDf.to_hdf('./test.h5','master',mode='w',append=False,format='table')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%time tmpDf.to_hdf('./test.h5','master',mode='w',append=False,format='table')\n", "\n", "\n", "testDf=tmpDf.head()\n", "\n", "testDf.to_hdf('./test.h5','master',mode='w',format='table')\n", "#export succceedded, but the slicing operation time takes forever, the path only lead from /master/ to py table. \n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.34 s, sys: 11.4 s, total: 14.7 s\n", "Wall time: 14.9 s\n" ] } ], "source": [ "#v6\n", "#14.9s \n", "%time testDf20=pd.read_hdf('/cellar/users/btsui/test.pos.chunked.h5','Run_digits_790000',mode='r')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "7.7110316966216308" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.log10(testDf20.shape[0])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(51408117, 7)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#dataset_name   = '/Configure:0000/Run:0000/CalibCycle:0000/Camera::FrameV1/XppSb4Pim.1:Tm6740.1/image'\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/cellar/users/btsui/anaconda2/lib/python2.7/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", " from ._conv import register_converters as _register_converters\n" ] } ], "source": [ "import h5py\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "hdf5_file_name='/cellar/users/btsui/test.h5'" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "f = h5py.File(hdf5_file_name, 'r') " ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "table=f['/master/table']\n", "#obj.iteritems()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "table[1000:2000]" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dtype([('index', ')\n" ] } ], "source": [ "for i,myObj in enumerate(obj.iteritems()):\n", " print myObj\n", " if i>10:\n", " break\n", " " ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "ERROR: Internal Python error in the inspect module.\n", "Below is the traceback from this internal error.\n", "\n", "\n", "Unfortunately, your original traceback can not be constructed.\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Traceback (most recent call last):\n", " File \"/cellar/users/btsui/anaconda2/lib/python2.7/site-packages/IPython/core/ultratb.py\", line 1118, in get_records\n", " return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)\n", " File \"/cellar/users/btsui/anaconda2/lib/python2.7/site-packages/IPython/core/ultratb.py\", line 300, in wrapped\n", " return f(*args, **kwargs)\n", " File \"/cellar/users/btsui/anaconda2/lib/python2.7/site-packages/IPython/core/ultratb.py\", line 345, in _fixed_getinnerframes\n", " records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))\n", " File \"/cellar/users/btsui/anaconda2/lib/python2.7/inspect.py\", line 1051, in getinnerframes\n", " framelist.append((tb.tb_frame,) + getframeinfo(tb, context))\n", " File \"/cellar/users/btsui/anaconda2/lib/python2.7/inspect.py\", line 1011, in getframeinfo\n", " filename = getsourcefile(frame) or getfile(frame)\n", " File \"/cellar/users/btsui/anaconda2/lib/python2.7/inspect.py\", line 450, in getsourcefile\n", " if os.path.exists(filename):\n", " File \"/cellar/users/btsui/anaconda2/lib/python2.7/genericpath.py\", line 26, in exists\n", " os.stat(path)\n", "KeyboardInterrupt\n" ] }, { "ename": "IndexError", "evalue": "string index out of range", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m/cellar/users/btsui/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc\u001b[0m in \u001b[0;36mrun_code\u001b[1;34m(self, code_obj, result)\u001b[0m\n\u001b[0;32m 2900\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2901\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror_in_exec\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2902\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshowtraceback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2903\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2904\u001b[0m \u001b[0moutflag\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/cellar/users/btsui/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc\u001b[0m in \u001b[0;36mshowtraceback\u001b[1;34m(self, exc_tuple, filename, tb_offset, exception_only)\u001b[0m\n\u001b[0;32m 1828\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1829\u001b[0m stb = self.InteractiveTB.structured_traceback(etype,\n\u001b[1;32m-> 1830\u001b[1;33m value, tb, tb_offset=tb_offset)\n\u001b[0m\u001b[0;32m 1831\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1832\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_showtraceback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0metype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/cellar/users/btsui/anaconda2/lib/python2.7/site-packages/IPython/core/ultratb.pyc\u001b[0m in \u001b[0;36mstructured_traceback\u001b[1;34m(self, etype, value, tb, tb_offset, number_of_lines_of_context)\u001b[0m\n\u001b[0;32m 1390\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtb\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtb\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1391\u001b[0m return FormattedTB.structured_traceback(\n\u001b[1;32m-> 1392\u001b[1;33m self, etype, value, tb, tb_offset, number_of_lines_of_context)\n\u001b[0m\u001b[0;32m 1393\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1394\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/cellar/users/btsui/anaconda2/lib/python2.7/site-packages/IPython/core/ultratb.pyc\u001b[0m in \u001b[0;36mstructured_traceback\u001b[1;34m(self, etype, value, tb, tb_offset, number_of_lines_of_context)\u001b[0m\n\u001b[0;32m 1298\u001b[0m \u001b[1;31m# Verbose modes need a full traceback\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1299\u001b[0m return VerboseTB.structured_traceback(\n\u001b[1;32m-> 1300\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0metype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtb\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtb_offset\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnumber_of_lines_of_context\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1301\u001b[0m )\n\u001b[0;32m 1302\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/cellar/users/btsui/anaconda2/lib/python2.7/site-packages/IPython/core/ultratb.pyc\u001b[0m in \u001b[0;36mstructured_traceback\u001b[1;34m(self, etype, evalue, etb, tb_offset, number_of_lines_of_context)\u001b[0m\n\u001b[0;32m 1182\u001b[0m \u001b[0mstructured_traceback_parts\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[0mformatted_exception\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1183\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1184\u001b[1;33m \u001b[0mstructured_traceback_parts\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[0mformatted_exception\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1185\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1186\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mstructured_traceback_parts\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mIndexError\u001b[0m: string index out of range" ] } ], "source": [ "%time posDf=pd.read_hdf('/cellar/users/btsui/test.h5','master',where='Run_digits=15999')\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\"\"\"\n", "the multindex doesn't work no matter what\n", "just use pickle for now: \n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\n", "\"\"\"\n", "[u'Chr', u'Pos',u'base','Run']\n", "-rw-r--r-- 1 btsui users 1.8M Jan 2 16:42 0.pickle.gz\n", "-rw-r--r-- 1 btsui users 2.0M Jan 2 16:42 10.pickle.gz\n", "\n", "['Run', u'Chr', u'Pos',u'base']\n", "-rw-r--r-- 1 btsui users 1.9M Jan 2 16:43 0.pickle.gz\n", "-rw-r--r-- 1 btsui users 2.2M Jan 2 16:44 10.pickle.gz\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\"\"\"\n", "### where did 1000.pickle.gz go from the first iteration\n", "### out of 82, only 60 processed, what about the rest?\n", "### \n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!rm /tmp/btsui/snp_merged/0.pickle" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#18G\n", "myDf=pd.read_hdf('/cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.h5',key='master')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#myDf" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "### keep everything in the same place?\n", "##" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "### read in all the human.\n", "###take only those in human and merge all of them together, be only additivite, add only those where it has not been added. \n", "#srr_pickle_df.iloc[:0].to_pickle('/data/cellardata/users/btsui/dbsnp/empty_base_snp.pickle')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## merge all the data in bulk, \n", "### generate an empty pickle for each species?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#orignal pickle, 2.7m\n", "#!ls -lah ./tmp.pickle" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!ls -lah tmp2.pickle.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!rm ./tmp2.pickle" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "### merge all the files, by 1000 steps" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "srr_pickle_df.to_pickle('./tmp2.pickle')\n", "#srr_pickle_df.to_pickle('./tmp.pickle')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!ls -lah ./tmp.pickle ./tmp2.pickle" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!rm ./tmp2.pickle.gz\n", "!rm ./tmp.pickle.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\n", "!gzip ./tmp2.pickle" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\n", "!gzip ./tmp.pickle" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!ls -lah tmp2.pickle.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!ls -lah tmp.pickle.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "### merge the pickle " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#0.4 mb\n", "np.log10(0.4*(4*(10**6)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "### the multindex doesn't increase the " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\n", "#no filtering on variants\n", "countDf=tmpDf6[tmpDf6.rd>0]#.rd.value_counts()\n", "##at a position, tell what's the value\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "subDf.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "my_data_L=[]\n", "my_key_L=[]\n", "g=countDf.stack().groupby(['base','features'])\n", "for myTuple, subS in g:\n", " sparse_Ids=subS.index.get_level_values(index_name)\n", " tmp_row_col=(sparse_Ids,np.zeros(len(sparse_Ids)))\n", " tmp_array=sparse.csc_matrix((subS ,tmp_row_col),shape=tmp_shape)\n", " my_data_L.append(tmp_array)\n", " my_key_L.append(myTuple)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sparseM=sparse.hstack(my_data_L)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "index_name='Chr_Pos_Id'\n", "baseDf.groupby(index_name)\n", "#csc_matrix((data, (row, col)), shape=(3, 3)).toarray()\n", "attrib='rd'\n", "tmp_shape=(Chr_Pos_to_ID_S.max(),1)\n", "for my_base, subDf in countDf.groupby(level='base'):\n", " sparse_Ids=subDf.index.get_level_values(index_name)\n", " \n", " tmp_row_col=(sparse_Ids,np.zeros(len(sparse_Ids)))\n", " \n", " tmpS=sparse.csc_matrix((subDf[attrib] ,tmp_row_col),shape=tmp_shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "countDf.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#countDf" ] }, { "cell_type": "code", "execution_count": 289, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#full_meta_df['Run']" ] }, { "cell_type": "code", "execution_count": 282, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import re\n", "import os\n", "from multiprocessing import Pool\n", "\n", "## init\n", "mySpecie='Homo_sapiens'\n", "outMergedDir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.h5'\n", "\n", "full_meta_dir=\"/cellar/users/btsui/Project/METAMAP/notebook/Parsing/sra_dump.csv\"\n", "full_meta_df=pd.read_csv(full_meta_dir)\n", "\n", "#inSrrDir='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/'\n", "inSrrDir='/nrnb/users/btsui/Data/all_seq/snp/'\n", "existingMergedDf=pd.read_pickle(outMergedDir)" ] }, { "cell_type": "code", "execution_count": 292, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'Run_db'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mexistingMergedDf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Run_db'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'Run_digits'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msize\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32m/cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc\u001b[0m in \u001b[0;36mgroupby\u001b[1;34m(self, by, axis, level, as_index, sort, group_keys, squeeze, **kwargs)\u001b[0m\n\u001b[0;32m 4414\u001b[0m return groupby(self, by=by, axis=axis, level=level, as_index=as_index,\n\u001b[0;32m 4415\u001b[0m \u001b[0msort\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msort\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgroup_keys\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mgroup_keys\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msqueeze\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msqueeze\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4416\u001b[1;33m **kwargs)\n\u001b[0m\u001b[0;32m 4417\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4418\u001b[0m def asfreq(self, freq, method=None, how=None, normalize=False,\n", "\u001b[1;32m/cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/core/groupby.pyc\u001b[0m in \u001b[0;36mgroupby\u001b[1;34m(obj, by, **kwds)\u001b[0m\n\u001b[0;32m 1697\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'invalid type: %s'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1698\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1699\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mklass\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mby\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1700\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1701\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/core/groupby.pyc\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze, **kwargs)\u001b[0m\n\u001b[0;32m 390\u001b[0m \u001b[0mlevel\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 391\u001b[0m \u001b[0msort\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msort\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 392\u001b[1;33m mutated=self.mutated)\n\u001b[0m\u001b[0;32m 393\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 394\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mobj\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/core/groupby.pyc\u001b[0m in \u001b[0;36m_get_grouper\u001b[1;34m(obj, key, axis, level, sort, mutated)\u001b[0m\n\u001b[0;32m 2688\u001b[0m \u001b[0min_axis\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgpr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mFalse\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgpr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2689\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2690\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgpr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2691\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgpr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mGrouper\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mgpr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkey\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2692\u001b[0m \u001b[1;31m# Add key to exclusions\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mKeyError\u001b[0m: 'Run_db'" ] } ], "source": [ "existingMergedDf.groupby(['Run_db','Run_digits']).size()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ " s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " C_CONTIGUOUS : True\n", " F_CONTIGUOUS : True\n", " OWNDATA : True\n", " WRITEABLE : True\n", " ALIGNED : True\n", " WRITEBACKIFCOPY : False\n", " UPDATEIFCOPY : False" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.flags" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " C_CONTIGUOUS : True\n", " F_CONTIGUOUS : True\n", " OWNDATA : False\n", " WRITEABLE : True\n", " ALIGNED : True\n", " WRITEBACKIFCOPY : False\n", " UPDATEIFCOPY : False" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.index.flags" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.__path__" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "store = pd.HDFStore('test.h5','w')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "### check compression ratio with at last ten reads: " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "### select by variants, for select by " ] } ], "metadata": { "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 1 }