{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Purpose of this script:\n",
    "\n",
    "\n",
    "dbSNP use the reference strand to annnotate. \n",
    "\n",
    "\n",
    "Map  genomic corrdinates with known human variants to other species (Like mouse)\n",
    "\n",
    "Input: Variant table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import gzip\n",
    "import re\n",
    "from tqdm import tqdm\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  interactivity=interactivity, compiler=compiler, result=result)\n"
     ]
    }
   ],
   "source": [
    "pattern='VP=0x\\d{4}(\\d{2})'\n",
    "prog = re.compile(pattern)\n",
    "inDbDir='/data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz'\n",
    "outDbDir=inDbDir.replace('.vcf.gz','.f1_byte2_not_00.vcf.gz')\n",
    "humanDf=pd.read_csv(outDbDir,sep='\\t',header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!gunzip -c /data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz |head   -n 20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "humanDf.columns=['Chr','Loc','rs','REF','ALT','','','Annot']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "## pos strand"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#humanDf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### identify regions in human to LiftOVer ( mapping homologous) to mouse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hg38ToDanRer10.over.chain.gz\r\n",
      "hg38ToDanRer11.over.chain.gz\r\n"
     ]
    }
   ],
   "source": [
    "!ls /cellar/users/btsui/Data/ucsc_chains/ | grep Dan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "inputDir='/cellar/users/btsui/Data/ucsc_chains/'\n",
    "fileNameInFolderS=pd.Series(os.listdir(inputDir))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "143    hg38ToMm10.over.chain.gz\n",
       "dtype: object"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fileNameInFolderS[fileNameInFolderS.str.contains('mm10',case=False)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "chainFiles=fileNameInFolderS[fileNameInFolderS.str.contains('.chain.gz$')]\n",
    "chainFiles=pd.concat([chainFiles[chainFiles.str.contains('mm10|DanRer',case=False)],\n",
    "                      chainFiles[~chainFiles.str.contains('mm10|DanRer',case=False)]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "intervalLength=1\n",
    "refOffset=1\n",
    "humanDf['Chr_uscs']='chr'+humanDf['Chr'].astype(str)\n",
    "\n",
    "humanDf['Loc_+1']=humanDf['Loc']+intervalLength-refOffset\n",
    "\n",
    "humanDf['Loc_-1']=humanDf['Loc']-refOffset\n",
    "humanDf['Chr_uscs']=humanDf['Chr_uscs'].str.replace('chrMT','chrM')\n",
    "humanDf['Score']='.'\n",
    "humanDf['Strand']='+'\n",
    "humanDf[['Chr_uscs','Loc_-1','Loc_+1','rs','Score','Strand']\n",
    "       ].to_csv('tmp.human.ucsc.bed',sep='\\t',header=None,index=None)\n",
    "#humanDf[['Chr','Loc_-1','Loc_+1','rs']].to_csv('tmp.human.ensembl.bed',sep='\\t',header=None,index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!head tmp.human.ucsc.bed"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### make chains"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|          | 1/157 [00:01<03:20,  1.29s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  1%|▏         | 2/157 [00:03<04:30,  1.74s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  2%|▏         | 3/157 [08:08<6:58:07, 162.91s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  3%|▎         | 4/157 [08:10<5:12:26, 122.52s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  3%|▎         | 5/157 [08:19<4:13:16, 99.98s/it] "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  4%|▍         | 6/157 [08:21<3:30:23, 83.60s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  4%|▍         | 7/157 [08:34<3:03:46, 73.51s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  5%|▌         | 8/157 [09:09<2:50:31, 68.67s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  6%|▌         | 9/157 [16:18<4:28:06, 108.69s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  6%|▋         | 10/157 [16:30<4:02:46, 99.09s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  7%|▋         | 11/157 [29:17<6:28:48, 159.78s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  8%|▊         | 12/157 [46:33<9:22:34, 232.79s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    }
   ],
   "source": [
    "for chainFile in tqdm(chainFiles):\n",
    "    outDir='/cellar/users/btsui/Data/ucsc_chains_human_homo_snps/{}'.format(chainFile+'.human_homo.bed')   \n",
    "    unmappedOutDir='/cellar/users/btsui/Data/ucsc_chains_human_homo_snps/{}'.format(chainFile+'.human_homo.unmapped.bed')   \n",
    "    liftOverCmd=\"liftOver {} {} {} {}\".format('tmp.human.ucsc.bed',\n",
    "                                                     inputDir+chainFile,\n",
    "                                                     outDir,unmappedOutDir)\n",
    "    print (os.system(liftOverCmd))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!mkdir /cellar/users/btsui/Data/ucsc_chains_human_homo_snps/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "asdasd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#os.system(liftOverCmd)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scratch"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### check SNP strand infos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "humanDf[['Chr','Loc_-1','Loc','rs']].to_csv('tmp.human.ensembl.bed',sep='\\t',header=None,index=None)\n",
    "\n",
    "!bedtools getfasta -fi /cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa -bed tmp.human.ensembl.bed -fo tmp.human.ucsc.bed.fasta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmpS=pd.read_csv('tmp.human.ucsc.bed.fasta',header=None)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "extractedDf=pd.DataFrame({'Loc':tmpS[(tmpS.index%2)==0].values,\n",
    "              'Nucleotide':tmpS[(tmpS.index%2)==1].values})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "extractedDf['Chr']=extractedDf.Loc.str.extract('(\\w+):')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "extractedDf['Site']=extractedDf.Loc.str.extract('\\w+:\\d+-(\\d+)')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "extractedDf['Chr-Site']=extractedDf['Chr']+'-'+extractedDf['Site'].astype(str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "chrToPosN=extractedDf.set_index(['Chr-Site'])['Nucleotide']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "chrToPosN=chrToPosN.groupby(chrToPosN.index).first()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Chr</th>\n",
       "      <th>Loc</th>\n",
       "      <th>rs</th>\n",
       "      <th>REF</th>\n",
       "      <th>ALT</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>Annot</th>\n",
       "      <th>Chr_uscs</th>\n",
       "      <th>Loc_+1</th>\n",
       "      <th>Loc_-1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>14727</td>\n",
       "      <td>rs1045587</td>\n",
       "      <td>G</td>\n",
       "      <td>A</td>\n",
       "      <td>.</td>\n",
       "      <td>.</td>\n",
       "      <td>RS=1045587;RSPOS=14727;RV;dbSNPBuildID=117;SSR...</td>\n",
       "      <td>chr1</td>\n",
       "      <td>14727</td>\n",
       "      <td>14726</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>630825</td>\n",
       "      <td>rs9783068</td>\n",
       "      <td>T</td>\n",
       "      <td>C</td>\n",
       "      <td>.</td>\n",
       "      <td>.</td>\n",
       "      <td>RS=9783068;RSPOS=630825;dbSNPBuildID=119;SSR=1...</td>\n",
       "      <td>chr1</td>\n",
       "      <td>630825</td>\n",
       "      <td>630824</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>630833</td>\n",
       "      <td>rs9701099</td>\n",
       "      <td>C</td>\n",
       "      <td>T</td>\n",
       "      <td>.</td>\n",
       "      <td>.</td>\n",
       "      <td>RS=9701099;RSPOS=630833;dbSNPBuildID=119;SSR=1...</td>\n",
       "      <td>chr1</td>\n",
       "      <td>630833</td>\n",
       "      <td>630832</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>817186</td>\n",
       "      <td>rs3094315</td>\n",
       "      <td>G</td>\n",
       "      <td>A</td>\n",
       "      <td>.</td>\n",
       "      <td>.</td>\n",
       "      <td>RS=3094315;RSPOS=817186;RV;dbSNPBuildID=103;SS...</td>\n",
       "      <td>chr1</td>\n",
       "      <td>817186</td>\n",
       "      <td>817185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>833068</td>\n",
       "      <td>rs12562034</td>\n",
       "      <td>G</td>\n",
       "      <td>A</td>\n",
       "      <td>.</td>\n",
       "      <td>.</td>\n",
       "      <td>RS=12562034;RSPOS=833068;dbSNPBuildID=120;SSR=...</td>\n",
       "      <td>chr1</td>\n",
       "      <td>833068</td>\n",
       "      <td>833067</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Chr     Loc          rs REF ALT        \\\n",
       "0   1   14727   rs1045587   G   A  .  .   \n",
       "1   1  630825   rs9783068   T   C  .  .   \n",
       "2   1  630833   rs9701099   C   T  .  .   \n",
       "3   1  817186   rs3094315   G   A  .  .   \n",
       "4   1  833068  rs12562034   G   A  .  .   \n",
       "\n",
       "                                               Annot Chr_uscs  Loc_+1  Loc_-1  \n",
       "0  RS=1045587;RSPOS=14727;RV;dbSNPBuildID=117;SSR...     chr1   14727   14726  \n",
       "1  RS=9783068;RSPOS=630825;dbSNPBuildID=119;SSR=1...     chr1  630825  630824  \n",
       "2  RS=9701099;RSPOS=630833;dbSNPBuildID=119;SSR=1...     chr1  630833  630832  \n",
       "3  RS=3094315;RSPOS=817186;RV;dbSNPBuildID=103;SS...     chr1  817186  817185  \n",
       "4  RS=12562034;RSPOS=833068;dbSNPBuildID=120;SSR=...     chr1  833068  833067  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "humanDf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "humanDf['Chr-Site']=humanDf['Chr'].astype(str)+'-'+humanDf['Loc'].astype(str)#chrToPosN[extractedDf['Chr-Site']].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "#posStrandDf=humanDf[~humanDf.Annot.str.contains(';RV;')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "humanDf['Pos_N']=chrToPosN.loc[humanDf['Chr-Site']].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pos_N  REF                     \n",
       "G      GCCTCCAGCCCCAGCT                1\n",
       "       GTGTAAACTCAGAAA                 1\n",
       "       GTGTACCCTT                      1\n",
       "       GTGTCAT                         1\n",
       "       GTGTGCCA                        1\n",
       "       GTGTGGGGCGGAGACTT               1\n",
       "       GTGTGT                          1\n",
       "       GTGTGTCAGC                      1\n",
       "       GTGTTCCAAGGGGACAGTAGCCCC        1\n",
       "       GTTA                            1\n",
       "       GTTAAA                          1\n",
       "       GTTAAATCCA                      1\n",
       "       GTTAAGGAA                       1\n",
       "       GTTACAAT                        1\n",
       "       GTTACAATTTACTGGCA               1\n",
       "       GTTACAGGAACT                    1\n",
       "       GTTACC                          1\n",
       "       GTTAGATCC                       1\n",
       "       GTTCTCCTCATTGAAAAAGA            1\n",
       "       GTTCTC                          1\n",
       "       GTTCTACAACAAGAGCGAGGAT          1\n",
       "       GTTCCT                          1\n",
       "       GTTCCATA                        1\n",
       "       GTTCAGCT                        1\n",
       "       GTGTA                           1\n",
       "       GTTCACC                         1\n",
       "       GTTATTTTATT                     1\n",
       "       GTTATTT                         1\n",
       "       GTTATT                          1\n",
       "       GTTATC                          1\n",
       "                                   ...  \n",
       "T      TTC                           103\n",
       "       TCA                           112\n",
       "A      AAG                           118\n",
       "C      CAA                           131\n",
       "       CCT                           154\n",
       "       CAT                           160\n",
       "A      ACT                           170\n",
       "C      CTG                           258\n",
       "       CTT                           263\n",
       "       CAG                           308\n",
       "       CG                            431\n",
       "G      GT                            487\n",
       "T      TA                            531\n",
       "A      AT                            637\n",
       "G      GC                            688\n",
       "       GA                            730\n",
       "A      AC                            817\n",
       "C      CA                            822\n",
       "T      TC                            833\n",
       "       TG                            883\n",
       "A      AG                            910\n",
       "C      CT                            941\n",
       "N      A                           14995\n",
       "       T                           15146\n",
       "       G                           18302\n",
       "       C                           18431\n",
       "T      T                           57432\n",
       "A      A                           57962\n",
       "G      G                           94487\n",
       "C      C                           94699\n",
       "Length: 5083, dtype: int64"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "humanDf.groupby(['Pos_N','REF']).size().sort_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#tmpS.str.extract('(\\d+):\\d+-(\\d+)',expand=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "#posStrandDf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmpS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### map the human variants to mouse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!head tmp.mouse.bed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## remove 1000 when not testing\n",
    "liftOverBedDf=pd.read_csv('tmp.mouse.bed',sep='\\t',nrows=100000,header=None) \n",
    "\n",
    "liftOverBedDf.columns=['Chr_uscs','Start','End','rs']\n",
    "\n",
    "###slice out the region for mouse with name\n",
    "\n",
    "liftOverBedDf['Chr_ensmbl']=liftOverBedDf['Chr_uscs'].str.replace('^chr','')\n",
    "liftOverBedDf['Start']=liftOverBedDf['Start']\n",
    "liftOverBedDf['End']=liftOverBedDf['End']\n",
    "liftOverBedDf[['Chr_ensmbl','Start','End','rs']].to_csv('tmp.specie.ensembl.bed',sep='\\t',header=None,index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "specieFastaDir='/cellar/users/btsui/Data/ensembl/release/fasta/Mus_musculus.GRCm38.dna_rm.toplevel.fa'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bedInterCmd=\"bedtools getfasta -name -fi {} -bed tmp.specie.ensembl.bed > liftOver.fasta\".format(specieFastaDir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#bedInterCmdHuman=\"bedtools getfasta -name -fi {} -bed tmp.human.ucsc.bed > liftOver.human.fasta\".format(humanFastaDir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!head liftOver.human.fasta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!bedtools getfasta -name -fi /cellar/users/btsui/Data/ensembl/release/fasta/Homo_sapiens.GRCh38.dna_rm.toplevel.fa -bed tmp.human.ucsc.bed > liftOver.human.fasta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.system(bedInterCmdHuman)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!bedtools getfasta -name -fi /cellar/users/btsui/Data/ensembl/release/fasta/Mus_musculus.GRCm38.dna_rm.toplevel.fa -bed tmp.specie.ensembl.bed > liftOver.fasta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.system(bedInterCmd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#pd.read_csv('liftOver.human.fasta')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "myFastaS=pd.read_csv( 'liftOver.fasta',header=None)[0]\n",
    "\n",
    "rsToSeqDf=pd.DataFrame({'rs':myFastaS[(myFastaS.index%2)==0].str.replace('>','').values,\n",
    "                        'Seq':myFastaS[(myFastaS.index%2)==1].values})\n",
    "\n",
    "rsToSeqSpecie=rsToSeqDf.set_index('rs')['Seq']\n",
    "\n",
    "revS=pd.Series({'A':'T','T':'A','C':'G','G':'C'})\n",
    "\n",
    "\n",
    "rsToSeqSpecie_corrected=pd.Series(data=revS[rsToSeqSpecie].values,index=rsToSeqSpecie.index)\n",
    "\n",
    "humanDf['ref_sepcie']=rsToSeqSpecie_corrected[humanDf['rs']].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#withValidDf.Chr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "withValidDf=humanDf[humanDf['ref_sepcie'].isin(['A','C','G','T'])&(humanDf.Chr==3)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(withValidDf['ref_sepcie']==withValidDf['REF']).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#withValidDf#.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#(withValidDf['REF']==withValidDf['ref_sepcie'])[withValidDf['Loc']<100000000].astype(int).reset_index().plot(x='index',y=0,kind='scatter',alpha=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#withValidDf.Chr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#withValidDf[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(withValidDf['REF']==withValidDf['ref_sepcie'])[withValidDf['Loc']<100000000].astype(int).reset_index().plot(x='Loc',y=0,kind='scatter',alpha=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#(withValidDf['REF']==withValidDf['specie_ref_seq']).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "withValidDf[(withValidDf['REF']==withValidDf['specie_ref_seq'])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#withValidDf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "revS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "myDict={}\n",
    "for i  in range(0,50,1):\n",
    "    myDict[i]={'rev':(withValidDf['specie_ref_seq'].str[i]==revS[withValidDf['REF']].values).mean(),\n",
    "               'pos':(withValidDf['specie_ref_seq'].str[i]==withValidDf['REF'].values).mean()\n",
    "              }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "withValidDf['rev_ref']=revS[withValidDf['REF']].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#withValidDf[withValidDf.specie_ref_seq.str[0]!=revS[withValidDf['REF']].values]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#withValidDf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame(myDict).T#.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "i=25\n",
    "#pd.Series(myDict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmpDf=pd.DataFrame( {'specie':withValidDf['specie_ref_seq'].str[i].values,'ref':withValidDf['REF'].values})\n",
    "tmpDf['rev_ref']=revS[tmpDf['ref']].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(tmpDf['rev_ref']==tmpDf['specie']).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Series(myDict\n",
    "         ).plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"myDict={}\n",
    "for i  in range(0,100,1):\n",
    "    myDict[i]=(withValidDf['specie_ref_seq'].str[i]).value_counts()\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#withValidDf['specie_ref_seq'].str.len()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame(myDict).T.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Series(myDict)#.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!head tmp.human.ensembl.bed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!bedtools getfasta -fi /cellar/users/btsui/Data/ensembl/release/fasta/Mus_musculus.GRCm38.dna_rm.toplevel.fa -bed tmp.human.ensembl.bed > liftOver.bed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!getFastaFromBed -fi "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#getFastaFromBed [OPTIONS] -fi <input FASTA> -bed <BED/GFF/VCF>\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## get the fasta "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#liftOverBedDf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!wc -l tmp.mouse.bed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!wc -l unMapped"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!liftOver"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### check in fasta to make sure ref base is correct"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}