{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "import pandas as pd\n",
    "import os\n",
    "import subprocess\n",
    "\n",
    "CWD='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/'\n",
    "os.chdir(CWD)\n",
    "#gdc_meta_df=pd.read_json('files.2017-12-09T19_29_39.496570.json')\n",
    "\n",
    "\n",
    "#gdc_meta_df=pd.read_csv('gdc_manifest.2018-07-11.txt',sep='\\t')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "#gdc_meta_df=pd.read_csv('./gdc_manifest.2017-12-27T02_43_35.959399.txt',sep='\\t')\n",
    "#/cellar/users/andreabc/GDC_barcodes/uuid_barcode_map.txt\n",
    "#gdc_meta_df.str.contains('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "#516 cases\n",
    "##bams only \n",
    "gdc_meta_df=pd.read_json('files.2018-07-11.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BAM    2105\n",
       "Name: data_format, dtype: int64"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gdc_meta_df['data_format'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "m_dtype=gdc_meta_df['data_type']=='Aligned Reads'\n",
    "#,\n",
    "m_experimental_strategy=gdc_meta_df['experimental_strategy'].isin(['WXS','RNA-Seq'])\n",
    "### process all the TCGA, realigned bams. \n",
    "gdc_meta_df_sub=gdc_meta_df[m_dtype&m_experimental_strategy]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "#gdc_meta_df['experimental_strategy'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "with each file, the pipeline can extract the data quickly\n",
    "\"\"\"\n",
    "gdc_meta_df_sub.sort_values('file_size').to_pickle('./tcga_lgg_wgs_bams.df.wxs_rnaseq.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/./tcga_lgg_wgs_bams.df.pickle\r\n"
     ]
    }
   ],
   "source": [
    "!echo $PWD/./tcga_lgg_wgs_bams.df.pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1045, 11)"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "### for each of the file, generate the allellic read count using the standard refere\n",
    "gdc_meta_df_sub.sort_values('file_size').shape#['file_size']#/10**9"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "size of bams in TB: 12.400216759716\n"
     ]
    }
   ],
   "source": [
    "print (\"size of bams in TB:\",(gdc_meta_df_sub['file_size']/10**12).sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "out_dir='/nrnb/users/btsui/Data/tcga_raw_lgg/'\n",
    "\n",
    "#gdc_meta_df.cases.iloc[0]\n",
    "token_dir='/cellar/users/ramarty/tokens/gdc-user-token.2018-06-25T22_21_40.089Z.txt'\n",
    "gdc_cmd_fmt='gdc-client download -t {token_dir} -d {out_dir} {file_uuid}'\n",
    "\n",
    "for _,rowS in   tqdm(gdc_meta_df.iterrows()) :\n",
    "    file_uuid=rowS.loc['id']\n",
    "    gdc_cmd=gdc_cmd_fmt.format(out_dir=out_dir,file_uuid=file_uuid,token_dir=token_dir)\n",
    "    #result = os.system(gdc_cmd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ls: cannot access '/nrnb/users/btsui/Data/tcga_raw_lgg/3a0e5ae0-dc79-468d-b459-a6d43b612851': No such file or directory\r\n"
     ]
    }
   ],
   "source": [
    "!ls /nrnb/users/btsui/Data/tcga_raw_lgg/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!gunzip -c  /nrnb/users/btsui/Data/tcga_extracted_lgg_snp/3a0e5ae0-dc79-468d-b459-a6d43b612851.snp.txt.gz | head -n 2000|tail -n 20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#download, and then extract, run the smallest first. \n",
    "#need to use"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!ls -lah /cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/Homo_sapiens/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0085c844-82bf-414a-bc05-5e7488a70c25.vcf\r\n"
     ]
    }
   ],
   "source": [
    "!ls /nrnb/users/btsui/Data/tcga_orig_vcf/0085c844-82bf-414a-bc05-5e7488a70c25/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}