{ "cells": [ { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "from tqdm import tqdm\n", "\n", "import pandas as pd\n", "import os\n", "import subprocess\n", "\n", "CWD='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/'\n", "os.chdir(CWD)\n", "#gdc_meta_df=pd.read_json('files.2017-12-09T19_29_39.496570.json')\n", "\n", "\n", "#gdc_meta_df=pd.read_csv('gdc_manifest.2018-07-11.txt',sep='\\t')\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "#gdc_meta_df=pd.read_csv('./gdc_manifest.2017-12-27T02_43_35.959399.txt',sep='\\t')\n", "#/cellar/users/andreabc/GDC_barcodes/uuid_barcode_map.txt\n", "#gdc_meta_df.str.contains('')" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "#516 cases\n", "##bams only \n", "gdc_meta_df=pd.read_json('files.2018-07-11.json')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BAM 2105\n", "Name: data_format, dtype: int64" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gdc_meta_df['data_format'].value_counts()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "m_dtype=gdc_meta_df['data_type']=='Aligned Reads'\n", "#,\n", "m_experimental_strategy=gdc_meta_df['experimental_strategy'].isin(['WXS','RNA-Seq'])\n", "### process all the TCGA, realigned bams. \n", "gdc_meta_df_sub=gdc_meta_df[m_dtype&m_experimental_strategy]" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "#gdc_meta_df['experimental_strategy'].value_counts()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "with each file, the pipeline can extract the data quickly\n", "\"\"\"\n", "gdc_meta_df_sub.sort_values('file_size').to_pickle('./tcga_lgg_wgs_bams.df.wxs_rnaseq.pickle')" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/./tcga_lgg_wgs_bams.df.pickle\r\n" ] } ], "source": [ "!echo $PWD/./tcga_lgg_wgs_bams.df.pickle" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1045, 11)" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "### for each of the file, generate the allellic read count using the standard refere\n", "gdc_meta_df_sub.sort_values('file_size').shape#['file_size']#/10**9" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "size of bams in TB: 12.400216759716\n" ] } ], "source": [ "print (\"size of bams in TB:\",(gdc_meta_df_sub['file_size']/10**12).sum())" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "out_dir='/nrnb/users/btsui/Data/tcga_raw_lgg/'\n", "\n", "#gdc_meta_df.cases.iloc[0]\n", "token_dir='/cellar/users/ramarty/tokens/gdc-user-token.2018-06-25T22_21_40.089Z.txt'\n", "gdc_cmd_fmt='gdc-client download -t {token_dir} -d {out_dir} {file_uuid}'\n", "\n", "for _,rowS in tqdm(gdc_meta_df.iterrows()) :\n", " file_uuid=rowS.loc['id']\n", " gdc_cmd=gdc_cmd_fmt.format(out_dir=out_dir,file_uuid=file_uuid,token_dir=token_dir)\n", " #result = os.system(gdc_cmd)" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ls: cannot access '/nrnb/users/btsui/Data/tcga_raw_lgg/3a0e5ae0-dc79-468d-b459-a6d43b612851': No such file or directory\r\n" ] } ], "source": [ "!ls /nrnb/users/btsui/Data/tcga_raw_lgg/" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "#!gunzip -c /nrnb/users/btsui/Data/tcga_extracted_lgg_snp/3a0e5ae0-dc79-468d-b459-a6d43b612851.snp.txt.gz | head -n 2000|tail -n 20" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#download, and then extract, run the smallest first. \n", "#need to use" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "#!ls -lah /cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/Homo_sapiens/" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0085c844-82bf-414a-bc05-5e7488a70c25.vcf\r\n" ] } ], "source": [ "!ls /nrnb/users/btsui/Data/tcga_orig_vcf/0085c844-82bf-414a-bc05-5e7488a70c25/" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }