{ "metadata": { "name": "", "signature": "sha256:5644523ab323fc645802e9e543149b3a32a46783bb356f60d3b4d8a1d4b3d3dd" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "

Download Data Get All Available MAF Files from TCGA Data Portal

" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import os as os\n", "\n", "from bs4 import BeautifulSoup\n", "from urllib2 import HTTPError" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "import NotebookImport\n", "from Imports import OUT_PATH" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "importing IPython notebook from Imports.ipynb\n", "Populating the interactive namespace from numpy and matplotlib" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "changing to source dirctory\n" ] }, { "html": [ "" ], "metadata": {}, "output_type": "display_data", "text": [ "" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "populating namespace with data\n" ] } ], "prompt_number": 2 }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

GLOBAL VARIABLE WARNING

\n", "Here I download updated clinical data from the TCGA Data Portal. \n", "This is a secure site which uses HTTPS. I had to give it a path \n", "to my ca-cert for the download to work. \n", "\n", "Download a copy of a generic cacert.pem [here](http://curl.haxx.se/ca/cacert.pem).\n", "
" ] }, { "cell_type": "code", "collapsed": false, "input": [ "PATH_TO_CACERT = '/cellar/users/agross/cacert.pem'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "Download most recent files from MAF dashboard" ] }, { "cell_type": "code", "collapsed": false, "input": [ "out_path = OUT_PATH + '/MAFs_new_2/'\n", "if not os.path.isdir(out_path):\n", " os.makedirs(out_path)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "maf_dashboard = 'https://confluence.broadinstitute.org/display/GDAC/MAF+Dashboard'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "!curl --cacert $PATH_TO_CACERT $maf_dashboard -o tmp.html" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\r\n", " Dload Upload Total Spent Left Speed\r\n", "\r", " 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 0 0 0 0 0 0 0 0 --:--:-- 0:00:01 --:--:-- 0" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", "100 17629 0 17629 0 0 7060 0 --:--:-- 0:00:02 --:--:-- 7308" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", "100 131k 0 131k 0 0 47417 0 --:--:-- 0:00:02 --:--:-- 48890\r\n" ] } ], "prompt_number": 7 }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "Use BeutifulSoup to parse out all of the links in the table" ] }, { "cell_type": "code", "collapsed": false, "input": [ "f = open('tmp.html', 'rb').read()\n", "soup = BeautifulSoup(f)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "r = [l.get('href') for l in soup.find_all('a')\n", " if l.get('href') != None\n", " and '.maf' in l.get('href')]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "Download all of the MAFs by following the links" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* This takes a while, as I'm downloading all of the data.\n", "* I read in the table first to count the number of comment lines and a second time to actuall load the data.\n", "* Yes there is likely a more efficient way to do this, but I'm waiting on https://github.com/pydata/pandas/issues/2685" ] }, { "cell_type": "code", "collapsed": false, "input": [ "maf = {}\n", "for f in r:\n", " try:\n", " #t = pd.read_table(f, nrows=10, sep='not_real_term', header=None, squeeze=True,\n", " # low_memory=False)\n", " #skip = t.apply(lambda s: s.startswith('#'))\n", " #skip = list(skip[skip==True].index)\n", " maf[f] = pd.read_table(f, header=0, index_col=0,\n", " low_memory=True, comment='#')\n", " except HTTPError:\n", " print f" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stderr", "text": [ "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n", "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (86,87) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n", "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,67,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n", "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (67,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,67,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n", "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (50,67,79,80,81) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (50,56,57,67,72,73,74,75,77,78,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n", "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,67) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n", "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,67,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,67,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n", "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (4,38,50,57,60,61,62,67,70,71,72,73,74,75,76,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (42,43,44,45) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n", "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n", "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (17,18,36,63,81) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n", "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (50,67,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (48,50,67) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n", "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,85) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "https://tcga-data.nci.nih.gov/tcgafiles/ftp_auth/distro_ftpusers/anonymous/tumor/laml/gsc/genome.wustl.edu/illuminaga_dnaseq/mutations/genome.wustl.edu_LAML.IlluminaGA_DNASeq.Level_2.1.2.0/genome.wustl.edu_LAML.IlluminaGA_DNASeq.preliminary.1.maf\n" ] } ], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "m2 = pd.concat(maf)\n", "m3 = m2.dropna(axis=1, how='all')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "Reduce MAF down to most usefull columns" ] }, { "cell_type": "code", "collapsed": false, "input": [ "cols = ['NCBI_Build', 'Chromosome', 'Start_position', \n", " 'End_position', 'Strand', 'Reference_Allele', \n", " 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2',\n", " 'Tumor_Sample_Barcode', 'Protein_Change',\n", " 'Variant_Classification','Variant_Type']\n", "\n", "m4 = m3[cols]\n", "m4 = m4.reset_index()\n", "#m4.index = map(lambda s: s.split('/')[-1], m4.index)\n", "m4 = m4.groupby(['Hugo_Symbol','Tumor_Sample_Barcode','Start_position']).first()\n", "m4 = m4.reset_index()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "m4.to_csv(out_path + 'mega_maf.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 17 }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "Get gene by patient mutation count matrix and save" ] }, { "cell_type": "code", "collapsed": false, "input": [ "m5 = m4.ix[m4.Variant_Classification != 'Silent']\n", "cc = m5.groupby(['Hugo_Symbol','Tumor_Sample_Barcode']).size()\n", "cc = cc.reset_index()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "cc.to_csv(out_path + 'meta.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 19 }, { "cell_type": "code", "collapsed": false, "input": [ "cc.shape" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 20, "text": [ "(1411730, 3)" ] } ], "prompt_number": 20 } ], "metadata": {} } ] }