{
 "metadata": {
  "name": "",
  "signature": "sha256:5644523ab323fc645802e9e543149b3a32a46783bb356f60d3b4d8a1d4b3d3dd"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<h1 class=\"alert alert-info\">Download Data <small>  <i class=\"icon-download\"></i>  Get All Available MAF Files from TCGA Data Portal</small></h1>"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas as pd\n",
      "import os as os\n",
      "\n",
      "from bs4 import BeautifulSoup\n",
      "from urllib2 import HTTPError"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import NotebookImport\n",
      "from Imports import OUT_PATH"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "importing IPython notebook from Imports.ipynb\n",
        "Populating the interactive namespace from numpy and matplotlib"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "changing to source dirctory\n"
       ]
      },
      {
       "html": [
        "<style>\n",
        "@font-face {\n",
        "    font-family: \"Computer Modern\";\n",
        "    src: url('http://mirrors.ctan.org/fonts/cm-unicode/fonts/otf/cmunss.otf');\n",
        "}\n",
        "div.cell{\n",
        "    width:900px;\n",
        "    margin-left:auto;\n",
        "    margin-right:auto;\n",
        "}\n",
        "h1, h2, h3, h4 {\n",
        "    font-family: Helvetica, serif;\n",
        "    color: #000000;\n",
        "}\n",
        "div.text_cell_render {\n",
        "    font-family: Computer Modern, \"Helvetica Neue\", Arial, Helvetica, Geneva, sans-serif;\n",
        "    line-height: 125%;\n",
        "    font-size: 115%;\n",
        "    color: #4d4d4d;\n",
        "    width:800px;\n",
        "    margin-left:0px;\n",
        "    margin-right:auto;\n",
        "}\n",
        "\n",
        ".warning{\n",
        "    color: rgb( 240, 20, 20 )\n",
        "    }  \n",
        "\n",
        "</style>"
       ],
       "metadata": {},
       "output_type": "display_data",
       "text": [
        "<IPython.core.display.HTML object>"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "populating namespace with data\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<div class='alert alert-warning' style='width:600px; font-size:16px'>\n",
      "<h1>GLOBAL VARIABLE WARNING</h1>\n",
      "Here I download updated clinical data from the TCGA Data Portal. \n",
      "This is a secure site which uses HTTPS.  I had to give it a path \n",
      "to my ca-cert for the download to work.  \n",
      "\n",
      "Download a copy of a generic cacert.pem [here](http://curl.haxx.se/ca/cacert.pem).\n",
      "</div>"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "PATH_TO_CACERT = '/cellar/users/agross/cacert.pem'"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Download most recent files from MAF dashboard"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "out_path = OUT_PATH + '/MAFs_new_2/'\n",
      "if not os.path.isdir(out_path):\n",
      "    os.makedirs(out_path)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "maf_dashboard = 'https://confluence.broadinstitute.org/display/GDAC/MAF+Dashboard'"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "!curl --cacert $PATH_TO_CACERT $maf_dashboard -o tmp.html"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\r\n",
        "                                 Dload  Upload   Total   Spent    Left  Speed\r\n",
        "\r",
        "  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\r",
        "  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\r",
        "  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\r",
        "100 17629    0 17629    0     0   7060      0 --:--:--  0:00:02 --:--:--  7308"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\r",
        "100  131k    0  131k    0     0  47417      0 --:--:--  0:00:02 --:--:-- 48890\r\n"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Use BeutifulSoup to parse out all of the links in the table"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "f = open('tmp.html', 'rb').read()\n",
      "soup = BeautifulSoup(f)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "r = [l.get('href') for l in soup.find_all('a')\n",
      "   if l.get('href') != None\n",
      "   and '.maf' in l.get('href')]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 9
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Download all of the MAFs by following the links"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "* This takes a while, as I'm downloading all of the data.\n",
      "* I read in the table first to count the number of comment lines and a second time to actuall load the data.\n",
      "* Yes there is likely a more efficient way to do this, but I'm waiting on https://github.com/pydata/pandas/issues/2685"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "maf = {}\n",
      "for f in r:\n",
      "    try:\n",
      "        #t = pd.read_table(f, nrows=10, sep='not_real_term', header=None, squeeze=True,\n",
      "        #                  low_memory=False)\n",
      "        #skip = t.apply(lambda s: s.startswith('#'))\n",
      "        #skip = list(skip[skip==True].index)\n",
      "        maf[f] = pd.read_table(f, header=0, index_col=0,\n",
      "                               low_memory=True, comment='#')\n",
      "    except HTTPError:\n",
      "        print f"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n",
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (86,87) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n",
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,67,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n",
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (67,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,67,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n",
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (50,67,79,80,81) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (50,56,57,67,72,73,74,75,77,78,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n",
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,67) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n",
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,67,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,67,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n",
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (4,38,50,57,60,61,62,67,70,71,72,73,74,75,76,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (42,43,44,45) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n",
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n",
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (17,18,36,63,81) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n",
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (50,67,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (48,50,67) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n",
        "/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,85) have mixed types. Specify dtype option on import or set low_memory=False.\n",
        "  data = self._reader.read(nrows)\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "https://tcga-data.nci.nih.gov/tcgafiles/ftp_auth/distro_ftpusers/anonymous/tumor/laml/gsc/genome.wustl.edu/illuminaga_dnaseq/mutations/genome.wustl.edu_LAML.IlluminaGA_DNASeq.Level_2.1.2.0/genome.wustl.edu_LAML.IlluminaGA_DNASeq.preliminary.1.maf\n"
       ]
      }
     ],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "m2 = pd.concat(maf)\n",
      "m3 = m2.dropna(axis=1, how='all')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 15
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Reduce MAF down to most usefull columns"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "cols = ['NCBI_Build', 'Chromosome', 'Start_position', \n",
      "         'End_position', 'Strand', 'Reference_Allele', \n",
      "         'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2',\n",
      "         'Tumor_Sample_Barcode', 'Protein_Change',\n",
      "         'Variant_Classification','Variant_Type']\n",
      "\n",
      "m4 = m3[cols]\n",
      "m4 = m4.reset_index()\n",
      "#m4.index = map(lambda s: s.split('/')[-1], m4.index)\n",
      "m4 = m4.groupby(['Hugo_Symbol','Tumor_Sample_Barcode','Start_position']).first()\n",
      "m4 = m4.reset_index()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 16
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "m4.to_csv(out_path + 'mega_maf.csv')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 17
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Get gene by patient mutation count matrix and save"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "m5 = m4.ix[m4.Variant_Classification != 'Silent']\n",
      "cc = m5.groupby(['Hugo_Symbol','Tumor_Sample_Barcode']).size()\n",
      "cc = cc.reset_index()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 18
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "cc.to_csv(out_path + 'meta.csv')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 19
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "cc.shape"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 20,
       "text": [
        "(1411730, 3)"
       ]
      }
     ],
     "prompt_number": 20
    }
   ],
   "metadata": {}
  }
 ]
}