\n",
"Here I download updated clinical data from the TCGA Data Portal. \n",
"This is a secure site which uses HTTPS. I had to give it a path \n",
"to my ca-cert for the download to work. \n",
"\n",
"Download a copy of a generic cacert.pem [here](http://curl.haxx.se/ca/cacert.pem).\n",
"
"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"PATH_TO_CACERT = '/cellar/users/agross/cacert.pem'"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Download most recent files from MAF dashboard"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"out_path = OUT_PATH + '/MAFs_new_2/'\n",
"if not os.path.isdir(out_path):\n",
" os.makedirs(out_path)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"maf_dashboard = 'https://confluence.broadinstitute.org/display/GDAC/MAF+Dashboard'"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!curl --cacert $PATH_TO_CACERT $maf_dashboard -o tmp.html"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" % Total % Received % Xferd Average Speed Time Time Time Current\r\n",
" Dload Upload Total Spent Left Speed\r\n",
"\r",
" 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r",
" 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r",
" 0 0 0 0 0 0 0 0 --:--:-- 0:00:01 --:--:-- 0"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r",
"100 17629 0 17629 0 0 7060 0 --:--:-- 0:00:02 --:--:-- 7308"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r",
"100 131k 0 131k 0 0 47417 0 --:--:-- 0:00:02 --:--:-- 48890\r\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Use BeutifulSoup to parse out all of the links in the table"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"f = open('tmp.html', 'rb').read()\n",
"soup = BeautifulSoup(f)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"r = [l.get('href') for l in soup.find_all('a')\n",
" if l.get('href') != None\n",
" and '.maf' in l.get('href')]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Download all of the MAFs by following the links"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"* This takes a while, as I'm downloading all of the data.\n",
"* I read in the table first to count the number of comment lines and a second time to actuall load the data.\n",
"* Yes there is likely a more efficient way to do this, but I'm waiting on https://github.com/pydata/pandas/issues/2685"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"maf = {}\n",
"for f in r:\n",
" try:\n",
" #t = pd.read_table(f, nrows=10, sep='not_real_term', header=None, squeeze=True,\n",
" # low_memory=False)\n",
" #skip = t.apply(lambda s: s.startswith('#'))\n",
" #skip = list(skip[skip==True].index)\n",
" maf[f] = pd.read_table(f, header=0, index_col=0,\n",
" low_memory=True, comment='#')\n",
" except HTTPError:\n",
" print f"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n",
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (86,87) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n",
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,67,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n",
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (67,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,67,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n",
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (50,67,79,80,81) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (50,56,57,67,72,73,74,75,77,78,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n",
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,67) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n",
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,67,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,67,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n",
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (4,38,50,57,60,61,62,67,70,71,72,73,74,75,76,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (42,43,44,45) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n",
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n",
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (17,18,36,63,81) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n",
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (50,67,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (48,50,67) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n",
"/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,85) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"https://tcga-data.nci.nih.gov/tcgafiles/ftp_auth/distro_ftpusers/anonymous/tumor/laml/gsc/genome.wustl.edu/illuminaga_dnaseq/mutations/genome.wustl.edu_LAML.IlluminaGA_DNASeq.Level_2.1.2.0/genome.wustl.edu_LAML.IlluminaGA_DNASeq.preliminary.1.maf\n"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"m2 = pd.concat(maf)\n",
"m3 = m2.dropna(axis=1, how='all')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 15
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Reduce MAF down to most usefull columns"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cols = ['NCBI_Build', 'Chromosome', 'Start_position', \n",
" 'End_position', 'Strand', 'Reference_Allele', \n",
" 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2',\n",
" 'Tumor_Sample_Barcode', 'Protein_Change',\n",
" 'Variant_Classification','Variant_Type']\n",
"\n",
"m4 = m3[cols]\n",
"m4 = m4.reset_index()\n",
"#m4.index = map(lambda s: s.split('/')[-1], m4.index)\n",
"m4 = m4.groupby(['Hugo_Symbol','Tumor_Sample_Barcode','Start_position']).first()\n",
"m4 = m4.reset_index()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"m4.to_csv(out_path + 'mega_maf.csv')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 17
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Get gene by patient mutation count matrix and save"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"m5 = m4.ix[m4.Variant_Classification != 'Silent']\n",
"cc = m5.groupby(['Hugo_Symbol','Tumor_Sample_Barcode']).size()\n",
"cc = cc.reset_index()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cc.to_csv(out_path + 'meta.csv')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cc.shape"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 20,
"text": [
"(1411730, 3)"
]
}
],
"prompt_number": 20
}
],
"metadata": {}
}
]
}