{ "metadata": { "name": "", "signature": "sha256:9ed6e8c28a04e9cd0f283c1ecff54483ba63318e8a1a668dbaaeeb9ab19c70dc" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "

Download Data Get updated clinical data from the TCGA Data Portal

" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "
\n", "


\n", "Here I download updated clinical data from the TCGA Data Portal. \n", "This is a secure site which uses HTTPS. I had to give it a path \n", "to my ca-cert for the download to work. \n", "\n", "Download a copy of a generic cacert.pem [here](http://curl.haxx.se/ca/cacert.pem).\n", "
" ] }, { "cell_type": "code", "collapsed": false, "input": [ "PATH_TO_CACERT = '/cellar/users/agross/cacert.pem'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "cd ../src" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/cellar/users/agross/TCGA_Code/TCGA/src\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "from Processing.Imports import *\n", "from IPython.display import clear_output" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "params = pd.read_table('../global_params.txt', header=None, squeeze=True, \n", " index_col=0)\n", "run_path = '{}/Firehose__{}/'.format(params.ix['OUT_PATH'], params.ix['RUN_DATE'])\n", "run = get_run(run_path, 'Run_' + params.ix['VERSION'])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "path = 'https://tcga-data.nci.nih.gov/tcgafiles/ftp_auth/distro_ftpusers/anonymous/tumor/'\n", "out = params['OUT_PATH'] + '/Followup_R3'\n", "\n", "if not os.path.isdir(out):\n", " os.makedirs(out)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "run.cancers" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 6, "text": [ "array(['ACC', 'BLCA', 'BRCA', 'CESC', 'COAD', 'DLBC', 'ESCA', 'GBM',\n", " 'HNSC', 'KICH', 'KIRC', 'KIRP', 'LAML', 'LGG', 'LIHC', 'LUAD',\n", " 'LUSC', 'OV', 'PAAD', 'PRAD', 'READ', 'SARC', 'SKCM', 'STAD',\n", " 'THCA', 'UCEC'], dtype=object)" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "for cancer in run.cancers:\n", " print cancer\n", " try:\n", " f = '{}{}/bcr/biotab/clin/'.format(path, cancer.lower())\n", " files = pd.read_table(f + 'MANIFEST.txt', sep=' ', header=None)\n", " if not os.path.isdir(out + '/' + cancer):\n", " os.makedirs(out + '/' + cancer)\n", " for g in files[1]:\n", " p = f + g\n", " o = out + '/' + cancer + '/' + g\n", " !curl --cacert $PATH_TO_CACERT $p > $o\n", " except:\n", " print 'FAIL: Make sure path to cacert.pem is set!'\n", " clear_output()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "for f in os.listdir(out):\n", " path = out + '/' + f + '/'\n", " for f in os.listdir(path):\n", " if 'nationwidechildrens' in f:\n", " os.rename(path + f, path + '_'.join(f.split('_')[1:]))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 } ], "metadata": {} } ] }