{ "metadata": { "name": "THREDDS crawler" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "from xml.etree import cElementTree\n", "from urlparse import urljoin\n", "\n", "import requests\n", "\n", "# namespaces for XML parsing\n", "thredds = \"http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0\"\n", "xlink = \"http://www.w3.org/1999/xlink\"\n", "\n", "def crawl(catalog):\n", " r = requests.get(catalog)\n", " xml = cElementTree.fromstring(r.content)\n", "\n", " # depth first traversal\n", " for subdir in xml.iterfind(\".//{%s}catalogRef\" % thredds):\n", " link = subdir.attrib[\"{%s}href\" % xlink]\n", " for dataset in crawl(urljoin(catalog, link)):\n", " yield dataset\n", "\n", " for dataset in xml.iterfind(\".//{%s}dataset[@urlPath]\" % thredds):\n", " yield dataset" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "catalog='http://geoport.whoi.edu/thredds/COAWST_catalog.xml'\n", "foo = crawl(catalog)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "print next(crawl(catalog))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "r = requests.get(catalog)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "r.content" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 12, "text": [ "'\\r\\n\\r\\n \\r\\n \\r\\n \\r\\n \\r\\n \\r\\n \\r\\n \\r\\n \\r\\n \\r\\n \\r\\n allServices\\r\\n gov.usgs.er.whsc\\r\\n GRID\\r\\n netCDF\\r\\n \\r\\n \\r\\n \\r\\n OM/WHSC/USGS\\r\\n \\r\\n \\r\\n \\r\\n OM/WHSC/USGS\\r\\n \\r\\n \\r\\n \\r\\n \\r\\n \\r\\n \\r\\n \\r\\n \\r\\n \\r\\n \\r\\n\\r\\n\\r\\n'" ] } ], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }