{
 "metadata": {
  "name": "",
  "signature": "sha256:6cb8dc1be14a8faf0059391ce2a87f197d282046498af069dc859ecc34fc1206"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from IPython.core.display import HTML\n",
      "\n",
      "with open('creative_commons.txt', 'r') as f:\n",
      "    html = f.read()\n",
      "    \n",
      "name = '2015-07-06-podcasts'\n",
      "\n",
      "html = '''\n",
      "<small>\n",
      "<p> This post was written as an IPython notebook.\n",
      " It is available for <a href='https://ocefpaf.github.com/python4oceanographers/downloads/notebooks/%s.ipynb'>download</a>\n",
      " or as a static <a href='https://nbviewer.ipython.org/url/ocefpaf.github.com/python4oceanographers/downloads/notebooks/%s.ipynb'>html</a>.</p>\n",
      "<p></p>\n",
      "%s''' % (name, name, html)\n",
      "\n",
      "%matplotlib inline\n",
      "from matplotlib import style\n",
      "style.use('ggplot')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import os\n",
      "from datetime import datetime\n",
      "\n",
      "title = \"Web scraping 101 (or how to get ready for a long trip)\"\n",
      "hour = datetime.utcnow().strftime('%H:%M')\n",
      "comments=\"true\"\n",
      "\n",
      "date = '-'.join(name.split('-')[:3])\n",
      "slug = '-'.join(name.split('-')[3:])\n",
      "\n",
      "metadata = dict(title=title,\n",
      "                date=date,\n",
      "                hour=hour,\n",
      "                comments=comments,\n",
      "                slug=slug,\n",
      "                name=name)\n",
      "\n",
      "markdown = \"\"\"Title: {title}\n",
      "date:  {date} {hour}\n",
      "comments: {comments}\n",
      "slug: {slug}\n",
      "\n",
      "{{% notebook {name}.ipynb cells[2:] %}}\n",
      "\"\"\".format(**metadata)\n",
      "\n",
      "content = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir, '{}.md'.format(name)))\n",
      "with open('{}'.format(content), 'w') as f:\n",
      "    f.writelines(markdown)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "In this post I will show how to write a simple script to scrape a webpage\n",
      "with a list of podcasts links.  I did that while preparing to go to Austin.\n",
      "It is a nice way to use my extra \"airport time\" to study a little bit.\n",
      "\n",
      "The first step is to list all the links in the podcast webpage,"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import requests\n",
      "from bs4 import BeautifulSoup, SoupStrainer\n",
      "\n",
      "\n",
      "def urllister(url):\n",
      "    r = requests.get(url)\n",
      "    soup = r.content\n",
      "    urls = []\n",
      "    for link in BeautifulSoup(soup, parse_only=SoupStrainer('a')):\n",
      "        try:\n",
      "            if link.has_attr('href'):\n",
      "                urls.append(link['href'])\n",
      "        except AttributeError:\n",
      "            pass\n",
      "    return urls"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "and filter it by the file extension you want to download:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import fnmatch\n",
      "\n",
      "\n",
      "def filter_url(urls, filetype=\"*.mp3\"):\n",
      "    return (fname for fname in fnmatch.filter(urls, filetype))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Now we need to create a download function.  I do not remember where I got\n",
      "the function below.  It is probably a mixture of StackOverflow and some\n",
      "customizations.  The beauty of this function is that it can resume a partial \n",
      "download and displays a nice progress bar."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import os\n",
      "import sys\n",
      "\n",
      "try:\n",
      "    from urllib.error import HTTPError\n",
      "    from urllib.request import FancyURLopener\n",
      "except ImportError:\n",
      "    from urllib2 import HTTPError\n",
      "    from urllib import FancyURLopener\n",
      "\n",
      "from progressbar import ProgressBar\n",
      "\n",
      "\n",
      "class URLOpener(FancyURLopener):\n",
      "    \"\"\"Subclass to override error 206 (partial file being sent).\"\"\"\n",
      "    def http_error_206(self, url, fp, errcode, errmsg, headers, data=None):\n",
      "        pass  # Ignore the expected \"non-error\" code.\n",
      "\n",
      "\n",
      "def download(fname, url, verbose=False):\n",
      "    \"\"\"Resume download.\"\"\"\n",
      "    current_size = 0\n",
      "    url_obj = URLOpener()\n",
      "    if os.path.exists(fname):\n",
      "        output = open(fname, \"ab\")\n",
      "        current_size = os.path.getsize(fname)\n",
      "        # If the file exists, then download only the remainder.\n",
      "        url_obj.addheader(\"Range\", \"bytes=%s-\" % (current_size))\n",
      "    else:\n",
      "        output = open(fname, \"wb\")\n",
      "\n",
      "    web_page = url_obj.open(url)\n",
      "\n",
      "    if verbose:\n",
      "        for key, value in web_page.headers.items():\n",
      "            sys.stdout.write(\"{} = {}\\n\".format(key, value))\n",
      "\n",
      "    # If we already have the whole file, there is no need to download it again.\n",
      "    num_bytes = 0\n",
      "    full_size = int(web_page.headers['Content-Length'])\n",
      "    if full_size == current_size:\n",
      "        msg = \"File ({}) was already downloaded from URL ({})\".format\n",
      "        sys.stdout.write(msg(fname, url))\n",
      "    elif full_size == 0:\n",
      "        sys.stdout.write(\"Full file size equal zero!\"\n",
      "                         \"Try again later or check the file\")\n",
      "    else:\n",
      "        if verbose:\n",
      "            msg = \"Downloading {:d} more bytes\".format\n",
      "            sys.stdout.write(msg(full_size - current_size))\n",
      "        pbar = ProgressBar(maxval=full_size)\n",
      "        pbar.start()\n",
      "        while True:\n",
      "            try:\n",
      "                data = web_page.read(8192)\n",
      "            except ValueError:\n",
      "                break\n",
      "            if not data:\n",
      "                break\n",
      "            output.write(data)\n",
      "            num_bytes = num_bytes + len(data)\n",
      "            pbar.update(num_bytes)\n",
      "        pbar.finish()\n",
      "    web_page.close()\n",
      "    output.close()\n",
      "\n",
      "    if verbose:\n",
      "        msg = \"Downloaded {} bytes from {}\".format\n",
      "        sys.stdout.write(msg(num_bytes, web_page.url))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Now find a URL with the podcasts you want and start scrapping.  Be nice and\n",
      "sleep a little bit before each download!"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "```python\n",
      "from time import sleep\n",
      "\n",
      "podcasts = range(0, 101)\n",
      "uri = (\"http://some-url-with-podcasts/podcast-{}.mp3\".format)\n",
      "\n",
      "for podcast in podcasts:\n",
      "    url = uri(podcast)\n",
      "    print(url + '\\n')\n",
      "    try:\n",
      "        fname = url.split('/')[-1]\n",
      "        download(fname, url, verbose=True)\n",
      "    except HTTPError:\n",
      "        print('Cannot download {}\\n'.format(url))\n",
      "    print('\\n')\n",
      "    sleep(2)\n",
      "```"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Be sure to read the page terms of use.  Some podcasts providers do not\n",
      "like scrapping!! \n",
      "\n",
      "~~I will be listening to some Spanish classes.~~ Nope, just lost my phone at\n",
      "the airport... I won't be listening to anything :-("
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "HTML(html)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "\n",
        "<small>\n",
        "<p> This post was written as an IPython notebook.\n",
        " It is available for <a href='https://ocefpaf.github.com/python4oceanographers/downloads/notebooks/2015-07-06-podcasts.ipynb'>download</a>\n",
        " or as a static <a href='https://nbviewer.ipython.org/url/ocefpaf.github.com/python4oceanographers/downloads/notebooks/2015-07-06-podcasts.ipynb'>html</a>.</p>\n",
        "<p></p>\n",
        "<a rel=\"license\" href=\"https://creativecommons.org/licenses/by-sa/4.0/\"><img\n",
        "alt=\"Creative Commons License\" style=\"border-width:0\"\n",
        "src=\"https://i.creativecommons.org/l/by-sa/4.0/88x31.png\" /></a><br /><span\n",
        "xmlns:dct=\"https://purl.org/dc/terms/\"\n",
        "property=\"dct:title\">python4oceanographers</span> by <a\n",
        "xmlns:cc=\"https://creativecommons.org/ns#\" href=\"https://ocefpaf.github.io/\"\n",
        "property=\"cc:attributionName\" rel=\"cc:attributionURL\">Filipe Fernandes</a> is\n",
        "licensed under a <a rel=\"license\"\n",
        "href=\"https://creativecommons.org/licenses/by-sa/4.0/\">Creative Commons\n",
        "Attribution-ShareAlike 4.0 International License</a>.<br />Based on a work at <a\n",
        "xmlns:dct=\"https://purl.org/dc/terms/\" href=\"https://ocefpaf.github.io/\"\n",
        "rel=\"dct:source\">https://ocefpaf.github.io/</a>.\n"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 6,
       "text": [
        "<IPython.core.display.HTML at 0x7f0aa41d3350>"
       ]
      }
     ],
     "prompt_number": 6
    }
   ],
   "metadata": {}
  }
 ]
}