{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Scraping Demo\n", "Companion to Lecture 4 of Harvard [CS109: Data Science](http://cs109.org)\n" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import requests\n", "from pattern import web\n", "from BeautifulSoup import BeautifulSoup" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Task\n", "\n", "Find and print the movie title, list of genres, runtime, and score of all movies on [this page](http://www.imdb.com/search/title?at=0&sort=num_votes,desc&start=1&title_type=feature&year=1950,2012)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Two ways of making get requests" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 1. Explicit URL" ] }, { "cell_type": "code", "collapsed": false, "input": [ "url = 'http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=1950,2012'\n", "r = requests.get(url)\n", "print r.url" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=1950,2012\n" ] } ], "prompt_number": 7 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2. Base URL with GET dictionary" ] }, { "cell_type": "code", "collapsed": false, "input": [ "url = 'http://www.imdb.com/search/title'\n", "params = dict(sort='num_votes,desc', start=1, title_type='feature', year='1950,2012')\n", "r = requests.get(url, params=params)\n", "print r.url # notice it constructs the full url for you" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "http://www.imdb.com/search/title?sort=num_votes%2Cdesc&start=1&title_type=feature&year=1950%2C2012\n" ] } ], "prompt_number": 8 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Using Pattern" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#selection in pattern follows the rules of CSS\n", "\n", "dom = web.Element(r.text)\n", "for movie in dom.by_tag('td.title'): \n", " title = movie.by_tag('a')[0].content\n", " genres = movie.by_tag('span.genre')[0].by_tag('a')\n", " genres = [g.content for g in genres]\n", " runtime = movie.by_tag('span.runtime')[0].content\n", " rating = movie.by_tag('span.value')[0].content\n", " print title, genres, runtime, rating" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "The Shawshank Redemption [u'Crime', u'Drama'] 142 mins. 9.3\n", "The Dark Knight [u'Action', u'Crime', u'Drama', u'Thriller'] 152 mins. 9.0\n", "Inception [u'Action', u'Adventure', u'Mystery', u'Sci-Fi', u'Thriller'] 148 mins. 8.8\n", "Pulp Fiction [u'Crime', u'Drama', u'Thriller'] 154 mins. 9.0\n", "Fight Club [u'Drama'] 139 mins. 8.9\n", "The Lord of the Rings: The Fellowship of the Ring" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " [u'Action', u'Adventure', u'Fantasy'] 178 mins. 8.8\n", "The Matrix [u'Action', u'Adventure', u'Sci-Fi'] 136 mins. 8.7\n", "The Lord of the Rings: The Return of the King [u'Action', u'Adventure', u'Fantasy'] 201 mins. 8.9\n", "The Godfather [u'Crime', u'Drama'] 175 mins. 9.2\n", "Forrest Gump [u'Drama', u'Romance'] 142 mins. 8.7\n", "The Dark Knight Rises [u'Action', u'Crime', u'Thriller'] 165 mins. 8.6\n", "The Lord of the Rings: The Two Towers" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " [u'Action', u'Adventure', u'Fantasy'] 179 mins. 8.7\n", "Se7en [u'Crime', u'Mystery', u'Thriller'] 127 mins. 8.7\n", "Avatar [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'] 162 mins. 7.9\n", "Batman Begins [u'Action', u'Adventure', u'Crime', u'Drama'] 140 mins. 8.3\n", "Gladiator [u'Action', u'Adventure', u'Drama'] 155 mins. 8.5\n", "Star Wars" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'] 121 mins. 8.8\n", "The Avengers [u'Action', u'Fantasy'] 143 mins. 8.2\n", "Memento [u'Mystery', u'Thriller'] 113 mins. 8.6\n", "American Beauty [u'Drama'] 122 mins. 8.5\n", "Schindler's List [u'Biography', u'Drama', u'History', u'War'] 195 mins. 8.9\n", "Saving Private Ryan [u'Action', u'Drama', u'War'] 169 mins. 8.6\n", "The Departed" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " [u'Crime', u'Drama', u'Thriller'] 151 mins. 8.5\n", "The Silence of the Lambs [u'Crime', u'Drama', u'Thriller'] 118 mins. 8.7\n", "Pirates of the Caribbean: The Curse of the Black Pearl [u'Action', u'Adventure', u'Fantasy'] 143 mins. 8.0\n", "Star Wars: Episode V - The Empire Strikes Back" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " [u'Action', u'Adventure', u'Sci-Fi'] 124 mins. 8.8\n", "Titanic [u'Drama', u'Romance'] 194 mins. 7.6\n", "V for Vendetta [u'Action', u'Crime', u'Fantasy', u'Mystery', u'Sci-Fi', u'Thriller'] 132 mins. 8.2\n", "Inglourious Basterds [u'Adventure', u'Drama', u'War'] 153 mins. 8.3\n", "The Prestige [u'Drama', u'Mystery', u'Thriller'] 130 mins. 8.4\n", "American History X" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " [u'Crime', u'Drama'] 119 mins. 8.6\n", "The Godfather: Part II [u'Crime', u'Drama'] 200 mins. 9.0\n", "The Usual Suspects [u'Crime', u'Mystery', u'Thriller'] 106 mins. 8.7\n", "Braveheart [u'Action', u'Biography', u'Drama', u'History', u'War'] 177 mins. 8.4\n", "Terminator 2: Judgment Day" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " [u'Action', u'Sci-Fi', u'Thriller'] 137 mins. 8.6\n", "The Sixth Sense [u'Drama', u'Mystery', u'Thriller'] 107 mins. 8.2\n", "Kill Bill: Vol. 1 [u'Action', u'Crime'] 111 mins. 8.2\n", "Goodfellas [u'Biography', u'Crime', u'Drama', u'Thriller'] 146 mins. 8.8\n", "Sin City [u'Crime', u'Thriller'] 124 mins. 8.2\n", "Léon: The Professional" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " [u'Crime', u'Drama', u'Thriller'] 110 mins. 8.6\n", "Django Unchained [u'Adventure', u'Drama', u'Western'] 165 mins. 8.5\n", "One Flew Over the Cuckoo's Nest [u'Drama'] 133 mins. 8.8\n", "The Green Mile [u'Crime', u'Drama', u'Fantasy', u'Mystery'] 189 mins. 8.5\n", "Raiders of the Lost Ark [u'Action', u'Adventure'] 115 mins. 8.6\n", "Eternal Sunshine of the Spotless Mind" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " [u'Drama', u'Romance', u'Sci-Fi'] 108 mins. 8.4\n", "Shutter Island [u'Drama', u'Thriller'] 138 mins. 8.0\n", "Iron Man [u'Action', u'Adventure', u'Sci-Fi'] 126 mins. 7.9\n", "Back to the Future [u'Adventure', u'Comedy', u'Sci-Fi'] 116 mins. 8.5\n", "WALL·E [u'Animation', u'Adventure', u'Family', u'Romance', u'Sci-Fi'] 98 mins. 8.5\n", "300" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " [u'Action', u'Fantasy', u'History', u'War'] 117 mins. 7.7\n" ] } ], "prompt_number": 9 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Using BeautifulSoup" ] }, { "cell_type": "code", "collapsed": false, "input": [ "bs = BeautifulSoup(r.text)\n", "for movie in bs.findAll('td', 'title'):\n", " title = movie.find('a').contents[0]\n", " genres = movie.find('span', 'genre').findAll('a')\n", " genres = [g.contents[0] for g in genres]\n", " runtime = movie.find('span', 'runtime').contents[0]\n", " rating = movie.find('span', 'value').contents[0]\n", " print title, genres, runtime, rating\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "The Shawshank Redemption [u'Crime', u'Drama'] 142 mins. 9.3\n", "The Dark Knight [u'Action', u'Crime', u'Drama', u'Thriller'] 152 mins. 9.0\n", "Inception [u'Action', u'Adventure', u'Mystery', u'Sci-Fi', u'Thriller'] 148 mins. 8.8\n", "Pulp Fiction [u'Crime', u'Drama', u'Thriller'] 154 mins. 9.0\n", "Fight Club [u'Drama'] 139 mins. 8.9\n", "The Lord of the Rings: The Fellowship of the Ring [u'Action', u'Adventure', u'Fantasy'] 178 mins. 8.8\n", "The Matrix [u'Action', u'Adventure', u'Sci-Fi'] 136 mins. 8.7\n", "The Lord of the Rings: The Return of the King [u'Action', u'Adventure', u'Fantasy'] 201 mins. 8.9\n", "The Godfather [u'Crime', u'Drama'] 175 mins. 9.2\n", "Forrest Gump" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " [u'Drama', u'Romance'] 142 mins. 8.7\n", "The Dark Knight Rises [u'Action', u'Crime', u'Thriller'] 165 mins. 8.6\n", "The Lord of the Rings: The Two Towers [u'Action', u'Adventure', u'Fantasy'] 179 mins. 8.7\n", "Se7en [u'Crime', u'Mystery', u'Thriller'] 127 mins. 8.7\n", "Avatar [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'] 162 mins. 7.9\n", "Batman Begins [u'Action', u'Adventure', u'Crime', u'Drama'] 140 mins. 8.3\n", "Gladiator [u'Action', u'Adventure', u'Drama'] 155 mins. 8.5\n", "Star Wars [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'] 121 mins. 8.8\n", "The Avengers [u'Action', u'Fantasy'] 143 mins. 8.2\n", "Memento" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " [u'Mystery', u'Thriller'] 113 mins. 8.6\n", "American Beauty [u'Drama'] 122 mins. 8.5\n", "Schindler's List [u'Biography', u'Drama', u'History', u'War'] 195 mins. 8.9\n", "Saving Private Ryan [u'Action', u'Drama', u'War'] 169 mins. 8.6\n", "The Departed [u'Crime', u'Drama', u'Thriller'] 151 mins. 8.5\n", "The Silence of the Lambs [u'Crime', u'Drama', u'Thriller'] 118 mins. 8.7\n", "Pirates of the Caribbean: The Curse of the Black Pearl [u'Action', u'Adventure', u'Fantasy'] 143 mins. 8.0\n", "Star Wars: Episode V - The Empire Strikes Back [u'Action', u'Adventure', u'Sci-Fi'] 124 mins. 8.8\n", "Titanic [u'Drama', u'Romance'] 194 mins. 7.6\n", "V for Vendetta" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " [u'Action', u'Crime', u'Fantasy', u'Mystery', u'Sci-Fi', u'Thriller'] 132 mins. 8.2\n", "Inglourious Basterds [u'Adventure', u'Drama', u'War'] 153 mins. 8.3\n", "The Prestige [u'Drama', u'Mystery', u'Thriller'] 130 mins. 8.4\n", "American History X [u'Crime', u'Drama'] 119 mins. 8.6\n", "The Godfather: Part II [u'Crime', u'Drama'] 200 mins. 9.0\n", "The Usual Suspects [u'Crime', u'Mystery', u'Thriller'] 106 mins. 8.7\n", "Braveheart [u'Action', u'Biography', u'Drama', u'History', u'War'] 177 mins. 8.4\n", "Terminator 2: Judgment Day [u'Action', u'Sci-Fi', u'Thriller'] 137 mins. 8.6\n", "The Sixth Sense [u'Drama', u'Mystery', u'Thriller'] 107 mins. 8.2\n", "Kill Bill: Vol. 1" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " [u'Action', u'Crime'] 111 mins. 8.2\n", "Goodfellas [u'Biography', u'Crime', u'Drama', u'Thriller'] 146 mins. 8.8\n", "Sin City [u'Crime', u'Thriller'] 124 mins. 8.2\n", "Léon: The Professional [u'Crime', u'Drama', u'Thriller'] 110 mins. 8.6\n", "Django Unchained [u'Adventure', u'Drama', u'Western'] 165 mins. 8.5\n", "One Flew Over the Cuckoo's Nest [u'Drama'] 133 mins. 8.8\n", "The Green Mile [u'Crime', u'Drama', u'Fantasy', u'Mystery'] 189 mins. 8.5\n", "Raiders of the Lost Ark [u'Action', u'Adventure'] 115 mins. 8.6\n", "Eternal Sunshine of the Spotless Mind [u'Drama', u'Romance', u'Sci-Fi'] 108 mins. 8.4\n", "Shutter Island" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " [u'Drama', u'Thriller'] 138 mins. 8.0\n", "Iron Man [u'Action', u'Adventure', u'Sci-Fi'] 126 mins. 7.9\n", "Back to the Future [u'Adventure', u'Comedy', u'Sci-Fi'] 116 mins. 8.5\n", "WALL·E [u'Animation', u'Adventure', u'Family', u'Romance', u'Sci-Fi'] 98 mins. 8.5\n", "300 [u'Action', u'Fantasy', u'History', u'War'] 117 mins. 7.7\n" ] } ], "prompt_number": 5 } ], "metadata": {} } ] }