{ "metadata": { "name": "", "signature": "sha256:6838a5bece2cc08348dec4d9651bafa8e181545ce2a843dd993b70f8459c64fa" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Scraping a HTML with Beauitful Soup\n", "\n", "- **Author:** [Chris Albon](http://www.chrisalbon.com/), [@ChrisAlbon](https://twitter.com/chrisalbon)\n", "- **Date:** -\n", "- **Repo:** [Python 3 code snippets for data science](https://github.com/chrisalbon/code_py)\n", "- **Note:** -" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Import required modules\n", "import requests\n", "from bs4 import BeautifulSoup\n", "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 105 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a dataframe. We will scrape iPython's HTML table output" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Create a values as dictionary of lists\n", "raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], \n", " 'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], \n", " 'age': [42, 52, 36, 24, 73], \n", " 'preTestScore': [4, 24, 31, 2, 3],\n", " 'postTestScore': [25, 94, 57, 62, 70]}\n", "\n", "# Create a dataframe\n", "raw_df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])\n", "\n", "# View a dataframe\n", "raw_df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameagepreTestScorepostTestScore
0 Jason Miller 42 4 25
1 Molly Jacobson 52 24 94
2 Tina Ali 36 31 57
3 Jake Milner 24 2 62
4 Amy Cooze 73 3 70
\n", "

5 rows \u00d7 5 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 106, "text": [ " first_name last_name age preTestScore postTestScore\n", "0 Jason Miller 42 4 25\n", "1 Molly Jacobson 52 24 94\n", "2 Tina Ali 36 31 57\n", "3 Jake Milner 24 2 62\n", "4 Amy Cooze 73 3 70\n", "\n", "[5 rows x 5 columns]" ] } ], "prompt_number": 106 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Download the HTML and create a Beautiful Soup object" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Create a variable with the URL to this tutorial\n", "url = 'http://nbviewer.ipython.org/github/chrisalbon/code_py/blob/master/beautiful_soup_scrape_table.ipynb'\n", "\n", "# Scrape the HTML at the url\n", "r = requests.get(url)\n", "\n", "# Turn the HTML into a Beautiful Soup object\n", "soup = BeautifulSoup(r.text)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 107 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Parse the Beautiful Soup object" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Create four variables to score the scraped data in\n", "first_name = []\n", "last_name = []\n", "age = []\n", "preTestScore = []\n", "postTestScore = []\n", "\n", "# Create an object of the first object that is class=dataframe\n", "table = soup.find(class_='dataframe')\n", "\n", "# Find all the tag pairs, skip the first one, then for each.\n", "for row in table.find_all('tr')[1:]:\n", " # Create a variable of all the tag pairs in each tag pair,\n", " col = row.find_all('td')\n", " \n", " # Create a variable of the string inside 1st tag pair,\n", " column_1 = col[0].string.strip()\n", " # and append it to first_name variable\n", " first_name.append(column_1)\n", " \n", " # Create a variable of the string inside 2nd tag pair,\n", " column_2 = col[1].string.strip()\n", " # and append it to last_name variable\n", " last_name.append(column_2)\n", " \n", " # Create a variable of the string inside 3rd tag pair,\n", " column_3 = col[2].string.strip()\n", " # and append it to age variable\n", " age.append(column_3)\n", " \n", " # Create a variable of the string inside 4th tag pair,\n", " column_4 = col[3].string.strip()\n", " # and append it to preTestScore variable\n", " preTestScore.append(column_4)\n", " \n", " # Create a variable of the string inside 5th tag pair,\n", " column_5 = col[4].string.strip()\n", " # and append it to postTestScore variable\n", " postTestScore.append(column_5)\n", "\n", "# Create a variable of the value of the columns\n", "columns = {'first_name': first_name, 'last_name': last_name, 'age': age, 'preTestScore': preTestScore, 'postTestScore': postTestScore}\n", "\n", "# Create a dataframe from the columns variable\n", "df = pd.DataFrame(columns)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 108 }, { "cell_type": "code", "collapsed": false, "input": [ "# View the dataframe\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agefirst_namelast_namepostTestScorepreTestScore
0 42 Jason Miller 25 4
1 52 Molly Jacobson 94 24
2 36 Tina Ali 57 31
3 24 Jake Milner 62 2
4 73 Amy Cooze 70 3
\n", "

5 rows \u00d7 5 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 109, "text": [ " age first_name last_name postTestScore preTestScore\n", "0 42 Jason Miller 25 4\n", "1 52 Molly Jacobson 94 24\n", "2 36 Tina Ali 57 31\n", "3 24 Jake Milner 62 2\n", "4 73 Amy Cooze 70 3\n", "\n", "[5 rows x 5 columns]" ] } ], "prompt_number": 109 } ], "metadata": {} } ] }