{ "metadata": { "name": "scraping menupages" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": "import urllib\nhtml_str = urllib.urlopen(\"http://www.menupages.com/restaurants/all-areas/morningside-heights/all-cuisines/\").read()", "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": "from bs4 import BeautifulSoup\nimport re\n\ndocument = BeautifulSoup(html_str)\n\ntable_tag = document.find(\"table\", attrs={\"class\": \"search-results\"})\n\nrestaurant_list = []\n\nfor tr_tag in table_tag.find_all('tr'):\n name_address_td_tag = tr_tag.find('td', attrs={'class': 'name-address'})\n # if we can't find a td tag with a name-address class, skip this tr\n if name_address_td_tag is None:\n continue\n name_a_tag = name_address_td_tag.find('a')\n rating_td_tag = tr_tag.find('td', attrs={'class': 'rating'})\n rating_image_tag = rating_td_tag.find('img')\n rating_image_url = rating_image_tag['src']\n # find the \"number of stars\" in the URL for the image tag, we'll use that\n # to determine the rating value\n rating_val = re.findall(r'[0-9_]+', rating_image_url)[0]\n rating_val = rating_val.replace('_', '.')\n rating_val = float(rating_val)\n price_td_tag = tr_tag.find('td', attrs={'class': 'price'})\n review_td_tag = tr_tag.find('td', attrs={'class': 'reviews'})\n # .contents is a list of things inside a tag, regardless of whether or\n # not it's a string or another tag. (look at how the tag is organized\n # in the source to see why this is necessary---we can't just use .string)\n restaurant = {\n 'name': name_a_tag.contents[1], \n 'rating': rating_val,\n 'price': len(price_td_tag.string),\n 'reviews': review_td_tag.string\n }\n restaurant_list.append(restaurant)\n\nrestaurant_list\n ", "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 2, "text": "[{'name': u'Ajanta', 'price': 2, 'rating': 3.0, 'reviews': u'43'},\n {'name': u'Amigos', 'price': 3, 'rating': 3.0, 'reviews': u'5'},\n {'name': u\"Amir's Grill\", 'price': 2, 'rating': 3.0, 'reviews': u'38'},\n {'name': u'Amsterdam Restaurant',\n 'price': 4,\n 'rating': 3.5,\n 'reviews': u'31'},\n {'name': u'Artopolis Espresso Cafe',\n 'price': 2,\n 'rating': 4.5,\n 'reviews': u'4'},\n {'name': u'Bettolona', 'price': 3, 'rating': 4.0, 'reviews': u'19'},\n {'name': u'Bier International', 'price': 2, 'rating': 4.0, 'reviews': u'5'},\n {'name': u\"Brad's\", 'price': 1, 'rating': 4.0, 'reviews': u'2'},\n {'name': u'Cafe Amrita', 'price': 2, 'rating': 3.5, 'reviews': u'34'},\n {'name': u'Cafe Nana', 'price': 1, 'rating': 4.5, 'reviews': u'8'},\n {'name': u\"Camille's\", 'price': 2, 'rating': 4.0, 'reviews': u'18'},\n {'name': u'Chapati House', 'price': 2, 'rating': 3.0, 'reviews': u'1'},\n {'name': u\"Che' Bella Pizza\", 'price': 2, 'rating': 4.0, 'reviews': u'11'},\n {'name': u'China Place', 'price': 2, 'rating': 3.5, 'reviews': u'7'},\n {'name': u'Chipotle', 'price': 2, 'rating': 4.0, 'reviews': u'2'},\n {'name': u'Chokolat Patisserie', 'price': 1, 'rating': 0.0, 'reviews': u'0'},\n {'name': u'Chokolat Patisserie', 'price': 1, 'rating': 0.0, 'reviews': u'0'},\n {'name': u'Columbia Cottage (CLOSED)',\n 'price': 2,\n 'rating': 3.0,\n 'reviews': u'37'},\n {'name': u'Community Food & Juice',\n 'price': 4,\n 'rating': 3.5,\n 'reviews': u'52'},\n {'name': u'Deluxe', 'price': 2, 'rating': 3.0, 'reviews': u'70'},\n {'name': u'Dig Inn Seasonal Market',\n 'price': 2,\n 'rating': 0.0,\n 'reviews': u'0'},\n {'name': u'Dinosaur Bar-B-Que', 'price': 3, 'rating': 4.0, 'reviews': u'116'},\n {'name': u'Domain NYC', 'price': 5, 'rating': 0.0, 'reviews': u'0'},\n {'name': u'El Paso Truck', 'price': 1, 'rating': 0.0, 'reviews': u'0'},\n {'name': u'El Porton', 'price': 3, 'rating': 4.0, 'reviews': u'10'},\n {'name': u'Falafel on Broadway', 'price': 2, 'rating': 5.0, 'reviews': u'1'},\n {'name': u'Famous Famiglia', 'price': 2, 'rating': 4.0, 'reviews': u'18'},\n {'name': u'Five Guys', 'price': 1, 'rating': 0.0, 'reviews': u'0'},\n {'name': u'Flat Top', 'price': 3, 'rating': 0.0, 'reviews': u'0'},\n {'name': u'Haagen-Dazs', 'price': 1, 'rating': 0.0, 'reviews': u'0'},\n {'name': u\"Haakon's Hall\", 'price': 3, 'rating': 4.0, 'reviews': u'21'},\n {'name': u'Hamilton Deli', 'price': 1, 'rating': 3.5, 'reviews': u'18'},\n {'name': u'Havana Central at The West End (CLOSED)',\n 'price': 3,\n 'rating': 3.0,\n 'reviews': u'48'},\n {'name': u'Henan Cart', 'price': 1, 'rating': 0.0, 'reviews': u'0'},\n {'name': u'Insomnia Cookies', 'price': 1, 'rating': 3.0, 'reviews': u'1'},\n {'name': u'Joe the Art of Coffee',\n 'price': 1,\n 'rating': 4.5,\n 'reviews': u'1'},\n {'name': u\"Joe's G-H Deli\", 'price': 1, 'rating': 0.0, 'reviews': u'0'},\n {'name': u'Kissaten Jin', 'price': 2, 'rating': 0.0, 'reviews': u'0'},\n {'name': u'Kitchenette', 'price': 3, 'rating': 3.0, 'reviews': u'46'},\n {'name': u'Koronet Pizza', 'price': 1, 'rating': 4.0, 'reviews': u'5'},\n {'name': u'Kuro Kuma', 'price': 1, 'rating': 4.5, 'reviews': u'2'},\n {'name': u'Le Monde', 'price': 4, 'rating': 3.0, 'reviews': u'39'},\n {'name': u'Levain Bakery', 'price': 1, 'rating': 4.5, 'reviews': u'2'},\n {'name': u'M2M - Morning To Midnight',\n 'price': 1,\n 'rating': 2.5,\n 'reviews': u'8'},\n {'name': u\"Mama's Fried Chicken and Pizza\",\n 'price': 1,\n 'rating': 0.0,\n 'reviews': u'0'},\n {'name': u'Maoz Vegetarian', 'price': 2, 'rating': 4.5, 'reviews': u'5'},\n {'name': u'Massawa', 'price': 3, 'rating': 3.5, 'reviews': u'33'},\n {'name': u'Max Caffe', 'price': 2, 'rating': 3.5, 'reviews': u'11'},\n {'name': u'Max Soha', 'price': 3, 'rating': 3.5, 'reviews': u'17'},\n {'name': u\"Mel's Burger Bar\", 'price': 3, 'rating': 3.0, 'reviews': u'16'},\n {'name': u\"Melba's\", 'price': 3, 'rating': 3.5, 'reviews': u'35'},\n {'name': u'Milano Market', 'price': 2, 'rating': 4.0, 'reviews': u'23'},\n {'name': u'New Aroma', 'price': 2, 'rating': 3.5, 'reviews': u'3'},\n {'name': u'Nikko', 'price': 2, 'rating': 4.0, 'reviews': u'14'},\n {'name': u'Nussbaum & Wu', 'price': 1, 'rating': 3.0, 'reviews': u'36'},\n {'name': u\"Ollie's\", 'price': 2, 'rating': 2.5, 'reviews': u'97'},\n {'name': u'Orange Peel (CLOSED)', 'price': 2, 'rating': 4.0, 'reviews': u'2'},\n {'name': u\"Oren's\", 'price': 1, 'rating': 0.0, 'reviews': u'0'},\n {'name': u'Panino Sportivo Roma',\n 'price': 2,\n 'rating': 3.5,\n 'reviews': u'19'},\n {'name': u'Peking Garden', 'price': 1, 'rating': 3.5, 'reviews': u'14'},\n {'name': u'Pinkberry', 'price': 1, 'rating': 4.0, 'reviews': u'3'},\n {'name': u'Pisticci', 'price': 3, 'rating': 4.0, 'reviews': u'34'},\n {'name': u'Pita Grill', 'price': 3, 'rating': 5.0, 'reviews': u'1'},\n {'name': u'Serengeti Teas & Spices',\n 'price': 1,\n 'rating': 0.0,\n 'reviews': u'0'},\n {'name': u'Sezz Medi (CLOSED)', 'price': 3, 'rating': 3.5, 'reviews': u'34'},\n {'name': u'Silvana', 'price': 2, 'rating': 5.0, 'reviews': u'1'},\n {'name': u'Strokos Pizza', 'price': 2, 'rating': 3.0, 'reviews': u'16'},\n {'name': u'Subsconscious', 'price': 2, 'rating': 3.0, 'reviews': u'38'},\n {'name': u'Sushi Sushi', 'price': 2, 'rating': 4.5, 'reviews': u'3'},\n {'name': u'Symposium Greek Restaurant',\n 'price': 3,\n 'rating': 3.5,\n 'reviews': u'19'},\n {'name': u'The Heights Bar & Grill',\n 'price': 3,\n 'rating': 4.0,\n 'reviews': u'26'},\n {'name': u'The Mill', 'price': 2, 'rating': 3.5, 'reviews': u'52'},\n {'name': u'Toast', 'price': 2, 'rating': 4.0, 'reviews': u'26'},\n {'name': u'Toast Chicken', 'price': 2, 'rating': 3.0, 'reviews': u'4'},\n {'name': u\"Tom's Delicious Pizza\",\n 'price': 2,\n 'rating': 3.5,\n 'reviews': u'12'},\n {'name': u\"Tom's Restaurant\", 'price': 2, 'rating': 3.0, 'reviews': u'44'},\n {'name': u'Uncle Luoyang', 'price': 1, 'rating': 0.0, 'reviews': u'0'},\n {'name': u'V & T Pizza', 'price': 3, 'rating': 3.5, 'reviews': u'68'},\n {'name': u'Vareli', 'price': 5, 'rating': 3.5, 'reviews': u'16'},\n {'name': u'Vegenation', 'price': 1, 'rating': 0.0, 'reviews': u'0'},\n {'name': u'Vinateria', 'price': 3, 'rating': 3.5, 'reviews': u'1'},\n {'name': u'Vine', 'price': 2, 'rating': 3.0, 'reviews': u'26'},\n {'name': u'West Place', 'price': 2, 'rating': 3.5, 'reviews': u'13'}]" } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": "import pandas as pd\n\ndf = pd.DataFrame(restaurant_list)\ndf[(df['price']==1) & (df['rating']>=4.0)]", "language": "python", "metadata": {}, "outputs": [ { "html": "
\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
namepriceratingreviews
7 Brad's 1 4.0 2
9 Cafe Nana 1 4.5 8
35 Joe the Art of Coffee 1 4.5 1
39 Koronet Pizza 1 4.0 5
40 Kuro Kuma 1 4.5 2
42 Levain Bakery 1 4.5 2
60 Pinkberry 1 4.0 3
\n

7 rows \u00d7 4 columns

\n
", "metadata": {}, "output_type": "pyout", "prompt_number": 3, "text": " name price rating reviews\n7 Brad's 1 4.0 2\n9 Cafe Nana 1 4.5 8\n35 Joe the Art of Coffee 1 4.5 1\n39 Koronet Pizza 1 4.0 5\n40 Kuro Kuma 1 4.5 2\n42 Levain Bakery 1 4.5 2\n60 Pinkberry 1 4.0 3\n\n[7 rows x 4 columns]" } ], "prompt_number": 3 } ], "metadata": {} } ] }