{ "metadata": { "name": "", "signature": "sha256:a4a69223c03f18a962e744ea22b3a05af0adb40b25b79db697844a290bba748e" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Web Scraping\n", "It is just a way of getting data from web.\n", "\n", "There are following ways how we can collect data:\n", "* Direct Download\n", "* Using API\n", "* Gathering from web pages\n", "\n", "The third one above is web scraping. It simply means to writing the scripts(may be python scripts) to grab the data from web pages. It is more fun than direct download.\n", "\n", "Many different Python libraries are available for web Scraping. I used *[pattern](https://pypi.python.org/pypi/Pattern)* and [requests](http://docs.python-requests.org/en/latest/) here. BeautifilSoup, [Scrapy](http://scrapy.org/) are other libraries for same purpose." ] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Web Scraping for finding list of all food trucks and their details" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# requests is for downloading data(text) from the web.\n", "import requests\n", "\n", "# pattern and beautifulsoup are for navigating through DOM\n", "from pattern import web\n", "\n", "import csv\n", "\n", "# function 3\n", "def get_food_truck_link(url):\n", " '''for a given food truck link inside the main_url find the real\n", " link of food truck and return'''\n", " \n", "def get_city_url(url):\n", "\t'''To fetch and return the dictonary of all cities and \n", "\ttheir id present on main page.\n", " Step 1'''\n", " # making get request to the given url\n", "\tr = requests.get(url)\n", "\thtml = r.text\n", "\t'''Since sub-footer of the main page contain list of all countries and their link, \n", " parsing is being done only on sub-footer of main page.'''\n", "\tsubfooter_index = html.find('id=\"subfooter\"')\n", "\tfooter = html[subfooter_index : html[subfooter_index:].find('')+subfooter_index]\n", "\tdom = web.Element(footer)\n", "\tcity_link = {}\n", "\tfor li in dom.by_tag('li'):\n", "\t\tcity = li.by_tag('a')[0].content\n", "\t\tinner_link = li.by_tag('a')[0].attributes.get('href', '')\n", "\t\tcity_link[city] = inner_link\n", "\treturn city_link\n", "\t\n", "\n", "def get_food_trucks(url):\n", "\t'''For a given city url find all food trucks present on the \n", "\tpage of that city.\n", " Step 2'''\n", "\tr = requests.get(url)\n", "\tdom = web.Element(r.text)\n", " truck_name_link = {}\n", " for li in dom.by_tag('li.squarelisting'):\n", " \t truck_name = li.by_tag('a')[0].attributes.get('href',)[1:]\n", " # step 3 for finding links for each truck is being called from here.\n", " \t truck_link = get_food_truck_link(main_url+truck_name)\n", " truck_name_link[truck_name] = truck_link\n", " return truck_name_link\n", " \n", "def get_food_truck_link(url):\n", " '''for a given food truck link inside the main_url find the real\n", " link of food truck and return.\n", " Step 3'''\n", " # print('inside get_food_truck_LINK', url)\n", " r = requests.get(url)\n", " dom = web.Element(r.text)\n", " #truck_link = dom.by_tag('a.drawbutton menu_link')[0].attributes.get('href',) \n", " truck_link = ''\n", " for div in dom.by_tag('div.widget cf'):\n", " truck_link = div.by_tag('a.drawbutton')[0].attributes.get('href',) \n", " return truck_link \n", "\t\n", " \n", "main_url = 'http://roaminghunger.com/' \n", "\n", "if __name__ == \"__main__\":\n", " # step 1 is to get a dict of all cities and their link address in roaminhunger.com\n", " city_link = get_city_url(main_url)\n", " # print city_link\n", " \n", " # step 2 - Now for each of these cities find food-truck\n", " city_food_trucks = {}\n", " for city in city_link:\n", "\tfood_trucks = get_food_trucks(main_url+city_link[city])\n", "\tcity_food_trucks[city] = food_trucks\n", " # print(city_food_trucks)\n", " \n", " # step - 4 Write whole data in a csv file\n", " with open('data_food_truck.csv', 'w') as csvfile:\n", " \tfilewriter = csv.writer(csvfile, delimiter='\\t',\n", " quotechar='|', quoting=csv.QUOTE_MINIMAL)\n", " \tfilewriter.writerow(['Food Truck', 'City', 'link'])\n", " \tfor city in city_food_trucks:\n", " \t for food_truck, link in city_food_trucks[city].items():\n", " \t\tfilewriter.writerow([food_truck, city, link])\n", "\tprint('Done!!')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }