{ "metadata": { "name": "", "signature": "sha256:78c99bd2617277ff59f18e1c301cc5a4df915edb097df1e1f39bebd4c8cc2095" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "## Data Wrangling with MongoDB - Exercises" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Q1.1" ] }, { "cell_type": "code", "collapsed": false, "input": [ "\n", "\n", "\"\"\"\n", "Your task is to process the supplied file and use the csv module to extract data from it.\n", "The data comes from NREL (National Renewable Energy Laboratory) website. Each file\n", "contains information from one meteorological station, in particular - about amount of\n", "solar and wind energy for each hour of day.\n", "\n", "Note that the first line of the datafile is neither data entry, nor header. It is a line\n", "describing the data source. You should extract the name of the station from it.\n", "\n", "The data should be returned as a list of lists (not dictionaries).\n", "You can use the csv modules \"reader\" method to get data in such format.\n", "Another useful method is next() - to get the next line from the iterator.\n", "You should only change the parse_file function.\n", "\"\"\"\n", "import csv\n", "import os\n", "\n", "DATADIR = \"\"\n", "DATAFILE = \"data/745090.csv\"\n", "\n", "\n", "def parse_file(datafile):\n", " name = \"\"\n", " data = []\n", " with open(datafile,'rb') as f:\n", " csvFile = csv.reader(f, delimiter=\",\", quotechar='\"')\n", " descriptionRow = csvFile.next() # Description Row that contains the name of the city for this csv file.\n", " name = descriptionRow[1] # Pull out name from description Row\n", " csvFile.next() # Skip over Header Row\n", " for row in csvFile: # Read the rest of the csv data sheet into a list, with each row as a separate list.\n", " data.append(row)\n", " return (name, data)\n", "\n", "\n", "def test():\n", " datafile = os.path.join(DATADIR, DATAFILE)\n", " name, data = parse_file(datafile)\n", " \n", " \n", " assert name == \"MOUNTAIN VIEW MOFFETT FLD NAS\"\n", " assert data[0][1] == \"01:00\"\n", " assert data[2][0] == \"01/01/2005\"\n", " assert data[2][5] == \"2\"\n", " print \"Passed.\"\n", " \n", "\n", "test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Passed.\n" ] } ], "prompt_number": 5 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Q 1.2" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Find the time and value of max load for each of the regions\n", "# COAST, EAST, FAR_WEST, NORTH, NORTH_C, SOUTHERN, SOUTH_C, WEST\n", "# and write the result out in a csv file, using pipe character | as the delimiter.\n", "# An example output can be seen in the \"example.csv\" file.\n", "import xlrd\n", "#import os\n", "#import csv\n", "from zipfile import ZipFile\n", "datafile = \"data/2013_ERCOT_Hourly_Load_Data.xls\"\n", "outfile = \"data/2013_Max_Loads.csv\"\n", "\n", "\n", "def open_zip(datafile):\n", " with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:\n", " myzip.extractall(path=\"data\")\n", "\n", "\n", "def parse_file(datafile):\n", " workbook = xlrd.open_workbook(datafile)\n", " sheet = workbook.sheet_by_index(0)\n", " data = []\n", " headers = sheet.row_values(0, start_colx=0, end_colx=None)\n", "\n", " writer_header = [\"Station\", \"Year\", \"Month\", \"Day\", \"Hour\", \"Max Load\"]\n", " data.append(writer_header)\n", "\n", "\n", " for i in range(1, len(headers)):\n", " temp_sheet_column = sheet.col_values(i, start_rowx=1, end_rowx=None)\n", " temp_max = max(temp_sheet_column)\n", " temp_max_index = temp_sheet_column.index(temp_max) + 1\n", " raw_max_date = sheet.cell_value(temp_max_index, 0)\n", " max_date_values = xlrd.xldate_as_tuple(raw_max_date, 0)\n", "\n", "\n", " d = [headers[i], max_date_values[0], max_date_values[1], max_date_values[2], max_date_values[3], temp_max]\n", "\n", " data.append(d)\n", "\n", " return data\n", "\n", "def save_file(data, filename):\n", " with open(filename, 'wb') as f:\n", " writer = csv.writer(f, delimiter='|')\n", " writer.writerows(data)\n", "\n", " \n", "def test():\n", " open_zip(datafile)\n", " data = parse_file(datafile)\n", " save_file(data, outfile)\n", "\n", " ans = {'FAR_WEST': {'Max Load': \"2281.2722140000024\", 'Year': \"2013\", \"Month\": \"6\", \"Day\": \"26\", \"Hour\": \"17\"}}\n", " \n", " fields = [\"Year\", \"Month\", \"Day\", \"Hour\", \"Max Load\"]\n", " with open(outfile) as of:\n", " csvfile = csv.DictReader(of, delimiter=\"|\")\n", " for line in csvfile:\n", " s = line[\"Station\"]\n", " if s == 'FAR_WEST':\n", " for field in fields:\n", " assert ans[s][field] == line[field]\n", " print \"Passed.\"\n", "\n", " \n", "test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Passed.\n" ] } ], "prompt_number": 9 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Q 1.3" ] }, { "cell_type": "code", "collapsed": false, "input": [ "\"\"\"\n", "This exercise shows some important concepts that you should be aware about:\n", "- using codecs module to write unicode files\n", "- using authentication with web APIs\n", "- using offset when accessing web APIs\n", "\n", "To run this code locally you have to register at the NYTimes developer site \n", "and get your own API key. You will be able to complete this exercise in our UI without doing so,\n", "as we have provided a sample result.\n", "\n", "Your task is to process the saved file that represents the most popular (by view count)\n", "articles in the last day, and return the following data:\n", "- list of dictionaries, where the dictionary key is \"section\" and value is \"title\"\n", "- list of URLs for all media entries with \"format\": \"Standard Thumbnail\"\n", "\n", "All your changes should be in the article_overview function.\n", "The rest of functions are provided for your convenience, if you want to access the API by yourself.\n", "\"\"\"\n", "import json\n", "import codecs\n", "import requests\n", "\n", "URL_MAIN = \"http://api.nytimes.com/svc/\"\n", "URL_POPULAR = URL_MAIN + \"mostpopular/v2/\"\n", "API_KEY = { \"popular\": \"\",\n", " \"article\": \"\"}\n", "\n", "\n", "def get_from_file(kind, period):\n", " filename = \"data/popular-{0}-{1}.json\".format(kind, period)\n", " with open(filename, \"r\") as f:\n", " return json.loads(f.read())\n", "\n", "\n", "def article_overview(kind, period):\n", " data = get_from_file(kind, period)\n", " urls = []\n", " titles = []\n", " for asset_id in data:\n", " try:\n", " # Add article section:title as a dictionary to the list of the titles.\n", " titles.append({asset_id[\"section\"]: asset_id[\"title\"]})\n", " # Json python objects are nested dictionaries.\n", " # Each article entry has 0+ media record entries.\n", " for media_record in asset_id[\"media\"]:\n", " # Each media record has 0+ metadata entries for the media record\n", " for meta_data_record in media_record[\"media-metadata\"]:\n", " # If the format is what we are looking for (Standard Thumbnail)..\n", " if meta_data_record[\"format\"] == \"Standard Thumbnail\":\n", " # We grab the url for that format that passed the check.\n", " urls.append(meta_data_record[\"url\"])\n", " except IndexError as err:\n", " print \"ERROR:\", err\n", " return titles, urls\n", "\n", "\n", "def query_site(url, target, offset):\n", " # This will set up the query with the API key and offset\n", " # Web services often use offset paramter to return data in small chunks\n", " # NYTimes returns 20 articles per request, if you want the next 20\n", " # You have to provide the offset parameter\n", " if API_KEY[\"popular\"] == \"\" or API_KEY[\"article\"] == \"\":\n", " print \"You need to register for NYTimes Developer account to run this program.\"\n", " print \"See Intructor notes for information\"\n", " return False\n", " params = {\"api-key\": API_KEY[target], \"offset\": offset}\n", " r = requests.get(url, params = params)\n", "\n", " if r.status_code == requests.codes.ok:\n", " return r.json()\n", " else:\n", " r.raise_for_status()\n", "\n", "\n", "def get_popular(url, kind, days, section=\"all-sections\", offset=0):\n", " # This function will construct the query according to the requirements of the site\n", " # and return the data, or print an error message if called incorrectly\n", " if days not in [1,7,30]:\n", " print \"Time period can be 1,7, 30 days only\"\n", " return False\n", " if kind not in [\"viewed\", \"shared\", \"emailed\"]:\n", " print \"kind can be only one of viewed/shared/emailed\"\n", " return False\n", "\n", " url = URL_POPULAR + \"most{0}/{1}/{2}.json\".format(kind, section, days)\n", " data = query_site(url, \"popular\", offset)\n", "\n", " return data\n", "\n", "\n", "def save_file(kind, period):\n", " # This will process all results, by calling the API repeatedly with supplied offset value,\n", " # combine the data and then write all results in a file.\n", " data = get_popular(URL_POPULAR, \"viewed\", 1)\n", " num_results = data[\"num_results\"]\n", " full_data = []\n", " with codecs.open(\"popular-{0}-{1}-full.json\".format(kind, period), encoding='utf-8', mode='w') as v:\n", " for offset in range(0, num_results, 20): \n", " data = get_popular(URL_POPULAR, kind, period, offset=offset)\n", " full_data += data[\"results\"]\n", " \n", " v.write(json.dumps(full_data, indent=2))\n", "\n", "\n", "def test():\n", " titles, urls = article_overview(\"viewed\", 1)\n", " assert len(titles) == 20\n", " assert len(urls) == 30\n", " assert titles[2] == {'Opinion': 'Professors, We Need You!'}\n", " assert urls[20] == 'http://graphics8.nytimes.com/images/2014/02/17/sports/ICEDANCE/ICEDANCE-thumbStandard.jpg'\n", " print \"Passed.\"\n", " \n", "test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Passed.\n" ] } ], "prompt_number": 13 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lesson 2 Exercise" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "# Your task here is to extract data from xml on authors of an article\n", "# and add it to a list, one item for an author.\n", "# See the provided data structure for the expected format.\n", "# The tags for first name, surname and email should map directly\n", "# to the dictionary keys\n", "import xml.etree.ElementTree as ET\n", "\n", "article_file = \"data/exampleResearchArticle.xml\"\n", "\n", "\n", "def get_root(fname):\n", " tree = ET.parse(fname)\n", " return tree.getroot()\n", "\n", "\n", "def get_authors(root):\n", " #authors = []\n", " #for author in root.findall('./fm/bibl/aug/au'):\n", " #\n", " # # Find the appropriate object tag listed under authors.\n", " # # Call that objects' .text function to extract the text value.\n", " # data = {\n", " # \"fnm\": author.find(\"fnm\").text,\n", " # \"snm\": author.find(\"snm\").text,\n", " # \"email\": author.find(\"email\").text\n", " # }\n", " #\n", " #\n", " # authors.append(data)\n", " authors = [{\"fnm\": author.find(\"./fnm\").text, \n", " \"snm\": author.find(\"./snm\").text,\n", " \"email\": author.find(\"./email\").text} for author in root.findall('./fm/bibl/aug/au')] \n", "\n", " return authors\n", "\n", "\n", "def test():\n", " solution = [{'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'}, {'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'}, {'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'}, {'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'}, {'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'}, {'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'}, {'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'}, {'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]\n", " \n", " root = get_root(article_file)\n", " data = get_authors(root)\n", "\n", " assert data[0] == solution[0]\n", " assert data[1][\"fnm\"] == solution[1][\"fnm\"]\n", " print \"Passed.\"\n", "\n", "test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Passed.\n" ] } ], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "# Your task here is to extract data from xml on authors of an article\n", "# and add it to a list, one item for an author.\n", "# See the provided data structure for the expected format.\n", "# The tags for first name, surname and email should map directly\n", "# to the dictionary keys, but you have to extract the attributes from the \"insr\" tag\n", "# and add them to the list for the dictionary key \"insr\"\n", "import xml.etree.ElementTree as ET\n", "\n", "article_file = \"data/exampleResearchArticle.xml\"\n", "\n", "\n", "def get_root(fname):\n", " tree = ET.parse(fname)\n", " return tree.getroot()\n", "\n", "\n", "\n", "def get_authors(root):\n", " authors = []\n", " for author in root.findall('./fm/bibl/aug/au'):\n", "\n", " # Find the appropriate object tag listed under authors.\n", " # Call that objects' .text function to extract the text value.\n", "\n", " all_iid = [insr.get('iid') for insr in author.findall('./insr')]\n", "\n", " data = {\n", " \"fnm\": author.find(\"./fnm\").text,\n", " \"snm\": author.find(\"./snm\").text,\n", " \"email\": author.find(\"./email\").text,\n", " \"insr\": all_iid\n", " }\n", "\n", "\n", " authors.append(data)\n", "\n", " return authors\n", "\n", "\n", "def test():\n", " solution = [{'insr': ['I1'], 'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'},\n", " {'insr': ['I2'], 'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'},\n", " {'insr': ['I3', 'I4'], 'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'},\n", " {'insr': ['I3'], 'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'},\n", " {'insr': ['I8'], 'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'},\n", " {'insr': ['I3', 'I5'], 'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'},\n", " {'insr': ['I6'], 'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'},\n", " {'insr': ['I7'], 'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]\n", "\n", " root = get_root(article_file)\n", " data = get_authors(root)\n", "\n", " assert data[0] == solution[0]\n", " assert data[1][\"insr\"] == solution[1][\"insr\"]\n", " print \"Passed.\"\n", "\n", "\n", "test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Passed.\n" ] } ], "prompt_number": 23 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using Beautiful Soup" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "# -*- coding: utf-8 -*-\n", "# Please note that the function 'make_request' is provided for your reference only.\n", "# You will not be able to to actually use it from within the Udacity web UI.\n", "# Your task is to process the HTML using BeautifulSoup, extract the hidden\n", "# form field values for \"__EVENTVALIDATION\" and \"__VIEWSTATE\" and set the approprate\n", "# values in the data dictionary.\n", "# All your changes should be in the 'extract_data' function\n", "from bs4 import BeautifulSoup\n", "import requests\n", "import json\n", "\n", "html_page = \"data/page_source.html\"\n", "\n", "\n", "def extract_data(page):\n", " data = {\"eventvalidation\": \"\",\n", " \"viewstate\": \"\"}\n", " with open(page, \"r\") as html:\n", " bs = BeautifulSoup(html)\n", " event_list = bs.find(id='__EVENTVALIDATION')\n", " event_view = bs.find(id='__VIEWSTATE')\n", " data[\"eventvalidation\"] = event_list['value']\n", " data[\"viewstate\"] = event_view['value']\n", "\n", " return data\n", "\n", "\n", "def make_request(data):\n", " eventvalidation = data[\"eventvalidation\"]\n", " viewstate = data[\"viewstate\"]\n", "\n", " r = requests.post(\"http://www.transtats.bts.gov/Data_Elements.aspx?Data=2\",\n", " data={'AirportList': \"BOS\",\n", " 'CarrierList': \"VX\",\n", " 'Submit': 'Submit',\n", " \"__EVENTTARGET\": \"\",\n", " \"__EVENTARGUMENT\": \"\",\n", " \"__EVENTVALIDATION\": eventvalidation,\n", " \"__VIEWSTATE\": viewstate\n", " })\n", "\n", " return r.text\n", "\n", "\n", "def test():\n", " data = extract_data(html_page)\n", " assert data[\"eventvalidation\"] != \"\"\n", " assert data[\"eventvalidation\"].startswith(\"/wEWjAkCoIj1ng0\")\n", " assert data[\"viewstate\"].startswith(\"/wEPDwUKLTI\")\n", " print \"Passed.\"\n", "\n", " \n", "test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Passed.\n" ] } ], "prompt_number": 27 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Q 2.1" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "# -*- coding: utf-8 -*-\n", "# Please note that the function 'make_request' is provided for your reference only.\n", "# You will not be able to to actually use it from within the Udacity web UI\n", "# All your changes should be in the 'extract_carrier' function\n", "# Also note that the html file is a stripped down version of what is actually on the website.\n", "\n", "# Your task in this exercise is to get a list of all airlines. Exclude all of the combination\n", "# values, like \"All U.S. Carriers\" from the data that you return.\n", "# You should return a list of codes for the carriers.\n", "\n", "from bs4 import BeautifulSoup\n", "html_page = \"data/options.html\"\n", "\n", "\n", "def extract_carriers(page):\n", " with open(page, \"r\") as html:\n", " soup = BeautifulSoup(html)\n", " \n", " data = [options['value'] for options in soup.find(id='CarrierList') if options.name == 'option' and options['value'][:3] != 'All']\n", " \n", "# for options in soup.find(id='CarrierList'):\n", "# try:\n", "# if options.name == \"option\" and options['value'][:3] != \"All\":\n", "# data.append(options['value'])\n", "# except TypeError as err:\n", "# print \"ERROR:\", err\n", "# pass\n", "\n", " return data\n", "\n", "\n", "def make_request(data):\n", " eventvalidation = data[\"eventvalidation\"]\n", " viewstate = data[\"viewstate\"]\n", " airport = data[\"airport\"]\n", " carrier = data[\"carrier\"]\n", "\n", " r = requests.post(\"http://www.transtats.bts.gov/Data_Elements.aspx?Data=2\",\n", " data={'AirportList': airport,\n", " 'CarrierList': carrier,\n", " 'Submit': 'Submit',\n", " \"__EVENTTARGET\": \"\",\n", " \"__EVENTARGUMENT\": \"\",\n", " \"__EVENTVALIDATION\": eventvalidation,\n", " \"__VIEWSTATE\": viewstate\n", " })\n", "\n", " return r.text\n", "\n", "\n", "def test():\n", " data = extract_carriers(html_page)\n", " assert len(data) == 16\n", " assert \"FL\" in data\n", " assert \"NK\" in data\n", " print \"Passed.\"\n", "\n", "test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Passed.\n" ] } ], "prompt_number": 32 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Q2.2" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "# -*- coding: utf-8 -*-\n", "# All your changes should be in the 'extract_airports' function\n", "# It should return a list of airport codes, excluding any combinations like \"All\"\n", "\n", "from bs4 import BeautifulSoup\n", "html_page = \"data/options.html\"\n", "\n", "\n", "\n", "def extract_airports(page):\n", " with open(page, \"r\") as html:\n", " soup = BeautifulSoup(html)\n", " data = [airport['value'] for airport in soup.find(id='AirportList') if airport.name=='option' and airport['value'][:3] != 'All']\n", "\n", " return data\n", "\n", "def test():\n", " data = extract_airports(html_page)\n", " assert len(data) == 15\n", " assert \"ATL\" in data\n", " assert \"ABR\" in data\n", " print \"Passed.\"\n", "\n", "test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Passed.\n" ] } ], "prompt_number": 35 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Q2.3" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "# -*- coding: utf-8 -*-\n", "# Let's assume that you combined the code from the previous 2 exercises\n", "# with code from the lesson on how to build requests, and downloaded all the data locally.\n", "# The files are in a directory \"data\", named after the carrier and airport:\n", "# \"{}-{}.html\".format(carrier, airport), for example \"FL-ATL.html\".\n", "# The table with flight info has a table class=\"dataTDRight\".\n", "# There are couple of helper functions to deal with the data files.\n", "# Please do not change them for grading purposes.\n", "# All your changes should be in the 'process_file' function\n", "\n", "from bs4 import BeautifulSoup\n", "from zipfile import ZipFile\n", "import os\n", "\n", "#datadir = \"data\"\n", "\n", "\n", "#def open_zip(datadir):\n", "# with ZipFile('{0}.zip'.format(datadir), 'r') as myzip:\n", "# myzip.extractall()\n", "\n", "\n", "#def process_all(datadir):\n", "# files = os.listdir(datadir)\n", "# return files\n", "\n", "\n", "def process_file(f):\n", " # This is example of the datastructure you should return\n", " # Each item in the list should be a dictionary containing all the relevant data\n", " # Note - year, month, and the flight data should be integers\n", " # You should skip the rows that contain the TOTAL data for a year\n", " # data = [{\"courier\": \"FL\",\n", " # \"airport\": \"ATL\",\n", " # \"year\": 2012,\n", " # \"month\": 12,\n", " # \"flights\": {\"domestic\": 100,\n", " # \"international\": 100}\n", " # },\n", " # {\"courier\": \"...\"}\n", " # ]\n", "\n", " # info = {}\n", " #\n", " #\n", " # info[\"courier\"], info[\"airport\"] = f[:6].split(\"-\")\n", "\n", " #print info\n", " \n", " data = []\n", " with open(\"{}/{}\".format(datadir, f), \"r\") as html:\n", "\n", " soup = BeautifulSoup(html)\n", "\n", " flight_table = soup.find(\"table\", {\"class\": \"dataTDRight\"})\n", "\n", " for flight_data_row in flight_table.findAll('tr'):\n", "\n", " info = {}\n", " info[\"courier\"], info[\"airport\"] = f[:6].split(\"-\")\n", "\n", " col = flight_data_row.findAll(\"td\")\n", " year = col[0].string.strip()\n", " month = col[1].string.strip()\n", " domestic = col[2].string.strip()\n", " international = col[3].string.strip()\n", " skip_total = col[4].string.strip()\n", "\n", " try:\n", "\n", " year = int(str(year))\n", " month = int(str(month))\n", " domestic = int(str(domestic).replace(\",\", \"\"))\n", " international = int(str(international).replace(\",\", \"\"))\n", "\n", " info[\"year\"] = year\n", " info[\"month\"] = month\n", " info[\"flights\"] = {\"domestic\": domestic,\n", " \"international\": international}\n", " data.append(info)\n", " except ValueError as err:\n", " #print \"Tried converting a non-int type to int:\", err\n", " pass\n", " return data\n", "\n", "# Udacity accesses many external files; not available locally. \n", "# Passes grader checks with lines uncommented back in.\n", "def test():\n", " print \"Running a simple test...\"\n", " #open_zip(datadir)\n", " #files = process_all(datadir)\n", " #data = []\n", " #for f in files:\n", " # data += process_file(f)\n", " data = process_file('FL-ATL.html')\n", " \n", " #assert len(data) == 399\n", " for entry in data[:3]:\n", " assert type(entry[\"year\"]) == int\n", " assert type(entry[\"flights\"][\"domestic\"]) == int\n", " assert len(entry[\"airport\"]) == 3\n", " assert len(entry[\"courier\"]) == 2\n", " assert data[-1][\"airport\"] == \"ATL\"\n", " #assert data[-1][\"flights\"] == {'international': 108289, 'domestic': 701425}\n", " \n", " print \"... success!\"\n", "\n", "if __name__ == \"__main__\":\n", " test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Running a simple test...\n", "... success!\n" ] } ], "prompt_number": 43 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Q2.4" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "# -*- coding: utf-8 -*-\n", "# This and the following exercise are using US Patent database.\n", "# The patent.data file is a small excerpt of a much larger datafile\n", "# that is available for download from US Patent website. They are pretty large ( >100 MB each).\n", "# The data itself is in XML, however there is a problem with how it's formatted.\n", "# Please run this script and observe the error. Then find the line that is causing the error.\n", "# You can do that by just looking at the datafile in the web UI, or programmatically.\n", "# For quiz purposes it does not matter, but as an exercise we suggest that you try to do it programmatically.\n", "# The original file is ~600MB large, you might not be able to open it in a text editor.\n", "\n", "import xml.etree.ElementTree as ET\n", "\n", "PATENTS = 'data/patent.data'\n", "\n", "def get_root(fname):\n", " tree = ET.parse(fname)\n", " return tree.getroot()\n", "\n", "get_root(PATENTS)" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "ParseError", "evalue": "junk after document element: line 657, column 0", "output_type": "pyerr", "traceback": [ "\u001b[1;36m File \u001b[1;32m\"\"\u001b[1;36m, line \u001b[1;32munknown\u001b[0m\n\u001b[1;31mParseError\u001b[0m\u001b[1;31m:\u001b[0m junk after document element: line 657, column 0\n" ] } ], "prompt_number": 51 }, { "cell_type": "code", "collapsed": false, "input": [ "import linecache\n", "\n", "print linecache.getline('data/patent.data', 657)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n", "\n" ] } ], "prompt_number": 53 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Q2.5a Please enter the content of the line that is causing the error:" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Q2.5b What do you think is the problem?" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "There are two root elements\n", "\n", "http://stackoverflow.com/questions/15837529/junk-after-document-element-line-13-column-2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Q2.6" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "# -*- coding: utf-8 -*-\n", "# So, the problem is that the gigantic file is actually not a valid XML, because\n", "# it has several root elements, and XML declarations.\n", "# It is, a matter of fact, a collection of a lot of concatenated XML documents.\n", "# So, one solution would be to split the file into separate documents,\n", "# so that you can process the resulting files as valid XML documents.\n", "\n", "import xml.etree.ElementTree as ET\n", "PATENTS = 'data/patent.data'\n", "\n", "def get_root(fname):\n", " tree = ET.parse(fname)\n", " return tree.getroot()\n", "\n", "\n", "def split_file(filename):\n", " # we want you to split the input file into separate files\n", " # each containing a single patent.\n", " # As a hint - each patent declaration starts with the same line that was causing the error\n", " # The new files should be saved with filename in the following format:\n", " # \"{}-{}\".format(filename, n) where n is a counter, starting from 0.\n", "\n", " output = []\n", " data = {}\n", "\n", " f = open(filename)\n", " count = 0\n", " file_number = 0\n", "\n", " # import pprint\n", " # pprint.pprint(f.readlines())\n", "\n", "\n", " output.append(f.readline())\n", "\n", " for line in f.readlines():\n", "\n", " if line.startswith(\" len(l2):\n", " return float(l[0])\n", " else:\n", " return float(l[1])\n", " return float(area)\n", "\n", "\n", "def process_file(filename):\n", " # CHANGES TO THIS FUNCTION WILL BE IGNORED WHEN YOU SUBMIT THE EXERCISE\n", " data = []\n", "\n", " with open(filename, \"r\") as f:\n", " reader = csv.DictReader(f)\n", "\n", " #skipping the extra matadata\n", " for i in range(3):\n", " l = reader.next()\n", "\n", " # processing file\n", " for line in reader:\n", " # calling your function to fix the area value\n", " if \"areaLand\" in line:\n", " line[\"areaLand\"] = fix_area(line[\"areaLand\"])\n", " data.append(line)\n", "\n", " return data\n", "\n", "\n", "def test():\n", " data = process_file(CITIES)\n", "\n", " print \"Printing three example results:\"\n", " for n in range(5,8):\n", " pprint.pprint(data[n][\"areaLand\"])\n", " \n", "\n", " assert data[8][\"areaLand\"] == 55166700.0\n", " assert data[3][\"areaLand\"] == None\n", "\n", "\n", "if __name__ == \"__main__\":\n", " test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Printing three example results:\n", "None\n", "101787000.0\n", "31597900.0\n" ] } ], "prompt_number": 72 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Q3.4" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "# -*- coding: utf-8 -*-\n", "\"\"\"\n", "In this problem set you work with cities infobox data, audit it, come up with a cleaning idea and then clean it up.\n", "\n", "In the previous quiz you recognized that the \"name\" value can be an array (or list in Python terms).\n", "It would make it easier to process and query the data later, if all values for the name \n", "would be in a Python list, instead of being just a string separated with special characters, like now.\n", "\n", "Finish the function fix_name(). It will recieve a string as an input, and it has to return a list\n", "of all the names. If there is only one name, the list with have only one item in it, if the name is \"NULL\",\n", "the list should be empty.\n", "The rest of the code is just an example on how this function can be used\n", "\"\"\"\n", "import codecs\n", "import csv\n", "import pprint\n", "\n", "CITIES = 'data/cities.csv'\n", "\n", "\n", "def fix_name(name):\n", "\n", " if name == \"NULL\":\n", " return []\n", " if name.startswith(\"{\"):\n", " return name.replace(\"{\", \"\").replace(\"}\", \"\").strip().split(\"|\")\n", " else:\n", " return [name]\n", "\n", "\n", "def process_file(filename):\n", " data = []\n", " with open(filename, \"r\") as f:\n", " reader = csv.DictReader(f)\n", " #skipping the extra matadata\n", " for i in range(3):\n", " l = reader.next()\n", " # processing file\n", " for line in reader:\n", " # calling your function to fix the area value\n", " if \"name\" in line:\n", " line[\"name\"] = fix_name(line[\"name\"])\n", " data.append(line)\n", " return data\n", "\n", "\n", "def test():\n", " data = process_file(CITIES)\n", "\n", " print \"Printing 20 results:\"\n", " for n in range(20):\n", " pprint.pprint(data[n][\"name\"])\n", "\n", " assert data[14][\"name\"] == ['Negtemiut', 'Nightmute']\n", " assert data[3][\"name\"] == ['Kumhari']\n", "\n", "if __name__ == \"__main__\":\n", " test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Printing 20 results:\n", "['Kud']\n", "['Kuju']\n", "['Kumbhraj']\n", "['Kumhari']\n", "['Kunigal']\n", "['Kurgunta']\n", "['Athens']\n", "['Demopolis']\n", "['Chelsea Alabama']\n", "['Pell City Alabama']\n", "['City of Northport']\n", "['Sand Point']\n", "['Unalaska Alaska']\n", "['City of Menlo Park']\n", "['Negtemiut', 'Nightmute']\n", "['Fairbanks Alaska']\n", "['Homer']\n", "['Ketchikan Alaska']\n", "['Nuniaq', 'Old Harbor']\n", "['Rainier Washington']\n" ] } ], "prompt_number": 79 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Q3.6" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "# -*- coding: utf-8 -*-\n", "\"\"\"\n", "In this problem set you work with cities infobox data, audit it, come up with a cleaning idea and then clean it up.\n", "\n", "If you look at the full city data, you will notice that there are couple of values that seem to provide\n", "the same information in different formats: \"point\" seems to be the combination of \"wgs84_pos#lat\" and \"wgs84_pos#long\".\n", "However we do not know if that is the case and should check if they are equivalent.\n", "\n", "Finish the function check_loc(). It will recieve 3 strings, first will be the combined value of \"point\" and then the\n", "\"wgs84_pos#\" values separately. You have to extract the lat and long values from the \"point\" and compare\n", "to the \"wgs84_pos# values and return True or False.\n", "\n", "Note that you do not have to fix the values, just determine if they are consistent. To fix them in this case\n", "you would need more information. Feel free to discuss possible strategies for fixing this on the discussion forum.\n", "\n", "The rest of the code is just an example on how this function can be used.\n", "Changes to \"process_file\" function will not be take into account.\n", "\"\"\"\n", "import csv\n", "import pprint\n", "\n", "CITIES = 'data/cities.csv'\n", "\n", "\n", "def check_loc(point, lat, longi):\n", "\n", " check1, check2 = point.split(\" \")\n", " if check1 == lat and check2 == longi:\n", " return True\n", " else:\n", " return False\n", "\n", "\n", "def process_file(filename):\n", " data = []\n", " with open(filename, \"r\") as f:\n", " reader = csv.DictReader(f)\n", " #skipping the extra matadata\n", " for i in range(3):\n", " l = reader.next()\n", " # processing file\n", " for line in reader:\n", " # calling your function to check the location\n", " result = check_loc(line[\"point\"], line[\"wgs84_pos#lat\"], line[\"wgs84_pos#long\"])\n", " if not result:\n", " print \"{}: {} != {} {}\".format(line[\"name\"], line[\"point\"], line[\"wgs84_pos#lat\"], line[\"wgs84_pos#long\"])\n", " data.append(line)\n", "\n", " return data\n", "\n", "\n", "def test():\n", " assert check_loc(\"33.08 75.28\", \"33.08\", \"75.28\") == True\n", " assert check_loc(\"44.57833333333333 -91.21833333333333\", \"44.5783\", \"-91.2183\") == False\n", " print \"Passed.\"\n", "\n", "if __name__ == \"__main__\":\n", " test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Passed.\n" ] } ], "prompt_number": 80 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Kicking the tires on MongoDB" ] }, { "cell_type": "code", "collapsed": false, "input": [ "\"\"\"\n", "Your task is to sucessfully run the exercise to see how pymongo works\n", "and how easy it is to start using it.\n", "You don't actually have to change anything in this exercise,\n", "but you can change the city name in the add_city function if you like.\n", "\n", "Your code will be run against a MongoDB instance that we have provided.\n", "If you want to run this code locally on your machine,\n", "you have to install MongoDB (see Instructor comments for link to installation information)\n", "and uncomment the get_db function.\n", "\"\"\"\n", "\n", "\n", "def get_db():\n", " from pymongo import MongoClient\n", " client = MongoClient('localhost:27017')\n", " # 'examples' here is the database name. It will be created if it does not exist.\n", " db = client.examples\n", " return db\n", "\n", "\n", "def add_city(db):\n", " db.cities.insert({\"name\" : \"Chicago\"})\n", " \n", "def get_city(db):\n", " return db.cities.find_one()\n", "\n", "\n", "if __name__ == \"__main__\":\n", "\n", " db = get_db() # uncomment this line if you want to run this locally\n", " add_city(db)\n", " print get_city(db)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "{u'_id': ObjectId('53b27aeecf9ef83870ffda22'), u'name': u'Chicago'}\n" ] } ], "prompt_number": 83 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Finding Porsche" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "\"\"\"\n", "Your task is to complete the 'porsche_query' function and in particular the query\n", "to find all autos where the manufacturer field matches \"Porsche\".\n", "Please modify only 'porsche_query' function, as only that will be taken into account.\n", "\n", "Your code will be run against a MongoDB instance that we have provided.\n", "If you want to run this code locally on your machine,\n", "you have to install MongoDB and download and insert the dataset.\n", "For instructions related to MongoDB setup and datasets please see Course Materials at\n", "the following link:\n", "https://www.udacity.com/wiki/ud032\n", "\"\"\"\n", "\n", "\n", "def get_db(db_name):\n", " from pymongo import MongoClient\n", " client = MongoClient('localhost:27017')\n", " db = client[db_name]\n", " return db\n", "\n", "\n", "def porsche_query():\n", " \n", " query = {\"manufacturer\": \"Porsche\"}\n", " return query\n", "\n", "\n", "def find_porsche(db, query):\n", " return db.autos.find(query)\n", "\n", "\n", "if __name__ == \"__main__\":\n", "\n", " db = get_db('examples')\n", " query = porsche_query()\n", " p = find_porsche(db, query)\n", " import pprint\n", " # Print only the first record.\n", " for a in p[0:1]:\n", " pprint.pprint(a)\n", " " ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "{u'_id': ObjectId('53b3629acf9ef81fe4999d69'),\n", " u'assembly': [u'Germany', u'Stuttgart'],\n", " u'bodyStyle': u'coup\\xe9',\n", " u'class': u'grand tourer',\n", " u'dimensions': {u'height': 1.27508,\n", " u'length': 4.52,\n", " u'weight': 1450.0,\n", " u'wheelbase': 2.5,\n", " u'width': 1.89},\n", " u'engine': [u'Porsche_928__1',\n", " u'Porsche_928__2',\n", " u'Porsche_928__3',\n", " u'Porsche_928__4'],\n", " u'layout': u'front-engine rear-wheel-drive layout',\n", " u'manufacturer': u'Porsche',\n", " u'modelYears': [],\n", " u'name': u'Porsche 928',\n", " u'productionYears': [1977,\n", " 1978,\n", " 1979,\n", " 1980,\n", " 1981,\n", " 1982,\n", " 1983,\n", " 1984,\n", " 1985,\n", " 1986,\n", " 1987,\n", " 1988,\n", " 1989,\n", " 1990,\n", " 1991,\n", " 1992,\n", " 1993,\n", " 1994,\n", " 1995],\n", " u'transmission': [u'3-speed automatic',\n", " u'4-speed automatic',\n", " u'5-speed manual']}\n" ] } ], "prompt_number": 93 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Inserting Multiple Documents" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from pymongo import MongoClient\n", "import csv\n", "import json\n", "import io\n", "import re\n", "import pprint\n", "\n", "\n", "field_map = {\n", " \"name\" : \"name\",\n", " \"bodyStyle_label\" : \"bodyStyle\",\n", " \"assembly_label\" : \"assembly\",\n", " \"class_label\" : \"class\",\n", " \"designer_label\" : \"designer\",\n", " \"engine_label\" : \"engine\",\n", " \"length\" : \"length\",\n", " \"height\" : \"height\",\n", " \"width\" : \"width\",\n", " \"weight\" : \"weight\",\n", " \"wheelbase\" : \"wheelbase\",\n", " \"layout_label\" : \"layout\",\n", " \"manufacturer_label\" : \"manufacturer\",\n", " \"modelEndYear\" : \"modelEndYear\",\n", " \"modelStartYear\" : \"modelStartYear\",\n", " \"predecessorLabel\" : \"predecessorLabel\",\n", " \"productionStartYear\" : \"productionStartYear\",\n", " \"productionEndYear\" : \"productionEndYear\",\n", " \"transmission\" : \"transmission\"\n", "}\n", "fields = field_map.keys()\n", "\n", "\n", "def skip_lines(input_file, skip):\n", " for i in range(0, skip):\n", " next(input_file)\n", "\n", "def is_number(s):\n", " try:\n", " float(s)\n", " return True\n", " except ValueError:\n", " return False\n", "\n", "def strip_automobile(v):\n", " return re.sub(r\"\\s*\\(automobile\\)\\s*\", \" \", v)\n", "\n", "def strip_city(v):\n", " return re.sub(r\"\\s*\\(city\\)\\s*\", \" \", v)\n", "\n", "def parse_array(v):\n", " if (v[0] == \"{\") and (v[-1] == \"}\"):\n", " v = v.lstrip(\"{\")\n", " v = v.rstrip(\"}\")\n", " v_array = v.split(\"|\")\n", " v_array = [i.strip() for i in v_array]\n", " return v_array\n", " return v\n", "\n", "def mm_to_meters(v):\n", " if v < 0.01:\n", " return v * 1000\n", " return v\n", "\n", "def clean_dimension(d, field, v):\n", " if is_number(v):\n", " if field == \"weight\":\n", " d[field] = float(v) / 1000.0\n", " else:\n", " d[field] = mm_to_meters(float(v))\n", " \n", "def clean_year(d, field, v):\n", " d[field] = v[0:4]\n", "\n", "def parse_array2(v):\n", " if (v[0] == \"{\") and (v[-1] == \"}\"):\n", " v = v.lstrip(\"{\")\n", " v = v.rstrip(\"}\")\n", " v_array = v.split(\"|\")\n", " v_array = [i.strip() for i in v_array]\n", " return (True, v_array)\n", " return (False, v)\n", "\n", "def ensure_not_array(v):\n", " (is_array, v) = parse_array(v)\n", " if is_array:\n", " return v[0]\n", " return v\n", "\n", "def ensure_array(v):\n", " (is_array, v) = parse_array2(v)\n", " if is_array:\n", " return v\n", " return [v]\n", "\n", "def ensure_float(v):\n", " if is_number(v):\n", " return float(v)\n", "\n", "def ensure_int(v):\n", " if is_number(v):\n", " return int(v)\n", "\n", "def ensure_year_array(val):\n", " #print \"val:\", val\n", " vals = ensure_array(val)\n", " year_vals = []\n", " for v in vals:\n", " v = v[0:4]\n", " v = int(v)\n", " if v:\n", " year_vals.append(v)\n", " return year_vals\n", "\n", "def empty_val(val):\n", " val = val.strip()\n", " return (val == \"NULL\") or (val == \"\")\n", "\n", "def years(row, start_field, end_field):\n", " start_val = row[start_field]\n", " end_val = row[end_field]\n", "\n", " if empty_val(start_val) or empty_val(end_val):\n", " return []\n", "\n", " start_years = ensure_year_array(start_val)\n", " if start_years:\n", " start_years = sorted(start_years)\n", " end_years = ensure_year_array(end_val)\n", " if end_years:\n", " end_years = sorted(end_years)\n", " all_years = []\n", " if start_years and end_years:\n", " #print start_years\n", " #print end_years\n", " for i in range(0, min(len(start_years), len(end_years))):\n", " for y in range(start_years[i], end_years[i]+1):\n", " all_years.append(y)\n", " return all_years\n", "\n", "\n", "def process_file(input_file):\n", " input_data = csv.DictReader(open(input_file))\n", " autos = []\n", " skip_lines(input_data, 3)\n", " for row in input_data:\n", " auto = {}\n", " model_years = {}\n", " production_years = {}\n", " dimensions = {}\n", " for field, val in row.iteritems():\n", " if field not in fields or empty_val(val):\n", " continue\n", " if field in [\"bodyStyle_label\", \"class_label\", \"layout_label\"]:\n", " val = val.lower()\n", " val = strip_automobile(val)\n", " val = strip_city(val)\n", " val = val.strip()\n", " val = parse_array(val)\n", " if field in [\"length\", \"width\", \"height\", \"weight\", \"wheelbase\"]:\n", " clean_dimension(dimensions, field_map[field], val)\n", " elif field in [\"modelStartYear\", \"modelEndYear\"]:\n", " clean_year(model_years, field_map[field], val)\n", " elif field in [\"productionStartYear\", \"productionEndYear\"]:\n", " clean_year(production_years, field_map[field], val)\n", " else:\n", " auto[field_map[field]] = val\n", " if dimensions:\n", " auto['dimensions'] = dimensions\n", " auto['modelYears'] = years(row, 'modelStartYear', 'modelEndYear')\n", " auto['productionYears'] = years(row, 'productionStartYear', 'productionEndYear')\n", " autos.append(auto)\n", " return autos\n", "\n", "# -------------------------------------------------------------------------------------------------------------\n", "\n", "def insert_autos(infile, db):\n", " autos = process_file(infile)\n", " \n", " for a in autos:\n", " db.autos.insert(a)\n", " # Your code here. Insert the data in one command\n", " # autos will be a list of dictionaries, as in the example in the previous video\n", " # You have to insert data in a collection 'autos'\n", "\n", "\n", " \n", "if __name__ == \"__main__\":\n", " \n", " from pymongo import MongoClient\n", " client = MongoClient(\"mongodb://localhost:27017\")\n", " db = client.examples\n", "\n", " insert_autos('data/autos-small.csv', db)\n", " pprint.pprint(db.autos.find_one())" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "{u'_id': ObjectId('53b3629acf9ef81fe4999d66'),\n", " u'dimensions': {u'length': 39.9288, u'weight': 2721000.0, u'width': 34.7472},\n", " u'engine': u'Crawler-transporter__1',\n", " u'manufacturer': u'Marion Power Shovel Company',\n", " u'modelYears': [],\n", " u'name': u'Crawler-transporter',\n", " u'productionYears': [],\n", " u'transmission': u'16 traction motors powered by four generators'}\n" ] } ], "prompt_number": 107 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Range Queries" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "\"\"\" Your task is to write a query that will return all cities\n", "that are founded in 21st century.\n", "Please modify only 'range_query' function, as only that will be taken into account.\n", "\n", "Your code will be run against a MongoDB instance that we have provided.\n", "If you want to run this code locally on your machine,\n", "you have to install MongoDB, download and insert the dataset.\n", "For instructions related to MongoDB setup and datasets please see Course Materials.\n", "\"\"\"\n", "from datetime import datetime\n", " \n", "def get_db():\n", " from pymongo import MongoClient\n", " client = MongoClient('localhost:27017')\n", " db = client.examples\n", " return db\n", "\n", "\n", "def range_query():\n", " # You can use datetime(year, month, day) to specify date in the query\n", " query = {\"foundingDate\": {\"$gte\": datetime(2001, 1, 1), \"$lt\": datetime(2100, 1, 1)}}\n", " return query\n", "\n", "\n", "if __name__ == \"__main__\":\n", "\n", " db = get_db()\n", " query = range_query()\n", " cities = db.cities.find(query)\n", "\n", " print \"Found cities:\", cities.count()\n", " import pprint\n", " pprint.pprint(cities[0])\n" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using $in Operator" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "\"\"\" Your task is to write a query that will return all cars manufactured by \"Ford Motor Company\"\n", "that are assembled in Germany, United Kingdom, or Japan.\n", "Please modify only 'in_query' function, as only that will be taken into account.\n", "\n", "Your code will be run against a MongoDB instance that we have provided.\n", "If you want to run this code locally on your machine,\n", "you have to install MongoDB, download and insert the dataset.\n", "For instructions related to MongoDB setup and datasets please see Course Materials.\n", "\"\"\"\n", "\n", "def get_db():\n", " from pymongo import MongoClient\n", " client = MongoClient('localhost:27017')\n", " db = client.examples\n", " return db\n", "\n", "\n", "def in_query():\n", " # Write the query\n", " query = {\"manufacturer\": \"Ford Motor Company\", \"assembly\":{\"$in\": [\"Germany\", \"United Kingdom\", \"Japan\"]} }\n", " \n", " return query\n", "\n", "\n", "if __name__ == \"__main__\":\n", "\n", " db = get_db()\n", " query = in_query()\n", " autos = db.autos.find(query, {\"name\":1, \"manufacturer\":1, \"assembly\": 1, \"_id\":0})\n", "\n", " print \"Found autos:\", autos.count()\n", " import pprint\n", " # Print first record only\n", " for a in autos[0:1]:\n", " pprint.pprint(a)\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Found autos: 34\n", "{u'assembly': [u'Argentina',\n", " u'Australia',\n", " u'Berlin',\n", " u'Brazil',\n", " u'Buenos Aires',\n", " u'Canada',\n", " u'Copenhagen',\n", " u'Cork',\n", " u'Denmark',\n", " u'Detroit',\n", " u'Dothan Alabama',\n", " u'England',\n", " u'Geelong',\n", " u'Germany',\n", " u'Highland Park Michigan',\n", " u'Ireland',\n", " u'Manchester',\n", " u'Minneapolis',\n", " u'Ontario',\n", " u'S\\xe3o Bernardo do Campo',\n", " u'Saint Paul Minnesota',\n", " u'Toronto',\n", " u'Walkerville Ontario'],\n", " u'manufacturer': u'Ford Motor Company',\n", " u'name': u'Ford Model T'}\n" ] } ], "prompt_number": 105 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Dot Notation" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "\"\"\" Your task is to write a query that will return all cars with width dimension greater than 2.5\n", "Please modify only 'dot_query' function, as only that will be taken into account.\n", "\n", "Your code will be run against a MongoDB instance that we have provided.\n", "If you want to run this code locally on your machine,\n", "you have to install MongoDB, download and insert the dataset.\n", "For instructions related to MongoDB setup and datasets please see Course Materials.\n", "\"\"\"\n", "\n", "\n", "def get_db():\n", " from pymongo import MongoClient\n", " client = MongoClient('localhost:27017')\n", " db = client.examples\n", " return db\n", "\n", "\n", "def dot_query():\n", " query = {\"dimensions.width\":{\"$gt\":2.5}}\n", " return query\n", "\n", "\n", "if __name__ == \"__main__\":\n", "\n", " db = get_db()\n", " query = dot_query()\n", " cars = db.autos.find(query, {\"dimensions.width\":1, \"_id\":0, \"name\":1})\n", "\n", " print \"Found cars:\", autos.count()\n", " import pprint\n", " for car in cars[0:5]:\n", " pprint.pprint(car)\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Found cars: 34\n", "{u'dimensions': {u'width': 34.7472}, u'name': u'Crawler-transporter'}\n", "{u'dimensions': {u'width': 3.7}, u'name': u'Thrust SSC'}\n", "{u'dimensions': {u'width': 8.7}, u'name': u'Liebherr T 282B'}\n", "{u'dimensions': {u'width': 2.5908}, u'name': u'Nova Bus LFS'}\n", "{u'dimensions': {u'width': 17.2212}, u'name': u'Wolseley 6/90'}\n" ] } ], "prompt_number": 126 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Q4.1" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "# -*- coding: utf-8 -*-\n", "\"\"\"\n", "In this problem set you work with another type of infobox data, audit it, clean it, \n", "come up with a data model, insert it into a MongoDB and then run some queries against your database.\n", "The set contains data about Arachnid class.\n", "Your task in this exercise is to parse the file, process only the fields that are listed in the\n", "FIELDS dictionary as keys, and return a dictionary of cleaned values. \n", "\n", "The following things should be done:\n", "- keys of the dictionary changed according to the mapping in FIELDS dictionary\n", "- trim out redundant description in parenthesis from the 'rdf-schema#label' field, like \"(spider)\"\n", "- if 'name' is \"NULL\" or contains non-alphanumeric characters, set it to the same value as 'label'.\n", "- if a value of a field is \"NULL\", convert it to None\n", "- if there is a value in 'synonym', it should be converted to an array (list)\n", " by stripping the \"{}\" characters and splitting the string on \"|\". Rest of the cleanup is up to you,\n", " eg removing \"*\" prefixes etc\n", "- strip leading and ending whitespace from all fields, if there is any\n", "- the output structure should be as follows:\n", "{ 'label': 'Argiope',\n", " 'uri': 'http://dbpedia.org/resource/Argiope_(spider)',\n", " 'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',\n", " 'name': 'Argiope',\n", " 'synonym': [\"One\", \"Two\"],\n", " 'classification': {\n", " 'family': 'Orb-weaver spider',\n", " 'class': 'Arachnid',\n", " 'phylum': 'Arthropod',\n", " 'order': 'Spider',\n", " 'kingdom': 'Animal',\n", " 'genus': None\n", " }\n", "}\n", "\"\"\"\n", "import codecs\n", "import csv\n", "import json\n", "import pprint\n", "import re\n", "\n", "DATAFILE = 'data/arachnid.csv'\n", "FIELDS ={'rdf-schema#label': 'label',\n", " 'URI': 'uri',\n", " 'rdf-schema#comment': 'description',\n", " 'synonym': 'synonym',\n", " 'name': 'name',\n", " 'family_label': 'family',\n", " 'class_label': 'class',\n", " 'phylum_label': 'phylum',\n", " 'order_label': 'order',\n", " 'kingdom_label': 'kingdom',\n", " 'genus_label': 'genus'}\n", "\n", "\n", "def clean_array(temp_array):\n", " final_array = []\n", " remove_these = [\"Pocock\", \"Forster\", \"Couzijn\", \"Thorell\", \"Peckham\"]\n", " for strings in temp_array:\n", " temp_string = strings.replace(\"*\", \"\").strip()\n", " for r in remove_these:\n", " if r in strings:\n", " temp_string = temp_string.split(r)[0].strip()\n", " if temp_string[-1:] == \"(\":\n", " temp_string = temp_string[:-1].strip()\n", " final_array.append(temp_string.strip())\n", " return final_array\n", "\n", "\n", "def process_file(filename, fields):\n", " process_fields = fields.keys()\n", "\n", " data = []\n", "\n", " class_fields = [\"class\", \"family\", \"genus\", \"kingdom\", \"order\", \"phylum\"]\n", "\n", " with open(filename, \"r\") as f:\n", " reader = csv.DictReader(f)\n", " for i in range(3):\n", " l = reader.next()\n", " for line in reader:\n", "\n", " # Create new dictionary structure containing only the fields that are passed in.\n", " temp_dict = {}\n", " class_dict = {}\n", " for old_key in process_fields:\n", " new_key = fields[old_key]\n", " if new_key in [\"label\", \"uri\", \"description\", \"name\", \"synonym\"]:\n", " temp_dict[new_key] = line[old_key].strip()\n", " else:\n", " class_dict[new_key] = line[old_key].strip()\n", " if class_dict[new_key] == \"NULL\":\n", " class_dict[new_key] = None\n", "\n", " temp_dict[\"classification\"] = class_dict\n", "\n", " for new_field in temp_dict.keys():\n", "\n", " # Remove (extra names) from labels\n", " if new_field == \"label\":\n", " temp_dict[\"label\"] = temp_dict[\"label\"].split(\"(\")[0].strip()\n", "\n", " # Check for non-alphanumeric chars, if found, replace \"name\" with \"label\"\n", " if new_field == \"name\":\n", " if re.search('[A-Za-z0-9]*', temp_dict[new_field]).group() != temp_dict[new_field]:\n", " temp_dict[new_field] = temp_dict[\"label\"].strip()\n", "\n", " # Change all NULL entries to None, except in \"name\" where NULL is changed to \"label\" entry.\n", " if temp_dict[new_field] == \"NULL\":\n", " if new_field == \"name\":\n", " temp_dict[new_field] = temp_dict[\"label\"].strip()\n", " else:\n", " temp_dict[new_field] = None\n", " # Split synonyms into list of synonyms. Pass to clean_array() for further cleaning.\n", " if new_field == \"synonym\" and temp_dict[\"synonym\"] is not None:\n", " temp_array = parse_array(temp_dict[\"synonym\"])\n", " temp_dict[\"synonym\"] = clean_array(temp_array)\n", "\n", " data.append(temp_dict)\n", "\n", " return data\n", "\n", "\n", "def parse_array(v):\n", " if (v[0] == \"{\") and (v[-1] == \"}\"):\n", " v = v.lstrip(\"{\")\n", " v = v.rstrip(\"}\")\n", " v_array = v.split(\"|\")\n", " v_array = [i.strip() for i in v_array]\n", " return v_array\n", " return [v]\n", "\n", "\n", "def test():\n", " data = process_file(DATAFILE, FIELDS)\n", "\n", " pprint.pprint(data[0])\n", " assert data[0] == {\n", " \"synonym\": None, \n", " \"name\": \"Argiope\", \n", " \"classification\": {\n", " \"kingdom\": \"Animal\", \n", " \"family\": \"Orb-weaver spider\", \n", " \"order\": \"Spider\", \n", " \"phylum\": \"Arthropod\", \n", " \"genus\": None, \n", " \"class\": \"Arachnid\"\n", " }, \n", " \"uri\": \"http://dbpedia.org/resource/Argiope_(spider)\", \n", " \"label\": \"Argiope\", \n", " \"description\": \"The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced.\"\n", " }\n", "\n", "\n", "if __name__ == \"__main__\":\n", " test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "{'classification': {'class': 'Arachnid',\n", " 'family': 'Orb-weaver spider',\n", " 'genus': None,\n", " 'kingdom': 'Animal',\n", " 'order': 'Spider',\n", " 'phylum': 'Arthropod'},\n", " 'description': 'The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced.',\n", " 'label': 'Argiope',\n", " 'name': 'Argiope',\n", " 'synonym': None,\n", " 'uri': 'http://dbpedia.org/resource/Argiope_(spider)'}\n" ] } ], "prompt_number": 127 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Q4.2" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import json\n", "import pprint\n", "\n", "def insert_data(data, db):\n", "\n", " for a in data:\n", " db.arachnid.insert(a)\n", "\n", "\n", "if __name__ == \"__main__\":\n", " \n", " from pymongo import MongoClient\n", " client = MongoClient(\"mongodb://localhost:27017\")\n", " db = client.examples\n", "\n", " with open('data/arachnid.json') as f:\n", " data = json.loads(f.read())\n", " insert_data(data, db)\n", " pprint.pprint(db.arachnid.find_one())" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "{u'_id': ObjectId('53b3ef08cf9ef82c701090b5'),\n", " u'classification': {u'class': u'Arachnid',\n", " u'family': u'Orb-weaver spider',\n", " u'genus': None,\n", " u'kingdom': u'Animal',\n", " u'order': u'Spider',\n", " u'phylum': u'Arthropod'},\n", " u'description': u'The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced.',\n", " u'label': u'Argiope',\n", " u'name': u'Argiope',\n", " u'synonym': None,\n", " u'uri': u'http://dbpedia.org/resource/Argiope_(spider)'}\n" ] } ], "prompt_number": 130 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Q4.3" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "# -*- coding: utf-8 -*-\n", "\"\"\"\n", "In this problem set you work with another type of infobox data, audit it, clean it, \n", "come up with a data model, insert it into a MongoDB and then run some queries against your database.\n", "The set contains data about Arachnid class.\n", "\n", "The data is already in the database. But you have been given a task to also include 'binomialAuthority'\n", "information in the data, so you have to go through the data and update the existing entries.\n", "\n", "The following things should be done in the function add_field:\n", "- process the csv file and extract 2 fields - 'rdf-schema#label' and 'binomialAuthority_label'\n", "- clean up the 'rdf-schema#label' same way as in the first exercise - removing redundant \"(spider)\" suffixes\n", "- return a dictionary, with 'label' being the key, and 'binomialAuthority_label' the value\n", "- if 'binomialAuthority_label' is \"NULL\", skip the item\n", "\n", "The following should be done in the function update_db:\n", "- query the database by using the field 'label'\n", "- update the data, by adding a new item under 'classification' with a key 'binomialAuthority'\n", "\n", "\n", "The resulting data should look like this:\n", "- the output structure should be as follows:\n", "{ 'label': 'Argiope',\n", " 'uri': 'http://dbpedia.org/resource/Argiope_(spider)',\n", " 'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',\n", " 'name': 'Argiope',\n", " 'synonym': [\"One\", \"Two\"],\n", " 'classification': {\n", " 'binomialAuthority': None,\n", " 'family': 'Orb-weaver spider',\n", " 'class': 'Arachnid',\n", " 'phylum': 'Arthropod',\n", " 'order': 'Spider',\n", " 'kingdom': 'Animal',\n", " 'genus': None\n", " }\n", "}\n", "\"\"\"\n", "import codecs\n", "import csv\n", "import json\n", "import pprint\n", "\n", "DATAFILE = 'data/arachnid.csv'\n", "FIELDS ={'rdf-schema#label': 'label',\n", " 'binomialAuthority_label': 'binomialAuthority'}\n", "\n", "\n", "def add_field(filename, fields):\n", "\n", " process_fields = fields.keys()\n", " data = {}\n", " with open(filename, \"r\") as f:\n", " reader = csv.DictReader(f)\n", " for i in range(3):\n", " l = reader.next()\n", "\n", " for line in reader:\n", " if line[\"binomialAuthority_label\"] != \"NULL\":\n", " data[line[\"rdf-schema#label\"].split(\"(\")[0].strip()] = line[\"binomialAuthority_label\"]\n", "\n", " return data\n", "\n", "\n", "def update_db(data, db):\n", "\n", " for k in data.keys():\n", "\n", " #all_r = db.arachnid.find({\"label\": k})\n", " #for a in all_r:\n", " # pprint.pprint(a), \"P\"\n", "\n", " db.arachnid.update({\"label\": k}, {\"$set\": {\"classification.binomialAuthority\": data[k]}}, multi=True)\n", "\n", "\n", "def test():\n", " # Please change only the add_field and update_db functions!\n", " # Changes done to this function will not be taken into account\n", " # when doing a Test Run or Submit, they are just for your own reference\n", " # and as an example for running this code locally!\n", " \n", " data = add_field(DATAFILE, FIELDS)\n", " from pymongo import MongoClient\n", " client = MongoClient(\"mongodb://localhost:27017\")\n", " db = client.examples\n", "\n", " update_db(data, db)\n", "\n", " updated = db.arachnid.find_one({'label': 'Opisthoncana'})\n", " assert updated['classification']['binomialAuthority'] == 'Embrik Strand'\n", " pprint.pprint(data)\n", "\n", "\n", "\n", "if __name__ == \"__main__\":\n", " test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "{'Opisthoncana': 'Embrik Strand',\n", " 'Orvilleus': 'Arthur M. Chickering',\n", " 'Six-spotted fishing spider': 'Charles Athanase Walckenaer',\n", " 'Zealanapis australis': '{1951 in science|Raymond Robert Forster}'}\n" ] } ], "prompt_number": 133 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using Group" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import json \n", "import pprint\n", "\n", "def insert_data(data, db):\n", " for a in data:\n", " db.twitter.insert(a)\n", "\n", "if __name__ == \"__main__\":\n", " \n", " from pymongo import MongoClient\n", " client = MongoClient(\"mongodb://localhost:27017\")\n", " db = client.examples\n", " \n", " # Available here: http://content.udacity-data.com/ud032/twitter/twitter.json.zip\n", " with open('data/twitter.json', 'r') as f:\n", " ## json.loads() takes a string, while json.load() takes a file-like object.\n", " ## http://stackoverflow.com/questions/11568246/loading-several-text-files-into-mongodb-using-pymongo\n", " for tweet in f.readlines():\n", " db.twitter.insert(json.loads(tweet))\n", " pprint.pprint(db.twitter.find_one())" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "{u'_id': ObjectId('54c81ed3cf9ef82b0871a383'),\n", " u'contributors': None,\n", " u'coordinates': None,\n", " u'created_at': u'Thu Sep 02 18:11:23 +0000 2010',\n", " u'entities': {u'hashtags': [], u'urls': [], u'user_mentions': []},\n", " u'favorited': False,\n", " u'geo': None,\n", " u'id': 22819396900L,\n", " u'in_reply_to_screen_name': None,\n", " u'in_reply_to_status_id': None,\n", " u'in_reply_to_user_id': None,\n", " u'place': None,\n", " u'retweet_count': None,\n", " u'retweeted': False,\n", " u'source': u'web',\n", " u'text': u'eu preciso de terminar de fazer a minha tabela, est\\xe1 muito foda **',\n", " u'truncated': False,\n", " u'user': {u'contributors_enabled': False,\n", " u'created_at': u'Fri Jul 03 21:44:05 +0000 2009',\n", " u'description': u's\\xf3 os loucos sabem (:',\n", " u'favourites_count': 1,\n", " u'follow_request_sent': None,\n", " u'followers_count': 102,\n", " u'following': None,\n", " u'friends_count': 73,\n", " u'geo_enabled': False,\n", " u'id': 53507833,\n", " u'lang': u'en',\n", " u'listed_count': 0,\n", " u'location': u'',\n", " u'name': u'Beatriz Helena Cunha',\n", " u'notifications': None,\n", " u'profile_background_color': u'081114',\n", " u'profile_background_image_url': u'http://a1.twimg.com/profile_background_images/133178546/biatwitter.jpg',\n", " u'profile_background_tile': True,\n", " u'profile_image_url': u'http://a2.twimg.com/profile_images/1036412454/OgAAADXK9q6kaxrvfwQTINH66RVLAH9YHb-veRTA4FaWb9KtbGGV_yKTGzmvzTfJidqAb5gK_mpspIE-MIvAASGH2CwAm1T1UIPQk0-HS8x_TV5kdnW30nch7ODk-1_normal.jpg',\n", " u'profile_link_color': u'eb55b6',\n", " u'profile_sidebar_border_color': u'1c9dbd',\n", " u'profile_sidebar_fill_color': u'768575',\n", " u'profile_text_color': u'25b8c2',\n", " u'profile_use_background_image': True,\n", " u'protected': False,\n", " u'screen_name': u'Bia_cunha1',\n", " u'show_all_inline_media': False,\n", " u'statuses_count': 3504,\n", " u'time_zone': u'Brasilia',\n", " u'url': u'http://http://www.orkut.com.br/Main#Profile?uid=1433295880233078770',\n", " u'utc_offset': -10800,\n", " u'verified': False}}\n" ] } ], "prompt_number": 46 }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "\"\"\"\n", "The tweets in our twitter collection have a field called \"source\". This field describes the application\n", "that was used to create the tweet. Following the examples for using the $group operator, your task is \n", "to modify the 'make-pipeline' function to identify most used applications for creating tweets. \n", "As a check on your query, 'web' is listed as the most frequently used application.\n", "'Ubertwitter' is the second most used. \n", "\n", "Please modify only the 'make_pipeline' function so that it creates and returns an aggregation pipeline\n", "that can be passed to the MongoDB aggregate function. As in our examples in this lesson, the aggregation \n", "pipeline should be a list of one or more dictionary objects. \n", "Please review the lesson examples if you are unsure of the syntax.\n", "\n", "Your code will be run against a MongoDB instance that we have provided. \n", "If you want to run this code locally on your machine, you have to install MongoDB, \n", "download and insert the dataset.\n", "For instructions related to MongoDB setup and datasets please see Course Materials.\n", "\n", "Please note that the dataset you are using here is a smaller version of the twitter dataset \n", "used in examples in this lesson. \n", "If you attempt some of the same queries that we looked at in the lesson examples,\n", "your results will be different.\n", "\"\"\"\n", "\n", "\n", "def get_db(db_name):\n", " from pymongo import MongoClient\n", " client = MongoClient(\"mongodb://localhost:27017\")\n", " db = client[db_name]\n", " return db\n", "\n", "def make_pipeline():\n", " pipeline = [{\"$group\": {\"_id\": \"$source\",\n", " \"count\": {\"$sum\": 1}}},\n", " {\"$sort\": {\"count\": -1}},\n", " {\"$limit\" : 5 }]\n", " return pipeline\n", "\n", "def tweet_sources(db, pipeline):\n", " result = db.tweets.aggregate(pipeline)\n", " return result\n", "\n", "if __name__ == '__main__':\n", " db = get_db('examples')\n", " pipeline = make_pipeline()\n", " result = tweet_sources(db, pipeline)\n", " import pprint\n", " pprint.pprint(result)\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "{u'ok': 1.0,\n", " u'result': [{u'_id': u'web', u'count': 69410},\n", " {u'_id': u'\\xdcberTwitter',\n", " u'count': 10179},\n", " {u'_id': u'TweetDeck',\n", " u'count': 10110},\n", " {u'_id': u'Twitter for BlackBerry\\xae',\n", " u'count': 6747},\n", " {u'_id': u'Twitter for iPhone',\n", " u'count': 6027}]}\n" ] } ], "prompt_number": 59 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using match and project" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "\"\"\"\n", "Write an aggregation query to answer this question:\n", "\n", "Of the users in the \"Brasilia\" timezone who have tweeted 100 times or more,\n", "who has the largest number of followers?\n", "\n", "The following hints will help you solve this problem:\n", "- Time zone is found in the \"time_zone\" field of the user object in each tweet.\n", "- The number of tweets for each user is found in the \"statuses_count\" field.\n", " To access these fields you will need to use dot notation (from Lesson 4)\n", "- Your aggregation query should return something like the following:\n", "{u'ok': 1.0,\n", " u'result': [{u'_id': ObjectId('52fd2490bac3fa1975477702'),\n", " u'followers': 2597,\n", " u'screen_name': u'marbles',\n", " u'tweets': 12334}]}\n", "\n", "Please modify only the 'make_pipeline' function so that it creates and returns an aggregation \n", "pipeline that can be passed to the MongoDB aggregate function. As in our examples in this lesson,\n", "the aggregation pipeline should be a list of one or more dictionary objects. \n", "Please review the lesson examples if you are unsure of the syntax.\n", "\n", "Your code will be run against a MongoDB instance that we have provided. If you want to run this code\n", "locally on your machine, you have to install MongoDB, download and insert the dataset.\n", "For instructions related to MongoDB setup and datasets please see Course Materials.\n", "\n", "Please note that the dataset you are using here is a smaller version of the twitter dataset used \n", "in examples in this lesson. If you attempt some of the same queries that we looked at in the lesson \n", "examples, your results will be different.\n", "\"\"\"\n", "\n", "def get_db(db_name):\n", " from pymongo import MongoClient\n", " client = MongoClient('localhost:27017')\n", " db = client[db_name]\n", " return db\n", "\n", "def make_pipeline():\n", " # complete the aggregation pipeline\n", " pipeline = [{\"$match\": {\"user.time_zone\": \"Brasilia\",\n", " \"user.statuses_count\": {\"$gte\": 100}}},\n", " {\"$project\": {\"followers\": \"$user.followers_count\",\n", " \"screen_name\": \"$user.screen_name\",\n", " \"tweets\": \"$user.statuses_count\"}},\n", " {\"$sort\": {\"followers\": -1}},\n", " {\"$limit\": 1}]\n", " return pipeline\n", "\n", "def aggregate(db, pipeline):\n", " result = db.tweets.aggregate(pipeline)\n", " return result\n", "\n", "if __name__ == '__main__':\n", " db = get_db('examples')\n", " pipeline = make_pipeline()\n", " result = aggregate(db, pipeline)\n", " import pprint\n", " pprint.pprint(result)\n", " assert len(result[\"result\"]) == 1\n", " # Online quiz uses smaller dataset. \n", " # Full dataset is loaded here giving slightly different results for aggregations.\n", " # assert result[\"result\"][0][\"followers\"] == 17209\n", "\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "{u'ok': 1.0,\n", " u'result': [{u'_id': ObjectId('54c81ddbcf9ef82b086f7611'),\n", " u'followers': 259760,\n", " u'screen_name': u'otaviomesquita',\n", " u'tweets': 10997}]}\n" ] } ], "prompt_number": 67 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using Unwind" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "\"\"\"\n", "For this exercise, let's return to our cities infobox dataset. The question we would like you to answer\n", "is as follows: Which region in India contains the most cities?\n", "\n", "As a starting point, use the solution for the example question we looked at -- \"Who includes the most\n", "user mentions in their tweets?\"\n", "\n", "One thing to note about the cities data is that the \"isPartOf\" field contains an array of regions or \n", "districts in which a given city is found. See the example document in Instructor Comments below.\n", "\n", "Please modify only the 'make_pipeline' function so that it creates and returns an aggregation pipeline \n", "that can be passed to the MongoDB aggregate function. As in our examples in this lesson, the aggregation \n", "pipeline should be a list of one or more dictionary objects. Please review the lesson examples if you \n", "are unsure of the syntax.\n", "\n", "Your code will be run against a MongoDB instance that we have provided. If you want to run this code \n", "locally on your machine, you have to install MongoDB, download and insert the dataset.\n", "For instructions related to MongoDB setup and datasets please see Course Materials.\n", "\n", "Please note that the dataset you are using here is a smaller version of the dataset used in \n", "examples in this lesson. If you attempt some of the same queries that we looked at in the lesson \n", "examples, your results will be different.\n", "\"\"\"\n", "\n", "def get_db(db_name):\n", " from pymongo import MongoClient\n", " client = MongoClient('localhost:27017')\n", " db = client[db_name]\n", " return db\n", "\n", "def make_pipeline():\n", " # complete the aggregation pipeline\n", " pipeline = [{\"$unwind\": \"$isPartOf\"},\n", " {\"$match\": {\"country\": \"India\"}},\n", " {\"$group\": {\"_id\": \"$isPartOf\",\n", " \"count\": {\"$sum\": 1}}},\n", " {\"$sort\": {\"count\": -1}}]\n", " return pipeline\n", "\n", "def aggregate(db, pipeline):\n", " result = db.cities.aggregate(pipeline)\n", " return result\n", "\n", "if __name__ == '__main__':\n", " db = get_db('examples')\n", " pipeline = make_pipeline()\n", " result = aggregate(db, pipeline)\n", " print \"Printing the first result:\"\n", " import pprint\n", " pprint.pprint(result[\"result\"][0])\n", " assert result[\"result\"][0][\"_id\"] == \"Uttar Pradesh\"\n", " assert result[\"result\"][0][\"count\"] == 623\n", "\n", "\n" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using Push" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "\"\"\"\n", "$push is similar to $addToSet. The difference is that rather than accumulating only unique values \n", "it aggregates all values into an array.\n", "\n", "Using an aggregation query, count the number of tweets for each user. In the same $group stage, \n", "use $push to accumulate all the tweet texts for each user. Limit your output to the 5 users\n", "with the most tweets. \n", "Your result documents should include only the fields:\n", "\"_id\" (screen name of user), \n", "\"count\" (number of tweets found for the user),\n", "\"tweet_texts\" (a list of the tweet texts found for the user). \n", "\n", "Please modify only the 'make_pipeline' function so that it creates and returns an aggregation \n", "pipeline that can be passed to the MongoDB aggregate function. As in our examples in this lesson, \n", "the aggregation pipeline should be a list of one or more dictionary objects. \n", "Please review the lesson examples if you are unsure of the syntax.\n", "\n", "Your code will be run against a MongoDB instance that we have provided. If you want to run this code \n", "locally on your machine, you have to install MongoDB, download and insert the dataset.\n", "For instructions related to MongoDB setup and datasets please see Course Materials.\n", "\n", "Please note that the dataset you are using here is a smaller version of the twitter dataset used in \n", "examples in this lesson. If you attempt some of the same queries that we looked at in the lesson \n", "examples, your results will be different.\n", "\"\"\"\n", "\n", "def get_db(db_name):\n", " from pymongo import MongoClient\n", " client = MongoClient('localhost:27017')\n", " db = client[db_name]\n", " return db\n", "\n", "def make_pipeline():\n", " # complete the aggregation pipeline\n", " pipeline = [{\"$group\": {\"_id\": \"$user.screen_name\",\n", " \"tweet_texts\": {\"$push\": \"$text\"},\n", " \"count\": {\"$sum\": 1}}},\n", " {\"$sort\": {\"count\": -1}},\n", " {\"$limit\": 5}]\n", " return pipeline\n", "\n", "def aggregate(db, pipeline):\n", " result = db.tweets.aggregate(pipeline)\n", " return result\n", "\n", "if __name__ == '__main__':\n", " db = get_db('examples')\n", " pipeline = make_pipeline()\n", " result = aggregate(db, pipeline)\n", " assert len(result[\"result\"]) == 5\n", " assert result[\"result\"][0][\"count\"] > result[\"result\"][4][\"count\"]\n", " import pprint\n", " pprint.pprint(result['result'][0])\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "{u'_id': u'behcolin',\n", " u'count': 24,\n", " u'tweet_texts': [u'RT @VouConfessarQue: #VouConfessarQue j\\xe1 aprendi uma mat\\xe9ria inteira poucos minutos antes de uma prova.',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'volto jah',\n", " u'RT @VouConfessarQue: #VouConfessarQue j\\xe1 aprendi uma mat\\xe9ria inteira poucos minutos antes de uma prova.',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'volto jah',\n", " u'RT @VouConfessarQue: #VouConfessarQue j\\xe1 aprendi uma mat\\xe9ria inteira poucos minutos antes de uma prova.',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\\xe9 trouxe ela de volta!',\n", " u'volto jah']}\n" ] } ], "prompt_number": 85 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Same Operator P" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "\"\"\"\n", "In an earlier exercise we looked at the cities dataset and asked which region in India contains \n", "the most cities. In this exercise, we'd like you to answer a related question regarding regions in \n", "India. What is the average city population for a region in India? Calculate your answer by first \n", "finding the average population of cities in each region and then by calculating the average of the \n", "regional averages.\n", "\n", "Hint: If you want to accumulate using values from all input documents to a group stage, you may use \n", "a constant as the value of the \"_id\" field. For example, \n", " { \"$group\" : {\"_id\" : \"India Regional City Population Average\",\n", " ... }\n", "\n", "Please modify only the 'make_pipeline' function so that it creates and returns an aggregation \n", "pipeline that can be passed to the MongoDB aggregate function. As in our examples in this lesson, \n", "the aggregation pipeline should be a list of one or more dictionary objects. \n", "Please review the lesson examples if you are unsure of the syntax.\n", "\n", "Your code will be run against a MongoDB instance that we have provided. If you want to run this code \n", "locally on your machine, you have to install MongoDB, download and insert the dataset.\n", "For instructions related to MongoDB setup and datasets please see Course Materials.\n", "\n", "Please note that the dataset you are using here is a smaller version of the twitter dataset used \n", "in examples in this lesson. If you attempt some of the same queries that we looked at in the lesson \n", "examples, your results will be different.\n", "\"\"\"\n", "\n", "def get_db(db_name):\n", " from pymongo import MongoClient\n", " client = MongoClient('localhost:27017')\n", " db = client[db_name]\n", " return db\n", "\n", "def make_pipeline():\n", " # complete the aggregation pipeline\n", " pipeline = [{\"$match\": {\"country\": \"India\"}},\n", " # First, match India as the country of interest; data contains world data.\n", " {\"$unwind\": \"$isPartOf\"},\n", " # Unwind regions; some cities belong to multiple regions.\n", " {\"$group\": {\"_id\": \"$isPartOf\",\n", " # Now group on each region.\n", " \"totPop\": {\"$sum\": \"$population\"},\n", " # Sum up the population of all of the cities for each region.\n", " \"count\": {\"$sum\": 1},\n", " # Count the number of times each region shows up.\n", " \"average\": {\"$avg\": \"$population\"}}},\n", " # Create an average for each region.\n", " {\"$group\": {\"_id\": \"India Regional City Population Average\",\n", " # Now group by a constant to group everything together.\n", " \"avg\": {\"$avg\": \"$average\"}}}]\n", " # And finally, get an average of the average region populations.\n", " return pipeline\n", "\n", "def aggregate(db, pipeline):\n", " result = db.cities.aggregate(pipeline)\n", " return result\n", "\n", "if __name__ == '__main__':\n", " db = get_db('examples')\n", " pipeline = make_pipeline()\n", " result = aggregate(db, pipeline)\n", " assert len(result[\"result\"]) == 1\n", " assert result[\"result\"][0][\"avg\"] == 196025.97814809752\n", " import pprint\n", " pprint.pprint(result)\n" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Q5.1" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "\"\"\"\n", "Use an aggregation query to answer the following question. \n", "\n", "What is the most common city name in our cities collection?\n", "\n", "Your first attempt probably identified None as the most frequently occurring city name. \n", "What that actually means is that there are a number of cities without a name field at all. \n", "It's strange that such documents would exist in this collection and, depending on your situation, \n", "might actually warrant further cleaning. \n", "\n", "To solve this problem the right way, we should really ignore cities that don't have a name specified. \n", "As a hint ask yourself what pipeline operator allows us to simply filter input? \n", "How do we test for the existence of a field?\n", "\n", "Please modify only the 'make_pipeline' function so that it creates and returns an aggregation pipeline \n", "that can be passed to the MongoDB aggregate function. As in our examples in this lesson, \n", "the aggregation pipeline should be a list of one or more dictionary objects. \n", "Please review the lesson examples if you are unsure of the syntax.\n", "\n", "Your code will be run against a MongoDB instance that we have provided. \n", "If you want to run this code locally on your machine, you have to install MongoDB, \n", "download and insert the dataset.\n", "For instructions related to MongoDB setup and datasets please see Course Materials.\n", "\n", "Please note that the dataset you are using here is a smaller version of the twitter dataset used in \n", "examples in this lesson. If you attempt some of the same queries that we looked at in the lesson \n", "examples, your results will be different.\n", "\"\"\"\n", "\n", "def get_db(db_name):\n", " from pymongo import MongoClient\n", " client = MongoClient('localhost:27017')\n", " db = client[db_name]\n", " return db\n", "\n", "def make_pipeline():\n", " # complete the aggregation pipeline\n", " pipeline = [{\"$match\": {\"name\": {\"$exists\": True}}},\n", " {\"$group\": {\"_id\": \"$name\",\n", " \"count\": {\"$sum\": 1}}},\n", " {\"$sort\": {\"count\": -1}},\n", " {\"$limit\": 1}]\n", " return pipeline\n", "\n", "def aggregate(db, pipeline):\n", " result = db.cities.aggregate(pipeline)\n", " return result\n", "\n", "if __name__ == '__main__':\n", " db = get_db('examples')\n", " pipeline = make_pipeline()\n", " result = aggregate(db, pipeline)\n", " import pprint\n", " pprint.pprint(result[\"result\"][0])\n", " assert len(result[\"result\"]) == 1\n", " assert result[\"result\"][0] == {'_id': 'Shahpur', 'count': 6}\n" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Q5.2" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "\"\"\"\n", "Use an aggregation query to answer the following question. \n", "\n", "Which Region in India has the largest number of cities with longitude between 75 and 80?\n", "\n", "Please modify only the 'make_pipeline' function so that it creates and returns an aggregation \n", "pipeline that can be passed to the MongoDB aggregate function. As in our examples in this lesson, \n", "the aggregation pipeline should be a list of one or more dictionary objects. \n", "Please review the lesson examples if you are unsure of the syntax.\n", "\n", "Your code will be run against a MongoDB instance that we have provided. If you want to run this \n", "code locally on your machine, you have to install MongoDB, download and insert the dataset.\n", "For instructions related to MongoDB setup and datasets please see Course Materials.\n", "\n", "Please note that the dataset you are using here is a smaller version of the twitter dataset used in \n", "examples in this lesson. If you attempt some of the same queries that we looked at in the lesson \n", "examples, your results will be different.\n", "\"\"\"\n", "\n", "def get_db(db_name):\n", " from pymongo import MongoClient\n", " client = MongoClient('localhost:27017')\n", " db = client[db_name]\n", " return db\n", "\n", "def make_pipeline():\n", " # complete the aggregation pipeline\n", " pipeline = [{\"$match\": {\"country\": \"India\",\n", " \"lon\": {\"$gte\": 75},\n", " \"lon\": {\"$lte\": 80}}},\n", " {\"$unwind\": \"$isPartOf\"},\n", " {\"$group\": {\"_id\": \"$isPartOf\",\n", " \"count\": {\"$sum\": 1}}},\n", " {\"$sort\": {\"count\": -1}},\n", " {\"$limit\": 1}]\n", " return pipeline\n", "\n", "def aggregate(db, pipeline):\n", " result = db.cities.aggregate(pipeline)\n", " return result\n", "\n", "if __name__ == '__main__':\n", " db = get_db('examples')\n", " pipeline = make_pipeline()\n", " result = aggregate(db, pipeline)\n", " import pprint\n", " pprint.pprint(result[\"result\"][0])\n", " assert len(result[\"result\"]) == 1\n", " assert result[\"result\"][0][\"_id\"] == 'Tamil Nadu'\n" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Q5.3" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "\"\"\"\n", "Use an aggregation query to answer the following question. \n", "\n", "Extrapolating from an earlier exercise in this lesson, find the average regional city population \n", "for all countries in the cities collection. What we are asking here is that you first calculate the \n", "average city population for each region in a country and then calculate the average of all the \n", "regional averages for a country. As a hint, _id fields in group stages need not be single values. \n", "They can also be compound keys (documents composed of multiple fields). You will use the same \n", "aggregation operator in more than one stage in writing this aggregation query. I encourage you to \n", "write it one stage at a time and test after writing each stage.\n", "\n", "Please modify only the 'make_pipeline' function so that it creates and returns an aggregation \n", "pipeline that can be passed to the MongoDB aggregate function. As in our examples in this lesson, \n", "the aggregation pipeline should be a list of one or more dictionary objects. \n", "Please review the lesson examples if you are unsure of the syntax.\n", "\n", "Your code will be run against a MongoDB instance that we have provided. If you want to run this code \n", "locally on your machine, you have to install MongoDB, download and insert the dataset.\n", "For instructions related to MongoDB setup and datasets please see Course Materials.\n", "\n", "Please note that the dataset you are using here is a smaller version of the twitter dataset used in \n", "examples in this lesson. If you attempt some of the same queries that we looked at in the lesson \n", "examples, your results will be different.\n", "\"\"\"\n", "\n", "def get_db(db_name):\n", " from pymongo import MongoClient\n", " client = MongoClient('localhost:27017')\n", " db = client[db_name]\n", " return db\n", "\n", "def make_pipeline():\n", " # complete the aggregation pipeline\n", " pipeline = [{\"$unwind\": \"$isPartOf\"},\n", " {\"$group\": {\"_id\": {\"Country\": \"$country\",\n", " \"Region\": \"$isPartOf\"},\n", " \"avgCity\": {\"$avg\": \"$population\"}}},\n", " {\"$group\": {\"_id\": \"$_id.Country\",\n", " \"avgRegionalPopulation\": {\"$avg\": \"$avgCity\"}}}]\n", " return pipeline\n", "\n", "def aggregate(db, pipeline):\n", " result = db.cities.aggregate(pipeline)\n", " return result\n", "\n", "if __name__ == '__main__':\n", " db = get_db('examples')\n", " pipeline = make_pipeline()\n", " result = aggregate(db, pipeline)\n", " import pprint\n", " if len(result[\"result\"]) < 150:\n", " pprint.pprint(result[\"result\"])\n", " else:\n", " pprint.pprint(result[\"result\"][:100])\n", " for country in result[\"result\"]:\n", " if country[\"_id\"] == 'Kuwait':\n", " assert country == {'_id': 'Kuwait', 'avgRegionalPopulation': 115945.66666666667}\n" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Iterative Parsing" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "# -*- coding: utf-8 -*-\n", "\"\"\"\n", "Your task is to use the iterative parsing to process the map file and\n", "find out not only what tags are there, but also how many, to get the\n", "feeling on how much of which data you can expect to have in the map.\n", "The output should be a dictionary with the tag name as the key\n", "and number of times this tag can be encountered in the map as value.\n", "\n", "Note that your code will be tested with a different data file than the 'example.osm'\n", "\"\"\"\n", "import xml.etree.cElementTree as ET\n", "import pprint\n", "from collections import defaultdict\n", "\n", "def count_tags(filename):\n", "# counts = dict()\n", "# for line in ET.iterparse(filename):\n", "# current = line[1].tag\n", "# counts[current] = counts.get(current, 0) + 1\n", " counts = defaultdict(int)\n", " for line in ET.iterparse(filename):\n", " current = line[1].tag\n", " counts[current] += 1\n", " return counts\n", "\n", "\n", "\n", "def test():\n", "\n", " tags = count_tags('data/example.osm')\n", " pprint.pprint(tags)\n", " assert tags == {'bounds': 1,\n", " 'member': 3,\n", " 'nd': 4,\n", " 'node': 20,\n", " 'osm': 1,\n", " 'relation': 1,\n", " 'tag': 7,\n", " 'way': 1}\n", "\n", " \n", "\n", "if __name__ == \"__main__\":\n", " test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "defaultdict(, {'node': 20, 'nd': 4, 'bounds': 1, 'member': 3, 'tag': 7, 'relation': 1, 'way': 1, 'osm': 1})\n" ] } ], "prompt_number": 98 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Tag Types" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "# -*- coding: utf-8 -*-\n", "import xml.etree.ElementTree as ET\n", "import pprint\n", "import re\n", "\"\"\"\n", "Your task is to explore the data a bit more.\n", "Before you process the data and add it into MongoDB, you should\n", "check the \"k\" value for each \"\" and see if they can be valid keys in MongoDB,\n", "as well as see if there are any other potential problems.\n", "\n", "We have provided you with 3 regular expressions to check for certain patterns\n", "in the tags. As we saw in the quiz earlier, we would like to change the data model\n", "and expand the \"addr:street\" type of keys to a dictionary like this:\n", "{\"address\": {\"street\": \"Some value\"}}\n", "So, we have to see if we have such tags, and if we have any tags with problematic characters.\n", "Please complete the function 'key_type'.\n", "\"\"\"\n", "\n", "\n", "lower = re.compile(r'^([a-z]|_)*$')\n", "lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')\n", "problemchars = re.compile(r'[=\\+/&<>;\\'\"\\?%#$@\\,\\. \\t\\r\\n]')\n", "\n", "\n", "def key_type(element, keys):\n", " if element.tag == \"tag\":\n", " k_value = element.attrib['k']\n", " if lower.search(k_value) is not None:\n", " keys['lower'] += 1\n", " elif lower_colon.search(k_value) is not None:\n", " keys['lower_colon'] += 1\n", " elif problemchars.search(k_value) is not None:\n", " keys[\"problemchars\"] += 1\n", " else:\n", " keys['other'] += 1\n", "\n", " return keys\n", "\n", "\n", "\n", "def process_map(filename):\n", " keys = {\"lower\": 0, \"lower_colon\": 0, \"problemchars\": 0, \"other\": 0}\n", " for _, element in ET.iterparse(filename):\n", " keys = key_type(element, keys)\n", "\n", " return keys\n", "\n", "\n", "\n", "def test():\n", " # You can use another testfile 'map.osm' to look at your solution\n", " # Note that the assertions will be incorrect then.\n", " keys = process_map('data/example.osm')\n", " pprint.pprint(keys)\n", " assert keys == {'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1}\n", "\n", "\n", "if __name__ == \"__main__\":\n", " test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "{'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1}\n" ] } ], "prompt_number": 101 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Exploring Users" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "# -*- coding: utf-8 -*-\n", "import xml.etree.ElementTree as ET\n", "import pprint\n", "import re\n", "\"\"\"\n", "Your task is to explore the data a bit more.\n", "The first task is a fun one - find out how many unique users\n", "have contributed to the map in this particular area!\n", "\n", "The function process_map should return a set of unique user IDs (\"uid\")\n", "\"\"\"\n", "\n", "def get_user(element):\n", " return\n", "\n", "\n", "def process_map(filename):\n", " users = set()\n", " for not_used, element in ET.iterparse(filename):\n", " #print \"TAG:\", element.tag\n", " #pprint.pprint(element.attrib)\n", " if element.tag == \"node\" or element.tag == \"way\" or element.tag == \"relation\":\n", " users.add(element.attrib['uid'])\n", " #pprint.pprint(element.attrib['uid'])\n", "\n", " return users\n", "\n", "\n", "def test():\n", "\n", " users = process_map('data/example.osm')\n", " pprint.pprint(users)\n", " assert len(users) == 6\n", "\n", "\n", "\n", "if __name__ == \"__main__\":\n", " test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "set(['1219059', '147510', '26299', '451048', '567034', '939355'])\n" ] } ], "prompt_number": 103 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Improving Street Names" ] }, { "cell_type": "code", "collapsed": false, "input": [ "\"\"\"\n", "Your task in this exercise has two steps:\n", "\n", "- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix \n", " the unexpected street types to the appropriate ones in the expected list.\n", " You have to add mappings only for the actual problems you find in this OSMFILE,\n", " not a generalized solution, since that may and will depend on the particular area you are auditing.\n", "- write the update_name function, to actually fix the street name.\n", " The function takes a string with street name as an argument and should return the fixed name\n", " We have provided a simple test so that you see what exactly is expected\n", "\"\"\"\n", "import xml.etree.cElementTree as ET\n", "from collections import defaultdict\n", "import re\n", "import pprint\n", "\n", "OSMFILE = \"data/example.osm\"\n", "street_type_re = re.compile(r'\\b\\S+\\.?$', re.IGNORECASE)\n", "\n", "\n", "expected = [\"Street\", \"Avenue\", \"Boulevard\", \"Drive\", \"Court\", \"Place\", \"Square\", \"Lane\", \"Road\", \n", " \"Trail\", \"Parkway\", \"Commons\"]\n", "\n", "# UPDATE THIS VARIABLE\n", "mapping = { \"St\": \"Street\",\n", " \"St.\": \"Street\",\n", " \"Ave\": \"Avenue\",\n", " \"Rd.\": \"Road\",\n", " \"W.\": \"West\",\n", " \"N.\": \"North\",\n", " \"S.\": \"South\",\n", " \"E\": \"East\"}\n", "\n", "\n", "def audit_street_type(street_types, street_name):\n", " m = street_type_re.search(street_name)\n", " if m:\n", " street_type = m.group()\n", " if street_type not in expected:\n", " street_types[street_type].add(street_name)\n", "\n", "\n", "def is_street_name(elem):\n", " return (elem.attrib['k'] == \"addr:street\")\n", "\n", "\n", "def audit(osmfile):\n", " osm_file = open(osmfile, \"r\")\n", " street_types = defaultdict(set)\n", " for event, elem in ET.iterparse(osm_file, events=(\"start\",)):\n", " if elem.tag == \"node\" or elem.tag == \"way\":\n", " for tag in elem.iter(\"tag\"):\n", " if is_street_name(tag):\n", " audit_street_type(street_types, tag.attrib['v'])\n", "\n", " return street_types\n", "\n", "\n", "def update_name(name, mapping):\n", " after = []\n", " # Split name string to test each part of the name;\n", " # Replacements may come anywhere in the name.\n", " for part in name.split(\" \"):\n", " # Check each part of the name against the keys in the correction dict\n", " if part in mapping.keys():\n", " # If exists in dict, overwrite that part of the name with the dict value for it.\n", " part = mapping[part]\n", " # Assemble each corrected piece of the name back together.\n", " after.append(part)\n", " # Return all pieces of the name as a string joined by a space.\n", " return \" \".join(after)\n", " \n", "\n", "# for w in mapping.keys():\n", "# if w in name:\n", "# if flag:\n", "# continue\n", "# # Replace abbrev. name in string with full name value from the mapping dict.\n", "# name = name.replace(w, mapping[w], 1)\n", "# # If St., flag to not check again in this string looking for St since new 'Street' will contain St\n", "# # re.compile() might be better\n", "# if w == \"St.\":\n", "# flag = True\n", "\n", "\n", "def test():\n", " st_types = audit(OSMFILE)\n", " assert len(st_types) == 3\n", " pprint.pprint(dict(st_types))\n", "\n", " for st_type, ways in st_types.iteritems():\n", " for name in ways:\n", " better_name = update_name(name, mapping)\n", " print name, \"=>\", better_name\n", " if name == \"West Lexington St.\":\n", " assert better_name == \"West Lexington Street\"\n", " if name == \"Baldwin Rd.\":\n", " assert better_name == \"Baldwin Road\"\n", "\n", "\n", "if __name__ == '__main__':\n", " test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "{'Ave': set(['N. Lincoln Ave', 'North Lincoln Ave']),\n", " 'Rd.': set(['Baldwin Rd.']),\n", " 'St.': set(['West Lexington St.'])}\n", "N. Lincoln Ave => North Lincoln Avenue\n", "North Lincoln Ave => North Lincoln Avenue\n", "West Lexington St. => West Lexington Street\n", "Baldwin Rd. => Baldwin Road\n" ] } ], "prompt_number": 158 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Preparing for database" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "# -*- coding: utf-8 -*-\n", "import xml.etree.ElementTree as ET\n", "import pprint\n", "import re\n", "import codecs\n", "import json\n", "\"\"\"\n", "Your task is to wrangle the data and transform the shape of the data\n", "into the model we mentioned earlier. The output should be a list of dictionaries\n", "that look like this:\n", "\n", "{\n", "\"id\": \"2406124091\",\n", "\"type: \"node\",\n", "\"visible\":\"true\",\n", "\"created\": {\n", " \"version\":\"2\",\n", " \"changeset\":\"17206049\",\n", " \"timestamp\":\"2013-08-03T16:43:42Z\",\n", " \"user\":\"linuxUser16\",\n", " \"uid\":\"1219059\"\n", " },\n", "\"pos\": [41.9757030, -87.6921867],\n", "\"address\": {\n", " \"housenumber\": \"5157\",\n", " \"postcode\": \"60625\",\n", " \"street\": \"North Lincoln Ave\"\n", " },\n", "\"amenity\": \"restaurant\",\n", "\"cuisine\": \"mexican\",\n", "\"name\": \"La Cabana De Don Luis\",\n", "\"phone\": \"1 (773)-271-5176\"\n", "}\n", "\n", "You have to complete the function 'shape_element'.\n", "We have provided a function that will parse the map file, and call the function with the element\n", "as an argument. You should return a dictionary, containing the shaped data for that element.\n", "We have also provided a way to save the data in a file, so that you could use\n", "mongoimport later on to import the shaped data into MongoDB. You could also do some cleaning\n", "before doing that, like in the previous exercise, but for this exercise you just have to\n", "shape the structure.\n", "\n", "In particular the following things should be done:\n", "- you should process only 2 types of top level tags: \"node\" and \"way\"\n", "- all attributes of \"node\" and \"way\" should be turned into regular key/value pairs, except:\n", " - attributes in the CREATED array should be added under a key \"created\"\n", " - attributes for latitude and longitude should be added to a \"pos\" array,\n", " for use in geospacial indexing. Make sure the values inside \"pos\" array are floats\n", " and not strings. \n", "- if second level tag \"k\" value contains problematic characters, it should be ignored\n", "- if second level tag \"k\" value starts with \"addr:\", it should be added to a dictionary \"address\"\n", "- if second level tag \"k\" value does not start with \"addr:\", but contains \":\", you can process it\n", " same as any other tag.\n", "- if there is a second \":\" that separates the type/direction of a street,\n", " the tag should be ignored, for example:\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", " should be turned into:\n", "\n", "{...\n", "\"address\": {\n", " \"housenumber\": 5158,\n", " \"street\": \"North Lincoln Avenue\"\n", "}\n", "\"amenity\": \"pharmacy\",\n", "...\n", "}\n", "\n", "- for \"way\" specifically:\n", "\n", " \n", " \n", "\n", "should be turned into\n", "\"node_ref\": [\"305896090\", \"1719825889\"]\n", "\"\"\"\n", "\n", "\n", "lower = re.compile(r'^([a-z]|_)*$')\n", "lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')\n", "problemchars = re.compile(r'[=\\+/&<>;\\'\"\\?%#$@\\,\\. \\t\\r\\n]')\n", "\n", "CREATED = [ \"version\", \"changeset\", \"timestamp\", \"user\", \"uid\"]\n", "\n", "\n", "def is_address(elem):\n", " if elem.attrib['k'][:5] == \"addr:\":\n", " return True\n", "\n", "\n", "\n", "\n", "def shape_element(element):\n", " # Make an empty dictionary for the output node/element.\n", " node = {}\n", " # Process 'node' or 'way' elements only.\n", " if element.tag == \"node\" or element.tag == \"way\":\n", " address_info = {}\n", " nd_info = []\n", " # Add 'node'/'way' as 'type' \n", " node[\"type\"] = element.tag\n", " # Add 'id' as 'id'\n", " node[\"id\"] = element.attrib[\"id\"]\n", " # If visible exists, add it to dict\n", " if \"visible\" in element.attrib.keys():\n", " node[\"visible\"] = element.attrib[\"visible\"]\n", " # Add 'lat'/'lon' if they ar available\n", " if \"lat\" in element.attrib.keys():\n", " node[\"pos\"] = [float(element.attrib['lat']), float(element.attrib['lon'])]\n", " # Add version, changeset, timestamp, uid, and user under the root node 'created'\n", " node[\"created\"] = {\"version\": element.attrib['version'],\n", " \"changeset\": element.attrib['changeset'],\n", " \"timestamp\": element.attrib['timestamp'],\n", " \"uid\": element.attrib['uid'],\n", " \"user\": element.attrib['user']}\n", " # Iterate through the tags of k,v pairs.\n", " for tag in element.iter(\"tag\"):\n", " #print tag.attrib\n", " p = problemchars.search(tag.attrib['k'])\n", " if p:\n", " # print \"PROBLEM:\", p.group()\n", " # Do nothing currently\n", " continue\n", " elif is_address(tag):\n", " if \":\" in tag.attrib['k'][5:]:\n", " # print \"Bad Address:\", tag.attrib['k'], \"--\", tag.attrib['v']\n", " # first 5 char of address attributes should be 'addr:'\n", " # If they're not, it's a bad address for this script.\n", " # Skip.\n", " continue\n", " else:\n", " # If first 5 char contain ':' (i.e. 'addr:'), add the last part of the string as a key and\n", " # the value from 'v' as the value in our address_info dict.\n", " # i.e. 'addr:state' will add 'state'\n", " address_info[tag.attrib['k'][5:]] = tag.attrib['v']\n", " #print \"Good Address:\", tag.attrib['k'], \"--\", tag.attrib['v']\n", " else:\n", " # If there's no ':', just add the 'k' as a key, and 'v' as a value in our node dict.\n", " node[tag.attrib['k']] = tag.attrib['v']\n", " #print \"Outside:\", tag.attrib['k'], \"--\", tag.attrib['v']\n", " # If we found 'addr:' info and added it to our address_info dict,\n", " if address_info != {}:\n", " # Then add that address_info dict under the node 'address'\n", " node['address'] = address_info\n", " # Iterate through the 'nd' nodes if they exist.\n", " for tag2 in element.iter(\"nd\"):\n", " # add each entry in a running list.\n", " nd_info.append(tag2.attrib['ref'])\n", " # If the resulting list isn't empty,\n", " if nd_info != []:\n", " # Add the list under the node 'node_refs'\n", " node['node_refs'] = nd_info\n", " return node\n", " else:\n", " # If the element isn't 'node' or 'way', just return None.\n", " return None\n", "\n", "def process_map(file_in, pretty = False):\n", " # You do not need to change this file\n", " file_out = \"{0}.json\".format(file_in)\n", " data = []\n", " with codecs.open(file_out, \"w\") as fo:\n", " for _, element in ET.iterparse(file_in):\n", " el = shape_element(element)\n", " if el:\n", " data.append(el)\n", " if pretty:\n", " fo.write(json.dumps(el, indent=2)+\"\\n\")\n", " else:\n", " fo.write(json.dumps(el) + \"\\n\")\n", " return data\n", "\n", "def test():\n", "\n", " data = process_map('data/example.osm', False)\n", " #pprint.pprint(data)\n", " assert data[0] == {\n", " \"id\": \"261114295\", \n", " \"visible\": \"true\", \n", " \"type\": \"node\", \n", " \"pos\": [\n", " 41.9730791, \n", " -87.6866303\n", " ], \n", " \"created\": {\n", " \"changeset\": \"11129782\", \n", " \"user\": \"bbmiller\", \n", " \"version\": \"7\", \n", " \"uid\": \"451048\", \n", " \"timestamp\": \"2012-03-28T18:31:23Z\"\n", " }\n", " }\n", " assert data[-1][\"address\"] == {\n", " \"street\": \"West Lexington St.\", \n", " \"housenumber\": \"1412\"\n", " }\n", " assert data[-1][\"node_refs\"] == [ \"2199822281\", \"2199822390\", \"2199822392\", \"2199822369\", \n", " \"2199822370\", \"2199822284\", \"2199822281\"]\n", " print \"Passed.\"\n", "\n", "if __name__ == \"__main__\":\n", " test()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Passed.\n" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "from IPython import utils\n", "from IPython.core.display import HTML\n", "import os\n", "def css_styling():\n", " \"\"\"Load default custom.css file from ipython profile\"\"\"\n", " base = utils.path.get_ipython_dir()\n", " styles = \"\" % (open(os.path.join(base,'profile_custom1/static/custom/custom.css'),'r').read())\n", " return HTML(styles)\n", "css_styling()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "" ], "metadata": {}, "output_type": "pyout", "prompt_number": 1, "text": [ "" ] } ], "prompt_number": 1 } ], "metadata": {} } ] }