# Download this example's [Ruby code](https://raw.githubusercontent.com/jpmckinney/pupa-ruby/gh-pages/docs/organization.rb) # to run locally. # # The [cat.rb](http://jpmckinney.github.io/pupa-ruby/docs/cat.html), # [bill.rb](http://jpmckinney.github.io/pupa-ruby/docs/bill.html) and # [legislator.rb](http://jpmckinney.github.io/pupa-ruby/docs/legislator.html) # examples show you how to scrape and import data. This example shows you how # to transform scraped data. require 'pupa' require 'csv' # We're going to scrape organizations and output them as CSV, which we can then # upload to the [Open Knowledge Foundation](https://github.com/okfn/publicbodies)'s # [Public Bodies](http://publicbodies.org/) project. class PublicBodyProcessor < Pupa::Processor # This transformation task will write a CSV row for each scraped organization. # You can name transformation tasks whatever you like. def csv puts CSV.generate_line %w( title abbr key category parent parent_key description url jurisdiction jurisdiction_code source source_url address contact email tags ) # `organizations` is a lazy enumerator of all scraped organizations, so # we'll see a CSV row printed as soon as an organization is scraped. organizations.each do |organization| puts CSV.generate_line [ organization.name, nil, organization._id, organization.classification, nil, nil, nil, 'New Brunswick', 'ocd-division/country:ca/province:nb', organization.sources[0][:note], organization.sources[0][:url], organization.contact_details.address, organization.extras[:contact_point], organization.contact_details.email, nil, nil, nil, ] end end # To keep this example short, we'll just scrape the departments and agencies # of the Government of New Brunswick. def scrape_organizations url = 'http://www1.gnb.ca/cnb/DsS/display-e.asp?typyofPublicBodyID=1' doc = get(url) doc.xpath('//table[4]//table').each do |table| organization = Pupa::Organization.new organization.name = table.at_xpath('.//u').text address = table.text.strip[/\A#{Regexp.escape(organization.name)}(.+?)(?=Co-ordinator:|Email:|Phone:|Fax:)/m, 1].gsub(/[[:space:]]{2,}/, "\n").strip email = clean(table.at_xpath('.//a/@href').value).sub(/\Amailto:/, '') contact_detail = table.at_xpath('.//u[text()="Co-ordinator"]').next.text.sub(/\A: /, '') organization.add_contact_detail('address', address) organization.add_extra(:contact_detail, contact_detail) organization.add_contact_detail('email', email) organization.add_source(url, note: 'New Brunswick Directory of Public Bodies') dispatch(organization) end end end PublicBodyProcessor.add_scraping_task(:organizations) runner = Pupa::Runner.new(PublicBodyProcessor) # Registers the `csv` action, so that we can run it with: # # ruby organization.rb --action csv > output.csv runner.add_action(name: 'csv', description: 'Output organizations as CSV') runner.run(ARGV) # You've won at Pupa.rb! Explore the [class and method documentation](http://rdoc.info/gems/pupa) # to learn how to do even more with Pupa.rb.