# Download this example's [Ruby code](https://raw.githubusercontent.com/jpmckinney/pupa-ruby/gh-pages/docs/bill.rb)
# to run locally.
#
# The [cat.rb](http://jpmckinney.github.io/pupa-ruby/docs/cat.html) example goes
# over the basics of using Pupa.rb. This covers how to relate objects and how to
# separate scraping tasks for different types of data.
require 'pupa'

require 'nokogiri'

# Defines a new class to model legislative bills. In this example, we will
# simply scrape the names of bills and associate each bill with a sponsor and a
# legislative body.
class Bill
  include Pupa::Model

  attr_accessor :number, :name, :sponsor_id, :organization_id
  attr_reader :sponsor, :organization

  # When saving scraped objects to a database, these foreign keys will be used
  # to derive an evaluation order.
  foreign_key :sponsor_id, :organization_id

  # Sometimes, you may not know the ID of an existing foreign object, but you
  # may have other information to identify it. In that case, put the information
  # you have in a property named after the foreign key without the `_id` suffix:
  # for example, `sponsor` for `sponsor_id`. Before saving the object to the
  # database, Pupa.rb will use this information to identify the foreign object.
  foreign_object :sponsor, :organization

  # We want to dump all properties, including foreign objects, to JSON after
  # scraping. However, we do not want to import foreign objects into MongoDB.
  # Pupa.rb automatically excludes foreign objects during import.
  dump :number, :name, :sponsor_id, :organization_id, :sponsor, :organization

  # Overrides the `sponsor=` setter to automatically add the `_type` property,
  # instead of having to add it each time in the processor.
  def sponsor=(sponsor)
    @sponsor = {_type: 'pupa/person'}.merge(sponsor)
  end

  def organization=(organization)
    @organization = {_type: 'pupa/organization'}.merge(organization)
  end

  def fingerprint
    to_h.slice(:number)
  end

  def to_s
    name
  end
end

# Scrapes legislative information about the Parliament of Canada.
class ParliamentOfCanada < Pupa::Processor
  # Instead of defining a single `scrape_objects` method to perform all the
  # scraping, we define a scraping task for each type of data we want to scrape:
  # people, organizations and bills.
  #
  # This will let us later, for example, run each task on a different schedule.
  # Bill data is updated more frequently than person data; we would therefore
  # run the bills task more frequently.
  #
  # See the [`scraping_task_method`](https://github.com/jpmckinney/pupa-ruby/blob/master/lib/pupa/processor.rb#L222)
  # documentation for more information on the naming of scraping methods.
  def scrape_people
    doc = get('http://www.parl.gc.ca/MembersOfParliament/MainMPsCompleteList.aspx?TimePeriod=Historical&Language=E')
    doc.css('#MasterPage_MasterPage_BodyContent_PageContent_Content_ListContent_ListContent_grdCompleteList tr:gt(1)').each do |row|
      person = Pupa::Person.new
      person.name = row.at_css('td:eq(1)').text.match(/\A([^,]+?), ([^(]+?)(?: \(.+\))?\z/)[1..2].
        reverse.map{|component| component.strip.squeeze(' ')}.join(' ')
      # Some bills omit sponsors' middle names, so we add an alternate name that
      # omits any middle names.
      components = person.name.split(' ')
      person.add_name("#{components.first} #{components.last}")
      dispatch(person)
    end
  end

  # Hardcodes the top-level organizations within Parliament.
  def scrape_organizations
    parliament = Pupa::Organization.new(name: 'Parliament of Canada')
    dispatch(parliament)

    house_of_commons = Pupa::Organization.new(name: 'House of Commons', parent_id: parliament._id)
    dispatch(house_of_commons)

    senate = Pupa::Organization.new(name: 'Senate', parent_id: parliament._id)
    dispatch(senate)
  end

  def scrape_bills
    doc = get('http://www.parl.gc.ca/LegisInfo/Home.aspx?language=E&ParliamentSession=41-1&Mode=1&download=xml')
    doc['Bills']['Bill'].each do |row|
      # Skip Senate bills, since we currently only scrape Members of Parliament.
      next if row['BillNumber']['prefix'] == 'S'

      bill = Bill.new
      bill.number = row['BillNumber']['prefix'] + row['BillNumber']['number']
      bill.name = row['BillTitle']['Title'].find{|x| x['language'] == 'en'}['__content__']
      # Here, we tell the Bill everything we know about the sponsor and the
      # legislative body. Pupa.rb will later determine which objects match the
      # given information.
      name = row['SponsorAffiliation']['Person']['FullName']
      bill.sponsor = {
        '$or' => [
          {'name' => name},
          {'other_names.name' => name},
        ],
      }
      bill.organization = {
        name: row['BillNumber']['prefix'] == 'C' ? 'House of Commons' : 'Senate',
      }
      dispatch(bill)
    end
  end
end

ParliamentOfCanada.add_scraping_task(:bills)
ParliamentOfCanada.add_scraping_task(:organizations)
ParliamentOfCanada.add_scraping_task(:people)

# By default, if you run `bill.rb`, it will perform all scraping tasks and
# import all the scraped objects into the database. Use the `--action` and
# `--task` switches to control the processor's behavior.
runner = Pupa::Runner.new(ParliamentOfCanada)
runner.run(ARGV)

# Ready for more? Check out the next example: [legislator.rb](http://jpmckinney.github.io/pupa-ruby/docs/legislator.html).