# Download this example's [Ruby code](https://raw.githubusercontent.com/jpmckinney/pupa-ruby/gh-pages/docs/legislator.rb)
# to run locally.
#
# The [cat.rb](http://jpmckinney.github.io/pupa-ruby/docs/cat.html) example goes
# over the basics of using Pupa.rb, and [bill.rb](http://jpmckinney.github.io/pupa-ruby/docs/bill.html)
# covers how to relate objects and how to separate scraping tasks for different
# types of data. This will explain how to run, for example, different methods to
# scrape legislators depending on the legislative term - particularly useful if
# a data source changes format from year to year.
require 'pupa'

require 'nokogiri'

# parl.gc.ca uses ASP.NET forms, so we need [bigger guns](http://mechanize.rubyforge.org/).
require 'mechanize'

class LegislatorProcessor < Pupa::Processor
  # The data source publishes information from different parliaments in
  # different formats. We override `scraping_task_method` to select the method
  # used to scrape legislators according to the parliament.
  def scraping_task_method(task_name)
    if task_name == :people
      # If the task is to scrape people and a parliament is given, we select a
      # method according to the parliament.
      if @options.key?('parliament')
        if @options['parliament'].to_i >= 36
          "scrape_people_36th_to_date"
        else
          "scrape_people_1st_to_35th"
        end
      # If no parliament is given, we assume the parliament is recent, as it is
      # more common to scrape current data than historical data.
      else
        "scrape_people_36th_to_date"
      end
    # Otherwise, we use `scraping_task_method`'s default behavior for other
    # scraping tasks.
    else
      super
    end
  end

  # A helper method to put name components in a typical order.
  def swap_first_last_name(name)
    name.strip.match(/\A([^,]+?), ([^(]+?)(?: \(.+\))?\z/)[1..2].
      reverse.map{|component| component.strip.squeeze(' ')}.join(' ')
  end

  def scrape_people_36th_to_date
    url = 'http://www.parl.gc.ca/MembersOfParliament/MainMPsCompleteList.aspx?TimePeriod=Historical&Language=E'
    doc = if @options.key?('parliament')
      # Since we aren't using the default Faraday HTTP client, we manually
      # configure the Mechanize client to use Pupa.rb's logger.
      client = Mechanize.new
      client.log = Pupa::Logger.new('mechanize', level: @level)
      page = client.get(url)
      page.form['MasterPage$MasterPage$BodyContent$PageContent$Content$ListCriteriaContent$ListCriteriaContent$ucComboParliament$cboParliaments'] = @options['parliament']
      page.form.submit.parser
    else
      get(url)
    end

    doc.css('#MasterPage_MasterPage_BodyContent_PageContent_Content_ListContent_ListContent_grdCompleteList tr:gt(1)').each do |row|
      person = Pupa::Person.new
      person.name = swap_first_last_name(row.at_css('td:eq(1)').text)
      dispatch(person)
    end
  end

  def scrape_people_1st_to_35th
    list_url = 'http://www.parl.gc.ca/Parlinfo/Lists/Members.aspx?Language=E'
    page_url = 'http://www.parl.gc.ca/Parlinfo/Lists/Members.aspx?Language=E&Parliament=%s&Riding=&Name=&Party=&Province=&Gender=&New=False&Current=False&First=False&Picture=False&Section=False&ElectionDate='
    doc = get(list_url)
    value = doc.at_xpath("//select[@id='ctl00_cphContent_cboParliamentCriteria']/option[starts-with(.,'#{@options['parliament']}')]/@value").value
    doc = get(page_url % value)

    doc.css('tr:gt(1)').each do |row|
      person = Pupa::Person.new
      person.name = swap_first_last_name(row.at_css('td:eq(1)').text)
      dispatch(person)
    end
  end
end

LegislatorProcessor.add_scraping_task(:people)

# To add scraping method selection criteria when running the processor, call
# `legislator.rb` following the pattern:
#
#     ruby legislator.rb [options] -- [criteria]
#
# So, for example, to scrape and import legislators from the 37th parliament:
#
#     ruby legislator.rb -- parliament 37
#
# Or, to scrape but not import legislators from the 12th parliament:
#
#     ruby legislator.rb --action scrape -- parliament 12
runner = Pupa::Runner.new(LegislatorProcessor)
runner.run(ARGV)

# Tired of scraping and importing data? See [organization.rb](http://jpmckinney.github.io/pupa-ruby/docs/organization.html)
# to learn how to transform scraped data with Pupa.rb.