# -*- coding: utf-8 -*-
""" Web scraping script using requests and bs4
    Used for web scraping tutorial session by the Stanford Data Science
    Drop-in. February 2, 2015.

    Requires:
    BeautifulSoup(http://www.crummy.com/software/BeautifulSoup)
        pip install beautifulsoup4
            or
        easy_install beautifulsoup4
    Requests(http://docs.python-requests.org/)
        pip install requests
            or
        easy_install requests

    This will collect and print  
    {Actor Name}    {Character Name}
    from the cast overview in a given IMDb.com movie page
@author: Jongbin Jung (jongbin at stanford edu)
"""
from bs4 import BeautifulSoup
import requests


def clean_text(text):
    """ Removes white-spaces before, after, and between characters

    :param text: the string to remove clean
    :return: a "cleaned" string with no more than one white space between
    characters
    """
    return ' '.join(text.split())


""" Go to the IMDb Movie page in link, and find the cast overview list.
    Prints tab-separated movie_title, actor_name, and character_played to
    stdout as a result.
"""
link = 'http://www.imdb.com/title/tt0111161/?ref_=nv_sr_1'
movie_page = requests.get(link)

# Strain the cast_list table from the movie_page
soup = BeautifulSoup(movie_page.content)

# Iterate through rows and extract the name and character
# Remember that some rows might not be a row of interest (e.g., a blank
# row for spacing the layout). Therefore, we need to use a try-except
# block to make sure we capture only the rows we want, without python
# complaining.
for row in soup.find_all('tr'):
    try:
        actor = clean_text(row.find(itemprop='name').text)
        character = clean_text(row.find(class_='character').text)

        print '\t'.join([actor, character])

    except AttributeError:
        pass