import urllib2
import json
import lxml.html
import pandas as pd
import numpy as np
act = "Angelina Jolie"
act = urllib2.quote(act.encode("utf-8"))
response = urllib2.urlopen('http://www.imdb.com/xml/find?json=1&nr=1&nm=on&q='+act)
data = json.load(response)
try:
act_id = data['name_exact'][0]['id']
except:
act_id = data['name_popular'][0]['id']
url = urllib2.urlopen('http://www.imdb.com/name/%s/'%(act_id))
html = url.read()
tree = lxml.html.fromstring(html)
elements = tree.find_class("filmo-row")
movie_list = []
for element in elements:
[movie_role, movie_id] = element.get('id').split("-")
if (movie_role == "actor") or (movie_role == "actress"):
movie_url = urllib2.urlopen("http://www.omdbapi.com/?i=%s&tomatoes=true"%(movie_id))
movie_data = json.load(movie_url)
if movie_data['Response'] == "True":
print "Title: ",movie_data['Title']
print "Rating: ",movie_data['imdbRating']
movie_values = [movie_id, movie_data['Title'], movie_data['Year'], movie_data['Released'], movie_data['Metascore'], movie_data['imdbRating'], movie_data['tomatoRating']]
movie_list.append(movie_values)
df = pd.DataFrame(movie_list, columns=['id', 'Title', 'Year', 'Released', 'Metascore', 'imdbRating', 'tomatoRating'])
df.set_index('id')
df.imdbRating = pd.to_numeric(df.imdbRating, errors='coerce')
print df.imdbRating.describe()
print df[['Title', 'Year','imdbRating']].sort_values("imdbRating",ascending=0).head()
print df[['Title', 'Year','imdbRating']].sort_values("imdbRating",ascending=1).head()