"""
Data input with Pandas for Data Bootcamp course.

Course materials
* http://databootcamp.nyuecon.com/
* https://github.com/NYUDataBootcamp/Materials

Written by Dave Backus, August 2015
Created with Python 3.4

Edited by Chase Coleman and Spencer Lyon, October 2016
Created with Python 3.5
"""

#%%
"""
Read csv file from internet (and why we like csv's)
The result is a data frame:  like a sheet with row and column labels
"""
import pandas as pd

# read file from url
url1 = 'https://raw.githubusercontent.com/NYUDataBootcamp'
url2 = '/Materials/master/Data/test.csv'
url  = url1 + url2
df = pd.read_csv(url)

# if the internet is down
#df_fromdict = pd.DataFrame({'name': ['Dave', 'Chase', 'Spencer'],
#            'x1': [1, 4, 5], 'x2': [2, 3, 6], 'x3': [3.5, 4.3, 7.8]})


#%%

# windows users only
# folder = "\C:\Users"  # WILL NOT WORK
folder = "\\C:\\Users"  # Good
folder = "/C:/Users"  # Good


# NEED TO CHANGE TO YOUR OWN PATH HERE
folder = "/Users/sglyon/Teaching/NYUDataBootcamp/Materials/Data/"
csv_file = folder + "test.csv"
df = pd.read_csv(csv_file)


#%%
"""
Examples
"""
import pandas as pd

# Penn World Table
url = 'http://www.rug.nl/research/ggdc/data/pwt/v81/pwt81.xlsx'
pwt = pd.read_excel(url, sheetname='Data')

#%%
# World Economic Outlook
url1 = 'https://www.imf.org/external/pubs/ft/weo/'
url2 = '2015/02/weodata/WEOOct2015all.xls'
weo = pd.read_csv(url1+url2,
                  sep='\t',                 # \t = tab
                  thousands=',',            # kill commas
                  na_values=['n/a', '--'])  # missing values

#%%
# PISA test scores
url = 'http://dx.doi.org/10.1787/888932937035'
pisa = pd.read_excel(url,
                     skiprows=18,       # skip the first 18 rows
                     skipfooter=7,      # skip the last 7
                     parse_cols=[0,1,9,13],   # select columns of interest
                     index_col=0,       # set the index as the first column
                     header=[0,1]       # set the variable names
                     )


#%%
pisa = pisa.dropna()                            # drop blank lines
pisa.columns = ['Math', 'Reading', 'Science']   # simplify variable names
pisa['Math'].plot(kind='barh', figsize=(5, 12))

#%%
# UN population data
url1 = 'http://esa.un.org/unpd/wpp/DVD/Files/'
url2 = '1_Indicators%20(Standard)/EXCEL_FILES/1_Population/'
url3 = 'WPP2015_POP_F07_1_POPULATION_BY_AGE_BOTH_SEXES.XLS'
url = url1 + url2 + url3

cols = [2, 4, 5] + list(range(6,28))
est = pd.read_excel(url, sheetname=0, skiprows=16, parse_cols=cols)

#%%
# income by colleage major
url1 = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/'
url2 = 'college-majors/recent-grads.csv'
url = url1 + url2
df538 = pd.read_csv(url)
df538 = df538.set_index("Major")
df538["Median"].plot(kind="barh", figsize=(5, 12))

#%%
# IMDb movies and parts
# WARNING: this file is approx 200 MB -- this might take a while
url  = 'http://pages.stern.nyu.edu/~dbackus/Data/cast.csv'
cast = pd.read_csv(url, encoding='utf-8')

#%% first 2016 presdidential debate
# NOTE: data came from here:
# https://www.kaggle.com/mrisdal/2016-us-presidential-debates
url1 = "https://raw.githubusercontent.com/NYUDataBootcamp/Materials/"
url2 = "master/Data/pres_debate_2016.csv"
url= url1 + url2
debate = pd.read_csv(url)

# who spoke more
trump = debate[debate["Speaker"] == "Trump"]
clinton = debate[debate["Speaker"] == "Clinton"]

print("Length of Trumps's talking ", len(trump["Text"].sum()))
print("Length of Clinton's talking ", len(clinton["Text"].sum()))

#%%
"""
APIs
"""
from pandas_datareader import data  # Package to access FRED
import datetime as dt               # package to handle dates

start = dt.datetime(2010, 1, 1)  # start date
codes = ['GDPC1', 'PCECC96']     # real GDP, real consumption
fred  = data.DataReader(codes, 'fred', start)
fred = fred/1000                # convert billions to trillions

fred.plot()

#%%
# World Bank
from pandas_datareader import wb   # World Bank api

var = ['NY.GDP.PCAP.PP.KD']         # GDP per capita
iso = ['USA', 'FRA', 'JPN', 'CHN', 'IND', 'BRA', 'MEX']  # country codes
year = 2013
wbdf = wb.download(indicator=var, country=iso, start=year, end=year)

#%%
wbdf = wbdf.reset_index(level='year', drop=True)
wbdf.plot(kind='barh')

#%%
# Fama-French equity returns
from pandas_datareader import data  # Package to access FF

ff = data.DataReader('F-F_Research_Data_factors', 'famafrench')[0]
ff.columns = ['xsm', 'smb', 'hml', 'rf']      # rename variables

#%%
"""
Review
"""
data = {'EG.ELC.ACCS.ZS': [53.2, 47.3, 85.4, 22.1],    # access to elec (%)
        'IT.CEL.SETS.P2': [153.8, 95.0, 130.6, 74.8],  # cell contracts per 100
        'IT.NET.USER.P2': [11.5, 12.9, 41.0, 13.5],    # internet access (%)
        'Country': ['Botswana', 'Namibia', 'South Africa', 'Zambia']}
af = pd.DataFrame(data)