# 2017-08-18 Building the Data Download Function

## Function (Including Packages)



In [1]:
# define a function to download the data if not already downloaded, and then read it in

import pandas_datareader
from pandas import DataFrame, Series
import pandas as pd
import os
from urllib.request import urlretrieve

def getdata_read_or_download(filename, source_URL, force_download = False):
    '''Use pandas to read in data from a specified local file in the current
    working directory, or download data from a specified source URL if the 
    local file does not exist in the current working directory. Download
    can be forced if the local file is corrupt or simply needs to be updated.
    
    Parameters:
    ===========
    
    filename : string
        location of already-dowloaded data in current working directory
    source_URL : string
        location of data on internet
    force_download: boolean (optional)
        if True, force redownload of data
        
    Returns:
    ========
    
    datafame : pandas dataframe
        the data file for the analysis     
    '''
    
    if ((force_download == True) or not os.path.exists(filename)):
        urlretrieve(source_URL,filename)
    dataframe = pd.read_csv(filename)
    return dataframe

In [12]:
!cd getdata_read_or_download
! pwd

/Users/delong/Dropbox/jupyter notebook files (.ipynb)/2017-08-05-delong-jupyter


## Tests:

In [2]:
# Test: read in the 2014 data from the web

ccuds_pandp_data = getdata_read_or_download(filename = "pandp.csv", 
    source_URL = "http://delong.typepad.com/2017-08-15-distance-to-frontier-2014-3.csv")

# ccuds_pandp_data = pd.read_csv(
#    'http://delong.typepad.com/2017-08-15-distance-to-frontier-2014-3.csv')

In [3]:
# Test: check to see if the data is in a comprehensible format...

ccuds_pandp_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189 entries, 0 to 188
Data columns (total 4 columns):
code                               189 non-null object
country                            189 non-null object
distance_to_frontier_2014          189 non-null float64
national_income_per_capita_2014    189 non-null int64
dtypes: float64(1), int64(1), object(2)
memory usage: 6.0+ KB


In [4]:
# Test: check to see if the data looks like the right data...

ccuds_pandp_data

Unnamed: 0,code,country,distance_to_frontier_2014,national_income_per_capita_2014
0,TCD,Chad,32.06,2141
1,CAF,Central African Republic,32.75,578
2,ERI,Eritrea,32.81,1140
3,SSD,South Sudan,34.07,2574
4,LBY,Libya,35.43,14887
5,ZAR,"Congo, Dem. Rep.",37.80,768
6,VEN,"Venezuela, RB",38.81,16666
7,COG,"Congo, Rep.",40.24,5905
8,AFG,Afghanistan,40.78,1877
9,HTI,Haiti,42.82,1670
