#Z0096 # import from python libraries and modules import pandas as pd import numpy as np import matplotlib.pyplot as plt from math import ceil import scipy.stats as stats # import data manipulation tools from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler # import from created modules from acquire import acquire_mvp, get_sql #################### Prepare Data #################### def shed_zscore_outliers(df, exclude=None): ''' Takes in DataFrame and removed rows with values that are more than three standard deviations from the mean. exclude=None will pass all columns into check, default behavior Can pass column name or list of column names as 'strings' to exclude from checking z-scores ''' # check if any columns are to be excluded if exclude == None: # reassign DataFrame with all values more than 3 standard # deviations from mean df = df[(np.abs(stats.zscore(df)) < 3).all(axis=1)] return df else: # assign DataFrame with non-excluded values more than 3 # standard deviations from mean df = df[(np.abs( stats.zscore(df.loc[:, df.columns != exclude])) < 3).all(axis=1)] return df def make_dummies(df, cols): ''' Takes in a DataFrame and list of columns to split that column into one for each while dropping the first ''' # create dum-dums from passed cols list, drop first df = pd.get_dummies(df, columns=cols, drop_first=True) return df def split_data(df, target): ''' ''' # assign X, y X = df.drop(columns=target) y = pd.DataFrame(df[target]) # split X data into train, validate, and test datasets X_train_validate, X_test = train_test_split(X, test_size=0.2, random_state=19) X_train, X_validate = train_test_split(X_train_validate, test_size=0.25, random_state=19) # split y data into train, validate, and test datasets y_train_validate, y_test = train_test_split(y, test_size=0.2, random_state=19) y_train, y_validate = train_test_split(y_train_validate, test_size=0.25, random_state=19) # create explore DataFrame using train data df = pd.concat((X_train, y_train), axis=1) return (df, X_train, y_train, X_validate, y_validate, X_test, y_test) def prepare_mvp(use_csv=False): ''' Takes the DataFrame from acquire_mvp function and prepares a DataFrame to obtain and explore the project MVP, converting the fips column into human readable county names ''' # acquire mvp data from database df = acquire_mvp(use_csv=use_csv) # remove outliers more than 3 stdev from mean df = shed_zscore_outliers(df, exclude='fips') # remove 0 values from bedrooms and bathrooms df = df.loc[((df.bedrooms != 0) & (df.bathrooms != 0))] # replace fips numerical codes with string county names and rename column df.fips = np.where(df.fips == 6037.0, 'Los Angeles', df.fips) df.fips = np.where(df.fips == '6059.0', 'Orange', df.fips) df.fips = np.where(df.fips == '6111.0', 'Ventura', df.fips) df = df.rename(columns=({'fips':'county'})) return df #################### Visualize Data #################### def show_distributions(df): ''' Plots the distrubtion for all columns within dataframe and automatically scaled dimensions to the number of required subplots ''' # define dimensions for subplots n_rows = ceil(len(list(df)) / 4) n_cols = 4 n_plot = 0 # define figure dimensions plt.figure(figsize=(40, n_rows * 10)) # start loop for plotting variables for col in list(df): n_plot += 1 plt.subplot(n_rows, n_cols, n_plot) plt.hist(x=df[col]) plt.title('') plt.xlabel(col) plt.xscale('linear') # set figure title plt.suptitle('Distribution of Variables') plt.show() #################### Obtain Wrangled Zillow #################### query = ''' SELECT properties_2017.id AS property_id, bedroomcnt, calculatedbathnbr, calculatedfinishedsquarefeet, latitude, longitude, regionidzip, taxvaluedollarcnt FROM properties_2017 INNER JOIN predictions_2017 USING(parcelid) LEFT JOIN propertylandusetype USING(propertylandusetypeid) WHERE propertylandusetypeid IN (261, 263, 264, 266, 268, 275, 276, 279) AND CAST(transactiondate AS DATE) BETWEEN 20170501 AND 20170831 ORDER BY properties_2017.id ASC ;''' def wrangle_zillow(use_csv=True): # get df = get_sql(query, 'zillow', use_csv=use_csv) df = df.dropna() df = df.set_index('property_id') df['latzip'] = df.latitude / df.regionidzip df['lonzip'] = df.longitude / df.regionidzip df = shed_zscore_outliers(df) df, \ X_train, y_train, \ X_validate, y_validate, \ X_test, y_test = split_data(df, 'taxvaluedollarcnt') return X_train, y_train, X_validate, y_validate, X_test, y_test