# Modeling in Python

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import sklearn as sl

  from pandas.core import datetools


In [21]:
data = {"x0" : [1, 2, 3, 4, 5], 
       "x1" : [0.01, -0.01, 0.25, -4.1, 0.0], 
       "y" : [-1.5, 0.0, 3.6, 1.3, -2.0]}
df = pd.DataFrame(data)
df

Unnamed: 0,x0,x1,y
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


In [9]:
df.values

array([[ 1.  ,  0.01, -1.5 ],
       [ 2.  , -0.01,  0.  ],
       [ 3.  ,  0.25,  3.6 ],
       [ 4.  , -4.1 ,  1.3 ],
       [ 5.  ,  0.  , -2.  ]])

In [10]:
df.columns

Index(['x0', 'x1', 'y'], dtype='object')

In [17]:
df.columns = ["one", "two", "three"]
df

Unnamed: 0,one,two,three
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


## patsy

In [19]:
import patsy

The `patsy.dmatrices` function takes a formula string along with a dataset and produces design matrices for a linear model:

In [22]:
y, X = patsy.dmatrices("y ~ x0 + x1", data = df)

In [23]:
y

DesignMatrix with shape (5, 1)
     y
  -1.5
   0.0
   3.6
   1.3
  -2.0
  Terms:
    'y' (column 0)

In [24]:
X

DesignMatrix with shape (5, 3)
  Intercept  x0     x1
          1   1   0.01
          1   2  -0.01
          1   3   0.25
          1   4  -4.10
          1   5   0.00
  Terms:
    'Intercept' (column 0)
    'x0' (column 1)
    'x1' (column 2)

## statsmodels

In [28]:
model = sm.OLS(y, X)

In [29]:
fit = model.fit()

In [35]:
print(fit.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.042
Model:                            OLS   Adj. R-squared:                 -0.915
Method:                 Least Squares   F-statistic:                   0.04431
Date:                Mon, 30 Oct 2017   Prob (F-statistic):              0.958
Time:                        03:13:21   Log-Likelihood:                -10.515
No. Observations:                   5   AIC:                             27.03
Df Residuals:                       2   BIC:                             25.86
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.3129      3.313      0.094      0.9



## scikit-learn

In [36]:
from sklearn.linear_model import LogisticRegression

In [40]:
train = pd.read_csv("datasets/titanic/train.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [41]:
impute_value = train['Age'].median()
train['Age'] = train['Age'].fillna(impute_value)
train['IsFemale'] = (train['Sex'] == 'female').astype(int)
predictors = ['Pclass', 'IsFemale', 'Age']
X_train = train[predictors].values
y_train = train['Survived'].values

In [37]:
model = LogisticRegression()

In [42]:
fit = model.fit(X_train, y_train)

In [45]:
print(fit)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
