In [1]:
import pandas as pd

pd.set_option("display.precision", 3)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

import warnings

import janitor
import numpy as np
import pingouin as pg

warnings.filterwarnings('ignore')

_url = "https://vincentarelbundock.github.io/Rdatasets/csv/wooldridge/sleep75.csv"
drop_var = ['case', 'leis1', 'leis2', 'leis3']
df = (pd.read_csv(_url, index_col=0)
      .drop(drop_var, axis=1)
#       .assign(lspsepay=lambda df: np.log1p(df.spsepay))
     )
df.head(3)

Unnamed: 0,age,black,clerical,construc,educ,earns74,gdhlth,inlf,smsa,lhrwage,...,spwrk75,totwrk,union,worknrm,workscnd,exper,yngkid,yrsmarr,hrwage,agesq
1,32,0,0.0,0.0,12,0,0,1,0,1.95586,...,0,3438,0,3438,0,14,0,13,7.07,1024
2,31,0,0.0,0.0,14,9500,1,1,0,0.35767,...,0,5020,0,5020,0,11,0,0,1.43,961
3,44,0,0.0,0.0,17,42500,1,1,1,3.02189,...,1,2815,0,2815,0,21,0,0,20.53,1936


In [2]:
# Prep variable lablels (fold cell)
# varlabels: http://fmwww.bc.edu/ec-p/data/wooldridge/sleep75.des
df_label = (pd.read_csv('data/sleep75-des.csv', encoding="ISO-8859-1")
            .assign(label=lambda df: df['des'].str.encode('ascii', 'ignore').str.decode('ascii'))
            .drop(['des'], axis=1)
            .set_index('var')
            .drop(drop_var)
            .reset_index()
           )

df_label.head(3)

Unnamed: 0,var,group,label
0,age,age,in years
1,black,other factors,=1 if black
2,clerical,occupation,=1 if clerical worker


In [3]:
stdopts = {'relimp': False, 'remove_na': True}
x = [
    "age",
    "black",
    "clerical",
    "construc",
    "educ",
    "gdhlth",
    "inlf",
    "smsa",
    "lhrwage",
    "prot",
    "selfe",
    "south",
    "spsepay",
    "totwrk",
    "yrsmarr",
]
lm_all = (pg.linear_regression(df[x], df['sleep'], **stdopts)
          .assign(model='all')
         )
lm_male = (pg.linear_regression(df.query('male==1')[x], df.query('male==1')['sleep'], **stdopts)
          .assign(model='men')
          )
lm_female = (pg.linear_regression(df.query('male==0')[x], df.query('male==0')['sleep'], **stdopts)
          .assign(model='women')
            )
lm_kids = (pg.linear_regression(df.query('yngkid==1')[x], df.query('yngkid==1')['sleep'], **stdopts)
          .assign(model='young kids')
            )
df_results = (pd.concat([lm_all, lm_female, lm_male, lm_kids])
              .query('names!="Intercept"')
              .reset_index(drop=True)
               # Get labels
              .rename_column('names', 'var')
              .merge(df_label, how='left', on='var', validate='m:1')
              .sort_values(['var', 'model', 'group'])
              .reset_index(drop=True)
              # Tidy up columns
              .rename_column("CI[2.5%]", "ll")
              .rename_column("CI[97.5%]", "hl")
             )
df_results

Unnamed: 0,var,coef,se,T,pval,r2,adj_r2,ll,hl,model,group,label
0,age,0.99489,1.96925,0.50521,0.61362,0.12729,0.10366,-2.87382,4.8636,all,age,in years
1,age,1.48289,2.88546,0.51392,0.60772,0.18038,0.13911,-4.19724,7.16302,men,age,in years
2,age,0.9532,2.94321,0.32386,0.74634,0.10911,0.05342,-4.84672,6.75312,women,age,in years
3,age,22.63402,15.49532,1.4607,0.14931,0.17815,-0.01362,-8.36124,53.62927,young kids,age,in years
4,black,-84.79661,82.15012,-1.03222,0.30245,0.12729,0.10366,-246.18572,76.59249,all,other factors,=1 if black
5,black,-132.96711,132.58117,-1.00291,0.31678,0.18038,0.13911,-393.95764,128.02342,men,other factors,=1 if black
6,black,-68.35063,108.90175,-0.62764,0.53088,0.10911,0.05342,-282.95362,146.25236,women,other factors,=1 if black
7,black,-119.74104,296.43295,-0.40394,0.68769,0.17815,-0.01362,-712.69523,473.21315,young kids,other factors,=1 if black
8,clerical,22.50583,48.96326,0.45965,0.64596,0.12729,0.10366,-73.68559,118.69725,all,occupation,=1 if clerical worker
9,clerical,-229.27596,102.88822,-2.2284,0.02665,0.18038,0.13911,-431.81492,-26.737,men,occupation,=1 if clerical worker


In [4]:
df_results.to_csv('../examples/data/sleep-mmodel.csv', index=False)

In [5]:
# _cols = ['var', 'label', 'coef', 'model', 'group', 'pval', 'll', 'hl']
# df_results[_cols].head(6).to_markdown()