# Feature Data

### This notebook serves as demonstration of how our data is created and provides an overview of what kind of information is available in each feature in the data set

In [1]:
import train
import build_features

import pickle
import numpy as np
import pandas as pd
import importlib as imp
from scipy.stats import norm

pd.options.display.max_columns = 500
pd.options.display.max_rows = 20000
pd.options.display.width = 20000
pd.options.display.float_format = '{:,.4f}'.format

## Load the data dictionaries

In [2]:
d1 = pickle.load(open('../python objects/patientdata_20170823.pkl', 'rb'))
d1mom = pickle.load(open('../python objects/patient_mother_data_20170724.pkl', 'rb'))
lat_lon_dic = pickle.load(open('../python objects/lat_lon_data_20180329.pkl', 'rb'))
env_dic= pickle.load(open('../python objects/census_data_20170920.pkl', 'rb'))
d1mom_hist = pickle.load(open('../python objects/full_lutheran_mother_data.pkl', 'rb'))

## Set the data creation parameters

In [3]:
agex_low = 4.5
agex_high = 5.5
months_from = 0
months_to = 24

label_ix = {'underweight':0,'normal':1,'overweight':2,'obese':3,'class I severe obesity':4,'class II severe obesity':5}

## Convert the data from a nested dictionary format to a user-friendly matrix format

In [4]:
x1,y1,y1label,feature_headers,mrns = build_features.call_build_function(d1, d1mom, d1mom_hist, lat_lon_dic, env_dic, agex_low, agex_high, months_from, months_to, False, prediction='multi')

Processing 52,945 patients: 52945it [03:19, 265.70it/s]   


In [5]:
print('Number of patients: {0:,d}'.format(int(x1.shape[0])))
print('Number of features: {0:,d}'.format(int(x1.shape[1])))

Number of patients: 52,945
Number of features: 19,290


In [6]:
print('Total number of children in cohort: {0:,d}'.format(int(y1.shape[0])))
print('Total number of eligible children at age 5 (4.5-5.5): {0:,d}'.format(int(y1label.sum())))
print('Total number of ineligible children at age 5 (4.5-5.5): {0:,d}'.format(int(y1.shape[0] - y1label.sum())))

Total number of children in cohort: 52,945
Total number of eligible children at age 5 (4.5-5.5): 11,967
Total number of ineligible children at age 5 (4.5-5.5): 40,978


## Summarize the number of features by category

In [7]:
ft_info = []
ft_cats = {}

for f in feature_headers:
    try:
        ft_cats[f.split(':')[0]] += 1
    except:
        ft_cats[f.split(':')[0]] = 1
        
xsum = x1.sum(axis=0)
for k in ft_cats:
    cols = [f.startswith(k) for f in feature_headers]
    ft_info.append([k, ft_cats[k], (xsum[cols] > 0).sum(), (xsum[cols] >= 5).sum()])

ft_info = pd.DataFrame(ft_info, columns=['Feature Category', 'Number of Features', 'Number of Features with >0 Occurrences', 'Number of Features with >=5 Occurrences'])
ft_info.to_csv('../summary_statistics/feature_categories.csv', index=False)
ft_info

Unnamed: 0,Feature Category,Number of Features,Number of Features with >0 Occurrences,Number of Features with >=5 Occurrences
0,Diagnosis,566,160,107
1,Lab,549,73,57
2,Medication,2968,78,14
3,Gender,2,2,2
4,Ethnicity,2,2,2
5,Race,11,9,8
6,Vital,343,228,228
7,Number of Visits,1,1,1
8,Zipcode,652,207,86
9,Census,34,34,34


## Summarize the amount of information available in each feature

In [8]:
ft_info[['Number of Features','Number of Features with >0 Occurrences','Number of Features with >=5 Occurrences']].sum(axis=0)

Number of Features                         19290
Number of Features with >0 Occurrences      2311
Number of Features with >=5 Occurrences     1509
dtype: int64

In [9]:
xsum = (x1 > 0).sum(axis=0)
for f, s in zip(feature_headers, xsum):
    print('{0:s} has {1:,d} occurrences'.format(f,s))

Diagnosis:9ccsCCS CATEGORY:CCS CATEGORY DESCRIPTION has 0 occurrences
Diagnosis:9ccs0:No DX has 180 occurrences
Diagnosis:9ccs1:Tuberculosis has 0 occurrences
Diagnosis:9ccs2:Septicemia has 16 occurrences
Diagnosis:9ccs3:Oth bact inf has 25 occurrences
Diagnosis:9ccs4:Mycoses has 471 occurrences
Diagnosis:9ccs5:HIV infectn has 2 occurrences
Diagnosis:9ccs6:Hepatitis has 3 occurrences
Diagnosis:9ccs7:Viral infect has 2,001 occurrences
Diagnosis:9ccs8:Oth infectns has 67 occurrences
Diagnosis:9ccs9:Sexual Infxs has 3 occurrences
Diagnosis:9ccs10:Immuniz/scrn has 3,423 occurrences
Diagnosis:9ccs11:Hd/nck cancr has 0 occurrences
Diagnosis:9ccs12:Esoph cancer has 0 occurrences
Diagnosis:9ccs13:Stomch cancr has 0 occurrences
Diagnosis:9ccs14:Colon cancer has 0 occurrences
Diagnosis:9ccs15:Rctm/anus ca has 0 occurrences
Diagnosis:9ccs16:Liver/ibd ca has 1 occurrences
Diagnosis:9ccs17:Pancreas can has 0 occurrences
Diagnosis:9ccs18:GI/perit can has 0 occurrences
Diagnosis:9ccs19:Brnch/lng ca h

Medication:Neutrogena Facial Soap has 0 occurrences
Medication:Multi-Vitamin Daily has 0 occurrences
Medication:Promethazine-Codeine has 0 occurrences
Medication:Magnesium has 0 occurrences
Medication:Naprosyn has 0 occurrences
Medication:Diclofenac Sodium has 0 occurrences
Medication:Accutane has 0 occurrences
Medication:CVS Vitamin D3 has 0 occurrences
Medication:Mefloquine HCl has 0 occurrences
Medication:Ocuflox has 0 occurrences
Medication:Fleet Enema has 0 occurrences
Medication:Fluticasone Propionate  HFA has 0 occurrences
Medication:Mult-Vitamin/Fluoride has 0 occurrences
Medication:Childrens Chewable Multi Vits has 1 occurrences
Medication:Advair HFA has 0 occurrences
Medication:NyQuil has 0 occurrences
Medication:Augmentin ES-600 has 0 occurrences
Medication:Pseudoephedrine-Guaifenesin CR has 0 occurrences
Medication:Triple Antibiotic has 0 occurrences
Medication:Anusol-HC has 0 occurrences
Medication:Felbatol has 0 occurrences
Medication:Cholecalciferol-Vitamin C has 0 occur

Medication:Latex Gloves has 0 occurrences
Medication:BIPAP Machine has 0 occurrences
Medication:Gammagard S/D has 0 occurrences
Medication:Sinus Wash Saline Refills has 0 occurrences
Medication:Liner (Chucks) has 0 occurrences
Medication:One Daily Multivitamin/Iron has 0 occurrences
Medication:12 Hour Nasal Spray has 0 occurrences
Medication:Atomoxetine HCl has 0 occurrences
Medication:Tussin DM has 0 occurrences
Medication:Mercaptopurine has 0 occurrences
Medication:Poly-Vi-Sol has 10 occurrences
Medication:Beclomethasone Dipropionate has 0 occurrences
Medication:Guaifenesin AC has 0 occurrences
Medication:Bactericin has 0 occurrences
Medication:Systane has 0 occurrences
Medication:Neupogen has 0 occurrences
Medication:OXcarbazepine ER has 0 occurrences
Medication:Oxtellar XR has 0 occurrences
Medication:Ferrous Fumarate has 0 occurrences
Medication:Propylene Glycol has 0 occurrences
Medication:Robitussin Cough+ Chest Max St has 0 occurrences
Medication:Carafate has 0 occurrences
Medi

Medication:L-Carnitine has 0 occurrences
Medication:Maxalt-MLT has 0 occurrences
Medication:KLS Natural Psyllium Fiber has 0 occurrences
Medication:Enfamil Nutramigen LIPIL has 1 occurrences
Medication:Allergy Medication Childrens has 0 occurrences
Medication:Allergy Medicine has 0 occurrences
Medication:Cetaphil DermaControl Foam Wsh has 0 occurrences
Medication:A+D Diaper Rash has 0 occurrences
Medication:Aveeno Baby Calming Comfort has 0 occurrences
Medication:Peak Flow Meter-Inh Assist Dev has 0 occurrences
Medication:CVS Gummy Multivitamin Kids has 0 occurrences
Medication:Paroxetine HCl has 0 occurrences
Medication:EryPed 400 has 0 occurrences
Medication:EryPed 200 has 0 occurrences
Medication:AF-Ibuprofen Infant has 0 occurrences
Medication:Vitamin C Plus has 0 occurrences
Medication:Alclometasone Dipropionate has 0 occurrences
Medication:EQL Children Multivitamin/Iron has 0 occurrences
Medication:Cal-Gest Antacid has 0 occurrences
Medication:Vitamins A &amp; D has 0 occurrences

Zipcode:11552-birth has 1 occurrences
Zipcode:11273-birth has 0 occurrences
Zipcode:11202-birth has 0 occurrences
Zipcode:11427-birth has 0 occurrences
Zipcode:07307-birth has 0 occurrences
Zipcode:11205-birth has 8 occurrences
Zipcode:11249-birth has 0 occurrences
Zipcode:11428-birth has 0 occurrences
Zipcode:07111-birth has 0 occurrences
Zipcode:11215-birth has 106 occurrences
Zipcode:08096-birth has 0 occurrences
Zipcode:11516-birth has 0 occurrences
Zipcode:11250-birth has 0 occurrences
Zipcode:10010-birth has 0 occurrences
Zipcode:10705-birth has 0 occurrences
Zipcode:07036-birth has 0 occurrences
Zipcode:08879-birth has 0 occurrences
Zipcode:10469-birth has 0 occurrences
Zipcode:10316-birth has 0 occurrences
Zipcode:11232-birth has 441 occurrences
Zipcode:11436-birth has 1 occurrences
Zipcode:11209-birth has 168 occurrences
Zipcode:11228-birth has 95 occurrences
Zipcode:11218-birth has 217 occurrences
Zipcode:08854-birth has 0 occurrences
Zipcode:11385-birth has 4 occurrences
Zip

Maternal Diagnosis:10ccs2613:E Codes: Poisoning has 0 occurrences
Maternal Diagnosis:10ccs2614:E Codes: Struck by- against has 0 occurrences
Maternal Diagnosis:10ccs2615:E Codes: Suffocation has 0 occurrences
Maternal Diagnosis:10ccs2616:E Codes: Adverse effects of medical care has 0 occurrences
Maternal Diagnosis:10ccs2617:E Codes: Adverse effects of medical drugs has 0 occurrences
Maternal Diagnosis:10ccs2618:E Codes: Other specified and classifiable has 0 occurrences
Maternal Diagnosis:10ccs2619:e codes: other specified- nec has 0 occurrences
Maternal Diagnosis:10ccs2620:E Codes: Unspecified has 0 occurrences
Maternal Diagnosis:10ccs2621:E Codes: Place of occurrence has 0 occurrences
Newborn Diagnosis:9ccsCCS CATEGORY:CCS CATEGORY DESCRIPTION has 0 occurrences
Newborn Diagnosis:9ccs0:No DX has 18 occurrences
Newborn Diagnosis:9ccs1:Tuberculosis has 0 occurrences
Newborn Diagnosis:9ccs2:Septicemia has 0 occurrences
Newborn Diagnosis:9ccs3:Oth bact inf has 0 occurrences
Newborn Diagno

Second_Insur:WAGE WORKS has 0 occurrences
Second_Insur:BLUE CROSS COMM HMO has 0 occurrences
Second_Insur:HEALTHPLUS AMERIGROUP CAID HMO has 0 occurrences
Second_Insur:HEALTH PLUS CAID HMO has 7 occurrences
Second_Insur:MCD PENDING HMO LIFT has 0 occurrences
Second_Insur:NGHBRHD HLTH PVD MCD HMO has 0 occurrences
Second_Insur:FIDELIS CARE MCAID HMO has 0 occurrences
Second_Insur:WELLCARE MCAID HMO has 0 occurrences
Second_Insur:MAGNA HLTHCARE FHC 1500 has 0 occurrences
Second_Insur:METROPLUS MCAID HMO has 3 occurrences
Second_Insur:HEALTHFIRST MCAID HMO has 0 occurrences
Second_Insur:MEDICAID has 6 occurrences
Second_Insur:1199 LMC NON EMP has 1 occurrences
Second_Insur:MEDICAID  GME has 0 occurrences
Second_Insur:MEDICAID-2ND TO MCARE IP PART A has 1 occurrences
Second_Insur:UNITED HLTHCR FAM HLTH PL has 0 occurrences
Second_Insur:AETNA USHC COMM HMO has 0 occurrences
Second_Insur:GEICO has 0 occurrences
Second_Insur:UNITED HLTHCARE COMM HMO has 0 occurrences
Second_Insur:BLUE CROSS I

Maternal Maternal Lab History: HEMOGLOBIN A-prePregnancy has 0 occurrences
Maternal Maternal Lab History: HEMOGLOBIN A1-prePregnancy has 0 occurrences
Maternal Maternal Lab History: HEMOGLOBIN A1C-prePregnancy has 0 occurrences
Maternal Maternal Lab History: HEMOGLOBIN A2-prePregnancy has 0 occurrences
Maternal Maternal Lab History: HEMOGLOBIN C-prePregnancy has 0 occurrences
Maternal Maternal Lab History: HEMOGLOBIN F-prePregnancy has 0 occurrences
Maternal Maternal Lab History: HEMOGLOBIN OTHER-prePregnancy has 0 occurrences
Maternal Maternal Lab History: HEMOGLOBIN S-prePregnancy has 0 occurrences
Maternal Maternal Lab History: HEMOGLOBIN VARIANT-prePregnancy has 0 occurrences
Maternal Maternal Lab History: HEP B DNA PCR COPIES/ML-prePregnancy has 0 occurrences
Maternal Maternal Lab History: HEP B DNA QUANT PCR IU/ML-prePregnancy has 0 occurrences
Maternal Maternal Lab History: HEP B VIRUS(CALC)-prePregnancy has 0 occurrences
Maternal Maternal Lab History: HEP B VIRUS(RT-PCR)-prePre

Maternal Maternal Lab History: IG G SUBCLASS 4-firstTrimester has 0 occurrences
Maternal Maternal Lab History: IGA IMMUNOGLOBULIN-firstTrimester has 0 occurrences
Maternal Maternal Lab History: IgA Subclass 1-firstTrimester has 0 occurrences
Maternal Maternal Lab History: Iga Subclass 2-firstTrimester has 0 occurrences
Maternal Maternal Lab History: IgA Total-firstTrimester has 0 occurrences
Maternal Maternal Lab History: IgA-ELISA-firstTrimester has 0 occurrences
Maternal Maternal Lab History: IGF-1-firstTrimester has 0 occurrences
Maternal Maternal Lab History: IGG AND IGM-firstTrimester has 0 occurrences
Maternal Maternal Lab History: IGG COOMBS-firstTrimester has 0 occurrences
Maternal Maternal Lab History: IGG IMMUNOGLOBULIN-firstTrimester has 0 occurrences
Maternal Maternal Lab History: IGM IMMUNOGLOBULIN-firstTrimester has 0 occurrences
Maternal Maternal Lab History: IgM-ELISA-firstTrimester has 0 occurrences
Maternal Maternal Lab History: IMIPENEM-firstTrimester has 0 occurrenc

Maternal Maternal Lab History: LD-secondTrimester has 0 occurrences
Maternal Maternal Lab History: LDH-secondTrimester has 0 occurrences
Maternal Maternal Lab History: LDL-secondTrimester has 0 occurrences
Maternal Maternal Lab History: LDL Particle Number-secondTrimester has 0 occurrences
Maternal Maternal Lab History: LDL Size-secondTrimester has 0 occurrences
Maternal Maternal Lab History: LDL/HDL Ratio-secondTrimester has 0 occurrences
Maternal Maternal Lab History: LEAD-secondTrimester has 0 occurrences
Maternal Maternal Lab History: LEAD LEVEL-secondTrimester has 0 occurrences
Maternal Maternal Lab History: LETTUCE RAST KU-secondTrimester has 0 occurrences
Maternal Maternal Lab History: LEUKOCYTE ALKALINE PHOSPHATASE-secondTrimester has 0 occurrences
Maternal Maternal Lab History: LEUKOCYTE ESTERASE-secondTrimester has 0 occurrences
Maternal Maternal Lab History: LEVETIRACETAM-secondTrimester has 0 occurrences
Maternal Maternal Lab History: Levofloxacin-secondTrimester has 0 occu

Maternal Maternal Lab History: METHYLMALONIC ACID URINE-thirdTrimester has 0 occurrences
Maternal Maternal Lab History: METRONIDAZOLE-thirdTrimester has 0 occurrences
Maternal Maternal Lab History: MICRO/CREAT RATIO-thirdTrimester has 0 occurrences
Maternal Maternal Lab History: MICROALBUMIN-thirdTrimester has 0 occurrences
Maternal Maternal Lab History: MICROALBUMIN:CR RATIO-thirdTrimester has 0 occurrences
Maternal Maternal Lab History: MILK (F2)IGE-thirdTrimester has 0 occurrences
Maternal Maternal Lab History: MILK CLASS-thirdTrimester has 0 occurrences
Maternal Maternal Lab History: MILK RAST %-thirdTrimester has 0 occurrences
Maternal Maternal Lab History: MILK RAST KU-thirdTrimester has 0 occurrences
Maternal Maternal Lab History: MIN. OF UR COLLECT-thirdTrimester has 0 occurrences
Maternal Maternal Lab History: MISCELLANEOUS 1-thirdTrimester has 0 occurrences
Maternal Maternal Lab History: MISCELLANEOUS 3-thirdTrimester has 0 occurrences
Maternal Maternal Lab History: MISCELLAN

Maternal Maternal Lab History: NT MoM-postPregnancy has 113 occurrences
Maternal Maternal Lab History: NTQR-LOCATION ID-postPregnancy has 3 occurrences
Maternal Maternal Lab History: NTQR-ULTRASONGRPHER ID-postPregnancy has 1 occurrences
Maternal Maternal Lab History: NUCHAL TRANSLUCENCY-postPregnancy has 114 occurrences
Maternal Maternal Lab History: NUCLEATED RBC'S-postPregnancy has 1 occurrences
Maternal Maternal Lab History: NUM OF CELLS COUNTED-postPregnancy has 0 occurrences
Maternal Maternal Lab History: NY HBV DNA VIRAL LOG-postPregnancy has 4 occurrences
Maternal Maternal Lab History: O2 SATURATION-postPregnancy has 18 occurrences
Maternal Maternal Lab History: Oak Tree IgE-postPregnancy has 2 occurrences
Maternal Maternal Lab History: OAT (F7)IGE-postPregnancy has 0 occurrences
Maternal Maternal Lab History: OAT CLASS-postPregnancy has 0 occurrences
Maternal Maternal Lab History: OAT RAST %-postPregnancy has 0 occurrences
Maternal Maternal Lab History: OAT RAST KU-postPregnan

Maternal Maternal Lab History: PERENNIAL RYE RAST %-otherPregnancy has 0 occurrences
Maternal Maternal Lab History: PH-otherPregnancy has 65 occurrences
Maternal Maternal Lab History: PHENOBARBITAL-otherPregnancy has 0 occurrences
Maternal Maternal Lab History: PHENYTOIN-otherPregnancy has 0 occurrences
Maternal Maternal Lab History: PHOSPHOLIPIDS-otherPregnancy has 0 occurrences
Maternal Maternal Lab History: PHOSPHOROUS-otherPregnancy has 0 occurrences
Maternal Maternal Lab History: PIGEON DROPPINGS RAST KU-otherPregnancy has 0 occurrences
Maternal Maternal Lab History: PIP/TAZOBACTAM-otherPregnancy has 0 occurrences
Maternal Maternal Lab History: PLACENTAL ISOENZYMES-otherPregnancy has 0 occurrences
Maternal Maternal Lab History: PLASMA CELL-otherPregnancy has 0 occurrences
Maternal Maternal Lab History: PLATELET COUNT-otherPregnancy has 522 occurrences
Maternal Maternal Lab History: PLEASE CALL LAB-otherPregnancy has 0 occurrences
Maternal Maternal Lab History: PNH GRANULOCYTES-oth

Maternal Maternal Diagnosis:9ccs166:Oth male gen-firstTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs167:Breast dx-firstTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs168:PID-firstTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs169:Endometrios-firstTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs170:Prolapse-firstTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs171:Menstrual dx-firstTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs172:Ovarian cyst-firstTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs173:Menopausl dx-firstTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs174:Fem infertil-firstTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs175:Ot femal gen-firstTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs176:Contraceptiv-firstTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs177:Spont abortn-firstTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs1

Maternal Maternal Diagnosis:9ccs33:Kidny/rnl ca-thirdTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs34:Uriny org ca-thirdTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs35:Brain/ns can-thirdTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs36:Thyroid cncr-thirdTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs37:Hodgkin-s ds-thirdTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs38:Non-Hodg lym-thirdTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs39:Leukemias-thirdTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs40:Mult myeloma-thirdTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs41:Ot primry ca-thirdTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs42:2ndary malig-thirdTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs43:Malig neopls-thirdTrimester has 0 occurrences
Maternal Maternal Diagnosis:9ccs44:Neoplsm unsp-thirdTrimester has 0 occurrences
Maternal Maternal Diagnosis:9cc

Maternal Maternal Diagnosis:10ccs198:Ot infl skin-postPregnancy has 1 occurrences
Maternal Maternal Diagnosis:10ccs199:Ulcer skin-postPregnancy has 0 occurrences
Maternal Maternal Diagnosis:10ccs200:Oth skin dx-postPregnancy has 1 occurrences
Maternal Maternal Diagnosis:10ccs201:Infect arth-postPregnancy has 0 occurrences
Maternal Maternal Diagnosis:10ccs202:Rheum arth-postPregnancy has 0 occurrences
Maternal Maternal Diagnosis:10ccs203:Osteoarthros-postPregnancy has 0 occurrences
Maternal Maternal Diagnosis:10ccs204:Ot joint dx-postPregnancy has 0 occurrences
Maternal Maternal Diagnosis:10ccs205:Back problem-postPregnancy has 3 occurrences
Maternal Maternal Diagnosis:10ccs206:Osteoporosis-postPregnancy has 0 occurrences
Maternal Maternal Diagnosis:10ccs207:Patholog fx-postPregnancy has 0 occurrences
Maternal Maternal Diagnosis:10ccs208:Acq foot def-postPregnancy has 0 occurrences
Maternal Maternal Diagnosis:10ccs209:Ot acq defor-postPregnancy has 0 occurrences
Maternal Maternal Diagno

Maternal Maternal Procedure History:90648-prePregnancy has 0 occurrences
Maternal Maternal Procedure History:01369-prePregnancy has 0 occurrences
Maternal Maternal Procedure History:57800-prePregnancy has 0 occurrences
Maternal Maternal Procedure History:95908-prePregnancy has 0 occurrences
Maternal Maternal Procedure History:56605-prePregnancy has 0 occurrences
Maternal Maternal Procedure History:92611-prePregnancy has 0 occurrences
Maternal Maternal Procedure History:92526-prePregnancy has 0 occurrences
Maternal Maternal Procedure History:90834-prePregnancy has 0 occurrences
Maternal Maternal Procedure History:90675-prePregnancy has 0 occurrences
Maternal Maternal Procedure History:01060-prePregnancy has 0 occurrences
Maternal Maternal Procedure History:10003-prePregnancy has 0 occurrences
Maternal Maternal Procedure History:95992-prePregnancy has 0 occurrences
Maternal Maternal Procedure History:95861-prePregnancy has 0 occurrences
Maternal Maternal Procedure History:10021-prePregna

Maternal Maternal Procedure History:90384-secondTrimester has 0 occurrences
Maternal Maternal Procedure History:90721-secondTrimester has 0 occurrences
Maternal Maternal Procedure History:82489-secondTrimester has 0 occurrences
Maternal Maternal Procedure History:98929-secondTrimester has 0 occurrences
Maternal Maternal Procedure History:92504-secondTrimester has 0 occurrences
Maternal Maternal Procedure History:99407-secondTrimester has 0 occurrences
Maternal Maternal Procedure History:90654-secondTrimester has 0 occurrences
Maternal Maternal Procedure History:Influenza-secondTrimester has 0 occurrences
Maternal Maternal Procedure History:05176-secondTrimester has 0 occurrences
Maternal Maternal Procedure History:G8783-secondTrimester has 0 occurrences
Maternal Maternal Procedure History:S9439-secondTrimester has 0 occurrences
Maternal Maternal Procedure History:00100-secondTrimester has 0 occurrences
Maternal Maternal Procedure History:99214-secondTrimester has 0 occurrences
Maternal

Maternal Maternal Procedure History:20526-postPregnancy has 0 occurrences
Maternal Maternal Procedure History:65205-postPregnancy has 0 occurrences
Maternal Maternal Procedure History:92563-postPregnancy has 0 occurrences
Maternal Maternal Procedure History:S8262-postPregnancy has 0 occurrences
Maternal Maternal Procedure History:20551-postPregnancy has 0 occurrences
Maternal Maternal Procedure History:92568-postPregnancy has 0 occurrences
Maternal Maternal Procedure History:92567-postPregnancy has 0 occurrences
Maternal Maternal Procedure History:T1013-postPregnancy has 0 occurrences
Maternal Maternal Procedure History:11400-postPregnancy has 0 occurrences
Maternal Maternal Procedure History:29700-postPregnancy has 0 occurrences
Maternal Maternal Procedure History:0501F-postPregnancy has 0 occurrences
Maternal Maternal Procedure History:03080-postPregnancy has 0 occurrences
Maternal Maternal Procedure History:0545F-postPregnancy has 0 occurrences
Maternal Maternal Procedure History:59

In [10]:
print('Average number of occurrences per feature {0:4.2f} with standard deviation: {1:4.2f}'.format(np.mean(xsum), np.std(xsum)))

Average number of occurrences per feature 40.96 with standard deviation: 330.50


## Create the valid study cohort and output the relevant information for each feature if there are at least 5 occurrences in the data

**NOTE: in this step there will be less features with at least 5 occurrences because we are also filtering rows that do not have maternal data or do not have a valid BMI reading (10 > BMI < 40) in the data.**

In [11]:
x2, y2, y2label, mrns2, ix_filter, feature_headers2, corr_headers_filtered, corrs_matrix_filtered, ix_corr_headers = \
    train.prepare_data_for_analysis({}, {}, {}, {}, {},
        x1, y1, y1label[:,label_ix['obese']], feature_headers, mrns,
        agex_low, agex_high, months_from, months_to,
        filterSTR=[], # use both boys and girls
        variablesubset=[], # do not remove any features
        do_impute=False, # do not impute values
        do_normalize=False, # do not normalize the values
        min_occur=5, # use only features with meaningful information
        delay_print=False, # print out all information as it's available
        lasso_selection=False # do not use LASSO feature selection
    )

Using pre-prepared data

Original cohort size is: 52945 num features: 19290
total number of people who have a BMI measured: 11484
total number of people who have all filtered variables: 52945
total number of people who have maternal data available: 3451
intersection of the three above is: 3449
3449 patients selected..
1,249 features filtered with number of occurrences less than 5
filtered correlated features to: 1,032
corr matrix is filtered to size: (1249, 1032)
output is: average: 16.684, min: 10.470, max: 35.320
total patients: 3,449, positive: 642.00, negative: 2,807.00
normalizing output...
Using pre-prepared data
1698 features are binary
Predicting BMI at age: 4.5 to 5.5 years, from data in ages: 0 - 24 months
filtering patients with: []
total size: 3,449 x 1,249


In [12]:
char_table = []
cols = ['Variable', 'Total N', 'Total Average', 'Total Std Dev', 'Obese N', 'Obese Average', 'Obese Std Dev', 'Not Obese N', 'Not Obese Average', 'Not Obese Std Dev', 'Unadjusted Odds Ratio', 'Unadjusted OR Low', 'Unadjusted OR High', 'Relative Risk', 'p-value for OR']
y2pos_ix = (y2label > 0)
with np.errstate(divide='ignore', invalid='ignore'):
    for ix, h in enumerate(feature_headers2):
        bin_indicator = x2[:,ix].max()==1 and x2[:,ix].min()==0

        ix_total = (x2[:,ix] != 0)
        ix_total_pos = (y2label > 0) & (x2[:,ix] != 0)
        ix_total_neg = (y2label == 0) & (x2[:,ix] != 0)

        De = sum((y2label > 0) & (x2[:,ix] != 0)) * 1.0
        He = sum((y2label == 0) & (x2[:,ix] != 0)) * 1.0
        Dn = sum((y2label > 0) & (x2[:,ix] == 0)) * 1.0
        Hn = sum((y2label == 0) & (x2[:,ix] == 0)) * 1.0

        OR = (De/He)/(Dn/Hn)
        OR_sterror = np.sqrt(1/De + 1/He + 1/Dn + 1/Hn)
        OR_low, OR_high = np.exp(np.log(OR) - 1.96*OR_sterror), np.exp(np.log(OR) + 1.96*OR_sterror)

        RR = (De/(De+He))/(Dn/(Dn+Hn))

        md = x2[ix_total_pos,:][:,ix].mean() - x2[ix_total_neg,:][:,ix].mean()
        se = np.sqrt( np.var(x2[ix_total_pos,:][:,ix]) / len(x2[ix_total_pos,:][:,ix]) + np.var(x2[ix_total_neg,:][:,ix])/len(x2[ix_total_neg,:][:,ix]))
        lcl, ucl = md-2*se, md+2*se
        z = md/se

        pvalue = 2 * norm.cdf(-1*(np.abs(np.log(OR))/OR_sterror)) if bin_indicator else 2 * norm.cdf(-np.abs(z))
        char_table.append([
                h, ix_total.sum(), x2[ix_total,:][:,ix].mean(), x2[ix_total,:][:,ix].std(),
                ix_total_pos.sum(), x2[ix_total_pos,:][:,ix].mean() if not(bin_indicator) else 0, x2[ix_total_pos,:][:,ix].std() if not(bin_indicator) else 0,
                ix_total_neg.sum(), x2[ix_total_neg,:][:,ix].mean() if not(bin_indicator) else 0,  x2[ix_total_neg,:][:,ix].std() if not(bin_indicator) else 0,
                OR if bin_indicator else 0, OR_low if bin_indicator else 0, OR_high if bin_indicator else 0, RR if bin_indicator else 0, pvalue
        ])

ft_info = pd.DataFrame(char_table, columns=cols)
ft_info.to_csv('../summary_statistics/cohort_feature_summary_table.csv', index=False)

  **kwargs)
  keepdims=keepdims)


In [13]:
ft_info.sort_values(by='Variable')

Unnamed: 0,Variable,Total N,Total Average,Total Std Dev,Obese N,Obese Average,Obese Std Dev,Not Obese N,Not Obese Average,Not Obese Std Dev,Unadjusted Odds Ratio,Unadjusted OR Low,Unadjusted OR High,Relative Risk,p-value for OR
421,Census:Estimate; Median household income in th...,2179,44612.5438,13476.9961,429,44402.1772,13598.1422,1750,44664.1137,13446.6292,0.0,0.0,0.0,0.0,0.7201
438,Census:Estimate; Median household income in th...,2188,44606.8684,13486.0795,432,44362.6088,13567.6713,1756,44666.9596,13465.252,0.0,0.0,0.0,0.0,0.6757
422,Census:Percent households receiving food stamp...,2179,28.7569,10.9304,429,28.7103,11.3442,1750,28.7683,10.8265,0.0,0.0,0.0,0.0,0.9237
439,Census:Percent households receiving food stamp...,2188,28.7641,10.9359,432,28.7444,11.3267,1756,28.7689,10.8376,0.0,0.0,0.0,0.0,0.9677
418,Census:Percent with a disability; Estimate; To...,2179,8.9662,3.4274,429,9.3135,3.528,1750,8.881,3.3969,0.0,0.0,0.0,0.0,0.0219
435,Census:Percent with a disability; Estimate; To...,2188,8.9758,3.4628,432,9.3144,3.5174,1756,8.8925,3.4442,0.0,0.0,0.0,0.0,0.025
412,Census:Percent; Estimate; Percent bachelor's d...,2180,23.2917,12.4839,429,23.4744,12.7289,1751,23.247,12.4227,0.0,0.0,0.0,0.0,0.739
429,Census:Percent; Estimate; Percent bachelor's d...,2189,23.3025,12.4911,432,23.4215,12.7037,1757,23.2732,12.4381,0.0,0.0,0.0,0.0,0.8272
428,Census:Percent; Estimate; Percent high school ...,2180,63.8712,16.5442,429,64.6436,16.3247,1751,63.682,16.5921,0.0,0.0,0.0,0.0,0.2757
445,Census:Percent; Estimate; Percent high school ...,2189,63.8667,16.5469,432,64.534,16.331,1757,63.7026,16.5954,0.0,0.0,0.0,0.0,0.3447


### Create the boys subset of the data and output relevant information for each feature that has at least 5 occurrences in the data

In [14]:
x2_boys, y2_boys, y2label_boys, mrns2_boys, ix_filter_boys, feature_headers2_boys, corr_headers_filtered_boys, corrs_matrix_filtered_boys, ix_corr_headers_boys = \
    train.prepare_data_for_analysis({}, {}, {}, {}, {},
        x1, y1, y1label[:,label_ix['obese']], feature_headers, mrns,
        agex_low, agex_high, months_from, months_to,
        filterSTR=['Gender:0'], # only the boys data
        variablesubset=[], # do not remove any features
        do_impute=False, # do not impute values
        do_normalize=False, # do not normalize the values
        min_occur=5, # use only features with meaningful information
        delay_print=False, # print out all information as it's available
        lasso_selection=False # do not use LASSO feature selection
    )

Using pre-prepared data

Original cohort size is: 52945 num features: 19290
total number of people who have:  ['Gender:0 male']  is: 5775
total number of people who have a BMI measured: 11484
total number of people who have all filtered variables: 5775
total number of people who have maternal data available: 3451
intersection of the three above is: 1751
1751 patients selected..
1,045 features filtered with number of occurrences less than 5
filtered correlated features to: 834
corr matrix is filtered to size: (1045, 834)
output is: average: 16.875, min: 11.260, max: 30.480
total patients: 1,751, positive: 386.00, negative: 1,365.00
normalizing output...
Using pre-prepared data
1698 features are binary
Predicting BMI at age: 4.5 to 5.5 years, from data in ages: 0 - 24 months
filtering patients with: ['Gender:0']
total size: 1,751 x 1,045


  c /= stddev[:, None]
  c /= stddev[None, :]


In [15]:
char_table = []
cols = ['Variable', 'Total N', 'Total Average', 'Total Std Dev', 'Obese N', 'Obese Average', 'Obese Std Dev', 'Not Obese N', 'Not Obese Average', 'Not Obese Std Dev', 'Unadjusted Odds Ratio', 'Unadjusted OR Low', 'Unadjusted OR High', 'Relative Risk', 'p-value for OR']
y2pos_ix = (y2label_boys > 0)
with np.errstate(divide='ignore', invalid='ignore'):
    for ix, h in enumerate(feature_headers2_boys):
        bin_indicator = x2_boys[:,ix].max()==1 and x2_boys[:,ix].min()==0

        ix_total = (x2_boys[:,ix] != 0)
        ix_total_pos = (y2label_boys > 0) & (x2_boys[:,ix] != 0)
        ix_total_neg = (y2label_boys == 0) & (x2_boys[:,ix] != 0)

        De = sum((y2label_boys > 0) & (x2_boys[:,ix] != 0)) * 1.0
        He = sum((y2label_boys == 0) & (x2_boys[:,ix] != 0)) * 1.0
        Dn = sum((y2label_boys > 0) & (x2_boys[:,ix] == 0)) * 1.0
        Hn = sum((y2label_boys == 0) & (x2_boys[:,ix] == 0)) * 1.0

        OR = (De/He)/(Dn/Hn)
        OR_sterror = np.sqrt(1/De + 1/He + 1/Dn + 1/Hn)
        OR_low, OR_high = np.exp(np.log(OR) - 1.96*OR_sterror), np.exp(np.log(OR) + 1.96*OR_sterror)

        RR = (De/(De+He))/(Dn/(Dn+Hn))

        md = x2_boys[ix_total_pos,:][:,ix].mean() - x2_boys[ix_total_neg,:][:,ix].mean()
        se = np.sqrt(np.var(x2_boys[ix_total_pos,:][:,ix]) / len(x2_boys[ix_total_pos,:][:,ix]) + np.var(x2_boys[ix_total_neg,:][:,ix])/len(x2_boys[ix_total_neg,:][:,ix]))
        lcl, ucl = md-2*se, md+2*se
        z = md/se

        pvalue = 2 * norm.cdf(-1*(np.abs(np.log(OR))/OR_sterror)) if bin_indicator else 2 * norm.cdf(-np.abs(z))
        char_table.append([
                h, ix_total.sum(), x2_boys[ix_total,:][:,ix].mean(), x2_boys[ix_total,:][:,ix].std(),
                ix_total_pos.sum(), x2_boys[ix_total_pos,:][:,ix].mean() if not(bin_indicator) else 0, x2_boys[ix_total_pos,:][:,ix].std() if not(bin_indicator) else 0,
                ix_total_neg.sum(), x2_boys[ix_total_neg,:][:,ix].mean() if not(bin_indicator) else 0,  x2_boys[ix_total_neg,:][:,ix].std() if not(bin_indicator) else 0,
                OR if bin_indicator else 0, OR_low if bin_indicator else 0, OR_high if bin_indicator else 0,
                RR if bin_indicator else 0, pvalue
        ])

ft_info = pd.DataFrame(char_table, columns=cols)
ft_info.to_csv('../summary_statistics/cohort_boys_feature_summary_table.csv', index=False)

  **kwargs)
  keepdims=keepdims)


In [16]:
ft_info

Unnamed: 0,Variable,Total N,Total Average,Total Std Dev,Obese N,Obese Average,Obese Std Dev,Not Obese N,Not Obese Average,Not Obese Std Dev,Unadjusted Odds Ratio,Unadjusted OR Low,Unadjusted OR High,Relative Risk,p-value for OR
0,Diagnosis:9ccs0:No DX,89,1.0,0.0,25,0.0,0.0,64,0.0,0.0,1.4078,0.8739,2.2677,1.2932,0.1597
1,Diagnosis:9ccs2:Septicemia,7,1.0,0.0,0,0.0,0.0,7,0.0,0.0,0.0,0.0,,0.0,
2,Diagnosis:9ccs3:Oth bact inf,11,1.0,0.0,1,0.0,0.0,10,0.0,0.0,0.3519,0.0449,2.758,0.4109,0.3201
3,Diagnosis:9ccs4:Mycoses,182,1.0,0.0,42,0.0,0.0,140,0.0,0.0,1.0683,0.7416,1.5389,1.0525,0.7227
4,Diagnosis:9ccs7:Viral infect,797,1.0,0.0,166,0.0,0.0,631,0.0,0.0,0.8777,0.6989,1.1023,0.9032,0.2619
5,Diagnosis:9ccs8:Oth infectns,21,1.0,0.0,5,0.0,0.0,16,0.0,0.0,1.1065,0.4027,3.0398,1.0811,0.8444
6,Diagnosis:9ccs10:Immuniz/scrn,1428,1.0,0.0,313,0.0,0.0,1115,0.0,0.0,0.9614,0.7198,1.2839,0.9698,0.7895
7,Diagnosis:9ccs47:Ot bnign neo,39,1.0,0.0,10,0.0,0.0,29,0.0,0.0,1.2252,0.5917,2.5369,1.1675,0.5844
8,Diagnosis:9ccs48:Thyroid dsor,14,1.0,0.0,3,0.0,0.0,11,0.0,0.0,0.9642,0.2676,3.4735,0.9718,0.9555
9,Diagnosis:9ccs55:Fluid/elc dx,13,1.0,0.0,1,0.0,0.0,12,0.0,0.0,0.2929,0.038,2.2594,0.3473,0.2388


### Create the girls subset of the data and output relevant information for each feature that has at least 5 occurrences in the data

In [17]:
x2_girls, y2_girls, y2label_girls, mrns2_girls, ix_filter_girls, feature_headers2_girls, corr_headers_filtered_girls, corrs_matrix_filtered_girls, ix_corr_headers_girls = \
    train.prepare_data_for_analysis({}, {}, {}, {}, {},
        x1, y1, y1label[:,label_ix['obese']], feature_headers, mrns,
        agex_low, agex_high, months_from, months_to,
        filterSTR=['Gender:1'], # only the girls data
        variablesubset=[], # do not remove any features
        do_impute=False, # do not impute values
        do_normalize=False, # do not normalize the values
        min_occur=5, # use only features with meaningful information
        delay_print=False, # print out all information as it's available
        lasso_selection=False # do not use LASSO feature selection
    )

Using pre-prepared data

Original cohort size is: 52945 num features: 19290
total number of people who have:  ['Gender:1 female']  is: 5719
total number of people who have a BMI measured: 11484
total number of people who have all filtered variables: 5719
total number of people who have maternal data available: 3451
intersection of the three above is: 1698
1698 patients selected..
1,052 features filtered with number of occurrences less than 5
filtered correlated features to: 838
corr matrix is filtered to size: (1052, 838)
output is: average: 16.487, min: 10.470, max: 35.320
total patients: 1,698, positive: 256.00, negative: 1,442.00
normalizing output...
Using pre-prepared data
1698 features are binary
Predicting BMI at age: 4.5 to 5.5 years, from data in ages: 0 - 24 months
filtering patients with: ['Gender:1']
total size: 1,698 x 1,052


  c /= stddev[:, None]
  c /= stddev[None, :]


In [18]:
char_table = []
cols = ['Variable', 'Total N', 'Total Average', 'Total Std Dev', 'Obese N', 'Obese Average', 'Obese Std Dev', 'Not Obese N', 'Not Obese Average', 'Not Obese Std Dev', 'Unadjusted Odds Ratio', 'Unadjusted OR Low', 'Unadjusted OR High', 'Relative Risk', 'p-value for OR']
y2pos_ix = (y2label_girls > 0)
with np.errstate(divide='ignore', invalid='ignore'):
    for ix, h in enumerate(feature_headers2_girls):
        bin_indicator = x2_girls[:,ix].max()==1 and x2_girls[:,ix].min()==0

        ix_total = (x2_girls[:,ix] != 0)
        ix_total_pos = (y2label_girls > 0) & (x2_girls[:,ix] != 0)
        ix_total_neg = (y2label_girls == 0) & (x2_girls[:,ix] != 0)

        De = sum((y2label_girls > 0) & (x2_girls[:,ix] != 0)) * 1.0
        He = sum((y2label_girls == 0) & (x2_girls[:,ix] != 0)) * 1.0
        Dn = sum((y2label_girls > 0) & (x2_girls[:,ix] == 0)) * 1.0
        Hn = sum((y2label_girls == 0) & (x2_girls[:,ix] == 0)) * 1.0

        OR = (De/He)/(Dn/Hn)
        OR_sterror = np.sqrt(1/De + 1/He + 1/Dn + 1/Hn)
        OR_low, OR_high = np.exp(np.log(OR) - 1.96*OR_sterror), np.exp(np.log(OR) + 1.96*OR_sterror)

        RR = (De/(De+He))/(Dn/(Dn+Hn))

        md = x2_girls[ix_total_pos,:][:,ix].mean() - x2_girls[ix_total_neg,:][:,ix].mean()
        se = np.sqrt(np.var(x2_girls[ix_total_pos,:][:,ix]) / len(x2_girls[ix_total_pos,:][:,ix]) + np.var(x2_girls[ix_total_neg,:][:,ix])/len(x2_girls[ix_total_neg,:][:,ix]))
        lcl, ucl = md-2*se, md+2*se
        z = md/se

        pvalue = 2 * norm.cdf(-1*(np.abs(np.log(OR))/OR_sterror)) if bin_indicator else 2 * norm.cdf(-np.abs(z))
        char_table.append([
                h, ix_total.sum(), x2_girls[ix_total,:][:,ix].mean(), x2_girls[ix_total,:][:,ix].std(),
                ix_total_pos.sum(), x2_girls[ix_total_pos,:][:,ix].mean() if not(bin_indicator) else 0, x2_girls[ix_total_pos,:][:,ix].std() if not(bin_indicator) else 0,
                ix_total_neg.sum(), x2_girls[ix_total_neg,:][:,ix].mean() if not(bin_indicator) else 0,  x2_girls[ix_total_neg,:][:,ix].std() if not(bin_indicator) else 0,
                OR if bin_indicator else 0, OR_low if bin_indicator else 0, OR_high if bin_indicator else 0,
                RR if bin_indicator else 0, pvalue
        ])

ft_info = pd.DataFrame(char_table, columns=cols)
ft_info.to_csv('../summary_statistics/cohort_girls_feature_summary_table.csv', index=False)

  **kwargs)
  keepdims=keepdims)


In [19]:
ft_info

Unnamed: 0,Variable,Total N,Total Average,Total Std Dev,Obese N,Obese Average,Obese Std Dev,Not Obese N,Not Obese Average,Not Obese Std Dev,Unadjusted Odds Ratio,Unadjusted OR Low,Unadjusted OR High,Relative Risk,p-value for OR
0,Diagnosis:9ccs0:No DX,84,1.0,0.0,9,0.0,0.0,75,0.0,0.0,0.6641,0.3283,1.3435,0.7001,0.2549
1,Diagnosis:9ccs2:Septicemia,6,1.0,0.0,2,0.0,0.0,4,0.0,0.0,2.8307,0.5158,15.5362,2.2205,0.231
2,Diagnosis:9ccs3:Oth bact inf,10,1.0,0.0,3,0.0,0.0,7,0.0,0.0,2.4308,0.6245,9.4626,2.0016,0.2002
3,Diagnosis:9ccs4:Mycoses,178,1.0,0.0,25,0.0,0.0,153,0.0,0.0,0.9118,0.5841,1.4234,0.9242,0.6844
4,Diagnosis:9ccs7:Viral infect,718,1.0,0.0,100,0.0,0.0,618,0.0,0.0,0.8547,0.6512,1.1217,0.8749,0.2577
5,Diagnosis:9ccs8:Oth infectns,31,1.0,0.0,3,0.0,0.0,28,0.0,0.0,0.5988,0.1807,1.9845,0.6376,0.4016
6,Diagnosis:9ccs10:Immuniz/scrn,1406,1.0,0.0,201,0.0,0.0,1205,0.0,0.0,0.7188,0.5172,0.999,0.759,0.0493
7,Diagnosis:9ccs47:Ot bnign neo,37,1.0,0.0,5,0.0,0.0,32,0.0,0.0,0.8777,0.3388,2.2742,0.8943,0.7883
8,Diagnosis:9ccs48:Thyroid dsor,6,1.0,0.0,0,0.0,0.0,6,0.0,0.0,0.0,0.0,,0.0,
9,Diagnosis:9ccs51:Ot endo dsor,12,1.0,0.0,1,0.0,0.0,11,0.0,0.0,0.5102,0.0656,3.9688,0.551,0.5202
