import pandas as pd import numpy as np import csv import functools import regex as re # combine device, patient, and followup csvs into one # and then separate pre- and post-implant variables # @input: # - patient_INTERMACS_Data_Dictionary.csv # - device_INTERMACS_Data_Dictionary.csv # - followup_INTERMACS_Data_Dictionary.csv # @output # - all (pickled df): three csvs joined # - pre(pickled df): pre-implant-related columns, match with ouput on 'OPER_ID' # - post (pickled df): post-implant-related columns, match with input on 'OPER_ID' blood_type_dict = { 1: 'O', 2: 'A', 3: 'B', 4: 'AB', 998: '-', } nyha_dict = { 'Class I: No Limitation or Symptoms': 'ClassI', 'Class II: Slight Limitation': 'ClassII', 'Class III: Marked Limitation': 'ClassIII', 'Class IV: Unable to Carry on Minimal Physical Activity': 'ClassIV', 'Unknown': '-', } num_card_hosp_dict = {'0-1': 0, '2-3': 2, '4 or more': 4, '-': None } time_card_dgn_dict = { '< 1 month': 0, '1 month - 1 year': 1, '1-2 years': 12, '> 2 years': 24, '-': None, } def feature_dict(): """ imports INTERMACS Data Dictionary with Format Value tables into one dictionary :returns: - a dictionary that combines all three data dictinoaries """ # import data dictionary csv as dict # :return: {'VARIABLE': {'TABLE': csv_name, 'TYPE': ..., 'FORMAT_VALUE', ....}} p = pd.read_csv('data/patient_INTERMACS_Data_Dictionary.csv', index_col='VARIABLE', encoding="ISO-8859-1", low_memory=False).to_dict(orient='index') d = pd.read_csv('data/device_INTERMACS_Data_Dictionary.csv', index_col='VARIABLE', encoding="ISO-8859-1", low_memory=False).to_dict(orient='index') f = pd.read_csv('data/followup_INTERMACS_Data_Dictionary.csv', index_col='VARIABLE', encoding="ISO-8859-1", low_memory=False).to_dict(orient='index') # print (len(d.keys() | p.keys() | e.keys() | f.keys())) _dict = {**d, **p, **f} # all columns 1080 columns in total # parse FORMAT_VALUE into dictionary # i.e. transform string ".=Missing; .U=Unknown; 0=No; 1=Yes; 9=Unknown" # to dict {'.': 'Missing'; '.U'= 'Unknown', 0=No; 1=Yes; 9 = 'Unknown'} for k, v in _dict.items(): # if FORMAT_VALUE has value if isinstance(v['FORMAT_VALUE'], str): # transform string to list of tuples _v = [tuple(item.split('=')) for item in v['FORMAT_VALUE'].split('; ')] # print(dict(re.findall(r'([^ ]*?)=([a-zA-Z\s]+)[^;]*?', v['FORMAT_VALUE']))) # re.findall v['FORMAT_VALUE'] = dict(_v) # print(k) # print(_dict[k]['FORMAT_VALUE']) return _dict def import_and_join(feature_dict): """ import three datasets (device, patient, events) and join on "OPER_ID" """ # load csvs # leave only pre-implant rows in follow-up data pdf = pd.read_csv('data/patientnewdata.csv', na_values = ['nan'], low_memory = False, encoding="ISO-8859-1") # 456 cols 17075 rows (patients) ddf = pd.read_csv('data/devicenewdata.csv', na_values = ['nan'], low_memory = False, encoding = "ISO-8859-1") # 19207 rows (implants) fdf = pd.read_csv('data/followupnewdata.csv', na_values=['nan'], low_memory=False, encoding="ISO-8859-1") fdf = fdf.loc[fdf['FORM_ID'] == 'Pre-Implant'] # 19207 rows (implants) # merge device and followup tables df = ddf.merge(fdf[list(fdf.columns.difference(ddf.columns)) + ['OPER_ID']], on='OPER_ID', how='left', suffixes=['', '']) # append patient table to the joined table df = df.merge(pdf[list(pdf.columns.difference(df.columns)) + ['OPER_ID']], on='OPER_ID', how='left', suffixes=['', '']) # 1447 cols 19207rows (implants) """ filter out patient_IDs without death records """ # 4989 rows are death records # == 4989 patients have death records in the df dead_pts = df[df['DEAD'] == 1.0]['PATIENT_ID'].unique() df = df[df['PATIENT_ID'].isin(dead_pts)] # 5820 rows left # convert nominal data type/ coding df['BLOOD_TYPE'].replace(blood_type_dict, inplace=True) df['TIME_CARD_DGN'].replace(time_card_dgn_dict, inplace=True) df['NUM_CARD_HOSP'].replace(num_card_hosp_dict, inplace=True) df['NYHA'].replace(nyha_dict, inplace=True) """ drop columns with no value """ # print("before dropna", len(df.columns)) #806 df = df.dropna(axis=1, how='all') # print("after dropna", len(df.columns)) #653 """ seperate columns about pre-implant conditions and post-implant outcomes """ idx_col = ['OPER_ID', 'PATIENT_ID', 'FORM_ID'] # elicit outcome cols from each table # starting with device table # - VERSION: intermacs version # - DEAD/DEATH_ are death related outcome, EXPL_ explant-related, # - INT_ interval btw implant and result time # - LOS: length of stay from implant to discharge # - DIS_INT are intervention since implant # - QRTR: quarter of implant outcome_col = ['VERSION', 'DEAD', 'DEATH_DEVICE_EXPLANT', 'DEATH_LOCATION', 'TXPL', 'SURGICAL_APPROACH', 'TIMING_DEATH', 'INT_DEAD', 'INT_EXPL', 'INT_TXPL', 'PRIMARY_COD', 'PRIMARY_COD_CANCER', 'EXPL', 'EXPLANT_DEVICE_TY', 'EXPLANT_REASON', 'EXPL_THROM', 'LOS', 'DISCHARGE_TO', 'DISCHARGE_STATUS', 'DIS_INT_AVS_REPAIR_NC', 'DIS_INT_AVS_REPAIR_WC', 'DIS_INT_AVS_REPLACE_BIO', 'DIS_INT_AVS_REPLACE_MECH', 'DIS_INT_BLEED_GT_48', 'DIS_INT_BLEED_LE_48', 'DIS_INT_BRONCHOSCOPY', 'DIS_INT_CARD_OTHER', 'DIS_INT_CARD_UNKNOWN', 'DIS_INT_DIALYSIS', 'DIS_INT_DRAINAGE', 'DIS_INT_INV_CARD_PROC', 'DIS_INT_MVS_REPAIR', 'DIS_INT_MVS_REPLACE_BIO', 'DIS_INT_MVS_REPLACE_MECH', 'DIS_INT_NONE', 'DIS_INT_OTHER', 'DIS_INT_PVS_REPAIR', 'DIS_INT_PVS_REPLACE_BIO', 'DIS_INT_PVS_REPLACE_MECH', 'DIS_INT_REINTUBATION', 'DIS_INT_SURG_PROC_DEV', 'DIS_INT_SURG_PROC_NC', 'DIS_INT_SURG_PROC_OTHER', 'DIS_INT_SURG_PROC_UNKNOWN', 'DIS_INT_TRANSPLANT', 'DIS_INT_TVS_REPAIR_DEVEGA', 'DIS_INT_TVS_REPAIR_OTHER', 'DIS_INT_TVS_REPAIR_RING', 'DIS_INT_TVS_REPLACE_BIO', 'DIS_INT_TVS_REPLACE_MECH', 'DIS_INT_UNKNOWN'] # outcome cols from events in addition to the DEATH, EXPLANT, DIS_INT related one # - OP outcomes of each operation # - PC_PUMP_EXCHANGE pump exchange outcome_col += ['OP', 'OP1COD', 'OP1CONT', 'OP1DEAD', 'OP1DEV_TY', 'OP1EVTID', 'OP1EXPDEV', 'OP1EXPL', 'OP1EXPREA', 'OP1INTD', 'OP1INTR', 'OP1INTT', 'OP1REC', 'OP1TXPL', 'OP2COD', 'OP2CONT', 'OP2DEAD', 'OP2DEV_TY', 'OP2EVTID', 'OP2EXPDEV', 'OP2EXPL', 'OP2EXPREA', 'OP2INTD', 'OP2INTR', 'OP2INTT', 'OP2REC', 'OP2TXPL', 'OP3COD', 'OP3CONT', 'OP3DEAD', 'OP3DEV_TY', 'OP3EVTID', 'OP3EXPDEV', 'OP3EXPL', 'OP3EXPREA', 'OP3INTD', 'OP3INTR', 'OP3INTT', 'OP3REC', 'OP3TXPL', 'OP4COD', 'OP4CONT', 'OP4DEAD', 'OP4DEV_TY', 'OP4EVTID', 'OP4EXPDEV', 'OP4EXPL', 'OP4EXPREA', 'OP4INTD', 'OP4INTR', 'OP4INTT', 'OP4REC', 'OP4TXPL', 'OP5COD', 'OP5CONT', 'OP5DEAD', 'OP5DEV_TY', 'OP5EVTID', 'OP5EXPDEV', 'OP5EXPL', 'OP5EXPREA', 'OP5INTD', 'OP5INTR', 'OP5INTT', 'OP5REC', 'OP5TXPL', 'OP6COD', 'OP6CONT', 'OP6DEAD', 'OP6DEV_TY', 'OP6EVTID', 'OP6EXPDEV', 'OP6EXPL', 'OP6EXPREA', 'OP6INTD', 'OP6INTR', 'OP6INTT', 'OP6REC', 'OP6TXPL', 'OP7COD', 'OP7CONT', 'OP7DEAD', 'OP7DEV_TY', 'OP7EVTID', 'OP7EXPDEV', 'OP7EXPL', 'OP7EXPREA', 'OP7INTD', 'OP7INTR', 'OP7INTT', 'OP7REC', 'OP7TXPL', 'OP8COD', 'OP8CONT', 'OP8DEAD', 'OP8DEV_TY', 'OP8EVTID', 'OP8EXPDEV', 'OP8EXPL', 'OP8EXPREA', 'OP8INTD', 'OP8INTR', 'OP8INTT', 'OP8REC', 'OP8TXPL'] # patient csv # - KCCQ, LIFE, LIFESTYLE: quality of life measures # - OUTCOME # - EXCH_NEW_STUDY: excharged # - PC_PUMP pump change outcome_col += ['INT_AFOL', 'INT_FOL', 'KCCQ12', 'KCCQ12PL', 'KCCQ12QL', 'KCCQ12SF', 'KCCQ12SL', 'KCCQ_PARENT_QUESTION', 'LIFE', 'LIFESTYLE_CHORES', 'LIFESTYLE_HOBBIES', 'LIFESTYLE_VISITING', 'OUTCOME', 'OUTCOME_I', 'PC_PUMP_EXCHANGE', 'PC_PUMP_EXCHANGE_REASON' ] # among these outcome cols, exclude the columns have been removed when dropna # this reduced outcomes from 172 cols >> 118cols outcome_col = [col for col in outcome_col if col in df.columns.tolist()] # the rest of the columns are pre-implant variables eligible to be features input_col = list(set(df.columns.get_values()) - set(outcome_col) - set(idx_col)) """ export the whole df, preimplant and outcome dfs to respective csvs """ df.to_csv('data/03-14-2018-all.csv', encoding='utf-8') # excludes PATIENT_ID and FORM_ID df[outcome_col + ['OPER_ID']].to_csv('data/03-14-2018-post.csv', encoding='utf-8') # excludes PATIENT_ID and FORM_ID df[input_col + ['OPER_ID']].to_csv('data/03-14-2018-pre.csv', encoding='utf-8') """ prepare data for modeling by combine input cols and label 'INT_DEAD' """ traintest = df[input_col + ['INT_DEAD']] traintest.to_csv('data/03-14-2018-traintest.csv', encoding='utf-8') print('Integrated and imported %d input columns, %d output columns and %d observations.' %(len(input_col),len(outcome_col),len(df))) #532 col 5821 rows # take all columns of df as object (str) columns at the moment df[df.columns] = df[df.columns].astype(str) return df, input_col, outcome_col, idx_col # feature_dict = feature_dict() #df = import_and_join(feature_dict)