capture log close
log using lecture04.log, replace text

//This log file contains most of the examples used in Lecture 4 of
//Stata Programming and Data Management, along with additional explanations
//and examples.

version 14 //We're teaching new-style merging. According to the CoursePlus survey (70% response rate so far), all students have version >=11
//also, "export excel" requires version 14
//if you have version 13 or earlier, you can comment that stuff out and edit line 8
clear all //clear all data from memory
macro drop _all //clear all macros in memory
set more off   //give output all at once (not one screenful at a time)
set linesize 80 //maximum allowed width for output
set varabbrev on //allows variable abbreviation



////////////////////////////////////////////////////////
//                  MERGING
////////////////////////////////////////////////////////

// Merging
use transplants, clear

//examples of syntax for merging
//NOTE: after each example we reload transplants.dta

//simplest merge: link data in memory (transplants.dta) to donors_recipients.dta
//by matching on the variable fake_id which appears in both datasets
//NOTE: rename your datasets after downloading 
//e.g. donors_recipients2022.dta -> donors_recipients.dta
merge 1:1 fake_id using donors_recipients

//the merge command creates a new variable, _merge
//check how many records matched
tab _merge

//"master" = dataset that was in memory before we merged
//"using" = dataset that we are bringing in with the merge command
//"match" = appears in both master and using datasets

use transplants, clear
merge 1:1 fake_id using donors_recipients, keep(match)
//only keep records that appear both in the original dataset and in the new dataset


use transplants, clear
merge 1:1 fake_id using donors_recipients, keep(master match)
//only keep records that appear in the master dataset only, or both datasets

use transplants, clear
merge 1:1 fake_id using donors_recipients, keep(master using)
//only keep records that appear in the master dataset only, 
//or the using dataset only, but not both

use transplants, clear
merge 1:1 fake_id using donors_recipients, keep(master match) gen(mergevar)
//call the newly created created variable "mergevar" instead of "_merge"

use transplants, clear
merge 1:1 fake_id using donors_recipients, keep(match) nogen
//don't create a new variable

merge m:1 fake_don_id using donors, keep(match) nogen keepusing(age_don)
//don't load all the new variables from the donors dataset.
//just load age_don.

corr age*
//are donor age and recipient age correlated in our fake data? No.

//merging: using assert to make sure that all your records match
use transplants, clear
merge 1:1 fake_id using donors_recipients, keep(master match)
assert _merge==3
drop _merge

//merging: using assert to make sure that *most* of your records match
use transplants, clear
merge 1:1 fake_id using donors_recipients, keep(master match)
assert inlist(_merge, 1, 3) //master only, or matched
quietly count if _merge == 3
assert r(N)/_N > 0.99  //99% of records have _merge==3
drop _merge


////////////////////////////////////////////////////////
//                  POSTFILE
////////////////////////////////////////////////////////

//minimal postfile example

//create a new dataset. The "postname" (what we'll call this dataset within
//our Stata script) is "mypost". There will be three variables, a, b, and c.
//the file name on disk will be abc.dta
postfile mypost a b c using abc.dta, replace

//write three rows of data directly into the new dataset.
//note that each value has to be surrounded by parentheses
post mypost (1) (2) (3)
post mypost (4) (5) (6)
post mypost (7) (8) (-2)

//all done. Save the file and close it
postclose mypost

//check out our results
use abc, clear
list, clean noobs


use transplants, clear

regress bmi gender age 
postfile pp str80 coef double(result CIleft CIright pvalue) ///
    using betas.dta, replace
foreach v of varlist gender age {
    local res=_b[`v']
    local left=_b[`v']+_se[`v']*invttail(e(df_r),0.975)
    local right=_b[`v']+_se[`v']*invttail(e(df_r),0.025)
    local pval=2*ttail(e(df_r), abs(_b[`v']/_se[`v']))
    post pp ("`v'") (`res') (`left') (`right') (`pval')
}
postclose pp

//let's have a look
use betas, clear
list, clean noobs

////////////////////////////////////////////////////////
//                  EXPORT EXCEL
////////////////////////////////////////////////////////

export excel coef result CIleft CIright using betas_table.xlsx, replace

format %3.2f result

gen ci = "(" + string(CIleft, "%3.2f") + ", " + string(CIright, "%3.2f") + ")"

export excel coef result ci using betas_table2.xlsx, replace firstrow(variables)

label var coef "Variable"
label var result "Coefficient"
label var ci "95% CI"
export excel coef result ci using betas_table3.xlsx, replace firstrow(varlabels)


////////////////////////////////////////////////////////
//                  OPTIONS
////////////////////////////////////////////////////////


//demonstrate options
use transplants, clear

capture program drop opt_demo
program define opt_demo
    syntax, [myoption]  //the user can choose to specify "myoption"
    if "`myoption'" != "" {  //if they said "myoption", display it
        disp "You picked the option `myoption'"
    }
    else {
        disp "No option" //otherwise, display this
    }
end

opt_demo   //"No option"
opt_demo, myoption   //"You picked the option myoption"

capture program drop table1_o
program define table1_o
    syntax varlist [if] , [round]

    //set display format
    //if they specified "round", then macro round will equal the word "round"
    //otherwise, it's blank (equals "")
    if "`round'" != "" {  
        local D %1.0f  //macro D will contain the display format
    }
    else {
        local D %3.2f  //set macro D to %3.2f
    }

    disp "Variable" _col(12) "mean(SD)" _col(25) "range"
    foreach v of varlist `varlist' {
        quietly sum `v' `if'
        disp "`v'" _col(12) `D' r(mean) "(" `D' r(sd) ")" _col(25) ///
            `D' r(min) "-" `D' r(max)  //notice we use macro D for format
    }
end


table1_o age bmi wait_yrs if age<40
table1_o age bmi wait_yrs if age>40, round

capture program drop minmax
program define minmax
    syntax  , [min(int 0) max(int 99)]
    disp "Min: `min'" _col(12) "Max: `max'"
end
minmax
minmax, min(20)
minmax, min(20) max(40)

//another example
capture program drop list_top
program define list_top
    syntax varname, [top(int 10)]
    //list the top `top' values
    //if `top' not specified, list the top 10
    preserve
    gsort -`varlist'
    list `varlist' in 1/`top', clean noobs
end

list_top age
list_top age, top(5)

//illustrating a function that takes a string option
capture program drop disptwice
program define disptwice
    syntax, [mystring(string)]
    disp "`mystring'"
    disp "`mystring'"
end


program define birthday
    syntax, name(string)
    disptwice, mystring("Happy Birthday to you")
    disp "Happy Birthday, dear `name'"
    disp "Happy birthday to you"
end

birthday, name("Gugu Mbatha")  //04/21/1983


//combining several of the above techniques into a "table1" program
capture program drop table1_nice
program define table1_nice
    syntax varlist [if] , [replace] [precision(int 1)] [title(string)]
    //if: specify records to include in analysis (optional)
    //precision: # of digits to the right of decimal point (optional, default 1)
    //title: title to display before the table (optional)

    preserve
    capture keep `if' //temporarily drop any records not required

    //set display format
    assert inrange(`precision', 0, 6) //allow 0-6 digits to R of decimal point
    local pplus = `precision' + 1 //first number for formatting code
    local D %`pplus'.`precision'f

    if "`title'" != "" {
        disp "`title'"
    }

    disp "Variable" _col(12) "mean(SD)" _col(25) "range"
    foreach v of varlist `varlist' {
        quietly sum `v' `if'
        disp "`v'" _col(12) `D' r(mean) "(" `D' r(sd) ")" _col(25) ///
            `D' r(min) "-" `D' r(max)  //notice we use macro D for format
    }
end


table1_nice age bmi wait_yrs if age>40, precision(2) replace ///
    title("Study population")





//put it in a program
capture program drop model
program define model
    //takes a varlist of independent (x) variables,
    //a filename to post the data to,
    //and the name of a dependent (y) variable
    syntax varlist, yvar(varname) filename(string)

    regress `yvar' `varlist'

    postfile pp str80 variable double(result CIleft CIright pvalue) ///
        using `filename', replace
    foreach v of varlist `varlist' {
        local res=_b[`v']
        local left=_b[`v']+_se[`v']*invttail(e(df_r),0.975)
        local right=_b[`v']+_se[`v']*invttail(e(df_r),0.025)
        local pval=2*ttail(e(df_r), abs(_b[`v']/_se[`v']))
        post pp ("`v'") (`res') (`left') (`right') (`pval')
    }
    postclose pp
end

use transplants, clear
model age wait_yrs gender rec_hcv_antibody, yvar(bmi) filename(model_bmi)


use model_bmi, clear
list, clean noobs

//BONUS: make it fancy
capture program drop model2
program define model2
    syntax varlist, cmd(string) [show replace] filename(string)

    `cmd' `varlist'
     local cmd1=word("`cmd'",1)

    assert inlist("`cmd1'", "regress", "logit", "logistic")

    postfile pp str80 variable double(result CIleft CIright pvalue) ///
        using `filename', `replace'
    foreach v of varlist `varlist' {
        if "`cmd1'"=="logistic" | "`cmd1'" == "logit" {
            local res=exp(_b[`v'])
            local left=exp(_b[`v']+_se[`v']*invnormal(0.025))
            local right=exp(_b[`v']+_se[`v']*invnormal(0.975))
            local pval=(1-normal(abs(_b[`v']/_se[`v'])))*2
        }
        else if "`cmd1'"=="regress" {
            local res=_b[`v']
            local left=_b[`v']+_se[`v']*invttail(e(df_r),0.975)
            local right=_b[`v']+_se[`v']*invttail(e(df_r),0.025)
            local pval=2*ttail(e(df_r), abs(_b[`v']/_se[`v']))
        }
        post pp ("`v'") (`res') (`left') (`right') (`pval')
    }
    postclose pp

    quietly {
        preserve
        use `filename', clear
        if "`show'"!="" {
            noisily list, clean noobs
        }
        compress
        save, replace
        format %3.2f result CIleft CIright pvalue
        export excel using `filename'.xlsx, replace firstrow(variables)
    }
end

use transplants, clear
model2 gender race age, filename(model_bmi) cmd("regress bmi") show replace
model2 bmi race age, filename(model_gender) cmd("logistic gender") show replace



log close