# Test & Roll: Profit Maximizing A/B Tests # Functions for computing expected profit and sample size # for normal-normal (nn) with posterior mean as decision rule # closed form (when possible) and by simulation # Elea McDonnell Feit, eleafeit@gmail.com # Updated 1 May 2019 library(doParallel) test_size_nht <- function(s, d, conf=0.95, power=0.8, N=NULL) { # computes the reccomended sample size for a null hypothesis test # comparing two treatments with finite population correction # s is a vector of length 1 (symmetric) or 2 (asymmetric) # indicating repsonse std deviation(s) # d is the minimum detectable difference between treatments # conf is 1 - rate of type I error of the null hypothesis test # power is 1 - rate of type II error of the null hypothesis test # N is the finite population. If N=NULL, then no finite population correction is used. z_alpha <- qnorm(1 - (1-conf)/2) z_beta <- qnorm(power) if(length(s) == 1) { # symmetric response variance if (is.null(N)) { out <- (z_alpha + z_beta)^2 * (2 * s^2) / d^2 } else { out <- (z_alpha + z_beta)^2 * (2 * s^2) * N / (d^2 * (N-1) + (z_alpha + z_beta)^2 * 4 * s^2) } } else { # asymmetric response variance if(is.null(N)) { n1 <- (z_alpha + z_beta)^2 * (s[1]^2 + s[1]*s[2]) / d^2 n2 <- (z_alpha + z_beta)^2 * (s[1]*s[2] + s[2]^2) / d^2 out <- c(n1, n2) } else { n1 <- (z_alpha + z_beta)^2 * N * (s[1]^2 + s[1]*s[2]) / (d^2 * (N-1) + (z_alpha + z_beta)^2 * (s[1] + s[2])^2) n2 <- (z_alpha + z_beta)^2 * N * (s[2]^2 + s[1]*s[2]) / (d^2 * (N-1) + (z_alpha + z_beta)^2 * (s[1] + s[2])^2) out <- c(n1, n2) } } out } # FUNCTIONS FOR 2-ARM TEST & ROLL (SYMMETRIC AND ASYMMETRIC) ===== profit_nn <- function(n, N, s, mu, sigma, log_n=FALSE) { # computes the per-customer profit for test & roll with 2 arms # where response is normal with (possibly assymetric) normal priors # n is a vector of length 2 of sample sizes # N is the size of the deployment population # s is a vector of lenght 2 of the (known) std dev of the outcome # mu is a vector of length 2 of the means of the prior on the mean response # sigma is a vector of length 2 of the std dev of the prior on the mean response # if length(n)=1, equal sample sizes are assumed # if lenght(s)=1 symmetric priors are assumed and only the first elements of mu and sigma are used if (length(n) == 1) n <- rep(n, 2) if (log_n) n <- exp(n) stopifnot(N >= sum(n), n[1]>0, n[2]>0, sum(sigma <= 0) == 0, sum(s <= 0) == 0) if (length(s) == 1) { # symmetric deploy <- (N - n[1] - n[2]) * (mu[1] + (2 * sigma[1]^2) / (sqrt(2*pi) * sqrt(s[1]^2 * (n[1] + n[2]) / (n[1]*n[2]) + 2 * sigma[1]^2))) test <- mu[1] * (n[1] + n[2]) } else { # asymmetric e <- mu[1] - mu[2] v <- sqrt (sigma[1]^4 / (sigma[1]^2 + s[1]^2 / n[1] ) + sigma[2]^4 / (sigma[2]^2 + s[2]^2 / n[2] ) ) deploy <- (N - n[1] - n[2]) * ( mu[2] + e * pnorm(e/v) + v * dnorm(e/v) ) test <- mu[1] * n[1] + mu[2] * n[2] } #c(profit = test + deploy, profit_test = test, profit_deploy = deploy) (test + deploy)/N } test_size_nn <- function(N, s, mu, sigma) { # computes the profit-maximizing test size for a 2-armed Test & Roll # where response is normal with normal priors (possibly asymmetric) # N is the size of the deployment population # s is a vector of lenght 2 of the (known) std dev of the outcome # mu is a vector of length 2 of the means of the prior on the mean response # sigma is a vector of length 2 of the std dev of the prior on the mean response # if lenght(s)=1 symmetric priors are assumed and only the first elements of mu and sigma are used stopifnot(N>2, sum(s <= 0) == 0, sum(sigma <= 0) == 0) if (length(s)==1) { # symmetric n <- ( -3 * s[1]^2 + sqrt( 9*s[1]^4 + 4*N*s[1]^2*sigma[1]^2 ) ) / (4 * sigma[1]^2) n <- rep(n, 2) } else { n <- optim(par=log(rep(round(N*0.05), 2 ) - 1), fn=profit_nn, control=list(fnscale=-1), N=N, s=s, mu=mu, sigma=sigma, log_n=TRUE)$par n <- exp(n) } n } profit_perfect_nn <- function(mu, sigma){ # computes the per-customer profit with perfect information # where response is normal with symmetric normal priors # todo: adapt to asymmetric case (closed form may not be possible) stopifnot(sigma > 0) (mu + sigma/sqrt(pi)) } error_rate_nn <- function(n, s, sigma) { # computes the rate of incorrect deployments # where response is normal with symmetric normal priors if (length(n)==1) n <- rep(n, 2) stopifnot(n[1]>0, n[2]>0, s>0, sigma>0) stopifnot(length(s)==1, length(sigma)==1) 1/2 - 2*atan( sqrt(2)*sigma*sqrt(n[1]*n[2] / (n[1] + n[2]) ) / s ) / (2*pi) } plot_prior_mean_resp_nn <- function(mu, sigma) { if (length(mu)==1) {mu <- rep(mu, 2); sigma <- rep(sigma,2)} xlim <- c(min(mu[1] - 4*sigma[1], mu[2] - 4*sigma[2]), max(mu[1] + 4*sigma[1], mu[2] + 4*sigma[2])) ylim <- c(0, 1.05*max(dnorm(mu[1], mean=mu[1], sd=sigma[1]), dnorm(mu[2], mean=mu[2], sd=sigma[2]))) xs <- seq(xlim[1], xlim[2], length.out=100) plot(xs, dnorm(xs, mean=mu[1], sd=sigma[1]), type="l", col="red", xlim=xlim, ylim=ylim, main="Prior on Mean Response", xlab="Profit Per Customer", ylab="Prior Density") lines(xs, dnorm(xs, mean=mu[2], sd=sigma[2]), type="l", col="blue", lty=3) legend("topright", legend=c("Treatment 1", "Treatment 2"), col=c("red", "blue"), lty=c(1,3), bty="n") } plot_prior_resp_nn <- function(s, mu, sigma) { if (length(mu)==1) {mu <- rep(mu, 2); sigma <- rep(sigma,2); s <- rep(s, 2)} sigma_resp1 <- sqrt(sigma[1]^2 + s[1]^2) sigma_resp2 <- sqrt(sigma[2]^2 + s[2]^2) xlim <- c(min(mu[1] - 4*sigma_resp1, mu[2] - 4*sigma_resp2), max(mu[1] + 4*sigma_resp1, mu[2] + 4*sigma_resp2)) ylim <- c(0, 1.05*max(dnorm(mu[1], mean=mu[1], sd=sigma_resp1), dnorm(mu[2], mean=mu[2], sd=sigma_resp2))) xs <- seq(xlim[1], xlim[2], length.out=100) plot(xs, dnorm(xs, mean=mu[1], sd=sigma_resp1), type="l", col="red", xlim=xlim, ylim=ylim, main="Prior on Response", xlab="Profit Per Customer", ylab="Prior Density") lines(xs, dnorm(xs, mean=mu[2], sd=sigma_resp2), type="l", col="blue", lty=3) legend("topright", legend=c("Treatment 1", "Treatment 2"), col=c("red", "blue"), lty=c(1,3), bty="n") } plot_prior_effect_nn <- function(mu, sigma, abs=FALSE) { if (length(mu)==1) {mu <- rep(mu, 2); sigma <- rep(sigma,2)} sigma_effect <- sqrt(sigma[1]^2 + sigma[2]^2) xlim <- c(mu[2] - mu[1] - 4*sigma_effect, mu[2] - mu[1] + 4*sigma_effect) if (!abs) { xs <- seq(xlim[1], xlim[2], length.out=100) plot(xs, dnorm(xs, mean=mu[2] - mu[1], sd=sigma_effect), type="l", xlim=xlim, main="Prior on Treatment Effect (m2 - m1)", xlab="Difference in Profit per Customer", ylab="Prior Density") } else { xs <- seq(0, max(-xlim[1], xlim[2]), length.out=100) ys <- dnorm(xs, mean=mu[2] - mu[1], sd=sigma_effect) + dnorm(-xs, mean=mu[2] - mu[1], sd=sigma_effect) plot(xs, ys, type="l", main="Prior on Treatment Effect |m2 - m1|", xlab="Absolute Difference in Profit Per Customer", ylab="Prior Density") } } # FUNCTIONS FOR K-ARM TEST & ROLL (REQUIRES SIMULATION) ===== one_rep_profit <-function(n, N, s, mu, sigma, K, TS=FALSE) { # utility function used in profit_nn_sim() to simulate one set of potential outcomes m <- rnorm(K, mu, sigma) # draw a true mean for the arm y <- matrix(rnorm(N*K, m, s), nrow=N, ncol=K, byrow=TRUE) # N observations from each arm # perfect information profit perfect_info <- sum(y[,which.max(m)]) # Perfect information: sum observations from arm with highest m # test and roll profit with sample sizes n postmean <- rep(NA, K) for (k in 1:K) { postvar <- 1/(1/sigma[k]^2 + n[k]/s[k]^2) postmean[k] <- postvar*(mu[k]/sigma[k]^2 + sum(y[1:n[k], k])/s[k]^2) } delta <- which.max(postmean) # pick the arm with the highest posterior mean error <- delta != which.max(m) deploy_1 <- delta == 1 test_roll <- 0 for (k in 1:K) test_roll <- test_roll + sum(y[1:n[k], k]) # profit from first n observations for each arm test_roll <- test_roll + sum(y[(sum(n)+1):N, delta]) # profit from remaining observations for selected arm # Thompson sampling profit thom_samp <- NA if (TS==TRUE) { n <- rep(0, K) # Initialize each arm with zero observations # note mu and sigma are initialized at the priors postvar <- t(1/(1/sigma^2 + t(matrix(1:N, nrow=N, ncol=K)) * 1/s^2)) postmean <- postvar * t(mu/sigma^2 + t(apply(y, 2, cumsum)) * 1/s^2) for (i in 1:N) { k <- which.max(rnorm(K, mu, sigma)) # draw a sample from the current posterior for each arm and choose arm with largest draw n[k] <- n[k] + 1 # increase the number of observations used from sampled arm mu[k] <- postmean[n[k], k] # advance mu and sigma sigma[k] <- sqrt(postvar[n[k], k]) } thom_samp <- 0 for (k in 1:K) thom_samp <- thom_samp + sum(y[1:n[k], k]) } return(c(perfect_info=perfect_info, test_roll=test_roll, thom_samp=thom_samp, error = error, deploy_1=deploy_1)) } profit_nn_sim <- function(n, N, s, mu, sigma, K=2, TS=FALSE, R=1000) { # computes the per-customer profit for test & roll with K arms # where response is normal with (assymetric) normal priors # R is the number of simulation replications # n is the sample size for test & roll # N is the size of the total population # s are the (known) std devs of the outcome (vector of length 1 or K) # mu are the means of the priors on the mean response of length (vector of length 1 or K) # sigma are the std devs of the priors on the mean response (vector of length 1 or K) # TS is a switch for computing profit for Thompson Sampling # if s is length 1, then arms are assumed to be symmetric if (length(s) == 1) { s <- rep(s, K); mu <- rep(mu, K); sigma <- rep(sigma, K) } if (length(n) == 1) n <- rep(n, K) stopifnot(length(s) == K, length(mu) == K, length(sigma) == K) stopifnot(sum(sigma <= 0) == 0, sum(s <= 0) == 0) stopifnot(N > K) reps <- foreach(i=1:R) %dopar% one_rep_profit(n, N, s, mu, sigma, K, TS) reps <- as.data.frame(do.call(rbind, reps)) profit <- apply(reps[,1:3], 2, mean) profit <- rbind(exp_profit=profit, apply(reps[,1:3], 2, quantile, probs=c(0.05, 0.95), na.rm=TRUE)) profit <- profit / N regret_draws <- 1 - reps[,1:3] / reps$perfect_info regret <- apply(regret_draws, 2, mean) regret <- rbind(exp_regret=regret, apply(regret_draws, 2, quantile, probs=c(0.05, 0.95), na.rm=TRUE)) error <- mean(reps[,4]) deploy_1 <- mean(reps[,5]) return(list(profit=profit, regret=regret, error_rate=error, deploy_1_rate = deploy_1, profit_draws=reps, regret_draws=regret_draws)) } one_rep_test_size <- function(n_vals, N, s, mu, sigma, K) { # utility function used in test_size_nn_sim() to simulate one set of potential outcomes # and profits for all possible equal sample sizes # potential outcomes m <- rnorm(K, mu, sigma) # draw a true mean for the arm y <- matrix(rnorm(N*K, m, s), nrow=N, ncol=K, byrow=TRUE) # N observations from each arm postmean <- matrix(NA, nrow=length(n_vals), ncol=K) for (k in 1:K) { postvar <- 1/(1/sigma[k]^2 + n_vals/s[k]^2) postmean[,k] <- postvar*(mu[k]/sigma[k]^2 + cumsum(y[1:(floor(N/K)-1),k])/s[k]^2) } delta <- apply(postmean, 1, which.max) # pick the arm with the highest posterior mean error <- delta != which.max(m) profit <- rep(0, length(n_vals)) for (i in seq_along(n_vals)) { for (k in 1:K) profit[i] <- profit[i] + sum(y[1:n_vals[i], k]) # profit from first n[i] observations for each arm profit[i] <- profit[i] + sum(y[(n_vals[i]*K + 1):N, delta[i]]) # profit from remaining observations for selected arm } return(cbind(n=n_vals, profit)) } test_size_nn_sim <- function(N, s, mu, sigma, K=2, R=1000) { # computes the profit-maximizing test size for a multi-armed test & roll # where response is normal with normal priors (possibly asymmetric) # N is the size of the deployment population # K is the number of arms # s is a K-vector of (known) std devs of the response, if length(s)==1, symmetric priors are used # mu is a K-vector of length K the means of the priors on the outcome # sigma is a K-vector of std devs of the priors on the mean response stopifnot(N > 2, sum(s <= 0) == 0, sum(sigma <= 0) == 0) if (length(s==1)) { # symmetric amrs # n is same for all arms; solve by enumeration s <- rep(s, K); mu <- rep(mu, K); sigma <- rep(sigma, K) n_vals <- 1:(floor(N/K)-1) # potential values for n reps <- foreach(i=1:R) %dopar% one_rep_test_size(n_vals, N, s, mu, sigma, K) reps <- as.data.frame(do.call(rbind, reps)) exp_profit <- xtabs(profit ~ n, data=reps) / R n <- rep(n_vals[which.max(exp_profit)], K) } else { # asymmetric stopifnot(length(mu)==K, length(s) == K, length(sigma) == K) # todo:finish this # best option is to use a numeric optimization, but then noise in profit_nn_sim # becomes critical. comments out is one strategy # start values based on two-arm symmetric #n <- ( -3 * s[1]^2 + sqrt( 9*s[1]^4 + 4*N*s[1]^2*sigma[1]^2 ) ) / (4 * sigma[1]^2) #n <- optim(par=log(n), fn=profit_nn_sim, control=list(fnscale=-1), # N=N, s=s, mu=mu, sigma=sigma, K=K, R=1000, log_n=TRUE)$par #n <- optim(par=n, fn=profit_nn_sim, control=list(fnscale=-1), # more precise simulation # N=N, s=s, mu=mu, sigma=sigma, K=K, R=5000, log_n=TRUE)$par n <- NA } return(list(n=n, max(exp_profit)/N)) } # SUMMARY FUNCTION ===== test_eval_nn <- function(n, N, s, mu, sigma) { # provides a complete summary of a test & roll plan # n is a vector of length 2 of sample sizes # N is the size of the deployment population # s is a vector of lenght 2 of the (known) std dev of the outcome # mu is a vector of length 2 of the means of the prior on the mean response # sigma is a vector of length 2 of the std dev of the prior on the mean response # if length(n)=1, equal sample sizes are assumed # if lenght(s)=1 symmetric priors are assumed and only the first elements of mu and sigma are used stopifnot(N >= n[1] + n[2], n[1] > 0, n[2] >0, sum(s <= 0) == 0, sum(sigma <=0) == 0) profit <- profit_nn(n, N, s, mu, sigma)*N if (length(s)==1) { # symmetric test <- mu[1] * (n[1] + n[2]) deploy <- profit - test rand <- mu[1]*N # choose randomly perfect <- profit_perfect_nn(mu, sigma)*N error_rate <- error_rate_nn(n, s, sigma) deploy_1_rate <- 0.5 } else { # assymetric test <- mu[1] * n[1] + mu[2] *n[2] deploy <- profit - test rand <- ((mu[1] + mu[2])*0.5)*N out <- profit_nn_sim(n, N, s, mu, sigma, R=10000) perfect <- out$profit["exp_profit", "perfect_info"]*N error_rate <- out$error_rate deploy_1_rate <- out$deploy_1_rate } gain <- (profit - rand) / (perfect - rand) data.frame(n1=n[1], n2=n[2], profit_per_cust = profit/N, profit = profit, profit_test = test, profit_deploy = deploy, profit_rand = rand, profit_perfect = perfect, profit_gain = gain, regret = 1-profit/perfect, error_rate = error_rate, deploy_1_rate = deploy_1_rate, tie_rate = 0) }