Supplementary R code for reproducing the examples in the paper ``On Selecting and Conditioning in Multiple Testing and Selective Inference’’.
require("readr") || install.packages("readr")
require("truncnorm") || install.packages("truncnorm")
require("glmnet") || install.packages("glmnet")
require("lars") || install.packages("lars")
require("penalized") || install.packages("penalized")
require("plotrix") || install.packages("plotrix")
require("tidyverse") || install.packages("tidyverse")
normc = function(X,center=T) {
X.centered = scale(X, center=center, scale=F)
X.scaled = scale(X.centered, center=F, scale=sqrt(colSums(X.centered^2)))
X.scaled[,]
}
solve_problem_glmnet = function(X, y, lambda_glmnet){
lasso = glmnet(x=X,
y=y,
alpha=1,
standardize=FALSE,
intercept=FALSE,
thresh=1e-20)
beta_hat = coef(lasso, s=lambda_glmnet)
return(beta_hat[-1])
}
solve_problem_penalized = function(X,y,lambda_penalized){
lasso = penalized(penalized=X,
response=y,
lambda1=lambda_penalized,
lambda2 = 0,
unpenalized = ~0,
standardize = FALSE,
trace=FALSE)
beta_hat = coef(lasso, "penalized")
return(beta_hat)
}
tn.cdf = function(y, mu, a, b, sigma){
d_right = pnorm (b, mu, sigma, lower.tail=FALSE, log.p=TRUE)
d_left = pnorm (a, mu, sigma, lower.tail=TRUE, log.p=TRUE)
d_max = max(d_right, d_left)
d_log = d_max + log(exp(d_left - d_max) + exp(d_right - d_max))
if (y > a & y < b){
n_log = d_left
return (exp(n_log-d_log))
}else{
if (y > b){
n_y_tilde = pnorm (y, mu, sigma, lower.tail=FALSE, log.p=TRUE)
n_b_tilde = pnorm (b, mu, sigma, lower.tail=FALSE, log.p=TRUE)
n_yb = n_b_tilde + log(1 - exp(n_y_tilde-n_b_tilde))
n_a = d_left
return(exp(n_yb-d_log) + exp(n_a-d_log))
}else{
n_log = pnorm (y, mu, sigma, lower.tail=TRUE, log.p=TRUE)
return (exp(n_log-d_log))
}
}
}
lasso_inference = function(X, y, lambda, alpha, sigma){
n = nrow(X)
m = ncol(X)
pval = vector(mode = "numeric", length = m)
lower = upper = pval
active = vector(mode = "logical", length = m)
for (j in 1:m){
eta_j = X %*% solve(crossprod(X)) %*% (1:m == j)
sqlen_eta_j = c(crossprod(eta_j))
X_j = X[,j,drop=FALSE]
X_jc = X[,-j,drop=FALSE]
nu_j = ( diag(rep(1,n)) - eta_j %*% t(eta_j) / sqlen_eta_j ) %*% y
beta_jc = solve_problem_penalized(X[,-j,drop=FALSE], nu_j, lambda_penalized = lambda)
r_j = X_jc %*% beta_jc - nu_j
z_j = c( t(eta_j) %*% y )
a_j = c(sqlen_eta_j * ( crossprod( X_j, r_j ) - lambda ))
b_j = c(sqlen_eta_j * ( crossprod( X_j, r_j ) + lambda ))
isactive = abs( crossprod( X_j, r_j ) - crossprod( eta_j, y ) / sqlen_eta_j ) >= lambda
if ( isactive ){
pv = tn.cdf(y = z_j, mu=0, a=a_j, b=b_j, sigma=sigma*sqrt(sqlen_eta_j))
up = uniroot(maxiter = 1000,function(x) tn.cdf(y = z_j, mu=x, a=a_j, b=b_j, sigma=sigma*sqrt(sqlen_eta_j)) - alpha/2, interval = c(a_j,b_j), extendInt = "yes" )$root
lo = uniroot(maxiter = 1000, function(x) 1 - tn.cdf(y = z_j, mu=x, a=a_j, b=b_j, sigma=sigma*sqrt(sqlen_eta_j)) - alpha/2, interval = c(a_j,b_j), extendInt = "yes" )$root
} else {
pv = ptruncnorm(q = z_j, mean=0, a=a_j, b=b_j, sd=sigma*sqrt(sqlen_eta_j))
up = uniroot(function(x) ptruncnorm(q = z_j, mean=x, a=a_j, b=b_j, sd=sigma*sqrt(sqlen_eta_j)) - alpha/2, interval = c(a_j,b_j) , extendInt = "yes")$root
lo = uniroot(function(x) 1-ptruncnorm(q = z_j, mean=x, a=a_j, b=b_j, sd=sigma*sqrt(sqlen_eta_j)) - alpha/2, interval = c(a_j,b_j), extendInt = "yes" )$root
}
pval[j] = 2*min(pv,1-pv)
lower[j] = lo
upper[j] = up
active[j] = isactive
}
return(list(pval = pval, lower=lower, upper=upper, active=active))
}
lasso_pvalue = function(X, y, lambda, alpha, sigma, beta){
n = nrow(X)
m = ncol(X)
pval = vector(mode = "numeric", length = m)
active = vector(mode = "logical", length = m)
for (j in 1:m){
eta_j = X %*% solve(crossprod(X)) %*% (1:m == j)
sqlen_eta_j = c(crossprod(eta_j))
X_j = X[,j,drop=FALSE]
X_jc = X[,-j,drop=FALSE]
nu_j = ( diag(rep(1,n)) - eta_j %*% t(eta_j) / sqlen_eta_j ) %*% y
beta_jc = solve_problem_penalized(X=X[,-j,drop=FALSE], y=nu_j, lambda_penalized = lambda)
r_j = X_jc %*% beta_jc - nu_j
z_j = c( t(eta_j) %*% y )
a_j = c(sqlen_eta_j * ( crossprod( X_j, r_j ) - lambda ))
b_j = c(sqlen_eta_j * ( crossprod( X_j, r_j ) + lambda ))
isactive = abs( crossprod( X_j, r_j ) - crossprod( eta_j, y ) / sqlen_eta_j ) >= lambda
if ( isactive ){
pv = tn.cdf(y = z_j, mu=beta[j], a=a_j, b=b_j, sigma=sigma*sqrt(sqlen_eta_j))
} else {
pv = ptruncnorm(q = z_j, mean=0, a=a_j, b=b_j, sd=sigma*sqrt(sqlen_eta_j))
}
pval[j] = 2*min(pv,1-pv)
active[j] = isactive
}
return(list(pval = pval, active=active))
}
lasso_active = function(X, y, lambda){
n = nrow(X)
m = ncol(X)
active = vector(mode = "logical", length = m)
for (j in 1:m){
eta_j = X %*% solve(crossprod(X)) %*% (1:m == j)
sqlen_eta_j = c(crossprod(eta_j))
X_j = X[,j,drop=FALSE]
X_jc = X[,-j,drop=FALSE]
nu_j = ( diag(rep(1,n)) - eta_j %*% t(eta_j) / sqlen_eta_j ) %*% y
beta_jc = solve_problem_penalized(X=X[,-j,drop=FALSE], y=nu_j, lambda_penalized = lambda)
r_j = X_jc %*% beta_jc - nu_j
isactive = abs( crossprod( X_j, r_j ) - crossprod( eta_j, y ) / sqlen_eta_j ) >= lambda
active[j] = isactive
}
return(active=active)
}
Stamey et al. (1989) was interested in the relation between prostate specific antigen (PSA) and several clinical measures, including log cancer volume (lcavol), log prostate weight (lweight), age, log of benign prostatic hyperplasia amount (lbph), seminal vesicle invasion (svi), log of capsular penetration (lcp), the Gleason score (gleason), and percent of Gleason scores 4 or 5 (pgg45). The dataset consisted of information collected from \(n=97\) men who were preparing to undergo a radial prostatectomy.
prostate <- read_delim("https://hastie.su.domains/ElemStatLearn/datasets/prostate.data",
delim = "\t", escape_double = FALSE, trim_ws = TRUE)[,-1]
X_raw = as.matrix(prostate[,1:8])
X = normc(X_raw,center=T)
y_raw = prostate[,9]
y = normc(y_raw,center=T)
sigma_hat = summary(lm(y~X))$sigma
n = nrow(X)
m = ncol(X)
The estimate from the regression model with all variables was utilized as the true value of \(\sigma^2\). For this dataset, Liu et al. (2018) used \(\lambda=0.0327\) (chosen by 10-fold cross-validation), which resulted in the selection of 7 variables. The following table compares the unconditional \(p\)-values for the hypotheses \(\beta_i=0\) (the row corresponding to lambda0) with the selective conditional \(p\)-values (the row corresponding to lambdacv).
lambdacv=lasso_inference(X, y, lambda=0.03270833, alpha=0.1, sigma=sigma_hat)$pval
lambda0=2*pnorm(abs(summary(lm(y~X))$coef[-1,3]), lower.tail = FALSE)
tab=rbind(lambdacv,lambda0)
colnames(tab)=colnames(X)
round(tab,4)
#> lcavol lweight age lbph svi lcp gleason pgg45
#> lambdacv 0 0.0036 0.0915 0.1779 0.0031 0.4666 0.0073 0.7419
#> lambda0 0 0.0020 0.0552 0.0949 0.0016 0.2380 0.7513 0.3072
Selective conditional \(p\)-values as a function of \(\lambda\). Black \(p\)-value curves are conditional on selection by the lasso, grey ones are conditional on non-selection. Vertical dashed lines indicate the values of \(\lambda\) at which the active set change.
alpha = 0.1
lambdas = 10^seq(0.01,-4,length=100)
knots = lars::lars(X,y)$lambda
PV = matrix(NA, nrow=length(lambdas), ncol=m)
LO = UP = AS = PV
for (i in 1:length(lambdas)){
lambda = lambdas[i]
res = lasso_inference(X, y, lambda, alpha, sigma=sigma_hat)
PV[i,] <- res$pval
AS[i,] <- res$active
LO[i,] <- res$lower
UP[i,] <- res$upper
}
pvals = 2*pnorm(abs(summary(lm(y~X))$coef[-1,3]), lower.tail = FALSE)
PV_A = PV_NA = PV
PV_A[!AS] = NA
PV_NA[AS] = NA
#pdf("Figure_pvals.pdf")
matplot(xlim=c(-9.5,0.5), col=1,
log(c(0,lambdas)),
rbind(PV_A,pvals),
type="l", lty=1, lwd=2, xlab=expression(log(lambda)), ylab="p-value")
for (i in 1:length(knots)) abline(v=log(knots[i]), lty=2, col="gray")
text(x=0.5, y=pvals, labels = colnames(X), cex=0.5, col="gray")
text(x=-9.5, y=pvals, labels = colnames(X), cex=0.5)
matlines(col="gray",
log(c(0,lambdas)),
rbind(PV_NA,pvals),lty=1, lwd=2)
90%-confidence intervals for all eight variables of the famous Prostate data set as a function of \(\lambda\), with intervals for selected coefficients in black and for non-selected ones in grey.
ll = log(lambdas)
l = (ll - min(ll) )/(max(ll)-min(ll)) *0.9
j = 1
plot(j+l,(LO[,j]+UP[,j])/2, col="white", ylim=c(-1,1), xlim=c(1,m+1),ylab="90% confidence interval", xaxt="n", xlab="")
axis(1, at=(1:m)+0.5,labels=names(prostate)[-c(9:10)])
axis(3, at=(1:m)-.05, labels=c(c(-9,0),rep(NA,m-2)))
mtext(expression(log(lambda)),side=3,at=1.5-.05)
cols = c("gray","black")
for (j in 1:m){
abline(v=j-.05, lty=3)
for (i in 1:length(l)) segments(x0=j+l[i],x1=j+l[i],y0=LO[i,j],y1=UP[i,j], col=cols[AS[i,j]+1])
}
abline(v=9-.05, lty=3)
abline(h=0, lty=3)
Plot of lasso selection regions in the \(Y\) space: red, blue, cyan and green regions indicate \(S=\emptyset\), \(S=\{1\}\), \(S=\{2\}\) and \(S=\{1,2\}\), respectively. \(Y\sim N(\mu, I_2)\) and the dashed circle corresponds to the 95% quantile.
m = 2
rho <- .95
Sigma = diag(rep(1-rho,m)) + (rho) * matrix(1, nrow = m, ncol = m)
X = t(eigen(Sigma)$vectors %*% diag(sqrt(eigen(Sigma)$values )) )
beta_true = matrix(c(5,5),ncol=1)
sigma_true = 1
n = nrow(X)
mylambda = .2
n.grid <- 100
grid_y1 <- seq(-2, 10, length = n.grid)
grid_y2 <- seq(-3, 3, length = n.grid)
grid_y <- data.matrix(expand.grid(grid_y1, grid_y2))
RES = matrix(NA, nrow=nrow(grid_y), ncol=2)
for (i in 1:nrow(grid_y)){
y = grid_y[i,]
isactive <- vector()
RES[i,] = lasso_active(X, y, lambda=mylambda)
}
plot(grid_y,pch=".", asp=1)
abline(h=0,lty=1)
abline(v=0,lty=1)
points(grid_y[apply(RES,1,sum)==0,],pch=19,col=2, cex=0.25)
points(grid_y[apply(RES,1,sum)==2,],pch=19,col=3, cex=0.25)
points(grid_y[apply(RES,1,sum)==1 & RES[,1]==T,],pch=19,col=4, cex=0.25)
points(grid_y[apply(RES,1,sum)==1 & RES[,2]==T,],pch=19,col=5, cex=0.25)
mu = X %*% beta_true
points(mu[1],mu[2], pch=19)
draw.circle(x=mu[1],y=mu[2],radius=sqrt(qchisq(.95,df=2)) )
In the following simulation, it is demonstrated numerically that there is a lack of FCR control at the \(\alpha\) level and a lack of simultaneous control at the confidence level of \(1-\alpha/|S|\).
set.seed(123)
m = 2
rho <- 0.95 # correlation coefficient
Sigma = diag(rep(1-rho,m)) + (rho) * matrix(1, nrow = m, ncol = m)
X = t(eigen(Sigma)$vectors %*% diag(sqrt(eigen(Sigma)$values )) )
beta_true = rep(5,m)
sigma_true = 1
n = nrow(X)
B = 5000
mylambda = 0.2
myalpha = 0.1
AS = PV = matrix(NA, nrow=B, ncol=m)
set.seed(123)
for (b in 1:B){
y = X %*% beta_true + rnorm(n,mean=0,sd=sigma_true)
res = lasso_pvalue(X, y, lambda=mylambda, alpha=myalpha, sigma=sigma_true, beta = beta_true)
PV[b,] = res$pval
AS[b,] = res$active
}
#simultaneous coverage on the selected
simcov = 1-mean(sapply(1:nrow(PV), function(i) any(p.adjust(PV[i,AS[i,]],"bonf") <= myalpha)))
simcov
#> [1] 0.8802
#FCR
fcr = mean(sapply(1:nrow(PV), function(i) sum( PV[i,AS[i,]] <= myalpha )/pmax(1,sum(AS[i,])) ))
fcr
#> [1] 0.1313
Expected number of rejections for four procedures defined in Section 7. Based on n = 100 hypotheses, \(\alpha = 0.05\), and \(10^4\) simulations.
B <- 10000
n <- 100
alpha = 0.05
mu=3
set.seed(13579)
nulls <- matrix(runif(B*n), B, n)
alts <- pnorm(matrix(rnorm(B*n, mean=-mu), B, n))
crit <- 1- (1-alpha)^(1/n:1)
n1s <- 0:10
res <- sapply(n1s, function(n1) {
P <- cbind(nulls[,seq_len(n-n1)], alts[, seq_len(n1)])
P <- do.call(rbind, apply(P, 1, sort, simplify = FALSE))
gate <- P[,1]/P[,2] < alpha
Si <- apply(P > matrix(crit, B, n, byrow=TRUE), 1, function(p) min (c(which(p), n+1))-1)
BH <- apply(P, 1, function(p) sum(p.adjust(p, method = "BH") < alpha))
Si2 <- apply((P[,-1]-P[,1])/(1-P[,1]) > matrix(crit[-1], B, n-1, byrow=TRUE), 1, function(p) min (c(which(p), n)))
BH2 <- apply((P[,-1]-P[,1])/(1-P[,1]), 1, function(p) sum(p.adjust(p, method = "BH") < alpha*n/(n-1)))
A <- P[,1]/P[,2] < alpha
B <- ifelse(gate, BH2+1, 0)
C <- P[,1] < crit[1]
D <- BH
c(A=mean(A), B=mean(B), C=mean(C), D = mean(D))
})
D <- as.data.frame(cbind(n1=n1s,t(res))) %>% pivot_longer(A:D, values_to = "rate", names_to = "method") %>%
mutate(n1=as.factor(n1))
library(ggplot2)
labs <- c("A: cond. sel.",
"B: cond. non-sel.",
"C: uncond. sel.",
"D: uncond. non-sel.")
ggplot(D) + aes(group=method, color=method, x=n1, y=rate, linetype=method,
shape=method) + geom_point() + geom_line() + theme_minimal() +
coord_cartesian(ylim=c(0, 2.5)) + theme(legend.position ="bottom") +
ylab("rejected hypotheses") + xlab("false hypotheses") +
scale_color_discrete(labels = labs) +
scale_linetype_discrete(labels = labs) +
scale_shape_discrete(labels = labs)