On Selecting and Conditioning in Multiple Testing and Selective Inference

Supplementary R code for reproducing the examples in the paper ``On Selecting and Conditioning in Multiple Testing and Selective Inference’’.

Required packages

require("readr") || install.packages("readr")
require("truncnorm") || install.packages("truncnorm")
require("glmnet") || install.packages("glmnet")
require("lars") || install.packages("lars")
require("penalized") || install.packages("penalized")
require("plotrix") || install.packages("plotrix")
require("tidyverse") || install.packages("tidyverse")

Import functions

normc = function(X,center=T) {
  X.centered = scale(X, center=center, scale=F)
  X.scaled = scale(X.centered, center=F, scale=sqrt(colSums(X.centered^2)))
  X.scaled[,]
}

solve_problem_glmnet = function(X, y, lambda_glmnet){
  lasso = glmnet(x=X, 
                 y=y, 
                 alpha=1,
                 standardize=FALSE, 
                 intercept=FALSE, 
                 thresh=1e-20)
  beta_hat = coef(lasso, s=lambda_glmnet)
  return(beta_hat[-1])
}


solve_problem_penalized = function(X,y,lambda_penalized){
  lasso = penalized(penalized=X, 
                 response=y, 
                 lambda1=lambda_penalized, 
                 lambda2 = 0, 
                 unpenalized = ~0,
                 standardize = FALSE, 
                 trace=FALSE)
  beta_hat = coef(lasso, "penalized")
  return(beta_hat)
}

tn.cdf = function(y, mu, a, b, sigma){
  d_right = pnorm (b, mu, sigma, lower.tail=FALSE, log.p=TRUE)
  d_left = pnorm (a, mu, sigma, lower.tail=TRUE, log.p=TRUE)
  d_max = max(d_right, d_left)
  d_log = d_max + log(exp(d_left - d_max) + exp(d_right - d_max))
  if (y > a & y < b){
    n_log = d_left
    return (exp(n_log-d_log))
  }else{
    if (y > b){
      n_y_tilde = pnorm (y, mu, sigma, lower.tail=FALSE, log.p=TRUE)
      n_b_tilde = pnorm (b, mu, sigma, lower.tail=FALSE, log.p=TRUE)
      n_yb = n_b_tilde + log(1 - exp(n_y_tilde-n_b_tilde))
      n_a = d_left
      return(exp(n_yb-d_log) + exp(n_a-d_log))
    }else{
      n_log = pnorm (y, mu, sigma, lower.tail=TRUE, log.p=TRUE)
      return (exp(n_log-d_log))
    }
  }
}



lasso_inference = function(X, y, lambda, alpha, sigma){

  n = nrow(X)
  m = ncol(X)
  
  pval = vector(mode = "numeric", length = m)
  lower = upper =  pval
  active = vector(mode = "logical", length = m)
  
  for (j in 1:m){
    
    eta_j = X %*% solve(crossprod(X)) %*% (1:m == j)
    sqlen_eta_j = c(crossprod(eta_j))
    X_j = X[,j,drop=FALSE]
    X_jc = X[,-j,drop=FALSE]
    nu_j = ( diag(rep(1,n)) - eta_j %*% t(eta_j) / sqlen_eta_j ) %*% y
    beta_jc = solve_problem_penalized(X[,-j,drop=FALSE], nu_j, lambda_penalized = lambda)
    r_j = X_jc %*% beta_jc - nu_j
    z_j = c( t(eta_j) %*% y )
    
    a_j = c(sqlen_eta_j * ( crossprod( X_j, r_j ) - lambda ))
    b_j = c(sqlen_eta_j * ( crossprod( X_j, r_j ) + lambda ))
    
    isactive = abs( crossprod( X_j, r_j ) - crossprod( eta_j, y ) / sqlen_eta_j ) >= lambda
    
    if ( isactive ){
      pv = tn.cdf(y = z_j, mu=0, a=a_j, b=b_j, sigma=sigma*sqrt(sqlen_eta_j))
      up = uniroot(maxiter = 1000,function(x) tn.cdf(y = z_j, mu=x, a=a_j, b=b_j, sigma=sigma*sqrt(sqlen_eta_j)) - alpha/2, interval = c(a_j,b_j), extendInt = "yes" )$root
      lo = uniroot(maxiter = 1000, function(x) 1 - tn.cdf(y = z_j, mu=x, a=a_j, b=b_j, sigma=sigma*sqrt(sqlen_eta_j)) - alpha/2, interval = c(a_j,b_j), extendInt = "yes" )$root
    } else {
      pv = ptruncnorm(q = z_j, mean=0, a=a_j, b=b_j, sd=sigma*sqrt(sqlen_eta_j)) 
      up = uniroot(function(x) ptruncnorm(q = z_j, mean=x, a=a_j, b=b_j, sd=sigma*sqrt(sqlen_eta_j)) - alpha/2, interval = c(a_j,b_j) , extendInt = "yes")$root
      lo = uniroot(function(x) 1-ptruncnorm(q = z_j, mean=x, a=a_j, b=b_j, sd=sigma*sqrt(sqlen_eta_j)) - alpha/2, interval = c(a_j,b_j), extendInt = "yes" )$root
    }
    pval[j] = 2*min(pv,1-pv)
    lower[j] = lo
    upper[j] = up
    active[j] = isactive
  }
  
  return(list(pval = pval, lower=lower, upper=upper, active=active))
}


lasso_pvalue = function(X, y, lambda, alpha, sigma, beta){
  
  n = nrow(X)
  m = ncol(X)
  
  pval = vector(mode = "numeric", length = m)
  active = vector(mode = "logical", length = m)
  
  for (j in 1:m){
    
    eta_j = X %*% solve(crossprod(X)) %*% (1:m == j)
    sqlen_eta_j = c(crossprod(eta_j))
    X_j = X[,j,drop=FALSE]
    X_jc = X[,-j,drop=FALSE]
    nu_j = ( diag(rep(1,n)) - eta_j %*% t(eta_j) / sqlen_eta_j ) %*% y
    beta_jc = solve_problem_penalized(X=X[,-j,drop=FALSE], y=nu_j, lambda_penalized = lambda)
    r_j = X_jc %*% beta_jc - nu_j 
    z_j = c( t(eta_j) %*% y )
    
    a_j = c(sqlen_eta_j * ( crossprod( X_j, r_j ) - lambda ))
    b_j = c(sqlen_eta_j * ( crossprod( X_j, r_j ) + lambda ))
    
    isactive = abs( crossprod( X_j, r_j ) - crossprod( eta_j, y ) / sqlen_eta_j ) >= lambda
    
    if ( isactive ){
      pv = tn.cdf(y = z_j, mu=beta[j], a=a_j, b=b_j, sigma=sigma*sqrt(sqlen_eta_j))
    } else {
      pv = ptruncnorm(q = z_j, mean=0, a=a_j, b=b_j, sd=sigma*sqrt(sqlen_eta_j)) 
    }
    pval[j] = 2*min(pv,1-pv)
    active[j] = isactive
  }
  return(list(pval = pval, active=active))
}


lasso_active = function(X, y, lambda){
  
  n = nrow(X)
  m = ncol(X)
  active = vector(mode = "logical", length = m)
  
  for (j in 1:m){
    eta_j = X %*% solve(crossprod(X)) %*% (1:m == j)
    sqlen_eta_j = c(crossprod(eta_j))
    X_j = X[,j,drop=FALSE]
    X_jc = X[,-j,drop=FALSE]
    nu_j = ( diag(rep(1,n)) - eta_j %*% t(eta_j) / sqlen_eta_j ) %*% y
    beta_jc = solve_problem_penalized(X=X[,-j,drop=FALSE], y=nu_j, lambda_penalized = lambda)
    r_j = X_jc %*% beta_jc - nu_j 
    isactive = abs( crossprod( X_j, r_j ) - crossprod( eta_j, y ) / sqlen_eta_j ) >= lambda
    active[j] = isactive
  }
  return(active=active)
}

Import data

Stamey et al. (1989) was interested in the relation between prostate specific antigen (PSA) and several clinical measures, including log cancer volume (lcavol), log prostate weight (lweight), age, log of benign prostatic hyperplasia amount (lbph), seminal vesicle invasion (svi), log of capsular penetration (lcp), the Gleason score (gleason), and percent of Gleason scores 4 or 5 (pgg45). The dataset consisted of information collected from \(n=97\) men who were preparing to undergo a radial prostatectomy.

prostate <- read_delim("https://hastie.su.domains/ElemStatLearn/datasets/prostate.data", 
    delim = "\t", escape_double = FALSE, trim_ws = TRUE)[,-1]

X_raw = as.matrix(prostate[,1:8])
X = normc(X_raw,center=T)
y_raw = prostate[,9]
y = normc(y_raw,center=T)
sigma_hat = summary(lm(y~X))$sigma
n = nrow(X)
m = ncol(X)

Analysis

The estimate from the regression model with all variables was utilized as the true value of \(\sigma^2\). For this dataset, Liu et al. (2018) used \(\lambda=0.0327\) (chosen by 10-fold cross-validation), which resulted in the selection of 7 variables. The following table compares the unconditional \(p\)-values for the hypotheses \(\beta_i=0\) (the row corresponding to lambda0) with the selective conditional \(p\)-values (the row corresponding to lambdacv).

lambdacv=lasso_inference(X, y, lambda=0.03270833, alpha=0.1, sigma=sigma_hat)$pval
lambda0=2*pnorm(abs(summary(lm(y~X))$coef[-1,3]), lower.tail = FALSE)
tab=rbind(lambdacv,lambda0)
colnames(tab)=colnames(X)
round(tab,4)
#>          lcavol lweight    age   lbph    svi    lcp gleason  pgg45
#> lambdacv      0  0.0036 0.0915 0.1779 0.0031 0.4666  0.0073 0.7419
#> lambda0       0  0.0020 0.0552 0.0949 0.0016 0.2380  0.7513 0.3072

Figure 7

Selective conditional \(p\)-values as a function of \(\lambda\). Black \(p\)-value curves are conditional on selection by the lasso, grey ones are conditional on non-selection. Vertical dashed lines indicate the values of \(\lambda\) at which the active set change.

alpha = 0.1
lambdas = 10^seq(0.01,-4,length=100)
knots = lars::lars(X,y)$lambda

PV = matrix(NA, nrow=length(lambdas), ncol=m)
LO = UP = AS = PV 

for (i in 1:length(lambdas)){
  
  lambda = lambdas[i]
  res = lasso_inference(X, y, lambda, alpha, sigma=sigma_hat)
  PV[i,] <- res$pval
  AS[i,] <- res$active
  LO[i,] <- res$lower
  UP[i,] <- res$upper
}

pvals = 2*pnorm(abs(summary(lm(y~X))$coef[-1,3]), lower.tail = FALSE)
PV_A = PV_NA = PV
PV_A[!AS] = NA
PV_NA[AS] = NA

#pdf("Figure_pvals.pdf")
matplot(xlim=c(-9.5,0.5), col=1,
  log(c(0,lambdas)), 
rbind(PV_A,pvals), 
type="l", lty=1, lwd=2, xlab=expression(log(lambda)), ylab="p-value")
for (i in 1:length(knots)) abline(v=log(knots[i]), lty=2, col="gray")
text(x=0.5, y=pvals, labels = colnames(X), cex=0.5, col="gray")
text(x=-9.5, y=pvals, labels = colnames(X), cex=0.5)
matlines(col="gray",
        log(c(0,lambdas)), 
        rbind(PV_NA,pvals),lty=1, lwd=2)

#dev.off()

Figure 5

90%-confidence intervals for all eight variables of the famous Prostate data set as a function of \(\lambda\), with intervals for selected coefficients in black and for non-selected ones in grey.

ll = log(lambdas)
l = (ll - min(ll) )/(max(ll)-min(ll)) *0.9

j = 1
plot(j+l,(LO[,j]+UP[,j])/2, col="white", ylim=c(-1,1), xlim=c(1,m+1),ylab="90% confidence interval", xaxt="n", xlab="")
axis(1, at=(1:m)+0.5,labels=names(prostate)[-c(9:10)])
axis(3, at=(1:m)-.05, labels=c(c(-9,0),rep(NA,m-2)))
mtext(expression(log(lambda)),side=3,at=1.5-.05)
cols = c("gray","black")
for (j in 1:m){
  abline(v=j-.05, lty=3)  
  for (i in 1:length(l)) segments(x0=j+l[i],x1=j+l[i],y0=LO[i,j],y1=UP[i,j], col=cols[AS[i,j]+1])
}
abline(v=9-.05, lty=3)
abline(h=0, lty=3)

#dev.off()

Figure 6

Plot of lasso selection regions in the \(Y\) space: red, blue, cyan and green regions indicate \(S=\emptyset\), \(S=\{1\}\), \(S=\{2\}\) and \(S=\{1,2\}\), respectively. \(Y\sim N(\mu, I_2)\) and the dashed circle corresponds to the 95% quantile.

m = 2
rho <- .95 
Sigma = diag(rep(1-rho,m)) + (rho) * matrix(1, nrow = m, ncol = m)
X = t(eigen(Sigma)$vectors %*% diag(sqrt(eigen(Sigma)$values )) )

beta_true = matrix(c(5,5),ncol=1) 
sigma_true = 1
n = nrow(X)
mylambda = .2

n.grid <- 100
grid_y1 <- seq(-2, 10, length = n.grid)
grid_y2 <- seq(-3, 3, length = n.grid)
grid_y <- data.matrix(expand.grid(grid_y1, grid_y2))

RES = matrix(NA, nrow=nrow(grid_y), ncol=2)

for (i in 1:nrow(grid_y)){
  y = grid_y[i,]
  isactive <- vector()
  RES[i,] = lasso_active(X, y, lambda=mylambda)
}

plot(grid_y,pch=".", asp=1)
abline(h=0,lty=1)
abline(v=0,lty=1)
points(grid_y[apply(RES,1,sum)==0,],pch=19,col=2, cex=0.25)
points(grid_y[apply(RES,1,sum)==2,],pch=19,col=3, cex=0.25)
points(grid_y[apply(RES,1,sum)==1 & RES[,1]==T,],pch=19,col=4, cex=0.25)
points(grid_y[apply(RES,1,sum)==1 & RES[,2]==T,],pch=19,col=5, cex=0.25)

mu = X %*% beta_true
points(mu[1],mu[2], pch=19)
draw.circle(x=mu[1],y=mu[2],radius=sqrt(qchisq(.95,df=2)) )

In the following simulation, it is demonstrated numerically that there is a lack of FCR control at the \(\alpha\) level and a lack of simultaneous control at the confidence level of \(1-\alpha/|S|\).

set.seed(123)
m = 2
rho <- 0.95 # correlation coefficient
Sigma = diag(rep(1-rho,m)) + (rho) * matrix(1, nrow = m, ncol = m)
X = t(eigen(Sigma)$vectors %*% diag(sqrt(eigen(Sigma)$values )) )
beta_true = rep(5,m)
sigma_true = 1
n = nrow(X)

B = 5000
mylambda = 0.2
myalpha = 0.1

AS = PV = matrix(NA, nrow=B, ncol=m)
set.seed(123)
for (b in 1:B){
  y = X %*% beta_true + rnorm(n,mean=0,sd=sigma_true)
  res = lasso_pvalue(X, y, lambda=mylambda, alpha=myalpha, sigma=sigma_true, beta = beta_true)
  PV[b,] = res$pval
  AS[b,] =  res$active
}

#simultaneous coverage on the selected
simcov = 1-mean(sapply(1:nrow(PV), function(i) any(p.adjust(PV[i,AS[i,]],"bonf") <= myalpha)))
simcov
#> [1] 0.8802

#FCR
fcr = mean(sapply(1:nrow(PV), function(i) sum( PV[i,AS[i,]] <= myalpha )/pmax(1,sum(AS[i,])) ))
fcr
#> [1] 0.1313

Figure 4

Expected number of rejections for four procedures defined in Section 7. Based on n = 100 hypotheses, \(\alpha = 0.05\), and \(10^4\) simulations.

B <- 10000
n <- 100
alpha = 0.05
mu=3
set.seed(13579)
nulls <- matrix(runif(B*n), B, n)
alts <- pnorm(matrix(rnorm(B*n, mean=-mu), B, n))
crit <- 1- (1-alpha)^(1/n:1)
n1s <- 0:10
res <- sapply(n1s, function(n1) {
  P <- cbind(nulls[,seq_len(n-n1)], alts[, seq_len(n1)])
  P <- do.call(rbind, apply(P, 1, sort, simplify = FALSE))
  gate <- P[,1]/P[,2] < alpha
  Si <- apply(P > matrix(crit, B, n, byrow=TRUE), 1, function(p) min (c(which(p), n+1))-1)
  BH <- apply(P, 1, function(p) sum(p.adjust(p, method = "BH") < alpha))
  Si2 <- apply((P[,-1]-P[,1])/(1-P[,1]) > matrix(crit[-1], B, n-1, byrow=TRUE), 1, function(p) min (c(which(p), n)))
  BH2 <- apply((P[,-1]-P[,1])/(1-P[,1]), 1, function(p) sum(p.adjust(p, method = "BH") < alpha*n/(n-1)))
  A <- P[,1]/P[,2] < alpha
  B <- ifelse(gate, BH2+1, 0)
  C <- P[,1] < crit[1]
  D <- BH
  c(A=mean(A), B=mean(B), C=mean(C), D = mean(D)) 
})

D <- as.data.frame(cbind(n1=n1s,t(res))) %>% pivot_longer(A:D, values_to = "rate", names_to = "method") %>%
  mutate(n1=as.factor(n1))
library(ggplot2)
labs <- c("A: cond. sel.",
          "B: cond. non-sel.",
          "C: uncond. sel.",
          "D: uncond. non-sel.")
ggplot(D) + aes(group=method, color=method, x=n1, y=rate, linetype=method,
                shape=method) + geom_point() + geom_line() + theme_minimal() + 
  coord_cartesian(ylim=c(0, 2.5)) + theme(legend.position ="bottom") +
  ylab("rejected hypotheses") + xlab("false hypotheses") +
  scale_color_discrete(labels = labs) +
  scale_linetype_discrete(labels = labs) +
  scale_shape_discrete(labels = labs)

#ggsave("inference_winner_BH.pdf",       width = 18, height = 14, units = "cm")