##The command ##source("http://parker.ad.siu.edu/Olive/hdpack.txt") ##is an easy way to get these functions into R. #need to install the following R packages one time #install.packages("glmnet") #install.packages("pls") # hdpack is a collection of R functions #for Math 583. AERplot<-function(yhat, y, res, d=1, alph = 0.1){ # Makes a response plot where cases within their PI are not plotted. # Need alph around 0.3 if n is small. Use alph = 0.01 if n > 100000. # If alph = 1, the usual response plot of yhat versus y is made. # The value d is a crude estimate of the model complexity or degrees of freedom. # Could use d = p if n>>p where n is the sample size and p = no. of predictors. # For forward selection, lasso, PCR, and PLS, d=number of nonzero # estimated beta coefficients for model Y = x^T beta + e. n <- length(y) val <- 8*n/9 if(alph == 1){ plot(yhat,y) } else{ ymin <- min(y); ymax <- max(y) yhmin <- min(yhat); yhmax<-max(yhat) plot(c(yhmin,yhmax),c(ymin,ymax),type="n", ylab="Y",xlab="YHAT") #get PI for the residuals if(d < val) corfac <- (1 + 15/n) * sqrt( (n+2*d)/(n - d) ) else corfac <- 5*(1+15/n) if (alph > 0.1) {qn <- min(1 - alph + 0.05, 1 - alph + d/n)} if (alph <= 0.1) {qn <- min(1 - alph/2, 1 - alph + 10*alph*d/n)} pn <- qn if(pn < 1 - alph + 0.001) qn <- 1 - alph alphan <- 1 - qn sres <- sort(res) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } #Plot points corresponding to Ys not in their PIs resup <- corfac*rup reslow <- corfac*rlow points(yhat[y>yhat+resup],y[y>yhat+resup]) points(yhat[y>p where n is the sample size and p = no. of predictors. # For forward selection, lasso, PCR, and PLS, d=number of nonzero # estimated beta coefficients for model Y = x^T beta + e. n <- length(y) val <- 8*n/9 plot(yhat,y) #get PI for the residuals if(d < val) corfac <- (1 + 15/n) * sqrt( (n+2*d)/(n - d) ) else corfac <- 5*(1+15/n) if (alph > 0.1) {qn <- min(1 - alph + 0.05, 1 - alph + d/n)} if (alph <= 0.1) {qn <- min(1 - alph/2, 1 - alph + 10*alph*d/n)} pn <- qn if(pn < 1 - alph + 0.001) qn <- 1 - alph alphan <- 1 - qn sres <- sort(res) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } #Plot pointwise PIs resup <- corfac*rup reslow <- corfac*rlow abline(0,1) abline(resup,1) abline(reslow,1) list(respi=c(reslow,resup)) } aftolssim<-function(n = 100, p = 4, k=1, nruns = 100, psi = 0.0, a=1, gam= 5, clam=0.1, alpha = 0.05){ #Calls olsinf. # Coverage of nonzero slopes goes to 0 as n increases unless gam >=5. #Want 1 < k < p. Do not expect the intercept coverage to be good. #Fits Weibull regression AFT with OLS to the uncensored cases #assuming the cases are iid and the censoring is independent of the cases. #Use 1 <= k < p so zeroes are in the model, k= the number of nonnoise variables #there are p coefficients for beta in the Weibull regression data #need p > 1, beta_A = -(1/gam, ..., 1/gam, 0, ..., 0)^T with p-k zeroes # beta_P = (1,...,1,0,...,0)^T with k ones and p-k zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(p-2)psi^2]/[1 + (p-1)psi^2], i not = j # when the correlation exists. SP~N(0,a^2), and a near 1 is ok. rho <- (2*psi + (p-2)*psi^2)/(1 + (p-1)*psi^2) val <- a/sqrt(k*(1 + (p-1)*psi^2) + k*(k-1)*(2*psi + (p-2)*psi^2)) A <- matrix(psi,nrow=p,ncol=p) diag(A) <- 1 beta <- 0 * 1:p beta[1:k] <- 1 #beta[1:0] acts like beta[1:1] = beta[1] ind <- c(1:k) #reduced model uses the first k nontrivial predictors betaaft <- beta betaaft[1:k] <- -1/gam #OLS beta from the AFT betaaft<-c(0,betaaft) cases <- 1:n q<-p+1 olscov <- 0 * 1:q olslen <- olscov redcov <- 0 nunc <- 0 *1:nruns for(i in 1:nruns) { x <- matrix(rnorm(n * p), nrow = n, ncol = p) x <- val* x %*% A SP <- x%*%beta #SP_i ~ N(0,a^2) lambdai <- exp(SP) w <- rexp(n, rate = lambdai) y <- w^(1/gam) cen <- rexp(n, rate = clam) timed <- pmin(y, cen) statusd <- as.numeric(cen >= y) #make a Weibull PH data set uncen <- cases[statusd > 0] nunc[i] <- length(uncen) yaft <- log(timed[uncen]) xaft <- x[uncen,] out<-olsinf(xaft,yaft,indices=ind,alph=alpha) olslen <- olslen + (out$olscis[,2] - out$olscis[,1]) for(j in 1:q){ if(out$olscis[j,1] <= betaaft[j] && betaaft[j] <= out$olscis[j,2]) olscov[j] <- olscov[j] + 1 } if(out$pvalred > alpha) #then fail to reject Ho where Ho: reduced model is good redcov <- redcov+1 } olslen <- sqrt(n)*olslen/nruns olscov <- olscov/nruns redcov <- redcov/nruns coef <- out$coef nunc <- mean(nunc) list(olslen=olslen,olscov=olscov,redcov=redcov,betaaft=betaaft,k=k, coef=coef,nunc=nunc)} belimboot<-function(x,y,B = 1000){ #needs library(leaps), n > 5p, p > 2 #bootstrap min Cp model backward elimination regression #Does not make sense to do variable selection if there #is only one nontrivial predictor. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vmax <- min(p,as.integer(n/5)) vars <- as.vector(1:(p-1)) #get the full model fullct=0 full <- lsfit(x,y) res <- full$resid fit <- y - res #get the minimum Cp submodel tem<-regsubsets(x,y,nvmax=vmax,method="backward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] #do not need the constant in vin vin <- vars[mincp[-1]] sub <- lsfit(x[,vin],y) bhatimin0 <- 0*1:p indx <- c(1,1+vin) bhatimin0[indx] <- sub$coef betas <- matrix(0,nrow=B,ncol=p) #bootstrap the minimum Cp submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-regsubsets(x,y=yb,method="backward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] vin <- vars[mincp[-1]] if(length(vin)==(p-1)) fullct=fullct+1 indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef } fullprop<-fullct/B list(full=full,sub=sub,bhatimin0=bhatimin0,betas=betas,fullprop=fullprop) } belimsim<-function(n = 100, p = 5, k = 1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi=0.0, BB=1000, alph = 0.05){ ##Gets proportion of times backwards elimination for MLR picks the full model. #needs library(leaps), n > 5p, p > 2 #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-1, k is the number of nonnoise variables #Need k < p-1 so there are some zeroes in the model. #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 one <- as.vector(0*1:(k+1) + 1) vmax <- min(p,as.integer(n/5)) vars <- as.vector(1:(p-1)) fullct=0 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set #get the minimum Cp submodel tem<-regsubsets(x,y,nvmax=vmax,method="backward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] #do not need the constant in vin vin <- vars[mincp[-1]] if(length(vin)==(p-1)) fullct=fullct+1 } fullct<-fullct/nruns list(fullprop=fullct)} betaci<-function(t, bhat, up){ #computes the prediction region method and Bickel and Ren CIs for beta_i. #t is the bootstrap sample T_1^*,...,T_B^* #for T_n = bhat = hat beta_i from forward selection cent<-mean(t) adist <- abs(t-cent) phlen <- quantile(adist, up) pLn <- cent - phlen pUn <- cent + phlen adist <- abs(t-bhat) brhlen <- quantile(adist, up) brLn <- bhat - brhlen brUn <- bhat + brhlen prcilen<-2*phlen brcilen<-2*brhlen list(prci=c(pLn, pUn),prcilen=prcilen,brci=c(brLn, brUn),brcilen=brcilen) } bicboot<-function(x,y,B = 1000){ #needs library(leaps), n > 5p, p > 2 #bootstrap min BIC model forward selection regression #Does not make sense to do variable selection if there #is only one nontrivial predictor. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vmax <- min(p,as.integer(n/5)) vars <- as.vector(1:(p-1)) #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the minimum bic submodel tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) minbic <- out$which[out$bic==min(out$bic)] #do not need the constant in vin vin <- vars[minbic[-1]] sub <- lsfit(x[,vin],y) bhatimin0 <- 0*1:p indx <- c(1,1+vin) bhatimin0[indx] <- sub$coef betas <- matrix(0,nrow=B,ncol=p) #bootstrap the minimum Cp submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-regsubsets(x,y=yb,method="forward") out<-summary(tem) minbic <- out$which[out$bic==min(out$bic)] vin <- vars[minbic[-1]] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef } list(full=full,sub=sub,bhatimin0=bhatimin0,betas=betas) } bicbootsim<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, BB=1000, alph = 0.05){ #needs library(leaps), calls bicboot, confreg, shorth3 from slpack #Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for forward selection variable selection using BIC. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1, so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 one <- as.vector(0*1:(k+1) + 1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <- bicboot(x,y,B=BB) #bootstrap the forward sel minimum BIC model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- out$bhatimin0[(k+2):p] tem <- confreg(out$betas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k+1 values of beta are 1 gg <- k + 1 tstat <- out$bhatimin0[1:(k+1)] tem <- confreg(out$betas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} binregbootsim<-function(n = 100, p = 4, k = 1, nruns = 100, psi=0.0, m = 40, B=1000, int=0, a = 5/3, alpha = 0.05){ #calls confreg, shorth3 ##Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #Simulates parametric bootstrap for binomial regression (full model). # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1, so zeroes are in the model, k is the number of nonnoise variables #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want # with int + 3a <=5, int -3a >= -5. #set.seed(974) ##need p>2 and want n >= 5p q <- p-1 pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] mv <- 0*1:n + m beta<-c(int,b) one <- as.vector(0*1:(k+1) + 1) one[1]<-int for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rbinom(n,size=m,prob=(exp(SP)/(1+exp(SP)))) ny <- mv-y #ny[i] = mv[i]-y[i] = no. of ``failures" tdata <- as.data.frame(cbind(x,y)) #make a BR data set out <- glm(cbind(y,ny)~., family=binomial, data=tdata) bhat<-out$coef ESP <- predict(out,newdata = tdata) betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rbinom(n,size=m,prob=(exp(ESP)/(1+exp(ESP)))) nydat <- mv-ydat tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(cbind(ydat,nydat)~., family=binomial, data=tdat) betas[i,] <- temp$coef } for (j in 1:p){ tem <- shorth3(betas[,j],alpha=alpha) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- bhat[(k+2):p] tem <- confreg(betas[,(k+2):p],g=gg,that=tstat,alpha=alpha) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k+1 values of beta are (int,1,...,1) gg <- k + 1 tstat <- bhat[1:(k+1)] tem <- confreg(betas[,1:(k+1)],g=gg,that=tstat,alpha=alpha) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} brpisim<-function(n = 100, p = 4, k = 1, nruns = 100, psi = 0.0, m = 40, B=1000, J=5, int=0, a = 5/3, alpha = 0.05){ #Needs library(leaps), library(MASS), library(mgcv), and library(glmnet). #Calls shpi and mshpi. #The GAM BR PI is only computed if p = 4. #Use 1 <= k <= p-1, where k is the number of nonnoise variables. #Simulates the Olive et al. (2018) PI for Binomial regression witn m trials. #PIs for full model, lasso, lasso variable selection, backward elimination # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want # with int + 3a <=5, int -3a >= -5. # out <- glm((y/mv)~., family=binomial, data=tdata) #works with warnings # z <- y/mv; zdata <- as.data.frame(cbind(x,z)) #out <- glm(z~., family=binomial, data=zdata) #works with warnings #set.seed(974) ##need p>2 fullpilen <- 1:nruns fullpicov <- 0 gampilen <- 1:nruns gampicov <- 0 ohfspilen <- 1:nruns ohfspicov <- 0 laspilen <- 1:nruns laspicov <- 0 RLpilen <- 1:nruns RLpicov <- 0 vspilen <- 1:nruns vspicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) nc <- ceiling(n/J)-1 nc <- min(nc,q) nc <- max(nc,1) #the maximum number of variables to use for forward selection zz<-1:nc dd <- 1:nruns ddbe <- dd dRL <- dd #lasso variable selection and lasso have the same d mv <- 0*1:n + m for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A xf <- val* rnorm(q) %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rbinom(n,size=m,prob=(exp(SP)/(1+exp(SP)))) ny <- mv-y #ny[i] = mv[i]-y[i] = no. of ``failures" spf <- int + xf%*%b yf <- rbinom(1,size=m,prob = (exp(spf)/(1+exp(spf)))) tdata <- as.data.frame(cbind(x,y)) yn <- c(y,yf) xn <- rbind(x,xf) tdat <- as.data.frame(cbind(xn,yn)) names(tdat)<-names(tdata) #make a BR data set #get full model BR PI #note that glm has the target class first if(n >= 5*p){ out <- glm(cbind(y,ny)~., family=binomial, data=tdata) ESP <- predict(out,newdata = tdat[n+1,]) ydat <- rbinom(B,size=m,prob=(exp(ESP)/(1+exp(ESP)))) #tem <- shpi(yf=yf,ydat=ydat,alph=alpha) tem <- mshpi(yf=yf,ydat=ydat,n,d=p,alph=alpha) fullpilen[i] <- tem$up - tem$low fullpicov <- fullpicov + tem$inr } #get backward elimination PR PI if(n >= 5*p){ varnames <- names(out$coef)[-1] outbe <- step(out,trace=0) vinnames <- names(outbe$coef)[-1] vin <- varnames %in% vinnames ddbe[i]<-length(outbe$coef) pp<-ddbe[i] ESP <- outbe$coef[1] + xf[vin] %*% outbe$coef[-1] ydat <- rbinom(B,size=m,prob=(exp(ESP)/(1+exp(ESP)))) #tem <- shpi(yf=yf,ydat=ydat,alph=alpha) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) vspilen[i] <- tem$up - tem$low vspicov <- vspicov + tem$inr } #get full model GAM BR PI if p = 4, GAM needs a formula if(n >= 5*p && p == 4){ x1 <- x[,1];x2 <- x[,2];x3 <- x[,3] z<-y/mv out <- gam(z ~ s(x1) + s(x2) + s(x3),family=binomial,weights=mv) ESP <- predict.gam(out,newdata=data.frame(x1=xf[1],x2=xf[2],x3=xf[3])) ydat <- rbinom(B,size=m,prob=(exp(ESP)/(1+exp(ESP)))) # tem <- shpi(yf=yf,ydat=ydat,alph=alpha) tem <- mshpi(yf=yf,ydat=ydat,n,d=p,alph=alpha) gampilen[i] <- tem$up - tem$low gampicov <- gampicov + tem$inr } #get lasso GLM PI ##note that lasso has the target class second out<-cv.glmnet(x,cbind(ny,y),family="binomial") lam <- out$lambda.min ESP <- predict(out,s=lam,newx=xf) #now get d lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 #d = pp ydat <- rbinom(B,size=m,prob=(exp(ESP)/(1+exp(ESP)))) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) laspilen[i] <- tem$up - tem$low laspicov <- laspicov + tem$inr #get lasso variable selection GLM PI xsub <- x[,vin] sub <- glm(cbind(y,ny)~., family=binomial, data=data.frame(cbind(xsub,y))) dRL[i]<- pp #want these to be near but >= k+1 ESP <- sub$coef[1] + xf[vin] %*% sub$coef[-1] ydat <- rbinom(B,size=m,prob=(exp(ESP)/(1+exp(ESP)))) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) RLpilen[i] <- tem$up - tem$low RLpicov <- RLpicov + tem$inr #get Olive and Hawkins forward selection PI if(n >= 5*p){ temp<-regsubsets(x,y,nvmax=nc,method="forward") out<-summary(temp) mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] xsub <- x[,vin] sub <- glm(cbind(y,ny)~., family=binomial, data=data.frame(cbind(xsub,y))) dd[i]<-length(sub$coef)#want these to be near but >= k+1 pp<-dd[i] ESP <- sub$coef[1] + xf[vin] %*% sub$coef[-1] ydat <- rbinom(B,size=m,prob=(exp(ESP)/(1+exp(ESP)))) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) ohfspilen[i] <- tem$up - tem$low ohfspicov <- ohfspicov + tem$inr } } fullpimnlen <- mean(fullpilen) fullpicov <- fullpicov/nruns gampimnlen <- mean(gampilen) gampicov <- gampicov/nruns laspimnlen <- mean(laspilen) laspicov <- laspicov/nruns RLpimnlen <- mean(RLpilen) RLpicov <- RLpicov/nruns ohfspimnlen <- mean(ohfspilen) ohfspicov <- ohfspicov/nruns vspimnlen <- mean(vspilen) vspicov <- vspicov/nruns mndd <- mean(dd) mndRL <- mean(dRL) mnddbe <- mean(ddbe) #lasso variable selection and lasso have the same d list(mndRL=mndRL,mndd=mndd,mnddbe=mnddbe,int=int,b=b,fullpicov=fullpicov, fullpimenlen=fullpimnlen,gampicov=gampicov,gampimenlen=gampimnlen, laspicov=laspicov, laspimenlen=laspimnlen, RLpicov=RLpicov, RLpimenlen=RLpimnlen,ohfspicov=ohfspicov,ohfspimnlen=ohfspimnlen, vspicov=vspicov,vspimnlen=vspimnlen)} brsplitsim<-function(n = 100, p = 4, k=1, n1=30, J=5, nruns = 100, psi = 0.0, int=1, a=4/3, m=4, B=1000, alpha = 0.05){ #needs library(glmnet), library(mgcv), n1>=30 gets rid of the warnings #J is an integer between 0 and 5, vs program used a = 5/3 #noundfit is the number of times in nruns that lasso did not underfit #nd is the size of the training data set selected by sequential data splitting #ad is the number of predictors, including a constant, selected by lasso for #the selected training data set: want ad not much larger than k+1 #k+1 is the number of beta_i not equal to 0 #n1 is the initial size of the training data set, want n1 <= n/2 #Simulates sequential data splitting for binomial regression using lasso #with 10-fold cross validation. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want exp(int+3a) reasonable # with int + 3a <=10, int > 0 and int - 3a > -4. q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta<-c(int,b) mv <- 0*1:n + m one <- as.vector(0*1:(k+1) + 1) one[1]<-int zero <- 0 * 1:p laspilen <- 1:nruns laspicov <- 0 RLpilen <- 1:nruns RLpicov <- 0 lsplitpilen <- 1:nruns lsplitpicov <- 0 splitpilen <- 1:nruns splitpicov <- 0 vars <- as.vector(1:(p-1)) noundfit <- 0 indx <- 1:n nds<-1:nruns ads<-nds #check for bad values of n1 fhalf <- floor((n-J)/2) if(n<40) n1 <- max(1,fhalf) else if(floor(n/(2*n1))>1000) n1 <- floor(n/2000) if(n1 > fhalf) n1 <- max(1,fhalf) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val * x %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) xf <- val*rnorm(q) %*% A SPf <- int + xf%*%b y <- rbinom(n,size=m,prob=(exp(SP)/(1+exp(SP)))) ny <- mv-y #ny[i] = mv[i]-y[i] = no. of ``failures" yf <- rbinom(1,size=m,prob=(exp(SPf)/(1+exp(SPf)))) #make a BR data set #get lasso PI ##note that lasso has the target class second out<-cv.glmnet(x,cbind(ny,y),family="binomial") lam <- out$lambda.min ESP <- predict(out,s=lam,newx=xf) #now get d lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 #d = pp ydat <- rbinom(B,size=m,prob=(exp(ESP)/(1+exp(ESP)))) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) laspilen[i] <- tem$up - tem$low laspicov <- laspicov + tem$inr #get lasso variable selection PI, problems if number of variables > n-1 # glm has target class first if(length(vin) < (n-5)){ xsub <- x[,vin] sub <- glm(cbind(y,ny)~., family=binomial, data=data.frame(cbind(xsub,y))) ESP <- sub$coef[1] + xf[vin] %*% sub$coef[-1] ydat <- rbinom(B,size=m,prob=(exp(ESP)/(1+exp(ESP)))) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) RLpilen[i] <- tem$up - tem$low RLpicov <- RLpicov + tem$inr } #use sequential data splitting perm <- sample(indx,n) H <- perm[1:n1] xH <- x[H,] yH <- y[H] nyH <- mv[H] - yH nd<-n1 out<-cv.glmnet(xH,cbind(nyH,yH),family="binomial") lam <- out$lambda.min fit <- predict(out,s=lam,newx=xH) lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 while(nd < fhalf && nd < 10*pp){ if((nd+n1) <= fhalf){ perm<-perm[-(1:n1)] H <- c(H,perm[1:n1]) xH <- x[H,] yH <- y[H] nyH <- mv[H] - yH out<-cv.glmnet(xH,cbind(nyH,yH),family="binomial") lam <- out$lambda.min fit <- predict(out,s=lam,newx=xH) lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 nd<-nd+n1 } else nd<-nd+n1 } if(nd > fhalf) nd <- nd-n1 if(nd < fhalf && nd < 10*pp){ md <- fhalf - nd nd <- nd + md perm<-perm[-(1:n1)] H <- c(H,perm[1:md]) xH <- x[H,] yH <- y[H] nyH <- mv[H] - yH out <-cv.glmnet(xH,cbind(nyH,yH),family="binomial") lam <- out$lambda.min fit <- predict(out,s=lam,newx=xH) lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 } nds[i] <- nd ads[i] <- pp if(length(vin) >= k){ if(vin[k]==k) noundfit <- noundfit + 1} xV <- x[-H,] yV <- y[-H] #get lasso data splitting PI nyV <- mv[-H] - yV out<-cv.glmnet(xV,cbind(nyV,yV),family="binomial") lam <- out$lambda.min ESP <- predict(out,s=lam,newx=xf) #now get d lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 #d = pp ydat <- rbinom(B,size=m,prob=(exp(ESP)/(1+exp(ESP)))) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) lsplitpilen[i] <- tem$up - tem$low lsplitpicov <- lsplitpicov + tem$inr #get lasso variable selection data splitting PI if(length(vin) < (length(yV)-3)){ xsub <- xV[,vin] sub <- glm(cbind(yV,nyV)~., family=binomial, data=data.frame(cbind(xsub,yV))) ESP <- sub$coef[1] + xf[vin] %*% sub$coef[-1] ydat <- rbinom(B,size=m,prob=(exp(ESP)/(1+exp(ESP)))) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) splitpilen[i] <- tem$up - tem$low splitpicov <- splitpicov + tem$inr } } laspicov <- laspicov/nruns laspilen <- mean(laspilen) RLpicov <- RLpicov/nruns RLpilen <- mean(RLpilen) lsplitpicov <- lsplitpicov/nruns lsplitpilen <- mean(lsplitpilen) splitpicov <- splitpicov/nruns splitpilen <- mean(splitpilen) mnnd<-mean(nds) mnad<-mean(ads) list(mnnd=mnnd,mnad=mnad,laspicov=laspicov,laspilen=laspilen, LVSpicov=RLpicov,LVSpilen=RLpilen,lsplitpicov=lsplitpicov, lsplitpilen=lsplitpilen,splitpicov=splitpicov,splitpilen=splitpilen, beta=beta,k=k,noundfit=noundfit,n1=n1)} ##for mldsim6 cmve<- function(x, csteps = 5) {# gets the cmve, rcmve and mb estimators zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) medd3 <- medd2 ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get CMVE attractor covf <- covm mnf <- mnm val <- mahalanobis(t(mnd), med, covv) if(val < medd3) { ##crit = [med(D)]^p * square root of det(cov) rd2 <- mahalanobis(x, mnd, covd) critd <- (sqrt(median(rd2)))^p*prod(diag(chol(covd))) rd2 <- mahalanobis(x, mnm, covm) critm <- (sqrt(median(rd2)))^p*prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get CMVE estimator chisqm <- qchisq(0.5, p) rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/chisqm covf <- const * covf ##reweight the above CMVE estimator (mnf,covf) to get the ##RCMVE estimator (rmnf,rcovf) rd2 <- mahalanobis(x, mnf, covf) up <- qchisq(0.975, p) if(p > 1){ rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) if(p > 1){ rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf list(center = mnf, cov = covf, rmnf = rmnf, rcovf = rcovf, mnm = mnm, covm = covm) } confreg<-function(x, g = 4, that = 1:4, alpha = 0.05){ # Makes confidence regions for theta from rows of x = Ti* from a bootstrap. # Use fot testing H_0: theta = 0 versus H_1: theta != 0. # The prediction region method, hybrid, and Bickel and Ren regions are used. # If g = 1, the shorth interval should work better. # Also computes the distance for the 0 vector. # Need g = dim(x)[2] and T = that the g by 1 estimator of theta. # Often that = A betahat(I_min,0). # Note that center= Tbar* = bagging estimator and cov = S*_T. x <- as.matrix(x) that <- as.vector(that) n <- dim(x)[1] zero <- 0*(1:g) up <- min((1 - alpha/2), (1 - alpha + 10*alpha*g/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + g/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical distance MD <- sqrt(md2) #get prediction region method cutoff cuplim <- sqrt(quantile(md2, up)) D0 <- sqrt(mahalanobis(zero, center, cov)) #get hybrid region statistic = Bickel and Ren statistic br0 <- sqrt(mahalanobis(zero, that, cov)) #get the Bickel and Ren cutoff and test statistic br2 <- mahalanobis(x,that,cov) brlim <- sqrt(quantile(br2, up)) list(cuplim=cuplim,brlim=brlim,br0=br0,D0=D0,MD=MD,center=center,cov=cov) } corrlar<-function(R, c= 50, tau = 0.05){ #Returns a regularized correlation matrix Rd = R(delta,0) and Rt = #R(delta,tau) given a positive semidefinite correlation matrix R where #delta depends on the condition number c > = 1. #C = var(x) and C = covmb2$cov are common. p<-dim(R)[2] out<-eigen(R,only.values=TRUE) lam1 <- out$values[1] lamp <- out$values[p] if(c>1){ delta <- (lam1 - c*lamp)/(c-1) delta <- max(0,delta) I <- diag(p) Rd <- (R + delta*I)/(1+delta) Rt <- Rd Rt[abs(Rt) 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ##get the location criterion lcut <- medd2 ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val <- mahalanobis(t(mnd), med, covv) if(val < lcut) {##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get FCH estimator chisqm <- qchisq(0.5, p) rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/chisqm covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the RFCH estimator ## (rmnf,rcovf) rd2 <- mahalanobis(x, mnf, covf) up <- qchisq(0.975, p) if(p > 1) { rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) if(p > 1){ rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf list(center = mnf, cov = covf, rmnf = rmnf, rcovf = rcovf, mnm=mnm, covm=covm) } covmb2<-function(x, m=0, k=5, msteps=9){ # Computes the covmb2 estimator with concentration type steps. Needs p > 1. # Best if p > n. Look at out<-medout(x) to determine how many cases # m are clean. Use m >= n/2. # Estimate m if m = 0: use k >= 0, so at least half of the cases are used, # and do the concentration type msteps to get a weighted median. # The concentration type steps help the most when the outlier proportion # is high. Try covbm2(x,msteps=0) and covmb2(x,msteps=9). # Using msteps > 0 does slow down the function some. p <- dim(x)[2] #Median Ball start covv <- diag(p) med <- apply(x, 2, median) #Get squared Euclidean distances from coordinatewise median. md2 <- mahalanobis(x, center = med, covv) if(m == 0){ if(msteps > 0){#do concentration type steps for(i in 1:msteps){ medd <- median(md2) medw <- apply(x[md2<=medd,], 2, median) md2 <- mahalanobis(x, center = medw, covv) } } md <- sqrt(md2) mcut <- median(md) + k*mad(md,constant=1) mns <- apply(x[md <= mcut, ], 2, mean) covs <- var(x[md <= mcut, ]) } else{ #Use m cases with the smallest distances. mcut <- sort(md2)[m] mns <- apply(x[md2 <= mcut, ], 2, mean) covs <- var(x[md2 <= mcut, ]) } list(center=mns,cov=covs) } ##for mldsim6 covrmvn<-function(x, csteps = 5, locc = 0.5) {# Needs number of predictors p > 1. # This robust MLD estimator is tailored to estimate the covariance matrix # of the bulk of the data when the bulk of the data is MVN and the outliers # are "not too bad." The FCH and MB estimators are also produced. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] up <- qchisq(0.975, p) qchi <- qchisq(0.5, p) ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the cov estimator ## (rmnmvn,rcovmvn) tailored for MVN data rd2 <- mahalanobis(x, mnf, covf) rmnmvn <- apply(x[rd2 <= up, ], 2, mean) rcovmvn <- var(x[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ## reweight again rd2 <- mahalanobis(x, rmnmvn, rcovmvn) rmnmvn <- apply(x[rd2 <= up, ], 2, mean) rcovmvn <- var(x[rd2 <= up, ]) d2 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d2 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn list(center = rmnmvn, cov = rcovmvn, mnf = mnf, covf = covf, mnm = mnm, covm = covm) } covxycis<-function(x,y,alph=0.05,cfac="T"){ ##gets CIs for cov (xj,Y), cases are entered as rows, #if cfac = "T", a correction factor for small n is used, best if n >= 50 x <- as.matrix(x) y <- as.vector(y) n <- length(y) p <- dim(x)[2] z <- x*(y-mean(y)) sds <- apply(z,2,sd) cut <- qt((1-alph/2),(n-1)) # up = 1 - alph/2 if(cfac=="T"){ ao2 <- alph/2 up <- min((1 - ao2/2), (1 - ao2 + 12.5*alph/n)) if(ao2 > 0.1) up <- min((1 - ao2 + 0.05), (1 - ao2 + 2.5/n)) qn <- up if(qn < 1 - ao2 + 0.001) up <- 1 - ao2 cut <- qt(up,(n-1)) } # Covxy <- cov(x,y) cis <- matrix(0,nrow=p,ncol=2) val <- cut*sds/sqrt(n) cis[,1] <- Covxy - val cis[,2] <- Covxy + val list(Covxy=Covxy,cis=cis,up=up) } ddplot4<-function(x, alpha = 0.1){ # Makes a DD plot with covrmvn used for the RDi. # Need p > 1. # Semiparametric prediction regions are added. # Click left mouse button to identify points. # Click right mouse button to end the function. # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. p <- dim(x)[2] n <- dim(x)[1] up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) out <- covrmvn(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) # MD is the classical and RD the robust distance MD <- sqrt(md2) RD <- sqrt(rd2) plot(MD, RD) abline(0, 1) #get nonparametric prediction region boundary cuplim <- sqrt(quantile(md2, up)) a <- min(RD) b <- max(RD) lines(c(cuplim, cuplim), c(b, a)) #get semiparametric prediction region boundary ruplim <- sqrt(quantile(rd2, up)) a <- min(MD) b <- max(MD) lines(c(a, b), c(ruplim, ruplim)) #get parametric MVN prediction region boundary mvnlim <- sqrt(qchisq(up, p)) b <- min(b, mvnlim) lines(c(a, b), c(mvnlim, mvnlim)) identify(MD, RD) list(cuplim = cuplim, ruplim = ruplim, mvnlim = mvnlim) } ddplot5<-function(x, mm=0, kk= 5,steps = 9) {# Plots Euclidean distances from the coordinatewise median #vs. those of covmb2 location estimator with 9 concentration type steps. #Good plot for outlier detection even if p > n. #Needs p > 1. x<-as.matrix(x) p <- dim(x)[2] covv <- diag(p) med <- apply(x, 2, median) RDMED <- sqrt(mahalanobis(x, center = med, covv)) RDCOVMB2 <- sqrt(mahalanobis(x,center=covmb2(x,m=mm,k=kk,msteps=steps)$center,covv)) plot(RDMED,RDCOVMB2) #list(RDMED=RDMED,RDCOVMB2=RDCOVMB2) } dpi<-function(yf, yfhat, d, resid, alph = 0.05){ #Gets the Pelawa Watagoda and Olive (2017) PI for Yf. #given yfhat, the `"plug in degrees of freedom" d, and the residuals. # Can work if p > n. Yf is in the PI if inr = 1 #Useful for lasso, PCR, PLS, forward selection, ridge regression. resid<-as.vector(resid) n<-length(resid) val <- 8*n/9 inr <- 0 if(d < val) corfac <- (1 + 15/n) * sqrt( (n+2*d)/(n - d) ) else corfac <- 5*(1+15/n) if (alph > 0.1) {qn <- min(1 - alph + 0.05, 1 - alph + d/n)} if (alph <= 0.1) {qn <- min(1 - alph/2, 1 - alph + 10*alph*d/n)} pn <- qn if(pn < 1 - alph + 0.001) qn <- 1 - alph alphan <- 1 - qn sres <- sort(resid) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow if(low <= yf && up >= yf) inr <- inr + 1 list(low=low, up=up, inr = inr)} dpisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi=0.0, type = 1, J = 20, alpha = 0.05){ #Needs library(pls) and library(glmnet). Uses Pelawa Watagoda and Olive (2017) PI. #The function dpisim2 adds the PI for forward selection. #Simulates PIs for PLS, PCR and lasso with d = min( ceil(n/J),p) components including #a constant. For lasso take the smallest lambda with df = d-1. #Also for lasso variable selection = OLS on lasso active predictors. # Uses PI with crude df estimate d = number of components + 1, p > n possible. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. #set.seed(974) ##need p>2 ##dfmax = nc works better than pcmax = nc val <- 8*n/9 pcrpilen <- 1:nruns pcrpicov <- 0 plspilen <- pcrpilen plspicov <- 0 lassopilen <- pcrpilen lassopicov <- 0 relpilen <- pcrpilen relpicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) nc <- ceiling(n/J)-1 nc <- min(nc,q) nc <- max(nc,1) pp <- nc+1 vars <- 1:q for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the PCR estimator z <- as.data.frame(cbind(y,x)) zz <- rbind(z,c(0,xf)) out<-pcr(V1~.,data=z,scale=T,ncomp=nc,validation="none") #If y is used instead of V1, predict does not work, #and nc tends to equal p, which should be impossible. #bug if nc=0, p large #using predict is rather difficult #if xf is used, predict does not work yfhat<-predict(out,zz[(n+1),-1],ncomp=nc)[1,1,1] res <- out$residuals[,,nc] #get PCR PI tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) pcrpilen[i] <- tem$up - tem$low pcrpicov <- pcrpicov + tem$inr #get PLS PI out<-plsr(V1~.,data=z,scale=T,ncomp=nc,validation="none") yfhat<-predict(out,zz[(n+1),-1],ncomp=nc)[1,1,1] res <- out$residuals[,,nc] tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) plspilen[i] <- tem$up - tem$low plspicov <- plspicov + tem$inr #get lasso PI out <- glmnet(x,y,dfmax=nc) lam<-out$lambda[length(out$lambda)] fit <- predict(out,s=lam,newx=x) yfhat <- predict(out,s=lam,newx=xf) res <- y - fit tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) lassopilen[i] <- tem$up - tem$low lassopicov <- lassopicov + tem$inr #get lasso variable selection PI lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] sub <- lsfit(x[,vin],y) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] res <- sub$resid tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) relpilen[i] <- tem$up - tem$low relpicov <- relpicov + tem$inr } pcrpimnlen <- mean(pcrpilen) pcrpicov <- pcrpicov/nruns plspimnlen <- mean(plspilen) plspicov <- plspicov/nruns lassopimnlen <- mean(lassopilen) lassopicov <- lassopicov/nruns relpimnlen <- mean(relpilen) relpicov <- relpicov/nruns list(pcrpicov=pcrpicov, pcrpimenlen=pcrpimnlen, plspicov=plspicov, plspimenlen=plspimnlen, lassopicov=lassopicov, lassopimenlen=lassopimnlen, relpicov=relpicov, relpimenlen=relpimnlen)} dpisim2<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi=0.0, type = 1, J = 20, alpha = 0.05){ #Needs library(leaps), library(pls) and library(glmnet). #Uses Pelawa Watagoda and Olive (2017) PI. #Simulates PIs for PLS, PCR, and lasso with d approx min(n/J,p) components including #a constant. For lasso take the smallest lambda with df = d-1. #Also uses lasso variable selection and forward selection. SLOW if p is LARGE. #Forward selection works with warnings even if p > n provided n/J + 1 is less than n. # Uses PI with crude df estimate d = number of components + 1, p > n possible. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. #set.seed(974) ##need p>2 ##dfmax = nc works better than pcmax = nc val <- 8*n/9 pcrpilen <- 1:nruns pcrpicov <- 0 plspilen <- pcrpilen plspicov <- 0 lassopilen <- pcrpilen lassopicov <- 0 relpilen <- pcrpilen relpicov <- 0 fselpilen <- pcrpilen fselpicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) nc <- ceiling(n/J)-1 nc <- min(nc,q) nc <- max(nc,1) pp <- nc+1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the PCR estimator z <- as.data.frame(cbind(y,x)) zz <- rbind(z,c(0,xf)) out<-pcr(V1~.,data=z,scale=T,ncomp=nc,validation="none") #If y is used instead of V1, predict does not work, #and nc tends to equal p, which should be impossible. #bug if nc=0, p large #using predict is rather difficult #if xf is used, predict does not work yfhat<-predict(out,zz[(n+1),-1],ncomp=nc)[1,1,1] res <- out$residuals[,,nc] #get PCR PI tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) pcrpilen[i] <- tem$up - tem$low pcrpicov <- pcrpicov + tem$inr #get PLS PI out<-plsr(V1~.,data=z,scale=T,ncomp=nc,validation="none") yfhat<-predict(out,zz[(n+1),-1],ncomp=nc)[1,1,1] res <- out$residuals[,,nc] tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) plspilen[i] <- tem$up - tem$low plspicov <- plspicov + tem$inr #get lasso PI out <- glmnet(x,y,dfmax=nc) lam<-out$lambda[length(out$lambda)] fit <- predict(out,s=lam,newx=x) yfhat <- predict(out,s=lam,newx=xf) res <- y - fit tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) lassopilen[i] <- tem$up - tem$low lassopicov <- lassopicov + tem$inr #get lasso variable selection PI lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] sub <- lsfit(x[,vin],y) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] res <- sub$resid tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) relpilen[i] <- tem$up - tem$low relpicov <- relpicov + tem$inr #get forward selection PI temp<-regsubsets(x,y,nvmax=nc,method="forward") out<-summary(temp) num <- length(out$cp) mod <- out$which[num,] #do not need the constant in vin vin <- vars[mod[-1]] sub <- lsfit(x[,vin],y) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] res <- sub$resid tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) fselpilen[i] <- tem$up - tem$low fselpicov <- fselpicov + tem$inr } pcrpimnlen <- mean(pcrpilen) pcrpicov <- pcrpicov/nruns plspimnlen <- mean(plspilen) plspicov <- plspicov/nruns lassopimnlen <- mean(lassopilen) lassopicov <- lassopicov/nruns relpimnlen <- mean(relpilen) relpicov <- relpicov/nruns fselpimnlen <- mean(fselpilen) fselpicov <- fselpicov/nruns list(pcrpicov=pcrpicov, pcrpimenlen=pcrpimnlen, plspicov=plspicov, plspimenlen=plspimnlen, lassopicov=lassopicov, lassopimenlen=lassopimnlen, relpicov=relpicov, relpimenlen=relpimnlen, fselpicov=fselpicov, fselpimenlen=fselpimnlen)} dpisim3<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi=0.0, type = 1, J = 20, alpha = 0.05){ #Needs library(pls). #Simulates PIs for PLS and PCR with d = min(ceil(n/J),p) components #including a constant. # Uses PI with crude df estimate d = number of components + 1, p > n possible. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. #set.seed(974) ##need p>2 ##dfmax = nc works better than pcmax = nc val <- 8*n/9 pcrpilen <- 1:nruns pcrpicov <- 0 plspilen <- pcrpilen plspicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) nc <- ceiling(n/J)-1 nc <- min(nc,q) nc <- max(nc,1) pp <- nc+1 vars <- 1:q for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the PCR estimator z <- as.data.frame(cbind(y,x)) zz <- rbind(z,c(0,xf)) out<-pcr(V1~.,data=z,scale=T,ncomp=nc,validation="none") #If y is used instead of V1, predict does not work, #and nc tends to equal p, which should be impossible. #bug if nc=0, p large #using predict is rather difficult #if xf is used, predict does not work yfhat<-predict(out,zz[(n+1),-1],ncomp=nc)[1,1,1] res <- out$residuals[,,nc] #get PCR PI tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) pcrpilen[i] <- tem$up - tem$low pcrpicov <- pcrpicov + tem$inr #get PLS PI out<-plsr(V1~.,data=z,scale=T,ncomp=nc,validation="none") yfhat<-predict(out,zz[(n+1),-1],ncomp=nc)[1,1,1] res <- out$residuals[,,nc] tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) plspilen[i] <- tem$up - tem$low plspicov <- plspicov + tem$inr } pcrpimnlen <- mean(pcrpilen) pcrpicov <- pcrpicov/nruns plspimnlen <- mean(plspilen) plspicov <- plspicov/nruns list(pcrpicov=pcrpicov, pcrpimenlen=pcrpimnlen, plspicov=plspicov, plspimenlen=plspimnlen)} drelpisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi=0.0, type = 1, J = 20, alpha = 0.05){ #Needs library(glmnet). Uses the Pelawa Watagoda and Olive (2017) PI. #Simulates PIs for lasso and lasso variable selection with d approx min (n/J,p) #components including a constant. For lasso take the smallest lambda with df #= d-1. #SLOW if p is LARGE. Like dpisim but does not do PLS, PCR, or forward #selection. # Uses PI with crude df estimate d = number of components + 1, p > n possible. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. #set.seed(974) ##need p>2 ##dfmax = nc works better than pcmax = nc val <- 8*n/9 lassopilen <- 1:nruns lassopicov <- 0 relpilen <- lassopilen relpicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) nc <- ceiling(n/J)-1 nc <- min(nc,q) nc <- max(nc,1) pp <- nc+1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #get lasso PI out <- glmnet(x,y,dfmax=nc) lam<-out$lambda[length(out$lambda)] fit <- predict(out,s=lam,newx=x) yfhat <- predict(out,s=lam,newx=xf) res <- y - fit tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) lassopilen[i] <- tem$up - tem$low lassopicov <- lassopicov + tem$inr #get lasso variable selection PI lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] sub <- lsfit(x[,vin],y) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] res <- sub$resid tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) relpilen[i] <- tem$up - tem$low relpicov <- relpicov + tem$inr # print(i) } lassopimnlen <- mean(lassopilen) lassopicov <- lassopicov/nruns relpimnlen <- mean(relpilen) relpicov <- relpicov/nruns list(lassopicov=lassopicov, lassopimenlen=lassopimnlen, relpicov=relpicov, relpimenlen=relpimnlen)} dvspisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, J = 20, alpha = 0.05){ #Needs library(leaps). Uses the Pelawa Watagoda and Olive (2017) PI. #Simulates PIs for forward selection variable selection. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 val <- 8*n/9 fselpilen <- 1:nruns fselpicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) nc <- ceiling(n/J)-1 nc <- min(nc,q) nc <- max(nc,1) pp <- nc+1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #get forward selection PI temp<-regsubsets(x,y,nvmax=nc,method="forward") out<-summary(temp) num <- length(out$cp) mod <- out$which[num,] #do not need the constant in vin vin <- vars[mod[-1]] sub <- lsfit(x[,vin],y) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] res <- sub$resid tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) fselpilen[i] <- tem$up - tem$low fselpicov <- fselpicov + tem$inr } fselpimnlen <- mean(fselpilen) fselpicov <- fselpicov/nruns list(fselpicov=fselpicov, fselpimenlen=fselpimnlen)} enet<-function(x,y,am=10){ #Gets elastic net output using 10-fold CV using a grid of alpha values #0, 1/am, 2/am, ..., am/am. #Needs library(glmnet). #Use am = 10, 20, 50, or 100. Larger am values take longer. x<-as.matrix(x) out<-cv.glmnet(x,y,alpha=0) lam <- out$lambda.min #value of lambda that minimizes 10 fold CV cvmin <- min(out$cvm) alph = 0 for(i in 1:am){ tem <- cv.glmnet(x,y,alpha=i/am) mcv <- min(tem$cvm) if(mcv < cvmin){ out <- tem lam <- out$lambda.min cvmin <- mcv alph <- i/am} } yhat <- predict(out,s=lam,newx=x) res <- y - yhat plot(yhat,y) abline(0,1) list(out=out, yhat=yhat, res=res, lam = lam, alph = alph) } enet2<-function(x,y,am=10){ #Gets elastic net output using 10-fold CV using a grid of alpha values #0, 1/am, 2/am, ..., am/am. Does not make the response plot made by enet. #Needs library(glmnet). #Use am = 10, 20, 50, or 100. Larger am values take longer. x<-as.matrix(x) out<-cv.glmnet(x,y,alpha=0) lam <- out$lambda.min #value of lambda that minimizes 10 fold CV cvmin <- min(out$cvm) alph = 0 for(i in 1:am){ tem <- cv.glmnet(x,y,alpha=i/am) mcv <- min(tem$cvm) if(mcv < cvmin){ out <- tem lam <- out$lambda.min cvmin <- mcv alph <- i/am} } yhat <- predict(out,s=lam,newx=x) res <- y - yhat list(out=out, yhat=yhat, res=res, lam = lam, alph = alph) } evspisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, J = 5, alpha = 0.05){ #Needs library(leaps). Uses the Pelawa Watagoda and Olive (2017) PI. #Simulates PIs for forward selection variable selection using EBIC. #EBIC might need normality to work well. # ebic - 2 p log(2) = out$bic+2*log(dbinom(x=xx,size=p,prob=0.5)) #ebic <- out$bic+2*log(2^p*dbinom(x=xx,size=p,prob=0.5)) #ebic <- out$bic+2*(lgamma(p+1)-lgamma(xx+1)-lgamma(p-xx+1)) #Formula uses EBIC(I) - 2 lgamma(p+1). # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 val <- 8*n/9 fselpilen <- 1:nruns fselpicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) nc <- ceiling(n/J)-1 nc <- min(nc,q) nc <- max(nc,1) #the maximum number of variables to use zz<-1:nc dd <- 1:nruns for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #get forward selection PI temp<-regsubsets(x,y,nvmax=nc,method="forward") out<-summary(temp) xx <- 1:min(length(out$bic),p-1)+1 ebic <- out$bic+2*(-lgamma(xx+1)-lgamma(p-xx+1)) #print(ebic) dd[i] <- zz[ebic==min(ebic)] #want these to be near but >= k minebic <- out$which[ebic==min(ebic),] #do not need the constant in vin vin <- vars[minebic[-1]] sub <- lsfit(x[,vin],y) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] res <- sub$resid pp <- length(vin)+1 tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) fselpilen[i] <- tem$up - tem$low fselpicov <- fselpicov + tem$inr } fselpimnlen <- mean(fselpilen) fselpicov <- fselpicov/nruns list(dd=dd,fselpicov=fselpicov, fselpimenlen=fselpimnlen)} FDAboot<-function(x, group, B = 1000){ #Bootstraps FDA betahat = first eigenvector. #Nominal 95% CIs #group labels are 1,2,...,g #needs library(MASS) g <- max(group) x <- as.matrix(x) p <- dim(x)[2] betas <- matrix(0,nrow=B,ncol=p) nis <- 1:g tindx <- 1:dim(x)[1] out<-lda(x,group) for(i in 1:g){ nis[i] <- sum(group==i) } for(i in 1:B){ indx <- 1:2 for(j in 1:g){ tind <- tindx[group==j] tdx<- sample(tind,nis[j],replace=T) indx <- c(indx,tdx) } indx <- indx[-c(1,2)] betas[i,]<-lda(x[indx,],group[indx])$scaling[,1] } shorci <- apply(betas,2,shorth2) list(betas=betas,shorci=shorci,out=out) } frey <- function(alph = 0.05){ #compare Frey correction with Olive correction nn <- seq(20,500,by = 1) m <- length(nn) fr <- 1:m ol <- fr for(i in 1:m){ fr[i] <- min(1,1 - alph + 1.12*sqrt(alph/nn[i])) ol[i] <- 1 - alph + 10*alph/nn[i] } plot(ol,fr) abline(0,1) } fsel<-function(x,y,J=5){ #forward selection: min Cp if n >= 10 p, EBIC otherwise. #Use J >= 5. #library(leaps) x<-as.matrix(x) n <- length(y) q <- dim(x)[2] #q = p-1 p <- q+1 nc <- ceiling(n/J)-1 nc <- min(nc,q) nc <- max(nc,1) #the maximum number of variables to use for forward selection temp<-regsubsets(x,y,nvmax=nc,method="forward") out<-summary(temp) if(n < 10*p) { xx <- 1:min(length(out$bic),q)+1 ebic <- out$bic+2*(-lgamma(xx+1)-lgamma(p-xx+1)) #print(ebic) minebic <- out$which[ebic==min(ebic),] #do not need the constant in vin vin <- vars[minebic[-1]]} #if n >= 10p use min Cp model else { mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]]} sub <- lsfit(x[,vin],y) res <- sub$resid yhat <- y - res list(sub=sub,vinI=vin,resI=res,yhatI=yhat) } fselboot<-function(x,y,B = 1000){ #needs library(leaps), n > 5p, p > 2 #bootstrap min Cp model forward selection regression #Does not make sense to do variable selection if there #is only one nontrivial predictor. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vmax <- min(p,as.integer(n/5)) vars <- as.vector(1:(p-1)) #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the minimum Cp submodel tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] #do not need the constant in vin vin <- vars[mincp[-1]] sub <- lsfit(x[,vin],y) bhatimin0 <- 0*1:p indx <- c(1,1+vin) bhatimin0[indx] <- sub$coef betas <- matrix(0,nrow=B,ncol=p) #bootstrap the minimum Cp submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-regsubsets(x,y=yb,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] vin <- vars[mincp[-1]] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef } list(full=full,sub=sub,bhatimin0=bhatimin0,betas=betas) } fselboot2<-function(x,y,B=1000,c=0.01,aug=F){ #needs library(leaps), n > 5p, p > 2 #bootstrap min Cp model forward selection regression #using bhatVS and bhatMIX #If aug not=F, adds cB full model bootstrap samples. #Then S_T^* will be better conditioned. #Does not make sense to do variable selection if there #is only one nontrivial predictor. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vmax <- min(p,as.integer(n/5)) vars <- as.vector(1:(p-1)) d <- ceiling(c*B) bpd <- B + d bp1 <- B + 1 #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the minimum Cp submodel tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] #do not need the constant in vin vin <- vars[mincp[-1]] sub <- lsfit(x[,vin],y) bhatimin0 <- 0*1:p indx <- c(1,1+vin) bhatimin0[indx] <- sub$coef betas <- matrix(0,nrow=bpd,ncol=p) btmix <- betas #bootstrap the minimum Cp submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-regsubsets(x,y=yb,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] vin <- vars[mincp[-1]] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef yb <- fit + sample(res,n,replace=T) btmix[i,indx] <- lsfit(x[,vin],yb)$coef } if(aug == F) {betas <- betas[1:B,]; btmix <- btmix[1:B,]} else { for(i in bp1:bpd){ yb <- fit + sample(res,n,replace=T) betas[i,] <- lsfit(x,yb)$coef } btmix[bp1:bpd,] <- betas[bp1:bpd,] } list(full=full,sub=sub,bhatimin0=bhatimin0,betas=betas,btmix=btmix) } fsprbootsim<-function(n = 100, p = 4, k = 1, nruns = 100, psi=0.0, B=1000, int=1, a = 1, alpha = 0.05){ ##Needs library(MASS). #very slow, use B = 50p ##Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #Simulates parametric bootstrap for Poisson regression (forward selection). # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1, so zeroes are in the model, k is the number of nonnoise variables #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want exp(int+3a) reasonable # with int + 3a <=10, int > 0 and int - 3a > -4. q <- p-1 pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta<-c(int,b) one <- as.vector(0*1:(k+1) + 1) one[1]<-int zero <- 0 * 1:p dd <- 1:nruns ddboot <- 1:B for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rpois(n,lambda=exp(SP)) tdata <- as.data.frame(cbind(x,y)) #make a PR data set out <- glm(y~., family=poisson, data=tdata) ESP <- predict(out,newdata = tdata) varnames <- names(out$coef) out1 <- glm(y~1, family=poisson, data=tdata) outfs <- stepAIC(out1,trace=0,direction="forward",scope=list(upper=out,lower=out1)) dd[i]<-length(outfs$coef) vinnames <- names(outfs$coef) vin <- varnames %in% vinnames bhat <- zero bhat[vin] <- outfs$coef #bhatImin betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rpois(n,lambda=exp(ESP)) tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(ydat~., family=poisson, data=tdat) tem1<-glm(ydat~1, family=poisson, data=tdat) outfs <- stepAIC(tem1,trace=0,direction="forward",scope=list(upper=temp,lower=tem1)) ddboot[i]<-length(outfs$coef) vinnames <- names(outfs$coef) vin <- varnames %in% vinnames bhatimin <- zero bhatimin[vin] <- outfs$coef betas[i,] <- bhatimin } for (j in 1:p){ tem <- shorth3(betas[,j],alpha=alpha) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- bhat[(k+2):p] tem <- confreg(betas[,(k+2):p],g=gg,that=tstat,alpha=alpha) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k+1 values of beta are (int,1,...,1) gg <- k + 1 tstat <- bhat[1:(k+1)] tem <- confreg(betas[,1:(k+1)],g=gg,that=tstat,alpha=alpha) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns mndd <- mean(dd) mnddboot <- mean(ddboot) #for the last bootstrap sample list(mndd=mndd,mnddboot=mnddboot,cicov=cicov,avelen=avelen,beta=beta,k=k)} gcor<-function(C){ #Returns a generalized correlation matrix R given #a positive semidefinite dispersion matrix C with positive diagonals. #Also returns D where C = D R D. #C = var(x) and C = covmb2$cov are common. d <- sqrt(diag(C)) di<- 1/d D <- diag(d) Di <- diag(di) #Dinverse R <- Di%*%C%*%Di list(D=D,R=R) } getB<-function(x, m=0, k= 5, msteps = 0){ # Gets the covmb2 subset B and the index of cases indx. # Best if p > n. # You can estimate number of clean cases m > n/2 with plot: out<-medout(x) x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] index <- 1:n covv <- diag(p) med <- apply(x, 2, median) #Get squared Euclidean distances from coordinatewise median. md2 <- mahalanobis(x, center = med, covv) if(m==0){ if(msteps > 0){#do concentration type steps for(i in 1:msteps){ medd <- median(md2) medw <- apply(x[md2<=medd,], 2, median) md2 <- mahalanobis(x, center = medw, covv) } } md <- sqrt(md2) mcut <- median(md) + k*mad(md,constant=1) } else{ #Use m cases with the smallest distances. md <- sqrt(md2) mcut <- sort(md)[m] } B <- x[md <= mcut,] indx <- index[md <= mcut] list(B = B, indx=indx) } getBbig<-function(x, group){ #Gets Bbig for discriminant analysis, binary regression, or one way MANOVA. #Let x contains data, group is vector with group[i] = j #if ith row is a case from the jth group, j = 1, ..., g. #Can be used if p > n. g <- max(group) x <- as.matrix(x) indx <- 1:2 tindx <- 1:dim(x)[1] #get the cases used in the covmb2 set B from each group for(i in 1:g){ xi <- as.matrix(x[group==i,]) tind <- tindx[group==i] tem <- getB(xi) tind <- tind[tem$indx] indx <- c(indx,tind) } indx <- indx[-c(1,2)] Bbig <- x[indx,] grp <- group[indx] list(Bbig=Bbig,indx=indx,grp=grp) } hdhot1sim<-function(n=100,p=10,B=100,nruns=100,xtype=1,eps=0.4,dd=4,delta=0, covtyp=1,psi=0.1,alpha=0.05){ # This R function simulates four one sample Hotelling's T^2 type tests where n/p may be #small. One test uses the m out of n bootstrap with m = 2n/3 where n/p is small. This test #could use more theory. Another test used wbar and S_W^2. Two other tests used theorem 2 # T_n, one with S_w^2, the other with sigmahat_W^2. # Need p > 1. Here p is the dimension of the mean mu. # Coded to reduce memory problems: avoid p by p matrices. # In the literature we sometimes use m instead of p. # xtype = 1 for MVN Np(0,I), # 2 for (1 - eps) Np(0,I) + eps Np(0, 25 I) # 3 for multivariate t_d with d = dd # 4 for lognormal. # Power can be estimated by increasing delta so mu = delta(1,...,1) # and mu0 = (0, ..., 0). # For MVN data, Cov(x) = I for covtyp=1. # Cov(x) = diag(1,...,p) for covtyp=2. # For covtyp = 3, cor(x_i,x_j) = rho for i not= j and # for MVN data this results in a covariance matrix with eigenvector # c(1, ..., 1)^T corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(p-2)psi^2]/[1 + (p-1)psi^2], i not = j # when the correlation exists. #A <- sqrt(diag(1:p)) memory problems if p is large #if(covtyp==3){ # A <- matrix(psi,nrow=p,ncol=p) # diag(A) <- 1} cvec2 <- sqrt(1:p) munot <- 0 * (1:p) mu <- delta * (1 + munot) indx <- 1:n Tboot <- 1:B cov <- 0 len <- 1:nruns icov <- cov ilen <- len tcov <- cov tlen <- len zcov <- cov zlen <- len m <- floor(n*2/3) m2 <- floor(n/2) m2m1 <- m2-1 tcut <- qt((1-alpha/2),m2m1) w<-1:m2 k <- n*(n-1) one <- 0*1:p + 1 for(i in 1:nruns) { #make data x <- matrix(rnorm(n*p), ncol = p, nrow = n) if(xtype == 2) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 } if(xtype == 3) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu } if(xtype == 4){ x <- exp(x) x <- x-exp(0.5) } if(covtyp==2){ x<-t(t(x)*cvec2) } if(covtyp==3){ rsum <- psi*apply(x,1,sum) x <- (1-psi)*x + rsum%*%t(one) } x <- mu + x a <- apply(x,2,sum) Thd <- (t(a)%*%a - sum(x^2))/k #currently a 1 by 1 matrix, not a scalar Thd <- as.double(Thd) #makes Thd into a scalar #x has mean mu =delta(1,...,1)^T #get bootstrapped T for(j in 1:B){ tem <- sample(indx,m,replace=F) xb <- x[tem,] a <- apply(xb,2,sum) Tboot[j] <- (t(a)%*%a - sum(xb^2))/(m*(m-1)) } tem<-shorth3(Tboot,alpha=alpha) len[i] <- tem$shorth[2] - tem$shorth[1] if(0 >= tem$shorth[1] && 0 <= tem$shorth[2]) cov <- cov + 1 sscp <- x%*%t(x) ss <- sscp - Thd ss <- ss^2 vw1 <- (sum(ss) - sum(diag(ss)))/k val <- tcut*sqrt(2*vw1/k) up <- Thd + val low <- Thd - val tlen[i] <- 2*val if(0 >= low && 0 <= up) tcov <- tcov + 1 for(j in 1:m2){ w[j] <- sscp[(2*j-1),(2*j)] } wbar <- mean(w) sdw <- sd(w) val <- tcut*sdw/sqrt(m2) up <- wbar + val low <- wbar - val ilen[i] <- 2*val if(0 >= low && 0 <= up) icov <- icov + 1 val <- tcut*sdw*sqrt(2/k) up <- Thd + val low <- Thd - val zlen[i] <- 2*val if(0 >= low && 0 <= up) zcov <- zcov + 1 } #prop of times Ho is rejected cov <- 1 - cov/nruns #bootstrap test tcov <- 1 - tcov/nruns #Th. 2 test with hatsigma_W^2 icov <- 1 - icov/nruns #test using wbar zcov <- 1 - zcov/nruns #Th. 2 test with S_W^2 blen <- mean(len) tlen <- mean(tlen) ilen <- mean(ilen) zlen <- mean(zlen) sdw1 <- sqrt(vw1) #list(tboot=Tboot,Thd=Thd,bcov=cov) list(sdw1=sdw1,sdw2=sdw,Thd=Thd,bcov=cov,blen=blen,icov=icov,ilen=ilen, tcov=tcov,tlen=tlen,zcov=zcov,zlen=zlen) } hdhot1sim2<-function(n=100,p=10,B=100,nruns=100,xtype=1,eps=0.4,dd=4,delta=0, covtyp=1,psi=0.1,alpha=0.05){ # Uses the fast estimator of V(W). Uses a Hu and Bai statistic # Need p > 1. Here p is the dimension of the mean mu. # coded to reduce memory problems: avoid p by p matrices # In the literature we sometimes use m instead of p. # xtype = 1 for MVN Np(0,I), # 2 for (1 - eps) Np(0,I) + eps Np(0, 25 I) # 3 for multivariate t_d with d = dd # 4 for lognormal. # Power can be estimated by increasing delta so mu = delta(1,...,1) # and mu0 = (0, ..., 0). # For MVN data, Cov(x) = I for covtyp=1. # Cov(x) = diag(1,...,p) for covtyp=2. # For covtyp = 3, cor(x_i,x_j) = rho for i not= j and # for MVN data this results in a covariance matrix with eigenvector # c(1, ..., 1)^T corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(p-2)psi^2]/[1 + (p-1)psi^2], i not = j # when the correlation exists. #A <- sqrt(diag(1:p)) memory problems if p is large #if(covtyp==3){ # A <- matrix(psi,nrow=p,ncol=p) # diag(A) <- 1} cvec2 <- sqrt(1:p) munot <- 0 * (1:p) mu <- delta * (1 + munot) cov <- 0 len <- 1:nruns m2 <- floor(n/2) m2m1 <- m2 - 1 tcut <- qt((1-alpha/2),m2m1) k <- n*(n-1) w<-1:m2 one <- 0*1:p + 1 for(i in 1:nruns) { #make data x <- matrix(rnorm(n*p), ncol = p, nrow = n) if(xtype == 2) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 } if(xtype == 3) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu } if(xtype == 4){ x <- exp(x) x <- x-exp(0.5) } if(covtyp==2){ x<-t(t(x)*cvec2) } if(covtyp==3){ rsum <- psi*apply(x,1,sum) x <- (1-psi)*x + rsum%*%t(one) } x <- mu + x a <- apply(x,2,sum) Thd <- (t(a)%*%a - sum(x^2))/k #currently a 1 by 1 matrix, not a scalar Thd <- as.double(Thd) #makes Thd into a scalar #x has mean mu =delta(1,...,1)^T #slow method #sscp <- x%*%t(x) #for(j in 1:m2){ # w[j] <- sscp[(2*j-1),(2*j)] #} for(j in 1:m2){ w[j] <- as.double(x[(2*j-1),]%*%x[(2*j),]) } sdw <- sd(w) val <- tcut*sdw*sqrt(2/k) up <- Thd + val low <- Thd - val len[i] <- 2*val if(0 >= low && 0 <= up) cov <- cov + 1 } #prop of times Ho is rejected cov <- 1 - cov/nruns len <- mean(len) list(sdw=sdw,Thd=Thd,cov=cov,len=len) } hdhot1sim3<-function(n=100,p=10,B=100,nruns=100,xtype=1,eps=0.4,dd=4,delta=0, covtyp=1,psi=0.1,alpha=0.05){ # This R function simulates five one sample Hotelling's T^2 type tests where n/p may be #small. One test uses the m out of n bootstrap with m = 2n/3 where n/p is small. This test #could use more theory. Another test used wbar and S_W^2. Two other tests used theorem 2 # T_n, one with S_w^2, the other with sigmahat_W^2. # The fifth test is Tn with S_W^2 applied to the spatial sign vectors. # Need p > 1. Here p is the dimension of the mean mu. # Coded to reduce memory problems: avoid p by p matrices. # In the literature we sometimes use m instead of p. # xtype = 1 for MVN Np(0,I), # 2 for (1 - eps) Np(0,I) + eps Np(0, 25 I) # 3 for multivariate t_d with d = dd # 4 for lognormal. # Power can be estimated by increasing delta so mu = delta(1,...,1) # and mu0 = (0, ..., 0). # For MVN data, Cov(x) = I for covtyp=1. # Cov(x) = diag(1,...,p) for covtyp=2. # For covtyp = 3, cor(x_i,x_j) = rho for i not= j and # for MVN data this results in a covariance matrix with eigenvector # c(1, ..., 1)^T corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(p-2)psi^2]/[1 + (p-1)psi^2], i not = j # when the correlation exists. #calls the norms function #A <- sqrt(diag(1:p)) memory problems if p is large #if(covtyp==3){ # A <- matrix(psi,nrow=p,ncol=p) # diag(A) <- 1} cvec2 <- sqrt(1:p) munot <- 0 * (1:p) mu <- delta * (1 + munot) indx <- 1:n Tboot <- 1:B cov <- 0 len <- 1:nruns icov <- cov ilen <- len tcov <- cov tlen <- len zcov <- cov zlen <- len sscov <- cov sslen <- len m <- floor(n*2/3) m2 <- floor(n/2) m2m1 <- m2-1 tcut <- qt((1-alpha/2),m2m1) w<-1:m2 k <- n*(n-1) one <- 0*1:p + 1 for(i in 1:nruns) { #make data x <- matrix(rnorm(n*p), ncol = p, nrow = n) if(xtype == 2) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 } if(xtype == 3) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu } if(xtype == 4){ x <- exp(x) x <- x-exp(0.5) } if(covtyp==2){ x<-t(t(x)*cvec2) } if(covtyp==3){ rsum <- psi*apply(x,1,sum) x <- (1-psi)*x + rsum%*%t(one) } x <- mu + x a <- apply(x,2,sum) Thd <- (t(a)%*%a - sum(x^2))/k #currently a 1 by 1 matrix, not a scalar Thd <- as.double(Thd) #makes Thd into a scalar #x has mean mu =delta(1,...,1)^T #get bootstrapped T for(j in 1:B){ tem <- sample(indx,m,replace=F) xb <- x[tem,] a <- apply(xb,2,sum) Tboot[j] <- (t(a)%*%a - sum(xb^2))/(m*(m-1)) } tem<-shorth3(Tboot,alpha=alpha) len[i] <- tem$shorth[2] - tem$shorth[1] if(0 >= tem$shorth[1] && 0 <= tem$shorth[2]) cov <- cov + 1 sscp <- x%*%t(x) ss <- sscp - Thd ss <- ss^2 vw1 <- (sum(ss) - sum(diag(ss)))/k val <- tcut*sqrt(2*vw1/k) up <- Thd + val low <- Thd - val tlen[i] <- 2*val if(0 >= low && 0 <= up) tcov <- tcov + 1 for(j in 1:m2){ w[j] <- sscp[(2*j-1),(2*j)] } wbar <- mean(w) sdw <- sd(w) val <- tcut*sdw/sqrt(m2) up <- wbar + val low <- wbar - val ilen[i] <- 2*val if(0 >= low && 0 <= up) icov <- icov + 1 val <- tcut*sdw*sqrt(2/k) up <- Thd + val low <- Thd - val zlen[i] <- 2*val if(0 >= low && 0 <= up) zcov <- zcov + 1 #test based on spatial sign vectors z <- x/norms(x)$enorms for(j in 1:m2){ w[j] <- as.double(z[(2*j-1),]%*%z[(2*j),]) } sdwss <- sd(w) val <- tcut*sdwss*sqrt(2/k) a <- apply(z,2,sum) Thd <- (t(a)%*%a - sum(z^2))/k #currently a 1 by 1 matrix, not a scalar Thd <- as.double(Thd) up <- Thd + val low <- Thd - val sslen[i] <- 2*val if(0 >= low && 0 <= up) sscov <- sscov + 1 } #prop of times Ho is rejected cov <- 1 - cov/nruns #bootstrap test tcov <- 1 - tcov/nruns #Th. 2 test with hatsigma_W^2 icov <- 1 - icov/nruns #test using wbar zcov <- 1 - zcov/nruns #Th. 2 test with S_W^2 sscov <- 1 - sscov/nruns #above test with spatial sign vectors blen <- mean(len) tlen <- mean(tlen) ilen <- mean(ilen) zlen <- mean(zlen) sslen <- mean(sslen) sdw1 <- sqrt(vw1) #list(tboot=Tboot,Thd=Thd,bcov=cov) list(sdw1=sdw1,sdw2=sdw,Thd=Thd,bcov=cov,blen=blen,icov=icov,ilen=ilen, tcov=tcov,tlen=tlen,zcov=zcov,zlen=zlen,sscov=sscov,sslen=sslen) } hdhot2sim<-function(n1=100,n2=100,p=10,B=100,nruns=100,xtype=1,sig=1, eps=0.4,dd=4,delta=0,covtyp=1,alpha=0.05){ # This R function simulates the two sample Hotelling's T^2 test based on the # m out of n bootstrap where mi=2ni/3 and ni/p is small. Need p > 1. # The one sample test applied to z_i = x_i1 - x_i2 is also used. # xtype = 1 for MVN Np(0,I), # 2 for (1 - eps) Np(0,I) + eps Np(0, 25 I) # 3 for multivariate t_d with d = dd # 4 for lognormal. # Power can be estimated by increasing delta so mu = delta(1,...,1) # and mu2 = (0, ..., 0). # Cov(x) = I, Cov(x2) = sig^2 Cov(x) for covtyp=1. # Cov(x) = diag(1,2,...,p), Cov(x2) = sig^2 Cov(x) for covtyp=2. # Cov(x) = I, Cov(x2) = sig^2 diag(1,2,...,p) for covtyp=3 #coded to avoid p by p matrices # A <- sqrt(diag(1:p)) #memory problems if p is large cvec2 <- sqrt(1:p) munot <- 0 * (1:p) mu <- delta * (1 + munot) indx <- 1:n1 indx2 <- 1:n2 Tboot <- 1:B len <- 1:nruns cov <- 0 cov2 <- 0 m <- min(n1,n2) k <- m*(m-1) mm <- floor(m/2) w<-1:mm mmm1 <- mm - 1 tcut <- qt((1-alpha/2),mmm1) m1 <- floor(n1*2/3) m2 <- floor(n2*2/3) for(i in 1:nruns) { #make data x <- matrix(rnorm(n1 * p), ncol = p, nrow = n1) x2 <- matrix(rnorm(n2 * p), ncol = p, nrow = n2) if(xtype == 2) { zu <- runif(n1) x[zu < eps, ] <- x[zu < eps, ] * 5 zu <- runif(n2) x2[zu < eps, ] <- x2[zu < eps, ] * 5 } if(xtype == 3) { zu <- sqrt(rchisq(n1, dd)/dd) x <- x/zu zu <- sqrt(rchisq(n2, dd)/dd) x2 <- x2/zu } if(xtype == 4){ x <- exp(x) x <- x-exp(0.5) x2 <- exp(x2) x2 <- x2 - exp(0.5) } if(covtyp==2){ x<-t(t(x)*cvec2) #x <- x %*% A x2<-t(t(x2)*cvec2) #x2 <- x2 %*% A } if(covtyp==3){ x2<-t(t(x2)*cvec2)} x2 <- sig * x2 x <- mu + x a <- apply(x,2,sum) b <- apply(x2,2,sum) Thd <- (t(a)%*%a - sum(x^2))/(n1*(n1-1)) + (t(b)%*%b - sum(x2^2))/(n2*(n2-1)) Thd <- Thd - 2*t(a)%*%b/(n1*n2) #x has mean mu =delta(1,...,1)^T, x2 has mean (0,...,0)^T #get bootstrapped T for(j in 1:B){ tem <- sample(indx,m1,replace=F) xb <- x[tem,] tem <- sample(indx2,m2,replace=F) x2b <- x2[tem,] a <- apply(xb,2,sum) b <- apply(x2b,2,sum) Tb <- (t(a)%*%a - sum(xb^2))/(m1*(m1-1)) + (t(b)%*%b - sum(x2b^2))/(m2*(m2-1)) Tboot[j] <- Tb - 2*t(a)%*%b/(m1*m2) } tem<-shorth3(Tboot,alpha=alpha) if(0 >= tem$shorth[1] && 0 <= tem$shorth[2]) cov <- cov + 1 #get the test based on the fast one sample test xx <- x[1:m,] xx2 <- x2[1:m,] diff <- xx - xx2 a <- apply(diff,2,sum) #faster? Thd2 <- (t(a)%*%a - sum(diff^2))/k #currently a 1 by 1 matrix, not a scalar Thd2 <- as.double(Thd2) #makes Thd2 into a scalar for(j in 1:mm){ w[j] <- as.double(diff[(2*j-1),]%*%diff[(2*j),]) } sdw <- sd(w) val <- tcut*sdw*sqrt(2/k) up <- Thd2 + val low <- Thd2 - val len[i] <- 2*val if(0 >= low && 0 <= up) cov2 <- cov2 + 1 } #prop of times Ho is rejected cov <- 1 - cov/nruns cov2 <- 1 - cov2/nruns len <- mean(len) list(Thd=Thd,cov=cov,cov2=cov2,len=len) } lassoboot<-function(x,y,B = 1000,regtype=1){ #needs library(glmnet), n > 5p, p > 2 #bootstrap lasso or ridge regression #Does not make sense to do variable selection if there #is only one nontrivial predictor. #Change regtype = 0 for ridge regression x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res betas <- matrix(0,nrow=B,ncol=p) #bootstrap the lasso or ridge regression model for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-cv.glmnet(x,y=yb,alpha=regtype) betas[i,] <- as.vector(predict(tem,type="coefficients",s=tem$lambda.min)) } list(betas=betas) } lassoboot2<-function(x,y,B = 1000,regtype=1,c=0.01,aug=F){ #needs library(glmnet), n > 5p, p > 2 #bootstrap lasso or ridge regression #If aug not=F, adds cB full model bootstrap samples. #Does not make sense to do variable selection if there #is only one nontrivial predictor. #Change regtype = 0 for ridge regression x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] d <- ceiling(c*B) bpd <- B + d bp1 <- B + 1 #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res betas <- matrix(0,nrow=bpd,ncol=p) #bootstrap the lasso or ridge regression model for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-cv.glmnet(x,y=yb,alpha=regtype) betas[i,] <- as.vector(predict(tem,type="coefficients",s=tem$lambda.min)) } if(aug == F) {betas <- betas[1:B,]} else { for(i in bp1:bpd){ yb <- fit + sample(res,n,replace=T) betas[i,] <- lsfit(x,yb)$coef } } list(betas=betas) } lassobootsim<-function(n = 100, p = 4, nruns = 100, eps = 0.1, shift = 9, type = 1, alph = 0.05, regtype=1, BB =1000){ #needs library(glmnet) VERY SLOW #Simulates bootstrap for lasso and ridge regression (regtype=0). #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #predictor variables are iid N(0,1) so uncorrelated # Y = 1 + x2 + ... + x_k + e #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k ones p - k zeroes q <- p-1 k <- floor(p/2) b <- 0 * 1:q b[1:(k-1)] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-lassoboot(x,y,B=BB,regtype=regtype) #bootstrap lasso or ridge regression for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p - k values of beta are 0 tem <- predreg(out$betas[,(k+1):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} lassobootsim3<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, regtype=1, BB=1000, alph = 0.05){ #needs library(glmnet), calls confreg, shorth3 #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Takes about a minute per run so really slow. #Simulates bootstrap for lasso or RR (regtype=0) #with 10-fold cross validation. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1, so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-lassoboot(x,y,B=BB,regtype=regtype) #bootstrap 10 fold CV lasso or RR model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 tem <- predreg(out$betas[,(k+2):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} lassobootsim4<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, regtype=1, BB=1000, alph = 0.05){ #needs library(glmnet), calls lassoboot, shorth3 #Gets rid of the test so the program does not fail. #Takes about a minute per run so really slow. #Simulates bootstrap for lasso or RR (regtype=0) #with 10-fold cross validation. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1, so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) cicov <- 0*(1:p) avelen <- 0*(1:p) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-lassoboot(x,y,B=BB,regtype=regtype) #bootstrap 10 fold CV lasso or RR model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} lassobootsim5<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, cc=0.01, augm=F, type = 1, psi = 0.0, regtype=1, BB=1000, alph = 0.05){ #needs library(glmnet), calls lassoboot2, predreg, shorth3 #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #If augm neq F, adds the OLS full model bootstrap samples (lambda = 0) #so S*_T is better conditioned. #Takes about a minute per run so really slow. #Simulates bootstrap for lasso or RR (regtype=0) #with 10-fold cross validation. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1, so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-lassoboot2(x,y,B=BB,regtype=regtype,c=cc,aug=augm) #bootstrap 10 fold CV lasso or RR model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 tem <- predreg(out$betas[,(k+2):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} lassopisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alph = 0.05, pitype = 1){ #Needs library(glmnet). Uses 10 fold CV and the Olive (2013) PI. #Simulates PIs for lasso and ridge regression. #change pitype = 0 for ridge regression # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alph > 0.1) {qn <- min(1 - alph + 0.05, 1 - alph + p/n)} if (alph <= 0.1) {qn <- min(1 - alph/2, 1 - alph + 10*alph*p/n)} pn <- qn if(pn < 1 - alph + 0.001) qn <- 1 - alph alphan <- 1 - qn pilen <- 1:nruns lamvec<- pilen opicov <- 0 q <- p-1 vmax <- min(p,as.integer(n/5)) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the 10 fold CV lasso or ridge regression model out<-cv.glmnet(x,y,alpha=pitype) lam <- out$lambda.min lamvec[i] <- lam fit <- predict(out,s=lam,newx=x) yfhat <- predict(out,s=lam,newx=xf) fres <- y - fit #get asymptotically optimal PI sres <- sort(fres) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } lammn <- mean(lamvec) pimnlen <- mean(pilen) opicov <- opicov/nruns list(lammn=lammn,opicov=opicov, pimenlen = pimnlen)} lassopisim2<-function(n = 100, p = 100, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alph = 0.05, pitype=1){ #Needs library(glmnet). ##slow Uses 10 fold CV and the Pelawa Watagoda and Olive (2017) PI. #Simulates PIs for lasso or ridge regression when p is not necessarily small. # 1 <= k <= p-1, k is the number of active nontrivial predictors # Use pitype = 0 for ridge regression. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 pilen <- 1:nruns lamvec<- pilen opicov <- 0 q <- p-1 vmax <- min(p,as.integer(n/5)) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] val <- 8*n/9 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the 10 fold CV lasso or ridge regression model out<-cv.glmnet(x,y,alpha=pitype) lam <- out$lambda.min lamvec[i] <- lam fit <- predict(out,s=lam,newx=x) yfhat <- predict(out,s=lam,newx=xf) fres <- y - fit #get PI if(pitype == 1) pp <- out$nzero[out$lambda==lam] + 1 #crude df for lasso else{ #x = u in the text #lam = lam_{1,n}/a, not sure what a is, perhaps a = 2n w1 <- scale(x) w <- sqrt(n/(n-1))*w1 #t(w) %*% w = n R_u, u = x svs <- svd(w)$d #singular values of w, pp <- 1 + sum(svs^2/(svs^2+2*n*lam)) } # d for ridge regression if lam = lam_{1,n}/(2n) if(pp < val) corfac <- (1 + 15/n) * sqrt( (n+2*pp)/(n - pp) ) else corfac <- 5*(1+15/n) if (alph > 0.1) {qn <- min(1 - alph + 0.05, 1 - alph + pp/n)} if (alph <= 0.1) {qn <- min(1 - alph/2, 1 - alph + 10*alph*pp/n)} pn <- qn if(pn < 1 - alph + 0.001) qn <- 1 - alph alphan <- 1 - qn sres <- sort(fres) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } lammn <- mean(lamvec) pimnlen <- mean(pilen) opicov <- opicov/nruns list(lammn=lammn,opicov=opicov, pimenlen = pimnlen)} LPHboot<-function(x,time,status,B=1000){ #needs library(glmnet), n > 5p, p > 2, want B >= 50p, #bootstraps the Cox regression lasso, takes a few minutes x <- as.matrix(x) n <- length(time) p <- dim(x)[2] y<-cbind(time,status) outlasso<-cv.glmnet(x,y,family="cox") lam <- outlasso$lambda.min betahat <- as.vector(predict(outlasso,type="coefficients",s=lam)) betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ samp <- sample(1:n, replace=TRUE) tdat <- tdata[samp,] tx <- x[samp,] ty <- y[samp,] temp<-cv.glmnet(tx,ty,family="cox") lam <- temp$lambda.min betas[i,] <- as.vector(predict(temp,type="coefficients",s=lam)) } list(bhatimin0=betahat,betas=betas) } LRboot<-function(x,y,mv=c(1,1),B = 1000,bin=T){ #needs library(MASS), n > 5p, p > 2, want B >= 50p, takes a few minutes #mv is the m_i vector of the number of trials; if bin=T #then for binary LR the program provides the number of trials #bootstrap logistic regression full model x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] tdata <- as.data.frame(cbind(x,y)) if(bin==T) mv <- 0*1:n + 1 ny <- mv-y out <- glm(cbind(y,ny)~., family=binomial, data=tdata) bhat<-out$coef ESP <- predict(out,newdata = tdata) betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rbinom(n,size=1,prob=(exp(ESP)/(1+exp(ESP)))) nydat <- mv-ydat tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(cbind(ydat,nydat)~., family=binomial, data=tdat) betas[i,] <- temp$coef } list(bhat=bhat,betas=betas) } lrpiplot<-function(ESP,y,mv,d=2,B=1000,slices=10,alpha=0.05) {# Makes response plot for logistic regression with PIs for Y_i/m_i added. #0 = data, x = PI #mv is the vector of the number of trial m_i #tdata <- as.data.frame(cbind(x,y)) #ny <- mv-y #ny[i] = mv[i]-y[i] = no. of ``failures" #out <- glm(cbind(y,ny)~., family=binomial, data=tdata) #x<-as.matrix(x); d<-dim(x)[2]+1 #d=no. of predictors #ESP <- predict(out) cov<-0 n <- length(y) low<-1:n up<-low Z <- y/mv plot(ESP, Z) abline(weighted.mean(Z, mv), 0) fit <- y fit <- exp(ESP)/(1 + exp(ESP)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) fit2 <- fit val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i - 1) * val + 1):(i * val)] <- weighted.mean(Z[ indx[((i - 1) * val + 1):(i * val)]], mv[indx[(( i - 1) * val + 1):(i * val)]]) } fit2[((slices - 1) * val + 1):n] <- weighted.mean(Z[indx[(( slices - 1) * val + 1):n]], mv[indx[((slices - 1) * val + 1):n]]) # fit2 is already sorted in order corresponding to indx lines(ESP[indx], fit2) for(i in 1:n){ ydat <- rbinom(B,size=mv[i],prob=(exp(ESP[i])/(1+exp(ESP[i])))) tem <- mshpi(yf=y[i],ydat=ydat,n=n,d=d,alph=alpha) low[i] <- tem$low/mv[i] up[i] <- tem$up/mv[i] cov<-cov+tem$inr } points(ESP,low,pch=4) points(ESP,up,pch=4) title("Response Plot") cov<-cov/n list(cov=cov) } lrplot3<-function(tem, x=x, slices = 10, ff = 0.99, step = T){ # Makes the response plot for binary logistic regression. # If step = T use step function, else use lowess. # ind <- as.data.frame(cbind(y,x[,vin])) #lasso variable selection GLM #tem <- glm(y~.,family="binomial",data=ind) x<-as.matrix(x) ESP <- x %*% tem$coef[-1] + tem$coef[1] Y <- y plot(ESP, Y) fit <- y fit <- exp(ESP)/(1 + exp(ESP)) # lines(sort(ESP),sort(fit)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) if(step == T){ fit2 <- fit n <- length(y) val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i - 1) * val + 1):(i * val)] <- mean(y[indx[((i - 1) * val + 1):(i * val)]]) } fit2[((slices - 1) * val + 1):n] <- mean(y[indx[((slices - 1) * val + 1 ):n]]) # fit2 is already sorted in order corresponding to indx lines(ESP[indx], fit2)} else lines(lowess(ESP, Y, f = ff), type = "s") title("Response Plot") } lrplot4<-function(x, y, mv, slices = 10, ff = 0.97, step = T) {# Makes response plot and OD plot for binomial logistic regression. # mv = (m1, ..., mn) where yi is bin(mi,p(SP)) # If step = T use step function, else use lowess. # Workstation: need to activate a graphics # device with command "X11()" or "motif()." # needs n >> p #ESP <- x %*% out$coef[-1] + out$coef[1] # n <- length(y) tdata <- as.data.frame(cbind(x,y)) ny <- mv-y #ny[i] = mv[i]-y[i] = no. of ``failures" out <- glm(cbind(y,ny)~., family=binomial, data=tdata) ESP <- predict(out) par(mfrow = c(1, 2),pty="s") Z <- y/mv plot(ESP, Z) abline(weighted.mean(Z, mv), 0) fit <- y fit <- exp(ESP)/(1 + exp(ESP)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) if(step == T) { fit2 <- fit val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i - 1) * val + 1):(i * val)] <- weighted.mean(Z[ indx[((i - 1) * val + 1):(i * val)]], mv[indx[(( i - 1) * val + 1):(i * val)]]) } fit2[((slices - 1) * val + 1):n] <- weighted.mean(Z[indx[(( slices - 1) * val + 1):n]], mv[indx[((slices - 1) * val + 1):n]]) # fit2 is already sorted in order corresponding to indx lines(ESP[indx], fit2) } else lines(lowess(ESP, Z, f = ff), type = "s") title("a) Response Plot") #get OD plot val <- exp(ESP)/(1 + exp(ESP)) Ehat <- mv * val Vmodhat <- Ehat * (1 - val) Vhat <- (y - Ehat)^2 plot(Vmodhat, Vhat) abline(0, 1) abline(0, 4) abline(lsfit(Vmodhat, Vhat)$coef) title("b) OD Plot") } medbootsim<-function(n = 100, p = 2, nruns = 100, xtype=1, eps=0.25, dd= 1, delta = 0, B=1000, alpha=0.05){ # Simulates coord. median bootstrap Hotelling's T^2 type test. # Gets coverages and avearage cutoff using pred reg method, # and calibration method using nV=49,99, or B. # Need p > 1. Want n > 20p and number of bootstrap samples B > 20p. # Multiply x by A where xtype = 1 for MVN Np(0,I), # 2, 3, 4 and 5 (with delta = eps) for (1 - delta) Np(0,I) + delta Np(0, 25 I) # 6, 7, 8 and 9 for multivariate t_d with d = 3, 5, 19 or dd # 10 for lognormal. # Power can be estimated by increasing delta so mu = delta(1,...,1) # and mu_o = 0*mu. # calls predreg A <- sqrt(diag(1:p)) cv <- 0 inr <- 0 indx <- 1:n munot <- 0 * (1:p) mu <- delta * (1 + munot) B2 <-B*2 Bp1 <- B+1 Bp49 <- B+49 Bp99 <- B+99 mus <- matrix(0,nrow=B2,ncol=p) DUB <- 1:nruns DU49 <- 1:nruns uv49 <- min(49, ceiling( (1 - alpha)*(1+49) ) ) up49 <- uv49/(49+1) #quantile for prediction region cv49 <- 0 DU99 <- 1:nruns uv99 <- min(99, ceiling( (1 - alpha)*(1+99) ) ) up99 <- uv99/(99+1) #quantile for prediction region cv99 <- 0 DUBB <- 1:nruns uvB <- min(B, ceiling( (1 - alpha)*(1+B) ) ) upB <- uvB/(B+1) #quantile for prediction region cvB <- 0 for(i in 1:nruns) { #make data x <- matrix(rnorm(n * p), ncol = p, nrow = n) if(xtype == 2) { zu <- runif(n) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(xtype == 3) { zu <- runif(n) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(xtype == 4) { zu <- runif(n) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(xtype == 5) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 } if(xtype == 6) { zu <- sqrt(rchisq(n, 3)/3) x <- x/zu } if(xtype == 7) { zu <- sqrt(rchisq(n, 5)/5) x <- x/zu } if(xtype == 8) { zu <- sqrt(rchisq(n, 19)/19) x <- x/zu } if(xtype == 9) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu } if(xtype == 10){ #Want pop coord med(x) = 0. x <- exp(x) x <- x - 1 } x <- x %*% A x <- mu + x #get bootstrapped coord. median for(j in 1:B2){ tem <- sample(indx,n,replace=T) mus[j,] <- apply(x[tem,],2,median) } out <- predreg(mus[1:B,]) cv <- cv + out$inr DUB[i] <- out$cuplim Tstar <- out$center Sstar <- out$cov xfd <- mahalanobis(munot, center=Tstar, cov=Sstar) mu49 <- mus[Bp1:Bp49,] md2 <- mahalanobis(mu49, center=Tstar, cov=Sstar) DU49[i] <- quantile(md2, up49, type=1) if(xfd <= DU49[i]) cv49 <- 1 + cv49 mu99 <- mus[Bp1:Bp99,] md2 <- mahalanobis(mu99, center=Tstar, cov=Sstar) DU99[i] <- quantile(md2, up99, type=1) if(xfd <= DU99[i]) cv99 <- 1 + cv99 muB <- mus[Bp1:B2,] md2 <- mahalanobis(muB, center=Tstar, cov=Sstar) DUBB[i] <- quantile(md2, upB, type=1) if(xfd <= DUBB[i]) cvB <- 1 + cvB } cv <- cv/nruns #prop of times Ho is not rejected DB <- mean(DUB) D49 <- mean(sqrt(DU49)) cv49 <- cv49/nruns D99 <- mean(sqrt(DU99)) cv99 <- cv99/nruns DBB <- mean(sqrt(DUBB)) cvB <- cvB/nruns list(cv=cv,DB=DB,cv49=cv49,D49=D49,cv99=cv99,D99=D99,cvB=cvB,DBB=DBB) } mldsim6<-function(n = 100, p = 2, steps = 5, gam = 0.4, runs = 100, outliers = 0, pm = 10, kk=5, osteps = 0){ # This R function compares the # FCH, RFCH, CMVE, RCMVE, RMVN, COVMB2, and MB estimators. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # Calls cmve, covfch, covrmvn, covmb2 A <- sqrt(diag(1:p)) fchct <- 0 rfchct <- 0 cmvect <- 0 rcmvect <- 0 rmvnct <- 0 mbct <- 0 covmb2ct <- 0 val <- floor(gam * n) for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } out <- covfch(x, csteps = steps) rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fchct <- fchct + 1 rd2 <- mahalanobis(x, out$rmnf, out$rcovf) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rfchct <- rfchct + 1 out <- cmve(x, csteps = steps) rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) cmvect <- cmvect + 1 rd2 <- mahalanobis(x, out$rmnf, out$rcovf) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rcmvect <- rcmvect + 1 rd2 <- mahalanobis(x, out$mnm, out$covm) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbct <- mbct + 1 out <- covrmvn(x, csteps = steps) rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rmvnct <- rmvnct + 1 out <- covmb2(x, k=kk, msteps=osteps) rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) covmb2ct <- covmb2ct + 1 } list(fchct = fchct, rfchct = rfchct, cmvect = cmvect, rcmvect = rcmvect, rmvnct = rmvnct, covmb2ct = covmb2ct, mbct = mbct) } mldsim7<-function(n = 100, p = 2, gam = 0.4, runs = 100, outliers = 0, pm = 10, kk=5, osteps = 0){ # This R function examines the COVMB2 estimators. #The function mldsim6 uses the weighted covariance matrix out$cov #The function mldsim7 uses squared Euclidian distances based on Ip are used, #or the squared Mahalanobis distances using diag(out$cov) # Counts number of times all outlier distances > clean distances. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # Calls covmb2 A <- sqrt(diag(1:p)) covv <- diag(p) #identity matrix Ip covmb2ct <- 0 diagct <- 0 val <- floor(gam * n) for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } out <- covmb2(x, k=kk, msteps=osteps) rd2 <- mahalanobis(x, out$center, covv) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) covmb2ct <- covmb2ct + 1 rd2 <- mahalanobis(x, out$center, diag(diag(out$cov))) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) diagct <- diagct + 1 } list( covmb2ct = covmb2ct, diagct = diagct) } MLRplot<-function(x, Y){ # Response plot and residual plot. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(lqs)" if a robust estimator replaces lsfit. # Advance the view with the right mouse button. x <- as.matrix(x) out <- lsfit(x, Y) cook <- ls.diag(out)$cooks n <- dim(x)[1] p <- dim(x)[2] + 1 tem <- cook > min(0.5, (2 * p)/n) bhat <- out$coef FIT <- Y - out$res cmar <- par("mar") par(mfrow = c(2, 1)) par(mar=c(4.0,4.0,2.0,0.5)) plot(FIT, Y) abline(0, 1) points(FIT[tem], Y[tem], pch = 15) title("Response Plot") identify(FIT, Y) RES <- Y - FIT plot(FIT, RES) points(FIT[tem], RES[tem], pch = 15) title("Residual Plot") identify(FIT, RES) par(mfrow = c(1, 1)) par(mar=cmar) } mlrsim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi=0.0, indices=c(1,2), alph = 0.05){ #Simulates CIs for OLS MLR for unscaled and scaled predictors. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-1, k is the number of nonnoise variables #Need k < p-1 so there are some zeroes in the model. #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) cicov <- 0*(1:p) avelen <- 0*(1:p) cicovs <- 0*(1:p) avelens <- 0*(1:p) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 covx <- A %*% t(A) sds <- sqrt(diag(covx)) #pop sds for the predictors sds<- c(1,sds) betas <- beta*sds #pop beta for OLS using scaled predictors val <- 1-alph/2 valf <- 1 - alph dfd <- n-p dfn <- length(indices) cut <- qt(val,df=dfd) fcut <- dfn*qf(valf,df1=dfn,df2=dfd) Up <- 1:p Lp <- Up Ups <- Up Lps <- Up one <- 1 + 0*1:n testcov <- 0 testcovs <- 0 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-lsfit(x,y) # temp <- ls.diag(out) ses <- temp$std.err mse <- temp$std.dev^2 bhat <- out$coef tem <- cut*ses Up <- bhat+tem Lp <- bhat-tem w<-scale(x, center = FALSE, scale = apply(x, 2, sd, na.rm = TRUE)) outs <- lsfit(w,y) temps <- ls.diag(outs) sess <- temps$std.err mses <- temps$std.dev^2 bhats <- outs$coef tems <- cut*sess Ups <- bhats+tems Lps <- bhats-tems for (j in 1:p){ if(beta[j] >= Lp[j] && beta[j] <= Up[j]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + Up[j] - Lp[j] #"CIs for OLS" using scaled predictors if(betas[j] >= Lps[j] && betas[j] <= Ups[j]) cicovs[j] <- cicovs[j] + 1 avelens[j] <- avelens[j] + Ups[j] - Lps[j] } #test whether beta_I = 0 with I = indices x1 <- cbind(one,x) xtx <- t(x1)%*%x1 cinv <- solve(xtx) diff <- bhat[indices] - beta[indices] dsq <- t(diff) %*% solve(cinv[indices,indices]) %*% diff dsq<-dsq[1,1] #make dsq a real instead of a matrix if(dsq < mse*fcut) testcov <- testcov +1 #test whether beta_I = 0 with I = indices and scaled predictors w1 <- cbind(one,w) wtw <- t(w1)%*%w1 cinv <- solve(wtw) diff <- bhats[indices] - betas[indices] dsq <- t(diff) %*% solve(cinv[indices,indices]) %*% diff dsq<-dsq[1,1] #make dsq a real instead of a matrix if(dsq < mses*fcut) testcovs <- testcovs +1 } cicov <- cicov/nruns avelen <- sqrt(n)*avelen/nruns cicovs <- cicovs/nruns avelens <- sqrt(n)*avelens/nruns testcov <- testcov/nruns testcovs <- testcovs/nruns list(cicov=cicov,avelen=avelen,testcov=testcov,beta=beta, cicovs=cicovs,avelens=avelens,testcovs=testcovs,betas=betas,k=k)} MLRsim<-function(n = 100, q = 7, nruns = 4, eps = 0.1, shift = 9, type = 1){ #Right click Stop for each plot. #Generates response and residual plots for MLR #for a few iid error distributions: # if type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients b <- 0 * 1:q + 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set MLRplot(x,y) #get the response and residual plots }} mlrsplitsim<-function(n = 100, p = 4, k=1, n1=30, J=5, nruns = 100, eps = 0.1, shift = 9, dtype = 1, psi = 0.0, alpha = 0.05){ #needs library(glmnet), n1>=30 gets rid of the warnings #J is an integer between 0 and 5, dtype=2 and 5 have problems for nruns > 100 #noundfit is the number of times in nruns that lasso did not underfit #nd is the size of the training data set selected by sequential data splitting #ad is the number of predictors, including a constant, selected by lasso for #the selected training data set: want ad not much larger than k+1 #k+1 is the number of beta_i not equal to 0 #n1 is the intitial size of the training data set, want n1 <= n/2 #Simulates sequential data splitting for multiple linear regression using lasso #with 10-fold cross validation. #Uses five iid error distributions: # dtype = 1 for N(0,1), 2 for t3, 3 for exp(1) - 1 # 4 for uniform(-1,1), 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2). # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. #nd < 10 ad is possible if nd approx n/2 q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 laspilen <- 1:nruns laspicov <- 0 RLpilen <- 1:nruns RLpicov <- 0 lsplitpilen <- 1:nruns lsplitpicov <- 0 splitpilen <- 1:nruns splitpicov <- 0 vars <- as.vector(1:(p-1)) noundfit <- 0 indx <- 1:n nds<-1:nruns ads<-nds #check for bad values of n1 fhalf <- floor((n-J)/2) if(n<40) n1 <- max(1,fhalf) else if(floor(n/(2*n1))>1000) n1 <- floor(n/2000) if(n1 > fhalf) n1 <- max(1,fhalf) for(i in 1:nruns) { ## ##print(i) x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(dtype == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(dtype == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(dtype == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(dtype == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(dtype == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #get lasso PI out<-cv.glmnet(x,y) lam <- out$lambda.min fit <- predict(out,s=lam,newx=x) yfhat <- predict(out,s=lam,newx=xf) res <- y - fit lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) laspilen[i] <- tem$up - tem$low laspicov <- laspicov + tem$inr #get lasso variable selection PI, problems if number of variables > n-1 if(length(vin) < (n-5)){ sub <- lsfit(x[,vin],y) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] res <- sub$resid tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) RLpilen[i] <- tem$up - tem$low RLpicov <- RLpicov + tem$inr } #use sequential data splitting perm <- sample(indx,n) H <- perm[1:n1] xH <- x[H,] yH <- y[H] nd<-n1 out<-cv.glmnet(xH,yH) lam <- out$lambda.min fit <- predict(out,s=lam,newx=xH) lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 while(nd < fhalf && nd < 10*pp){ if((nd+n1) <= fhalf){ perm<-perm[-(1:n1)] H <- c(H,perm[1:n1]) xH <- x[H,] yH <- y[H] out<-cv.glmnet(xH,yH) lam <- out$lambda.min fit <- predict(out,s=lam,newx=xH) lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 nd<-nd+n1 } else nd<-nd+n1 } if(nd > fhalf) nd <- nd-n1 if(nd < fhalf && nd < 10*pp){ md <- fhalf - nd nd <- nd + md perm<-perm[-(1:n1)] H <- c(H,perm[1:md]) xH <- x[H,] yH <- y[H] out<-cv.glmnet(xH,yH) lam <- out$lambda.min fit <- predict(out,s=lam,newx=xH) lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 } nds[i] <- nd ads[i] <- pp if(length(vin) >= k){ if(vin[k]==k) noundfit <- noundfit + 1} xV <- x[-H,] yV <- y[-H] #get lasso data splitting PI out<-cv.glmnet(xV,yV) lam <- out$lambda.min fit <- predict(out,s=lam,newx=xV) yfhat <- predict(out,s=lam,newx=xf) res <- yV - fit lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) lsplitpilen[i] <- tem$up - tem$low lsplitpicov <- lsplitpicov + tem$inr #get lasso variable selection data splitting PI if(length(vin) < (length(yV)-3)){ sub <- lsfit(xV[,vin],yV) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] res <- sub$resid tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) splitpilen[i] <- tem$up - tem$low splitpicov <- splitpicov + tem$inr } } laspicov <- laspicov/nruns laspilen <- mean(laspilen) RLpicov <- RLpicov/nruns RLpilen <- mean(RLpilen) lsplitpicov <- lsplitpicov/nruns lsplitpilen <- mean(lsplitpilen) splitpicov <- splitpicov/nruns splitpilen <- mean(splitpilen) mnnd<-mean(nds) mnad<-mean(ads) list(mnnd=mnnd,mnad=mnad,laspicov=laspicov,laspilen=laspilen, LVSpicov=RLpicov,LVSpilen=RLpilen,lsplitpicov=lsplitpicov, lsplitpilen=lsplitpilen,splitpicov=splitpicov,splitpilen=splitpilen, beta=beta,k=k,noundfit=noundfit,n1=n1)} mmlesim<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, etype = 1, wtype=1, psi = 0.0, cfac="T", indices=c(1,2), alph = 0.05){ #Tests if beta_i,MMLE = 0 #Generates MLR data Y = alpha + x^T beta + e where V(Y_i) = sigma_i^2 so beta = beta_OLS. #Hence cov(x,Y) = eta = eta_OPLS = cov(x) beta #tests beta_i,MMLE eta_i/(sigma_i^2) = 0 and whether beta_I,MMLE = 0 where I is given by #indices provides I has r of the x_i and n > 5 r with r > 1. #k+1 is the number of beta_i not equal to 0 #Uses five iid error distributions: # etype = 1 for N(0,1), 2 for t3, 3 for exp(1) - 1 # 4 for uniform(-1,1), 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2). # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-1, if k < p-1, zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes #wtype = 1 for OLS, 2 if err = abs(SP - 5) * e, 3 if sqrt(1 + 0.5* x[,1]^2)*e #4 for exp[1 + log(|x_2|) + ... + log(|x_p|] * e, #5 for [1 + log(|x_2|) + ... + log(|x_p|] * e # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. # Calls oplscis. q <- p-1 b <- 0 * 1:q dd <- b + 1 dq <- dd/q if(k > 0) b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 covx <- A %*% t(A) vars<-diag(covx) varinvs <- 1/vars etaopls <- covx %*% b #pop cov(x,y) theta <- etaopls*varinvs mmlecov <- 0 * 1:q mmlelen <- mmlecov r <- length(indices) etaI <- etaopls[indices] testcov <- 0 val <- 1-alph cut <- qchisq(val,df=r) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A SP <- 1 + x %*% b if(etype == 1) err <- rnorm(n) if(etype == 2) err <- rt(n, df = 3) if(etype == 3) err <- rexp(n) - 1 if(etype == 4) err <- runif(n, min = -1, max = 1) if(etype == 5) err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) *shift) if(wtype == 2) err <- abs(SP - 5) * err if(wtype == 3) err <- sqrt(1 + x[,1]^2)*err if(wtype == 4) err <- exp(1 + log(abs(x))%*%dd) * err if(wtype == 5) err <- (1 + log(abs(x))%*%dd) * err if(wtype == 6) #geometric mean of the |x_i| err <- exp(log(abs(x))%*%dq) * err if(wtype == 7) #mean of the log(|x_i|) err <- (log(abs(x))%*%dq) * err #make an MLR data set, usually with heterogeneity y <- SP + err out <- covxycis(x,y,alph,cfac=cfac) vhats <-apply(x,2,var) vhatinvs <- 1/vhats cis <- out$cis*vhatinvs mmlelen <- mmlelen + (cis[,2] - cis[,1]) for(j in 1:q){ if(out$cis[j,1] <= theta[j] && theta[j] <= out$cis[j,2]) mmlecov[j] <- mmlecov[j] + 1 } if(r > 1 && n > 5*r) { xI <- x[,indices] covxIy <- cov(xI,y) diff <- covxIy*vhatinvs[indices] #hatbeta_I,MMLE z<-xI*c(y-mean(y)) DhatIsq <- diag(vhatinvs[indices]) acov <- DhatIsq%*%cov(z)%*%DhatIsq dsq <- n*t(diff)%*%solve(acov)%*%diff dsq<-dsq[1,1] #make dsq a real instead of a matrix if(dsq < cut) testcov <- testcov +1 } } mmlelen <- mmlelen/nruns mmlecov <- mmlecov/nruns covxy <- t(out$Covxy) up=out$up minlen <- min(mmlelen) meanlen <- mean(mmlelen) maxlen <- max(mmlelen) lens <- c(minlen,meanlen,maxlen) mincov <- min(mmlecov) cov90 <- mean(mmlecov>=0.9) cov92 <- mean(mmlecov>=0.92) cov93 <- mean(mmlecov>=0.93) cov94 <- mean(mmlecov>=0.94) cov96 <- mean(mmlecov>=0.96) covprop <- c(mincov,cov90,cov92,cov93,cov94,cov96) testcov <- testcov/nruns if(p > 400) list(mmlelen=mmlelen,mmlecov=mmlecov,lens=lens,covprop=covprop,testcov=testcov,up=up) else list(covxy=covxy,etaopls=t(etaopls),mmlelen=mmlelen,mmlecov=mmlecov, lens=lens,covprop=covprop,testcov=testcov,up=up) } mmlesim2<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, etype = 1, wtype=1, psi = 0.0, cfac="T", indices=c(1,2), alph = 0.05){ #For standardized predictors t, let beta_MMLE(t,T) = eta(t) = cov(t,Y) = eta_OPLS(t,Y). #Generates MLR data Y = alpha + x^T beta + e where V(Y_i) = sigma_i^2 so beta = beta_OLS. #Hence cov(x,Y) = eta = eta_OPLS = cov(x) beta #tests eta_i(t)=cov(t_i,Y)=eta_i/sigma_i=0 and whether eta_I(t) = 0 where I is given by #indices provides I has r of the x_i and n > 5 r with r > 1. #k+1 is the number of beta_i not equal to 0 #Uses five iid error distributions: # etype = 1 for N(0,1), 2 for t3, 3 for exp(1) - 1 # 4 for uniform(-1,1), 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2). # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-1, if k < p-1, zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes #wtype = 1 for OLS, 2 if err = abs(SP - 5) * e, 3 if sqrt(1 + 0.5* x[,1]^2)*e #4 for exp[1 + log(|x_2|) + ... + log(|x_p|] * e, #5 for [1 + log(|x_2|) + ... + log(|x_p|] * e # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. # Calls oplscis. q <- p-1 b <- 0 * 1:q dd <- b + 1 dq <- dd/q if(k > 0) b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 covx <- A %*% t(A) vars<-diag(covx) varinvs <- 1/vars sdinvs <- sqrt(varinvs) etaopls <- covx %*% b #pop cov(x,y) theta <- etaopls*sqrt(varinvs) mmlecov <- 0 * 1:q mmlelen <- mmlecov r <- length(indices) etaI <- etaopls[indices] testcov <- 0 val <- 1-alph cut <- qchisq(val,df=r) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A SP <- 1 + x %*% b if(etype == 1) err <- rnorm(n) if(etype == 2) err <- rt(n, df = 3) if(etype == 3) err <- rexp(n) - 1 if(etype == 4) err <- runif(n, min = -1, max = 1) if(etype == 5) err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) *shift) if(wtype == 2) err <- abs(SP - 5) * err if(wtype == 3) err <- sqrt(1 + x[,1]^2)*err if(wtype == 4) err <- exp(1 + log(abs(x))%*%dd) * err if(wtype == 5) err <- (1 + log(abs(x))%*%dd) * err if(wtype == 6) #geometric mean of the |x_i| err <- exp(log(abs(x))%*%dq) * err if(wtype == 7) #mean of the log(|x_i|) err <- (log(abs(x))%*%dq) * err #make an MLR data set, usually with heterogeneity y <- SP + err out <- covxycis(x,y,alph,cfac=cfac) sdhats <-apply(x,2,sd) sdhatinvs <- 1/sdhats cis <- out$cis*sdhatinvs mmlelen <- mmlelen + (cis[,2] - cis[,1]) for(j in 1:q){ if(out$cis[j,1] <= theta[j] && theta[j] <= out$cis[j,2]) mmlecov[j] <- mmlecov[j] + 1 } if(r > 1 && n > 5*r) { xI <- x[,indices] covxIy <- cov(xI,y) diff <- covxIy*sdhatinvs[indices] #hatbeta_I,MMLE(t) = hat(eta)_I(t) z<-xI*c(y-mean(y)) DhatI <- diag(sdhatinvs[indices]) acov <- DhatI%*%cov(z)%*%DhatI dsq <- n*t(diff)%*%solve(acov)%*%diff dsq<-dsq[1,1] #make dsq a real instead of a matrix if(dsq < cut) testcov <- testcov +1 } } mmlelen <- mmlelen/nruns mmlecov <- mmlecov/nruns covxy <- t(out$Covxy) up=out$up minlen <- min(mmlelen) meanlen <- mean(mmlelen) maxlen <- max(mmlelen) lens <- c(minlen,meanlen,maxlen) mincov <- min(mmlecov) cov90 <- mean(mmlecov>=0.9) cov92 <- mean(mmlecov>=0.92) cov93 <- mean(mmlecov>=0.93) cov94 <- mean(mmlecov>=0.94) cov96 <- mean(mmlecov>=0.96) covprop <- c(mincov,cov90,cov92,cov93,cov94,cov96) testcov <- testcov/nruns if(p > 400) list(mmlelen=mmlelen,mmlecov=mmlecov,lens=lens,covprop=covprop,testcov=testcov,up=up) else list(covxy=covxy,etaopls=t(etaopls),mmlelen=mmlelen,mmlecov=mmlecov, lens=lens,covprop=covprop,testcov=testcov,up=up) } modIboot<-function(x,y,B = 1000){ #needs library(leaps), n > 5p, p > 2 #bootstrap the I_I model for forward selection regression #Does not make sense to do variable selection if there #is only one nontrivial predictor. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vmax <- min(p,as.integer(n/5)) vars <- as.vector(1:(p-1)) #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the I_I submodel tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) num <- 1:length(out$cp) tnum <- num[out$cp <= min(out$cp)+1] Icp <- out$cp[min(tnum)] modI <- out$which[out$cp==Icp] #do not need the constant in vin vin <- vars[modI[-1]] sub <- lsfit(x[,vin],y) betas <- matrix(0,nrow=B,ncol=p) #bootstrap the I_I submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-regsubsets(x,y=yb,method="forward") out<-summary(tem) num <- 1:length(out$cp) tnum <- num[out$cp <= min(out$cp)+1] Icp <- out$cp[min(tnum)] modI <- out$which[out$cp==Icp] vin <- vars[modI[-1]] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef } list(full=full,sub=sub,betas=betas) } modIpisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alpha = 0.05){ #Needs library(leaps). Uses Olive (2013) PI. #Simulates PIs for forward selection variable selection for model I_I. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} if (alpha <= 0.1) {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn pilen <- 1:nruns ps <- pilen opicov <- 0 q <- p-1 vmax <- min(p,as.integer(n/5)) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the forward sel model I_I model tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) num <- 1:length(out$cp) tnum <- num[out$cp <= min(out$cp)+1] Icp <- out$cp[min(tnum)] modI <- out$which[out$cp==Icp,] #do not need the constant in vin vin <- vars[modI[-1]] sub <- lsfit(x[,vin],y) ps[i]<-length(sub$coef) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] fres <- sub$resid #get asymptotically optimal PI sres <- sort(fres) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } psmn <- mean(ps)-k #0 if subset is selecting optimal subset pimnlen <- mean(pilen) opicov <- opicov/nruns list(psmn=psmn, opicov=opicov, pimenlen = pimnlen)} mshpi<-function(yf=0, ydat, n, d, alph = 0.05){ #Gets the Olive et al (2018) modified shorth PI for Yf given xf for GLMs and 1D regression where #the B x 1 vector ydat is the training data, e.g Poisson regression has yi ind ~ Poisson(exp(ESP)) and #d = ``plug in degrees of freedom." Can work if p > n. Yf is in the PI if #inr = 1. Useful for GLM, GAM, lasso, and forward selection. ydat<-as.vector(ydat) B<-length(ydat) inr <- 0 if (alph > 0.1) {qn <- min(1 - alph + 0.05, 1 - alph + d/n)} if (alph <= 0.1) {qn <- min(1 - alph/2, 1 - alph + 10*alph*d/n)} pn <- qn if(pn < 1 - alph + 0.001) qn <- 1 - alph cc <- ceiling(B * (qn + 1.12*sqrt(alph/B))) cc <- min(B,cc) sy <- sort(ydat) up <- sy[cc] low <- sy[1] olen <- up - low if(cc < B){ for(j in (cc + 1):B){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen up <- sy[j] low <- sy[j - cc + 1] } } } if(low <= yf && up >= yf) inr <- inr + 1 list(low=low, up=up, inr = inr)} mspisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, J = 5, alpha = 0.05){ #Needs library(leaps), library(pls), and library(glmnet). #Use 1 <= k <= p-1, where k is the number of nonnoise variables. #Uses the Pelawa Watagoda and Olive (2017) PI. SLOW. #Simulates PIs for forward selection variable selection using EBIC if n<10p # and min Cp if n >=10p. PIs for lasso, ridge regression, lasso var sel, PLS, #and PCR use 10-fold cross validation. # ebic - 2 p log(2) = out$bic+2*log(dbinom(x=xx,size=p,prob=0.5)) #ebic <- out$bic+2*log(2^p*dbinom(x=xx,size=p,prob=0.5)) #ebic <- out$bic+2*(lgamma(p+1)-lgamma(xx+1)-lgamma(p-xx+1)) #Formula uses EBIC(I) - 2 lgamma(p+1). #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 fselpilen <- 1:nruns fselpicov <- 0 laspilen <- 1:nruns laspicov <- 0 rrpilen <- 1:nruns rrpicov <- 0 RLpilen <- 1:nruns RLpicov <- 0 plspilen <- 1:nruns plspicov <- 0 pcrpilen <- 1:nruns pcrpicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) nc <- ceiling(n/J)-1 nc <- min(nc,q) nc <- max(nc,1) #the maximum number of variables to use for forward selection zz<-1:nc vmax <- min(p,as.integer(n/5))#max no. of variables to use for lasso dd <- 1:nruns dRL <- dd # lasso variable selection and lasso have the same d dpls <- dd dpcr<-dd for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #get forward selection PI temp<-regsubsets(x,y,nvmax=nc,method="forward") out<-summary(temp) if(n < 10*p) { xx <- 1:min(length(out$bic),p-1)+1 ebic <- out$bic+2*(-lgamma(xx+1)-lgamma(p-xx+1)) #print(ebic) minebic <- out$which[ebic==min(ebic),] #do not need the constant in vin vin <- vars[minebic[-1]] sub <- lsfit(x[,vin],y) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] res <- sub$resid dd[i]<-length(sub$coef)#want these to be near but >= k+1 pp <- dd[i] tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) fselpilen[i] <- tem$up - tem$low fselpicov <- fselpicov + tem$inr } #if n >= 10p use min Cp model else { mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] sub <- lsfit(x[,vin],y) dd[i]<-length(sub$coef)#want these to be near but >= k+1 pp<-dd[i] yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] res <- sub$resid tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) fselpilen[i] <- tem$up - tem$low fselpicov <- fselpicov + tem$inr } #get lasso PI out<-cv.glmnet(x,y) lam <- out$lambda.min #lamvec[i] <- lam fit <- predict(out,s=lam,newx=x) yfhat <- predict(out,s=lam,newx=xf) res <- y - fit lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) laspilen[i] <- tem$up - tem$low laspicov <- laspicov + tem$inr #get lasso variable selection PI sub <- lsfit(x[,vin],y) dRL[i]<- pp #want these to be near but >= k+1 yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] res <- sub$resid tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) RLpilen[i] <- tem$up - tem$low RLpicov <- RLpicov + tem$inr #get ridge regression PI out<-cv.glmnet(x,y,alpha=0) lam <- out$lambda.min fit <- predict(out,s=lam,newx=x) yfhat <- predict(out,s=lam,newx=xf) res <- y - fit #very crude df is using lasso df tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) rrpilen[i] <- tem$up - tem$low rrpicov <- rrpicov + tem$inr #get 10 fold CV PLS PI z <- as.data.frame(cbind(y,x)) zz <- rbind(z,c(0,xf)) out<-plsr(V1~.,data=z,scale=T,validation="CV") #If y is used instead of V1, predict does not work, #and nc tends to equal p, which should be impossible. tem<-MSEP(out) cvmse<-tem$val[,,1:(out$ncomp+1)][1,] npls <-max(which.min(cvmse)-1,1) #bug if npls=0, p large #using predict is rather difficult #if xf is used, predict does not work yfhat<-predict(out,zz[(n+1),-1],ncomp=npls)[1,1,1] pp <- npls+1 dpls[i] <- pp #get the number of components to use #if npls=p-1=q, same as OLS res <- out$residuals[,,npls] #get asymptotically optimal PI tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) plspilen[i] <- tem$up - tem$low plspicov <- plspicov + tem$inr #get 10 fold CV PCR PI out<-pcr(V1~.,data=z,scale=T,validation="CV") #If y is used instead of V1, predict does not work, #and npcr tends to equal p, which should be impossible. tem<-MSEP(out) cvmse<-tem$val[,,1:(out$ncomp+1)][1,] npcr <-max(which.min(cvmse)-1,1) #bug if npcr=0, p large #using predict is rather difficult #if xf is used, predict does not work yfhat<-predict(out,zz[(n+1),-1],ncomp=npcr)[1,1,1] pp <- npcr+1 dpcr[i] <- pp #get the number of components to use #if npcr=p-1=q, same as OLS res <- out$residuals[,,nc] tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) pcrpilen[i] <- tem$up - tem$low pcrpicov <- pcrpicov + tem$inr } fselpimnlen <- mean(fselpilen) fselpicov <- fselpicov/nruns laspimnlen <- mean(laspilen) laspicov <- laspicov/nruns RLpimnlen <- mean(RLpilen) RLpicov <- RLpicov/nruns rrpimnlen <- mean(rrpilen) rrpicov <- rrpicov/nruns plspimnlen <- mean(plspilen) plspicov <- plspicov/nruns pcrpimnlen <- mean(pcrpilen) pcrpicov <- pcrpicov/nruns #lasso variable selection and lasso have the same d list(dfsel=dd,dRL=dRL,dpls=dpls,dpcr=dpcr,fselpicov=fselpicov, fselpimenlen=fselpimnlen,laspicov=laspicov, laspimenlen=laspimnlen, RLpicov=RLpicov, RLpimenlen=RLpimnlen, rrpicov=rrpicov, rrpimenlen=rrpimnlen, plspicov=plspicov, plspimenlen=plspimnlen, pcrpicov=pcrpicov, pcrpimenlen=pcrpimnlen)} nbinroplssim<-function(n=100,p=4,k=1,nruns=100,psi=0.0,int=1,a=1,cfac="T",kappa=1, indices=c(1,2), alph=0.05){ ###Simulates negative binomial regression (NBR full model). #as kappa goes to infinity, the model goes to a Poisson regression model # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want # with int + 3a <=5, int -3a >= -5. #Then does the Z = log(Y) (or log(0.5) if Y = 0) transformation. #Does inference on beta_Z = c_Z beta = c_Z beta_NBR provided psi=0 and k < p-1. ##Use indices = c(p-2,p-1) in the simulation, so Ho is true. ##Hence for p=100: indices = c(98,99), for p=4: indices = c(2,3). #set.seed(974) ##need p>2 and want n >= 5p q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta<-c(int,b) prmat <- matrix(0,nrow=nruns,ncol=p) zmat <- prmat omat <- prmat oplscov <- 0 * 1:q oplslen <- oplscov testcov <- 0 r <- length(indices) eta0 <- 0*1:r val <- 1-alph cut <- qchisq(val,df=r) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rnbinom(n,mu=exp(SP),size=kappa) tdata <- as.data.frame(cbind(x,y)) #make a negative binomial regression data set z<-y z[y==0]<-0.5 z <- log(z) if(n > 5*p){ out <- glm(y~., family=poisson, data=tdata) prmat[i,] <-out$coef #beta_PR is supposed to be a consistent est of beta outo <- lm(y~., data=tdata) omat[i,] <-coef(outo) zdata <- as.data.frame(cbind(x,z)) outz <- lm(z~., data=zdata) zmat[i,] <-coef(outz) } if(k < p-1){ out <- covxycis(x,z,alph,cfac=cfac) oplslen <- oplslen + (out$cis[,2] - out$cis[,1]) for(j in 1:q){ if(out$cis[j,1] <= 0 && 0 <= out$cis[j,2]) oplscov[j] <- oplscov[j] + 1 } } if(r > 1 && n > 5*r) { xI <- x[,indices] covxIz <- cov(xI,z) diff <- covxIz - eta0 w<-xI*c(z-mean(z)) dsq <- n*t(diff)%*%solve(cov(w))%*%diff dsq<-dsq[1,1] #make dsq a real instead of a matrix if(dsq < cut) testcov <- testcov +1 } } prbhat <- apply(prmat,2,mean) olsbhat <- apply(omat,2,mean) zbhat <- apply(zmat,2,mean) oplslen <- oplslen/nruns oplscov <- oplscov/nruns minlen <- min(oplslen) meanlen <- mean(oplslen) maxlen <- max(oplslen) lens <- c(minlen,meanlen,maxlen) mincov <- min(oplscov) cov90 <- mean(oplscov>=0.9) cov92 <- mean(oplscov>=0.92) cov93 <- mean(oplscov>=0.93) cov94 <- mean(oplscov>=0.94) cov96 <- mean(oplscov>=0.96) covprop <- c(mincov,cov90,cov92,cov93,cov94,cov96) testcov <- testcov/nruns #MLRplot(x,z) list(prbhat=prbhat,olsbhat=olsbhat,zbhat=zbhat,beta=beta,k=k, oplslen=oplslen,oplscov=oplscov,lens=lens,covprop=covprop,testcov=testcov) } norms <- function(x){ #gets the Euclidean norm of each row vector of x x <- as.matrix(x) n <- dim(x)[1] enorms <- 1:n for(i in 1:n) enorms[i] <- norm(x[i,],"2" ) list(enorms=enorms) } nparPLSboot<-function(x,y,B = 10){ ##Needs library(pls), often want B >= 50p, n > 5p, p > 2 takes a few seconds #bootstraps the PLS model with the nonparametric bootstrap x <- as.matrix(x) n <- length(y) p <- dim(x)[2] + 1 z <- as.data.frame(cbind(y,x)) out <- plsr(y~.,data=z,method="simpls",validation="CV") bhat<- as.vector(coef(out,intercept=TRUE)) betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ samp <- sample(1:n, replace=TRUE) yb <- y[samp] xb <- x[samp,] z <- as.data.frame(cbind(yb,xb)) temp<-plsr(yb~.,data=z,method="simpls",validation="CV") betas[i,] <- as.vector(coef(temp,intercept=TRUE)) } list(bhat=bhat,betas=betas) } olsinf<-function(x, y, indices = c(1,2), alph=0.05){ # Does OLS multiple linear regression inference for partial F test. # cis for slopes. OLS needs iid errors and want n >= 5p. # The indices are the variables in the reduced model for the partial F test. x <- as.matrix(x) y <- as.vector(y) indices <- as.vector(indices) n <- dim(x)[1] q <- dim(x)[2] p <- q+1 #number of predictors including intercept r <- length(indices) + 1 df <- p-r #assume reduced model contains an intercept dendf <- n-p olscis <- matrix(0,nrow=p,ncol=2) tcut <- qt((1-alph/2),(n-p)) out <- lsfit(x,y) tem<-ls.diag(out) #get CIs for beta_i val <- tcut*tem$std.err coef<-out$coef olscis[,1] <- coef - val olscis[,2] <- coef + val # partial F test outr <- lsfit(x[,indices],y) mse <- tem$std.dev^2 resr <- outr$res Fr <- (resr%*%resr - mse*dendf)/(mse*df) if(df == 0) pvalred <- 1 else pvalred <- 1 - pf(Fr,df,dendf) list(coef=coef,olscis=olscis,Fr=Fr,pvalred=pvalred) } olssim<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, alpha = 0.05){ #calls olsinf #need k < p #k+1 is the number of beta_i not equal to 0 #tests whether the reduced model using first k+1 predictors, including constant #is good. Ho: reduced model is good is true, so want cov approx 1-alpha #Uses five iid error distributions: # type = 1 for N(0,1), 2 for t3, 3 for exp(1) - 1 # 4 for uniform(-1,1), 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2). # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q if(k > 0) b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 olscov <- 0 * 1:p olslen <- olscov redcov <- 0 ind <- 1:k for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) y <- 1 + x %*% b + rnorm(n) if(type == 2) y <- 1 + x %*% b + rt(n, df = 3) if(type == 3) y <- 1 + x %*% b + rexp(n) - 1 if(type == 4) y <- 1 + x %*% b + runif(n, min = -1, max = 1) if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out<-olsinf(x,y,indices=ind,alph=alpha) olslen <- olslen + (out$olscis[,2] - out$olscis[,1]) for(j in 1:p){ if(out$olscis[j,1] <= beta[j] && beta[j] <= out$olscis[j,2]) olscov[j] <- olscov[j] + 1 } if(out$pvalred > alpha) #then fail to reject Ho where Ho: reduced model is good redcov <- redcov+1 } olslen <- sqrt(n)*olslen/nruns olscov <- olscov/nruns redcov <- redcov/nruns list(olslen=olslen,olscov=olscov,redcov=redcov) } oplscis<-function(x,y,alph=0.05,cfac="T"){ ##gets CIs for beta_i where beta = beta_OPLS, cases are entered as rows, #if cfac = "T", a correction factor for small n is used, best if n >= 50 #may work poorly in high dimensions because lamhat is then a poor estimate of lambda x <- as.matrix(x) y <- as.vector(y) n <- length(y) p <- dim(x)[2] z <- x*(y-mean(y)) sds <- apply(z,2,sd) cut <- qt((1-alph/2),(n-1)) # up = 1 - alph/2 if(cfac=="T"){ ao2 <- alph/2 up <- min((1 - ao2/2), (1 - ao2 + 12.5*alph/n)) if(ao2 > 0.1) up <- min((1 - ao2 + 0.05), (1 - ao2 + 2.5/n)) qn <- up if(qn < 1 - ao2 + 0.001) up <- 1 - ao2 cut <- qt(up,(n-1)) } # Covxy <- cov(x,y) #lamhat <- t(Covxy)%*%Covxy/(t(Covxy)%*%cov(x)%*%Covxy) # expensive way etaesp <- x%*%Covxy #W's lamhat <- lsfit(etaesp,y)$coef[2] cis <- matrix(0,nrow=p,ncol=2) val <- cut*sds/sqrt(n) cis[,1] <- lamhat*(Covxy - val) cis[,2] <- lamhat*(Covxy + val) list(Covxy=Covxy,cis=cis,up=up,lamhat=lamhat) } OPLSEEplot<-function(x, Y){ # plots OPLSESP versus OLSESP # one component partial least squares. # Identify points with the right mouse button. Right click stop. #needs n > p to compute OLS x <- as.matrix(x) etahat <- cov(x,Y) etaesp <- x%*%etahat opls <- lsfit(etaesp,Y) FITopls <- Y-opls$res out <- lsfit(x, Y) FITols <- Y - out$res plot(FITopls,FITols) abline(0,1) identify(FITopls, FITols) } OPLSplot<-function(x, Y){ # Response plot and residual plot based on OPLS # one component partial least squares. # Advance the view with the right mouse button. Right click stop. x <- as.matrix(x) etahat <- cov(x,Y) etaesp <- x%*%etahat opls <- lsfit(etaesp,Y) FIT <- Y-opls$res cmar <- par("mar") par(mfrow = c(2, 1)) par(mar=c(4.0,4.0,2.0,0.5)) plot(FIT, Y) abline(0, 1) title("Response Plot") identify(FIT, Y) RES <- Y - FIT plot(FIT, RES) title("Residual Plot") identify(FIT, RES) par(mfrow = c(1, 1)) par(mar=cmar) } oplssim<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, cfac="T", indices=c(1,2), alph = 0.05){ #Generates MLR data Y = alpha + x^T beta + e so beta = beta_OLS #Hence cov(x,Y) = eta = eta_OPLS = cov(x) beta #tests whether eta_i = 0 and whether cov(x_I,y) = 0 where I is given by #indices provides I has r of the x_i and n > 5 r with r > 1. #k+1 is the number of beta_i not equal to 0 #Uses five iid error distributions: # type = 1 for N(0,1), 2 for t3, 3 for exp(1) - 1 # 4 for uniform(-1,1), 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2). # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-1, if k < p-1, zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. Calls covxycis. q <- p-1 b <- 0 * 1:q if(k > 0) b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 covx <- A %*% t(A) etaopls <- covx %*% b r <- length(indices) etaI <- etaopls[indices] oplscov <- 0 * 1:q testcov <- 0 val <- 1-alph cut <- qchisq(val,df=r) oplslen <- oplscov for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) y <- 1 + x %*% b + rnorm(n) if(type == 2) y <- 1 + x %*% b + rt(n, df = 3) if(type == 3) y <- 1 + x %*% b + rexp(n) - 1 if(type == 4) y <- 1 + x %*% b + runif(n, min = -1, max = 1) if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <- covxycis(x,y,alph,cfac=cfac) oplslen <- oplslen + (out$cis[,2] - out$cis[,1]) for(j in 1:q){ if(out$cis[j,1] <= etaopls[j] && etaopls[j] <= out$cis[j,2]) oplscov[j] <- oplscov[j] + 1 } if(r > 1 && n > 5*r) { xI <- x[,indices] covxIy <- cov(xI,y) diff <- covxIy - etaI z<-xI*c(y-mean(y)) dsq <- n*t(diff)%*%solve(cov(z))%*%diff dsq<-dsq[1,1] #make dsq a real instead of a matrix ### print(dsq) if(dsq < cut) testcov <- testcov +1 } } oplslen <- oplslen/nruns oplscov <- oplscov/nruns covxy <- t(out$Covxy) up=out$up minlen <- min(oplslen) meanlen <- mean(oplslen) maxlen <- max(oplslen) lens <- c(minlen,meanlen,maxlen) mincov <- min(oplscov) cov90 <- mean(oplscov>=0.9) cov92 <- mean(oplscov>=0.92) cov93 <- mean(oplscov>=0.93) cov94 <- mean(oplscov>=0.94) cov96 <- mean(oplscov>=0.96) covprop <- c(mincov,cov90,cov92,cov93,cov94,cov96) testcov <- testcov/nruns if(p > 400) list(oplslen=oplslen,oplscov=oplscov,lens=lens,covprop=covprop,testcov=testcov,up=up) else list(covxy=covxy,etaopls=t(etaopls),oplslen=oplslen,oplscov=oplscov, lens=lens,covprop=covprop,testcov=testcov,up=up) } oplssim2<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, etype = 1, wtype=1, psi = 0.0, cfac="T", indices=c(1,2), alph = 0.05){ #Gets CIs for beta_i for beta_OPLS, oplssim gets CIs for cov(x_i,Y) = eta_i ##likely poor in high dimensions if lambdahat is a poor estimate of lambda #Generates MLR data Y = alpha + x^T beta + e where V(Y_i) = sigma_i^2 so beta = beta_OLS. #Hence cov(x,Y) = eta = eta_OPLS = cov(x) beta #tests whether lambda eta_i = 0 and whether lambda cov(x_I,y) = 0 where I is given by #indices provides I has r of the x_i and n > 5 r with r > 1. #k+1 is the number of beta_i not equal to 0 #Uses five iid error distributions: # etype = 1 for N(0,1), 2 for t3, 3 for exp(1) - 1 # 4 for uniform(-1,1), 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2). # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-1, if k < p-1, zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes #wtype = 1 for OLS, 2 if err = abs(SP - 5) * e, 3 if sqrt(1 + 0.5* x[,1]^2)*e #4 for exp[1 + log(|x_2|) + ... + log(|x_p|] * e, #5 for [1 + log(|x_2|) + ... + log(|x_p|] * e # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. # Calls oplscis. q <- p-1 b <- 0 * 1:q dd <- b + 1 dq <- dd/q if(k > 0) b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 covx <- A %*% t(A) etaopls <- covx %*% b #pop cov(x,y) lambda <- t(etaopls)%*%etaopls/(t(etaopls)%*%covx%*%etaopls) #storage problems lambda<-lambda[1,1] #make lambda a real instead of a matrix betaopls <- lambda*etaopls oplscov <- 0 * 1:q oplslen <- oplscov r <- length(indices) etaI <- etaopls[indices] testcov <- 0 val <- 1-alph cut <- qchisq(val,df=r) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A SP <- 1 + x %*% b if(etype == 1) err <- rnorm(n) if(etype == 2) err <- rt(n, df = 3) if(etype == 3) err <- rexp(n) - 1 if(etype == 4) err <- runif(n, min = -1, max = 1) if(etype == 5) err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) *shift) if(wtype == 2) err <- abs(SP - 5) * err if(wtype == 3) err <- sqrt(1 + x[,1]^2)*err if(wtype == 4) err <- exp(1 + log(abs(x))%*%dd) * err if(wtype == 5) err <- (1 + log(abs(x))%*%dd) * err if(wtype == 6) #geometric mean of the |x_i| err <- exp(log(abs(x))%*%dq) * err if(wtype == 7) #mean of the log(|x_i|) err <- (log(abs(x))%*%dq) * err #make an MLR data set, usually with heterogeneity y <- SP + err out <- oplscis(x,y,alph,cfac=cfac) oplslen <- oplslen + (out$cis[,2] - out$cis[,1]) for(j in 1:q){ if(out$cis[j,1] <= betaopls[j] && betaopls[j] <= out$cis[j,2]) oplscov[j] <- oplscov[j] + 1 } if(r > 1 && n > 5*r) { lamhat<-out$lamhat xI <- x[,indices] covxIy <- cov(xI,y) diff <- covxIy #OPLS test wants pop cov = 0, so do not use covxIy - eta_I z<-xI*c(y-mean(y)) dsq <- n*t(diff)%*%solve(cov(z))%*%diff dsq<-dsq[1,1] #make dsq a real instead of a matrix if(dsq < cut) testcov <- testcov +1 } } oplslen <- oplslen/nruns oplscov <- oplscov/nruns covxy <- t(out$Covxy) lamhat <-out$lamhat up=out$up minlen <- min(oplslen) meanlen <- mean(oplslen) maxlen <- max(oplslen) lens <- c(minlen,meanlen,maxlen) mincov <- min(oplscov) cov90 <- mean(oplscov>=0.9) cov92 <- mean(oplscov>=0.92) cov93 <- mean(oplscov>=0.93) cov94 <- mean(oplscov>=0.94) cov96 <- mean(oplscov>=0.96) covprop <- c(mincov,cov90,cov92,cov93,cov94,cov96) testcov <- testcov/nruns if(p > 400) list(oplslen=oplslen,oplscov=oplscov,lens=lens,covprop=covprop,testcov=testcov,up=up, lambda=lambda, lamhat=lamhat) else list(covxy=covxy,etaopls=t(etaopls),oplslen=oplslen,oplscov=oplscov, lens=lens,covprop=covprop,testcov=testcov,up=up,lambda=lambda,lamhat=lamhat) } oplswls<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, etype = 1, wtype=1, psi = 0.0, alpha = 0.05){ #right click Stop twice #generates weighted least squares (WLS) data sets #and fits OLS and OPLS to the data sets, making the response plots # Use 0 <= psi < 1. Want n/nslices > 13. #Uses five iid error distributions: # etype = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. #wtype = 1 for OLS, 2 if err = abs(SP - 5) * e, 3 if sqrt(1 + 0.5* x[,1]^2)*e #4 for exp[1 + log(|x_2|) + ... + log(|x_p|] * e, #5 for [1 + log(|x_2|) + ... + log(|x_p|] * e # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-2, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(p-3)psi^2]/[1 + (p-2)psi^2], i not = j, p > 2 # when the correlation exists. q <- p-1 b <- 0 * 1:q dd <- b + 1 dq <- dd/q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 one <- as.vector(0*1:(k+1) + 1) x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A SP <- 1 + x %*% b if(etype == 1) err <- rnorm(n) if(etype == 2) err <- rt(n, df = 3) if(etype == 3) err <- rexp(n) - 1 if(etype == 4) err <- runif(n, min = -1, max = 1) if(etype == 5) err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) *shift) if(wtype == 2) err <- abs(SP - 5) * err if(wtype == 3) err <- sqrt(1 + x[,1]^2)*err if(wtype == 4) err <- exp(1 + log(abs(x))%*%dd) * err if(wtype == 5) err <- (1 + log(abs(x))%*%dd) * err if(wtype == 6) #geometric mean of the |x_i| err <- exp(log(abs(x))%*%dq) * err if(wtype == 7) #mean of the log(|x_i|) err <- (log(abs(x))%*%dq) * err #make an MLR data set y <- SP + err etahat <- cov(x,y) etaesp <- x%*%etahat opls <- lsfit(etaesp,y) OPLSFIT <- y-opls$res plot(OPLSFIT, y) abline(0, 1) title("OPLS Response Plot") identify(OPLSFIT,y)#right click Stop if(n > (p+5)){ out <- lsfit(x, y) FIT <- y-out$res plot(FIT, y) abline(0, 1) title("OLS Response Plot") identify(FIT,y) #right click Stop plot(OPLSFIT,FIT) abline(0,1) } } oplswsim<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, etype = 1, wtype=1, psi = 0.0, cfac="T", indices=c(1,2), alph = 0.05){ #Generates MLR data Y = alpha + x^T beta + e where V(Y_i) = sigma_i^2 so beta = beta_OLS. #Hence cov(x,Y) = eta = eta_OPLS = cov(x) beta #tests whether eta_i = 0 and whether cov(x_I,y) = 0 where I is given by #indices provides I has r of the x_i and n > 5 r with r > 1. #k+1 is the number of beta_i not equal to 0 #Cov(x) is a p by p matrix, so storage problems if p > 10000. #If cfac="T", a small sample correction factor is used that is best for n > 50. #Uses five iid error distributions: # type = 1 for N(0,1), 2 for t3, 3 for exp(1) - 1 # 4 for uniform(-1,1), 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2). # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-1, if k < p-1, zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes #wtype = 1 for OLS, 2 if err = abs(SP - 5) * e, 3 if sqrt(1 + 0.5* x[,1]^2)*e #4 for exp[1 + log(|x_2|) + ... + log(|x_p|] * e, #5 for [1 + log(|x_2|) + ... + log(|x_p|] * e # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. Calls covxycis. q <- p-1 b <- 0 * 1:q dd <- b + 1 dq <- dd/q if(k > 0) b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 covx <- A %*% t(A) etaopls <- covx %*% b oplscov <- 0 * 1:q oplslen <- oplscov testcov <- 0 r <- length(indices) etaI <- etaopls[indices] val <- 1-alph cut <- qchisq(val,df=r) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A SP <- 1 + x %*% b if(etype == 1) err <- rnorm(n) if(etype == 2) err <- rt(n, df = 3) if(etype == 3) err <- rexp(n) - 1 if(etype == 4) err <- runif(n, min = -1, max = 1) if(etype == 5) err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) *shift) if(wtype == 2) err <- abs(SP - 5) * err if(wtype == 3) err <- sqrt(1 + x[,1]^2)*err if(wtype == 4) err <- exp(1 + log(abs(x))%*%dd) * err if(wtype == 5) err <- (1 + log(abs(x))%*%dd) * err if(wtype == 6) #geometric mean of the |x_i| err <- exp(log(abs(x))%*%dq) * err if(wtype == 7) #mean of the log(|x_i|) err <- (log(abs(x))%*%dq) * err #make an MLR data set, usually with heterogeneity y <- SP + err out <- covxycis(x,y,alph,cfac=cfac) oplslen <- oplslen + (out$cis[,2] - out$cis[,1]) for(j in 1:q){ if(out$cis[j,1] <= etaopls[j] && etaopls[j] <= out$cis[j,2]) oplscov[j] <- oplscov[j] + 1 } if(r > 1 && n > 5*r) { xI <- x[,indices] covxIy <- cov(xI,y) diff <- covxIy - etaI z<-xI*c(y-mean(y)) dsq <- n*t(diff)%*%solve(cov(z))%*%diff dsq<-dsq[1,1] #make dsq a real instead of a matrix if(dsq < cut) testcov <- testcov +1 } } oplslen <- oplslen/nruns oplscov <- oplscov/nruns covxy <- t(out$Covxy) up=out$up minlen <- min(oplslen) meanlen <- mean(oplslen) maxlen <- max(oplslen) lens <- c(minlen,meanlen,maxlen) mincov <- min(oplscov) cov90 <- mean(oplscov>=0.9) cov92 <- mean(oplscov>=0.92) cov93 <- mean(oplscov>=0.93) cov94 <- mean(oplscov>=0.94) cov96 <- mean(oplscov>=0.96) covprop <- c(mincov,cov90,cov92,cov93,cov94,cov96) testcov <- testcov/nruns if(p > 400) list(oplslen=oplslen,oplscov=oplscov,lens=lens,covprop=covprop,testcov=testcov,up=up) else list(covxy=covxy,etaopls=t(etaopls),oplslen=oplslen,oplscov=oplscov, lens=lens,covprop=covprop,testcov=testcov,up=up) } pcaboot<-function(x, corr=T, rob=F, B = 1000){ #Bootstraps PCA. Likely only accurate for positive eigenvalues. #If rob = T, bootstrap the robust PCA. #Nominal 95% CIs x <- as.matrix(x) n <- dim(x)[1] p <- dim(x)[2] lsciL <- 1:p lsciU <- 1:p indx <- 1:n lams <- matrix(0,nrow=B,ncol=p) if(rob==F){ for(i in 1:B){ tem <- sample(indx,n,replace=T) lams[i,] <- prcomp(x[tem,], scale = corr)$sd^2 } } else{ for(i in 1:B){ tem <- sample(indx,n,replace=T) lams[i,] <- rprcomp(x[tem,], corr = corr)$out$sd^2 } } shorci <- apply(lams,2,shorth2) covhat <- var(lams) for(i in 1:p){ lsciL[i] <- mean(lams[,i]) - 2*sqrt(covhat[i,i]) lsciU[i] <- mean(lams[,i]) + 2*sqrt(covhat[i,i]) } lscis <- cbind(lsciL,lsciU) list(lams=lams,covhat=covhat,shorci=shorci,lscis=lscis) } pcabootsim<-function(n = 100, p = 4, nruns = 1, xtype = 1, dd = 1, eps = 0.25, corr=F,rob=F,B=1000){ # Generates a simulated data set for the PCA bootstrap CIs. # Need n > 2q. # MAY NOT WORK IF p = 1 # If corr = F, then eigenvalues = c(1,2,...,p). # If corr = T, then eigenvalues = c(1,1,...,1). # Multiply x by A where xtype = 1 for MVN Np(0,I), # 2, 3, 4 and 5 for (1 - eps) Np(0,I) + eps Np(0, 25 I) # 6, 7, 8 and 9 for multivariate t_d with d = 3, 5, 19 or dd # 10 for lognormal. # Can't get accurate counts if xtype = 9. set.seed(974) A <- sqrt(diag(1:p)) lscict <- 0*(1:p) shcict <- lscict sigsq <- 1 for(i in 1:nruns) { #make data x <- matrix(rnorm(n * p), nrow = n, ncol = p) if(xtype == 2) { zu <- runif(n) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 sigsq <- 10.6 } if(xtype == 3) { zu <- runif(n) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 sigsq <- 15.4 } if(xtype == 4) { zu <- runif(n) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 sigsq <- 3.4 } if(xtype == 5) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 sigsq <- 1 +eps*24 } if(xtype == 6) { zu <- sqrt(rchisq(n, 3)/3) x <- x/zu sigsq <- 3 } if(xtype == 7) { zu <- sqrt(rchisq(n, 5)/5) x <- x/zu sigsq <- 5/3 } if(xtype == 8) { zu <- sqrt(rchisq(n, 19)/19) x <- x/zu sigsq <- 19/17 } if(xtype == 9) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu #sigsq <- dd/(dd-2) } if(xtype == 10){ x <- exp(x) sigsq <- exp(1)*(exp(1)-1) } x <- x %*% A out <- pcaboot(x,corr=corr,rob=rob,B=B) lam <- sigsq*(1:p) for(j in 1:p){ if(out$lscis[j,1] <= lam[p+1-j] & lam[p+1-j] <= out$lscis[j,2]) lscict[j] <- lscict[j] + 1 if(out$shorci[[j]]$shorth[1] <= lam[p+1-j] & lam[p+1-j] <= out$shorci[[j]]$shorth[2]) shcict[j] <- shcict[j]+1 } } covh <- cov(x) lscict <- lscict/nruns shcict <- shcict/nruns list(out=out,covh=covh,lscicv=lscict,shcicv=shcict) } pcrpisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi=0.0, type = 1, alpha = 0.05){ #Needs library(pls). Uses 10 fold CV and the Olive (2013) PI. #Simulates PIs for principle components regression with 10 fold CV. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} if (alpha <= 0.1) {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn ncvec<-1:nruns pilen <- 1:nruns opicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the PCR 10 fold CV estimator z <- as.data.frame(cbind(y,x)) zz <- rbind(z,c(0,xf)) out<-pcr(V1~.,data=z,scale=T,validation="CV") #If y is used instead of V1, predict does not work, #and nc tends to equal p, which should be impossible. tem<-MSEP(out) cvmse<-tem$val[,,1:(out$ncomp+1)][1,] nc <- max(which.min(cvmse)-1,1) #bug if nc=0, p large #using predict is rather difficult #if xf is used, predict does not work yfhat<-predict(out,zz[(n+1),-1],ncomp=nc)[1,1,1] ncvec[i] <- nc #get the number of components to use #if nc=p-1=q, same as OLS res <- out$residuals[,,nc] #get asymptotically optimal PI sres <- sort(res) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } pimnlen <- mean(pilen) opicov <- opicov/nruns qminncmn <- q - mean(ncvec) #If qminncmn is 0 then PCR is equivalent to OLS. list(qminncmn, opicov=opicov, pimenlen = pimnlen)} pcrpisim2<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi=0.0, type = 1, alph = 0.05){ #Needs library(pls). Uses 10 fold CV and the Pelawa Watagoda and Olive (2017) PI. #Simulates PIs for principle components regression with 10 fold CV. # Uses PI with crude df estimate d = number of components + 1, p > n possible. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 val <- 8*n/9 ncvec<-1:nruns pilen <- 1:nruns opicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the PCR 10 fold CV estimator z <- as.data.frame(cbind(y,x)) zz <- rbind(z,c(0,xf)) out<-pcr(V1~.,data=z,scale=T,validation="CV") #If y is used instead of V1, predict does not work, #and nc tends to equal p, which should be impossible. tem<-MSEP(out) cvmse<-tem$val[,,1:(out$ncomp+1)][1,] nc <-max(which.min(cvmse)-1,1) #bug if nc=0, p large #using predict is rather difficult #if xf is used, predict does not work yfhat<-predict(out,zz[(n+1),-1],ncomp=nc)[1,1,1] ncvec[i] <- nc #get the number of components to use #if nc=p-1=q, same as OLS res <- out$residuals[,,nc] #get asymptotically optimal PI #get PI pp <- nc+1 if(pp < val) corfac <- (1 + 15/n) * sqrt( (n+2*pp)/(n - pp) ) else corfac <- 5*(1+15/n) if (alph > 0.1) {qn <- min(1 - alph + 0.05, 1 - alph + pp/n)} if (alph <= 0.1) {qn <- min(1 - alph/2, 1 - alph + 10*alph*pp/n)} pn <- qn if(pn < 1 - alph + 0.001) qn <- 1 - alph alphan <- 1 - qn sres <- sort(res) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } pimnlen <- mean(pilen) opicov <- opicov/nruns qminncmn <- q - mean(ncvec) #If qminncmn is 0 then PCR is equivalent to OLS. list(qminncmn, opicov=opicov, pimenlen = pimnlen)} PHboot<-function(x,time,status,B = 1000){ #needs library(survival), n > 5p, p > 2, #want B >= 50p, takes a few seconds #bootstraps the Cox PH regression full model with the nonparametric bootstrap x <- as.matrix(x) n <- length(time) p <- dim(x)[2] tdata <- as.data.frame(cbind(x,time,status)) out <- coxph(Surv(time, status) ~., data=tdata) bhat<-out$coef betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ samp <- sample(1:n, replace=TRUE) tdat <- tdata[samp,] temp <- coxph(Surv(time, status) ~., data=tdat) betas[i,] <- temp$coef } list(bhat=bhat,betas=betas) } PHbootsim<-function(n=100,p=4,k=1,nruns=100,psi=0.0,B=1000,a=1,gam=1, clam=0.1,alpha=0.05){ #needs library(survival), n > 5p, p > 2, want B >= 50p, #bootstraps the Cox regression full model, takes a long time #Use 1 <= k < p, so zeroes are in the model, k is the number of nonnoise variables. #there are p coefficients for beta in the Weibull regression data #need K < p so zeroes are in the model #need p > 1, beta_A = -(1/gam, ..., 1/gam, 0, ..., 0)^T with p-k zeroes # beta_P = (1,...,1,0,...,0)^T with k ones and p-k zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(p-2)psi^2]/[1 + (p-1)psi^2], i not = j # when the correlation exists. SP~N(0,a^2), and a near 1 is ok. rho <- (2*psi + (p-2)*psi^2)/(1 + (p-1)*psi^2) val <- a/sqrt(k*(1 + (p-1)*psi^2) + k*(k-1)*(2*psi + (p-2)*psi^2)) A <- matrix(psi,nrow=p,ncol=p) diag(A) <- 1 beta <- 0 * 1:p beta[1:k] <- 1 #beta[1:0] acts like beta[1:1] = beta[1] pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) zero <- 0 * 1:p one <- as.vector(0*1:k + 1) for(i in 1:nruns) { x <- matrix(rnorm(n * p), nrow = n, ncol = p) x <- val* x %*% A SP <- x%*%beta #SP_i ~ N(0,a^2) lambdai <- exp(SP) w <- rexp(n, rate = lambdai) y <- w^(1/gam) cen <- rexp(n, rate = clam) time <- pmin(y, cen) status <- as.numeric(cen >= y) tdata <- as.data.frame(cbind(x,time,status)) out <- coxph(Surv(time, status) ~., data=tdata) bhat<-out$coef betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ samp <- sample(1:n, replace=TRUE) tdat <- tdata[samp,] temp <- coxph(Surv(time, status) ~., data=tdat) betas[i,] <- temp$coef } for (j in 1:p){ tem <- shorth3(betas[,j],alpha=alpha) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] } #test whether the last p-k values of beta are 0 gg <- p - k tstat <- bhat[(k+1):p] tem <- confreg(betas[,(k+1):p],g=gg,that=tstat,alpha=alpha) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k values of beta are (1,...,1) gg <- k tstat <- bhat[1:k] tem <- confreg(betas[,1:k],g=gg,that=tstat,alpha=alpha) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k) } phdata2<-function(n=100,p=4,k=1,psi=0.0,a=1,gam=1,clam=0.1){ #Use 1 <= k <= p, where k is the number of nonnoise variables. #there are p coefficients for beta in the Weibull regression data #need p > 1, beta_A = -(1/gam, ..., 1/gam, 0, ..., 0)^T with p-k zeroes # beta_P = (1,...,1,0,...,0)^T with k ones and p-k zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(p-2)psi^2]/[1 + (p-1)psi^2], i not = j # when the correlation exists. SP~N(0,a^2), and a near 1 is ok. #set.seed(974) ##need p>2 rho <- (2*psi + (p-2)*psi^2)/(1 + (p-1)*psi^2) val <- a/sqrt(k*(1 + (p-1)*psi^2) + k*(k-1)*(2*psi + (p-2)*psi^2)) A <- matrix(psi,nrow=p,ncol=p) diag(A) <- 1 b <- 0 * 1:p b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] x <- matrix(rnorm(n * p), nrow = n, ncol = p) x <- val* x %*% A SP <- x%*%b #SP_i ~ N(0,a^2) lambdai <- exp(SP) w <- rexp(n, rate = lambdai) y <- w^(1/gam) cen <- rexp(n, rate = clam) ceny <- pmin(y, cen) status <- as.numeric(cen >= y) tdata <- as.data.frame(cbind(x,y,status)) #out <- coxph(Surv(y, status) ~ ., data=tdata) #ESP <- x %*% out$coef list(betaP=b,x = x, time = ceny, status = status) } PHsplitsim<-function(n = 100, p = 4, k=1, n1=30, J=5, nruns = 100, psi = 0.0, a=1, gam= 1, B=1000, clam=0.1, alpha = 0.05){ #needs library(glmnet), library(survival), n1>=30 gets rid of the warnings #J is an integer between 0 and 5 #noundfit is the number of times in nruns that lasso did not underfit #nd is the size of the training data set selected by sequential data splitting #ad is the number of predictors, including a constant, selected by lasso for #the selected training data set: want ad not much larger than k #n1 is the initial size of the training data set, want n1 <= n/2 #Simulates sequential data splitting for Cox proportional hazards regression #using lasso #with 10-fold cross validation. #Use 1 <= k < p so zeroes are in the model, k= the number of nonnoise variables #there are p coefficients for beta in the Weibull regression data #need p > 1, beta_A = -(1/gam, ..., 1/gam, 0, ..., 0)^T with p-k zeroes # beta_P = (1,...,1,0,...,0)^T with k ones and p-k zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(p-2)psi^2]/[1 + (p-1)psi^2], i not = j # when the correlation exists. SP~N(0,a^2), and a near 1 is ok. rho <- (2*psi + (p-2)*psi^2)/(1 + (p-1)*psi^2) val <- a/sqrt(k*(1 + (p-1)*psi^2) + k*(k-1)*(2*psi + (p-2)*psi^2)) A <- matrix(psi,nrow=p,ncol=p) diag(A) <- 1 beta <- 0 * 1:p beta[1:k] <- 1 #beta[1:0] acts like beta[1:1] = beta[1] vars <- 1:p zero <- 0 * 1:p one <- as.vector(0*1:k + 1) RLpilen <- 1:nruns RLpicov <- 0 splitpilen <- 1:nruns splitpicov <- 0 noundfit <- 0 indx <- 1:n nds<-1:nruns ads<-nds #check for bad values of n1 fhalf <- floor((n-J)/2) if(n<40) n1 <- max(1,fhalf) else if(floor(n/(2*n1))>1000) n1 <- floor(n/2000) if(n1 > fhalf) n1 <- max(1,fhalf) for(i in 1:nruns) { x <- matrix(rnorm(n * p), nrow = n, ncol = p) x <- val* x %*% A xf <- val* rnorm(p) %*% A SP <- x%*%beta #SP_i ~ N(0,a^2) SPf <- xf%*%beta lambdai <- exp(SP) lambdaf <- exp(SPf) w <- rexp(n, rate = lambdai) y <- w^(1/gam) wf <- rexp(1, rate = lambdaf) yf <- wf^(1/gam) #uncensored cen <- rexp(n, rate = clam) timed <- pmin(y, cen) statusd <- as.numeric(cen >= y) #make a Weibull PH data set #get lasso variable selection PI, #if n >= 5*pp time <- timed status <- statusd ycen <- cbind(time,status) out<-cv.glmnet(x,ycen,family="cox") lam <- out$lambda.min lcoef <- as.vector(predict(out,type="coefficients",s=lam)) vin <- vars[lcoef!=0] pp <- length(vin) if(n >= 5*pp ){ tdata <- as.data.frame(cbind(x[,vin],time,status)) outw <- survreg(Surv(time, status) ~ ., data = tdata) int <- outw$coef[1] bhat <- outw$coef[-1] sig <- outw$scale ghat=1/sig espw <- -xf[vin]%*%bhat/sig lamxf <- exp(-int/sig)*exp(espw) sc <- 1/lamxf^(1/ghat) ydat <- rweibull(B,shape=ghat,scale=sc) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) RLpilen[i] <- tem$up - tem$low RLpicov <- RLpicov + tem$inr } #use sequential data splitting perm <- sample(indx,n) H <- perm[1:n1] xH <- x[H,] yH <- y[H] time <- timed[H] status <- statusd[H] ycenH <- cbind(time,status) nd<-n1 #the names need to be "time" and "status" out<-cv.glmnet(xH,ycenH,family="cox") lam <- out$lambda.min lcoef <- as.vector(predict(out,type="coefficients",s=lam)) vin <- vars[lcoef!=0] pp <- length(vin) while(nd < fhalf && nd < 10*pp){ if((nd+n1) <= fhalf){ perm<-perm[-(1:n1)] H <- c(H,perm[1:n1]) xH <- x[H,] yH <- y[H] time <- timed[H] status <- statusd[H] ycenH <- cbind(time,status) out<-cv.glmnet(xH,ycenH,family="cox") lam <- out$lambda.min # fit <- predict(out,s=lam,newx=xH) lcoef <- as.vector(predict(out,type="coefficients",s=lam)) vin <- vars[lcoef!=0] pp <- length(vin) nd<-nd+n1 } else nd<-nd+n1 } if(nd > fhalf) nd <- nd-n1 if(nd < fhalf && nd < 10*pp){ md <- fhalf - nd nd <- nd + md perm<-perm[-(1:n1)] H <- c(H,perm[1:md]) xH <- x[H,] yH <- y[H] time <- timed[H] status <- statusd[H] ycenH <- cbind(time,status) out<-cv.glmnet(xH,ycenH,family="cox") lam <- out$lambda.min #fit <- predict(out,s=lam,newx=xH) lcoef <- as.vector(predict(out,type="coefficients",s=lam)) vin <- vars[lcoef!=0] pp <- length(vin) } nds[i] <- nd ads[i] <- pp if(length(vin) >= k){ if(vin[k]==k) noundfit <- noundfit + 1} xV <- x[-H,] #get lasso variable selection data splitting PI time <- timed[-H] status <- statusd[-H] ycen <- cbind(time,status) out<-cv.glmnet(xV,ycen,family="cox") lam <- out$lambda.min lcoef <- as.vector(predict(out,type="coefficients",s=lam)) vin <- vars[lcoef!=0] pp <- length(vin) nV <- n - nd if(nV >= 5*pp ){ tdata <- as.data.frame(cbind(xV[,vin],time,status)) outw <- survreg(Surv(time, status) ~ ., data = tdata) int <- outw$coef[1] bhat <- outw$coef[-1] sig <- outw$scale ghat=1/sig espw <- -xf[vin]%*%bhat/sig lamxf <- exp(-int/sig)*exp(espw) sc <- 1/lamxf^(1/ghat) ydat <- rweibull(B,shape=ghat,scale=sc) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) splitpilen[i] <- tem$up - tem$low splitpicov <- RLpicov + tem$inr } } RLpicov <- RLpicov/nruns RLpilen <- mean(RLpilen) splitpicov <- splitpicov/nruns splitpilen <- mean(splitpilen) mnnd<-mean(nds) mnad<-mean(ads) list(mnnd=mnnd,mnad=mnad,LVSpicov=RLpicov,LVSpilen=RLpilen, splitpicov=splitpicov,splitpilen=splitpilen, beta=beta,k=k,noundfit=noundfit,n1=n1)} pifold<-function(x,y,k=5,alph = 0.05){ #Does k-fold CV using PI coverage and average length. #A y case in the jth fold is in its PI iff the ``residual" of the case #is in the PI for the residuals of the cases not in the jth fold. #Use for k < < n: very inefficient for leave one out CV = n-fold CV. x<-as.matrix(x) n<-nrow(x) cov <- 0 len <- 0 d <- ncol(x) + 1 folds<-rand(n,k)$groups #folds<-sample(1:k,n,replace=TRUE) #could be quicker for(j in 1:k){ tem<-lsfit(x[folds!=j,],y[folds!=j]) res <-tem$resid #want PI for the residuals of cases not in the jth fold temp <- dpi(yf=0,yfhat=0,d=d,resid=res,alph=alph) yfhat <- tem$coef[1] + as.matrix(x[folds==j,]) %*% tem$coef[-1] fres <- y[folds==j] - yfhat cov <- cov + sum((temp$low <= fres) * (temp$up >= fres)) len <- len + sum(folds==j)*(temp$up - temp$low) } cov <- cov/n alen <- len/n list(cov=cov,alen=alen) } pisimspline<-function(n = 100, nruns = 100, alpha = 0.05, eps = 0.1, shift = 9, type = 1, modt=1){ #Compares asymptotically optimal PIs for GCV cubic spline, 1 predictor x #applied to Y = m(x) + e. Olive (2013) PIs, best if n > 10 df. #type = 1 for N(0,1) errors, 2 for t3 errors, #3 for exp(1) - 1 errors #4 for uniform(-1,1) errors, #5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors #modt = 1: Y = x + x^2 #modt = 2: Y = sin(x) + cos(x) + log(|x|) #modt = 3: Y = 3 sqrt(abs(x)) p<-1 alph <- alpha npicov <- 0 opicov <- 0 dpicov <- 0 val <- 8*n/9 val3 <- 1:nruns val4 <- val3 df <- val3 pilen <- matrix(0, nrow = nruns, ncol = 3) corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} else {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn for(i in 1:nruns) { x <- rnorm(n) xf <- rnorm(1) if(modt==1){ymn <- x + x^2 yfmn <- xf + xf^2} if(modt==2){ymn <- sin(x) + cos(x) + log(abs(x)) yfmn <- sin(xf) + cos(xf) + log(abs(xf))} if(modt==3){ymn <- 3 * sqrt(abs(x)) yfmn <- 3 * sqrt(abs(xf)) } if(type == 1) { y <- ymn + rnorm(n) yf <- yfmn + rnorm(1) } if(type == 2) { y <- ymn + rt(n, df = 3) yf <- yfmn + rt(1, df = 3) } if(type == 3) { y <- ymn + rexp(n) - 1 yf <- yfmn + rexp(1) - 1 } if(type == 4) { y <- ymn + runif(n, min = -1, max = 1) yf <- yfmn + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- ymn + err yf <- yfmn + rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) } out <- smooth.spline(x,y) df[i] <- out$df yhat <- predict(out,x) yhat<-yhat$y yfhat<-predict(out,xf) yfhat<-yfhat$y res <- y - yhat #get semiparametric PI val2 <- quantile(res, c(alphan/2, 1 - alphan/2)) val3[i] <- val2[1] val4[i] <- val2[2] up <- yfhat + corfac*val4[i] low <- yfhat + corfac*val3[i] pilen[i, 1] <- up - low if(low < yf && up > yf) npicov <- npicov + 1 # asymptotically optimal PI sres <- sort(res) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i, 2] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 #newer PI using df pp <- df[i] if(pp < val) cfac <- (1 + 15/n) * sqrt( (n+2*pp)/(n - pp) ) else cfac <- 5*(1+15/n) if (alph > 0.1) {qn <- min(1 - alph + 0.05, 1 - alph + pp/n)} if (alph <= 0.1) {qn <- min(1 - alph/2, 1 - alph + 10*alph*pp/n)} pn <- qn if(pn < 1 - alph + 0.001) qn <- 1 - alph alphan <- 1 - qn cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + cfac*rup low <- yfhat + cfac*rlow pilen[i, 3] <- up - low if(low < yf && up > yf) dpicov <- dpicov + 1 } pimnlen <- apply(pilen, 2, mean) lcut <- mean(val3) hcut <- mean(val4) npicov <- npicov/nruns opicov <- opicov/nruns dpicov <- dpicov/nruns adf <- mean(df) list(adf = adf, pimenlen = pimnlen, spicov = npicov, opicov = opicov, dpicov = dpicov, lcut = lcut, hcut = hcut) } plsbootsim<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, BB=1000, alph = 0.05){ #needs library(pls), calls nparPLSboot, predreg, shorth3 #Takes about a minute per run so really slow. #Simulates nonparametric bootstrap for pls #with 10-fold cross validation. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1, so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-nparPLSboot(x,y,B=BB) #bootstrap 10 fold CV simpls model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 tem <- predreg(out$betas[,(k+2):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} plspisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi=0.0, type = 1, alpha = 0.05){ #Needs library(pls). ##errors if p >= n/2 Uses the Olive (2013) PI. #Simulates PIs for partial least squares with 10 fold CV. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} if (alpha <= 0.1) {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn ncvec<-1:nruns pilen <- 1:nruns opicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the PLS 10 fold CV estimator z <- as.data.frame(cbind(y,x)) zz <- rbind(z,c(0,xf)) out<-plsr(V1~.,data=z,scale=T,validation="CV") #If y is used instead of V1, predict does not work, #and nc tends to equal p, which should be impossible. tem<-MSEP(out) cvmse<-tem$val[,,1:(out$ncomp+1)][1,] nc <- max(which.min(cvmse)-1,1) #bug if nc=0, p large #using predict is rather difficult #if xf is used, predict does not work yfhat<-predict(out,zz[(n+1),-1],ncomp=nc)[1,1,1] ncvec[i] <- nc #get the number of components to use #if nc=p-1=q, same as OLS res <- out$residuals[,,nc] #get asymptotically optimal PI sres <- sort(res) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } pimnlen <- mean(pilen) opicov <- opicov/nruns qminncmn <- q - mean(ncvec) #If qminncmn is 0 then PLS is equivalent to OLS. list(qminncmn, opicov=opicov, pimenlen = pimnlen)} plspisim2<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi=0.0, type = 1, alph = 0.05){ #Needs library(pls). ##Undercoverage if p > n. Uses Pelawa Watagoda and Olive (2017) PI. #Simulates PIs for partial least squares with 10 fold CV lasso when p is not #necessarily small. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 val <- 8*n/9 ncvec<-1:nruns pilen <- 1:nruns opicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the PLS 10 fold CV estimator z <- as.data.frame(cbind(y,x)) zz <- rbind(z,c(0,xf)) out<-plsr(V1~.,data=z,scale=T,validation="CV") #If y is used instead of V1, predict does not work, #and nc tends to equal p, which should be impossible. tem<-MSEP(out) cvmse<-tem$val[,,1:(out$ncomp+1)][1,] nc <-max(which.min(cvmse)-1,1) #bug if nc=0, p large #using predict is rather difficult #if xf is used, predict does not work yfhat<-predict(out,zz[(n+1),-1],ncomp=nc)[1,1,1] ncvec[i] <- nc #get the number of components to use #if nc=p-1=q, same as OLS res <- out$residuals[,,nc] #get asymptotically optimal PI #get PI pp <- nc + 1 if(pp < val) corfac <- (1 + 15/n) * sqrt( (n+2*pp)/(n - pp) ) else corfac <- 5*(1+15/n) if (alph > 0.1) {qn <- min(1 - alph + 0.05, 1 - alph + pp/n)} if (alph <= 0.1) {qn <- min(1 - alph/2, 1 - alph + 10*alph*pp/n)} pn <- qn if(pn < 1 - alph + 0.001) qn <- 1 - alph alphan <- 1 - qn sres <- sort(res) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } pimnlen <- mean(pilen) opicov <- opicov/nruns qminncmn <- q - mean(ncvec) #If qminncmn is 0 then PLS is equivalent to OLS. list(qminncmn, opicov=opicov, pimenlen = pimnlen)} PRboot<-function(x,y,B = 1000){ #needs library(MASS), n > 5p, p > 2, want B >= 50p, takes a few minutes #bootstraps the Poisson regression full model x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] tdata <- as.data.frame(cbind(x,y)) out <- glm(y~., family=poisson, data=tdata) bhat<-out$coef ESP <- predict(out,newdata = tdata) betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rpois(n,lambda=exp(ESP)) tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(ydat~., family=poisson, data=tdat) betas[i,] <- temp$coef } list(bhat=bhat,betas=betas) } predreg<-function(x, alpha = 0.05){ # Makes a prediction region for the rows of x. # If p = 1, the shorth interval should work better. #Also computes the distance for the 0 vector. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] zero <- 0*(1:p) up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical distance MD <- sqrt(md2) #get nonparametric prediction region boundary cuplim <- sqrt(quantile(md2, up)) D0 <- sqrt(mahalanobis(zero, center, cov)) list(cuplim = cuplim, D0=D0, MD=MD, center=center, cov=cov) } predrgn <-function(x, xf, alpha = 0.05){ # Makes a prediction region for xf when cases are rows of x. # If p = 1, the shorth interval should work better. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] inr <- 0 up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # md2 is the classical squared Mahalanobis distance #get nonparametric prediction region boundary cuplim <- quantile(md2, up) #inr <- 1 if xf is in the prediction region xfd <- mahalanobis(xf, center, cov) if(xfd <= cuplim) inr <- 1 list(md2=md2, center=center,cov=cov, cuplim = cuplim, xfd, inr=inr) } predsim<-function(n = 100, p = 4, nruns = 100, xtype = 1, dd = 1, eps = 0.25, alpha = 0.1) {# MAY NOT WORK IF p = 1. # Gets coverages of nonparametric, semiparametric and # parametric MVN prediction regions. # Multiply x by A where xtype = 1 for MVN Nq(0,I), # 2, 3, 4 and 10 (with delta = eps) for delta Np(0,I) + (1-delta) Np(0, 25 I) # 5 for lognormal, # 6, 7, 8 and 9 for multivariate t_d with d = 3, 5, 19 or dd # mahalanobis gives squared Maha distances # set.seed(974) ccvr <- 0 scvr <- 0 rcvr <- 0 volc <- 1:nruns vols <- volc volr <- volc #up <- 1 - alpha up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha A <- sqrt(diag(1:p)) m <- n + 1 for(i in 1:nruns) { #make data x <- matrix(rnorm(m * p), nrow = m, ncol = p) if(xtype == 2) { zu <- runif(m) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(xtype == 3) { zu <- runif(m) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(xtype == 4) { zu <- runif(m) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(xtype == 5) x <- exp(x) if(xtype == 6) { zu <- sqrt(rchisq(m, 3)/3) x <- x/zu } if(xtype == 7) { zu <- sqrt(rchisq(m, 5)/5) x <- x/zu } if(xtype == 8) { zu <- sqrt(rchisq(m, 19)/19) x <- x/zu } if(xtype == 9) { zu <- sqrt(rchisq(m, dd)/dd) x <- x/zu } if(xtype == 10) { zu <- runif(m) x[zu < eps, ] <- x[zu < eps, ] * 5 } x <- x %*% A xx <- x[ - m, ] center <- apply(xx, 2, mean) cov <- var(xx) md2 <- mahalanobis(xx, center, cov) hsq <- quantile(md2, up) if(mahalanobis(t(x[m, ]), center, cov) <= hsq) ccvr <- ccvr + 1 volc[i] <- sqrt(hsq)^p * prod(diag(chol(cov))) out <- covrmvn(xx) center <- out$center cov <- out$cov md2 <- mahalanobis(xx, center, cov) hsq <- quantile(md2, up) dsq <- mahalanobis(t(x[m, ]), center, cov) if(dsq <= hsq) scvr <- scvr + 1 sqrtdet <- prod(diag(chol(cov))) vols[i] <- sqrt(hsq)^p * sqrtdet hsq <- qchisq(up, p) if(dsq <= hsq) rcvr <- rcvr + 1 volr[i] <- sqrt(hsq)^p * sqrtdet } ccvr <- ccvr/nruns scvr <- scvr/nruns rcvr <- rcvr/nruns #get a measure of efficiency wrt vols, so eff(vols) = 1 vols <- mean(vols) volc <- mean(volc)/vols volr <- mean(volr)/vols vols <- 1 list(ncvr = ccvr, scvr = scvr, mcvr = rcvr, voln = volc, vols = vols, volm = volr, up = up) } predsim2<-function(n = 50, p = 50, nv=19, nruns = 100, xtype = 1, dtype = 1, alpha = 0.05) {# MAY NOT WORK IF p = 1. # Gets coverages of the data splitting prediction region. # Multiply x by A where xtype = 1 for MVN Nq(0,I), 2 for lognormal #dtype = 1 if (T,C) = (xbar, Ip ), dtype = 2 if (T,C) = (coord median, Ip) # mahalanobis gives squared Maha distances #want n/2 > nv #coverage is very sensitive to quantile type. Use type = 1: inverse quantile #function so percentile = Y(ceiling(nv*(1-alpha)) cvr <- 0 nv <- min(nv, floor(n/2)) nH <- n - nv uv <- min(nv, ceiling( (1 - alpha)*(1+nv) ) ) up <- uv/(nv+1) #quantile for prediction region A <- sqrt(diag(1:p)) m <- n + 1 C <- diag(p) indx <- 1:n for(i in 1:nruns) { #make data x <- matrix(rnorm(m * p), nrow = m, ncol = p) if(xtype == 2) x <- exp(x) x <- x %*% A xx <- x[ - m, ] #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- xx[H,] xV <- xx[-H,] if(dtype==1){ center <- apply(xH, 2, mean) disp = C} if(dtype==2){ center <- apply(xH, 2, median) disp = C} md2 <- mahalanobis(xV, center, cov=disp) hsq <- quantile(md2, up, type=1) if(mahalanobis(t(x[m, ]), center, cov=disp) <= hsq) cvr <- cvr + 1 } cvr <- cvr/nruns list(cvr = cvr, up=up) } pregbootsim<-function(n = 100, p = 4, k = 1, nruns = 100, psi=0.0, B=1000, int=1, a = 1, alpha = 0.05){ ##Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #Simulates parametric bootstrap for Poisson regression (full model). # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so there are zeroes in the model, k is the number of nonnoise variables #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want # with int + 3a <=5, int -3a >= -5. #set.seed(974) ##need p>2 and want n >= 5p q <- p-1 pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta<-c(int,b) one <- as.vector(0*1:(k+1) + 1) one[1] <- int for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rpois(n,lambda=exp(SP)) tdata <- as.data.frame(cbind(x,y)) #make a PR data set out <- glm(y~., family=poisson, data=tdata) bhat<-out$coef ESP <- predict(out,newdata = tdata) betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rpois(n,lambda=exp(ESP)) tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(ydat~., family=poisson, data=tdat) betas[i,] <- temp$coef } for (j in 1:p){ tem <- shorth3(betas[,j],alpha=alpha) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- bhat[(k+2):p] tem <- confreg(betas[,(k+2):p],g=gg,that=tstat,alpha=alpha) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k+1 values of beta are (int,1,...,1) gg <- k + 1 tstat <- bhat[1:(k+1)] tem <- confreg(betas[,1:(k+1)],g=gg,that=tstat,alpha=alpha) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} proplssim<-function(n=100,p=4,k=1,nruns=100,psi=0.0,int=1,a=1,cfac="T",indices=c(1,2), alph=0.05){ ###Simulates Poisson regression (full model). # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want # with int + 3a <=5, int -3a >= -5. #Then does the Z = log(Y) (or log(0.5) if Y = 0) transformation. #Does inference on beta_Z = c_Z beta = c_Z beta_PR provided psi=0 and k < p-1. ##Use indices = c(p-2,p-1) in the simulation, so Ho is true. ##Hence for p=100: indices = c(98,99), for p=4: indices = c(2,3). #set.seed(974) ##need p>2 and want n >= 5p q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta<-c(int,b) prmat <- matrix(0,nrow=nruns,ncol=p) zmat <- prmat omat <- prmat oplscov <- 0 * 1:q oplslen <- oplscov testcov <- 0 r <- length(indices) eta0 <- 0*1:r val <- 1-alph cut <- qchisq(val,df=r) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rpois(n,lambda=exp(SP)) tdata <- as.data.frame(cbind(x,y)) #make a PR data set z<-y z[y==0]<-0.5 z <- log(z) if(n > 5*p){ out <- glm(y~., family=poisson, data=tdata) prmat[i,] <-out$coef outo <- lm(y~., data=tdata) omat[i,] <-coef(outo) zdata <- as.data.frame(cbind(x,z)) outz <- lm(z~., data=zdata) zmat[i,] <-coef(outz) } if(k < p-1){ out <- covxycis(x,z,alph,cfac=cfac) oplslen <- oplslen + (out$cis[,2] - out$cis[,1]) for(j in 1:q){ if(out$cis[j,1] <= 0 && 0 <= out$cis[j,2]) oplscov[j] <- oplscov[j] + 1 } } if(r > 1 && n > 5*r) { xI <- x[,indices] covxIz <- cov(xI,z) diff <- covxIz - eta0 w<-xI*c(z-mean(z)) dsq <- n*t(diff)%*%solve(cov(w))%*%diff dsq<-dsq[1,1] #make dsq a real instead of a matrix if(dsq < cut) testcov <- testcov +1 } } prbhat <- apply(prmat,2,mean) olsbhat <- apply(omat,2,mean) zbhat <- apply(zmat,2,mean) oplslen <- oplslen/nruns oplscov <- oplscov/nruns minlen <- min(oplslen) meanlen <- mean(oplslen) maxlen <- max(oplslen) lens <- c(minlen,meanlen,maxlen) mincov <- min(oplscov) cov90 <- mean(oplscov>=0.9) cov92 <- mean(oplscov>=0.92) cov93 <- mean(oplscov>=0.93) cov94 <- mean(oplscov>=0.94) cov96 <- mean(oplscov>=0.96) covprop <- c(mincov,cov90,cov92,cov93,cov94,cov96) testcov <- testcov/nruns #MLRplot(x,z) list(prbhat=prbhat,olsbhat=olsbhat,zbhat=zbhat,beta=beta,k=k, oplslen=oplslen,oplscov=oplscov,lens=lens,covprop=covprop,testcov=testcov) } prpiplot<- function(x, y, B = 1000, alpha=0.05) {# Makes response plot for full model Poisson regression with PIs added. #0 = data, x = PI tdata <- as.data.frame(cbind(x,y)) out <- glm(y~., family=poisson, data=tdata) x<-as.matrix(x) p<-dim(x)[2]+1 ESP <- predict(out) Y <- as.vector(y) n <- length(y) low<-1:n up<-low cov<-0 plot(ESP, Y) abline(mean(Y), 0) Ehat <- exp(ESP) indx <- sort.list(ESP) lines(ESP[indx], Ehat[indx]) lines(lowess(ESP, Y), type = "s") for(i in 1:n){ ydat <- rpois(B,lambda=exp(ESP[i])) tem <- mshpi(yf=y[i],ydat=ydat,n=n,d=p,alph=alpha) low[i] <- tem$low up[i] <- tem$up cov<-cov+tem$inr } points(ESP,low,pch=4) points(ESP,up,pch=4) title("Response Plot") cov<-cov/n list(cov=cov) } prpiplot2<- function(ESP, y, d=2, B = 1000, alpha=0.05) {# Makes response plot for Poisson regression with PIs added. #0 = data, x = PI #tdata <- as.data.frame(cbind(x,y)) #out <- glm(y~., family=poisson, data=tdata) #x<-as.matrix(x); d<-dim(x)[2]+1 #d=no. of predictors #ESP <- predict(out) Y <- as.vector(y) n <- length(y) low<-1:n up<-low cov<-0 plot(ESP, Y) abline(mean(Y), 0) Ehat <- exp(ESP) indx <- sort.list(ESP) lines(ESP[indx], Ehat[indx]) lines(lowess(ESP, Y), type = "s") for(i in 1:n){ ydat <- rpois(B,lambda=exp(ESP[i])) tem <- mshpi(yf=y[i],ydat=ydat,n=n,d=d,alph=alpha) low[i] <- tem$low up[i] <- tem$up cov<-cov+tem$inr } points(ESP,low,pch=4) points(ESP,up,pch=4) title("Response Plot") cov<-cov/n list(cov=cov) } prpisim<-function(n = 100, p = 4, k = 1, nruns = 100, psi = 0.0, J = 5, B=1000, int=1, a = 1, alpha = 0.05){ #Needs library(leaps), library(mgcv), and library(glmnet). #Calls shpi and mshpi. #The GAM PR PI is only computed if p = 4. #Use 1 <= k <= p-1, where k is the number of nonnoise variables. #Simulates the Olive et al. (2018) PI for Poisson regression. #PIs for full model, lasso, lasso variable selection, backward elimination # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want exp(int+3a) reasonable # with int + 3a <=10, int > 0 and int -3a > -4. set.seed(974) ##need p>2 fullpilen <- 1:nruns fullpicov <- 0 gampilen <- 1:nruns gampicov <- 0 ohfspilen <- 1:nruns ohfspicov <- 0 laspilen <- 1:nruns laspicov <- 0 RLpilen <- 1:nruns RLpicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) nc <- ceiling(n/J)-1 nc <- min(nc,q) nc <- max(nc,1) #the maximum number of variables to use for forward selection zz<-1:nc vmax <- min(p,as.integer(n/5))#max no. of variables to use for lasso dd <- 1:nruns dRL <- dd #lasso variable selection and lasso have the same d for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A xf <- val* rnorm(q) %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rpois(n,lambda=exp(SP)) yf <- rpois(1,lambda=exp(int + xf%*%b)) tdata <- as.data.frame(cbind(x,y)) yn <- c(y,yf) xn <- rbind(x,xf) tdat <- as.data.frame(cbind(xn,yn)) #make a PR data set #get full model PR PI if(n >= 5*p){ out <- glm(y~., family=poisson, data=tdata) ESP <- predict(out,newdata = tdat[n+1,]) ydat <- rpois(B,lambda=exp(ESP)) tem <- shpi(yf=yf,ydat=ydat,alph=alpha) fullpilen[i] <- tem$up - tem$low fullpicov <- fullpicov + tem$inr } #get full model GAM PR PI if p = 4, GAM needs a formula if(n >= 5*p && p == 4){ x1 <- x[,1];x2 <- x[,2];x3 <- x[,3] out <- gam(y ~ s(x1) + s(x2) + s(x3),family=poisson) ESP <- predict.gam(out,newdata=data.frame(x1=xf[1],x2=xf[2],x3=xf[3])) ydat <- rpois(B,lambda=exp(ESP)) tem <- shpi(yf=yf,ydat=ydat,alph=alpha) gampilen[i] <- tem$up - tem$low gampicov <- gampicov + tem$inr } #get lasso GLM PI out<-cv.glmnet(x,y,family="poisson") lam <- out$lambda.min ESP <- predict(out,s=lam,newx=xf) #now get d lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 #d = pp ydat <- rpois(B,lambda=exp(ESP)) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) laspilen[i] <- tem$up - tem$low laspicov <- laspicov + tem$inr #get lasso variable selection GLM PI xsub <- x[,vin] sub <- glm(y~., family=poisson, data=data.frame(cbind(xsub,y))) dRL[i]<- pp #want these to be near but >= k+1 ESP <- sub$coef[1] + xf[vin] %*% sub$coef[-1] ydat <- rpois(B,lambda=exp(ESP)) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) RLpilen[i] <- tem$up - tem$low RLpicov <- RLpicov + tem$inr #get Olive and Hawkins forward selection PI if(n >= 5*p){ temp<-regsubsets(x,y,nvmax=nc,method="forward") out<-summary(temp) mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] xsub <- x[,vin] sub <- glm(y~., family=poisson, data=data.frame(cbind(xsub,y))) dd[i]<-length(sub$coef)#want these to be near but >= k+1 pp<-dd[i] ESP <- sub$coef[1] + xf[vin] %*% sub$coef[-1] ydat <- rpois(B,lambda=exp(ESP)) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) ohfspilen[i] <- tem$up - tem$low ohfspicov <- ohfspicov + tem$inr } } fullpimnlen <- mean(fullpilen) fullpicov <- fullpicov/nruns gampimnlen <- mean(gampilen) gampicov <- gampicov/nruns laspimnlen <- mean(laspilen) laspicov <- laspicov/nruns RLpimnlen <- mean(RLpilen) RLpicov <- RLpicov/nruns ohfspimnlen <- mean(ohfspilen) ohfspicov <- ohfspicov/nruns mndd <- mean(dd) mndRL <- mean(dRL) #lasso variable selection and lasso have the same d list(mndRL=mndRL,mndd=mndd,int=int,b=b,fullpicov=fullpicov, fullpimenlen=fullpimnlen,gampicov=gampicov,gampimenlen=gampimnlen, laspicov=laspicov, laspimenlen=laspimnlen, RLpicov=RLpicov, RLpimenlen=RLpimnlen,ohfspicov=ohfspicov,ohfspimnlen=ohfspimnlen)} prpisim2<-function(n = 100, p = 4, k = 1, nruns = 100, psi = 0.0, J = 5, B=1000, int=1, a = 1, alpha = 0.05){ #Needs library(leaps), library(MASS), library(mgcv), and library(glmnet). #Calls shpi and mshpi. Adds backwards elimination to prpisim. #The GAM PR PI is only computed if p = 4. #Use 1 <= k <= p-1, where k is the number of nonnoise variables. #Simulates the Olive et al. (2018) PI for Poisson regression. #PIs for full model, lasso, lasso variable selection, backward elimination # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want exp(int+3a) reasonable # with int + 3a <=10, int > 0 and int -3a > -4. set.seed(974) ##need p>2 fullpilen <- 1:nruns fullpicov <- 0 gampilen <- 1:nruns gampicov <- 0 ohfspilen <- 1:nruns ohfspicov <- 0 laspilen <- 1:nruns laspicov <- 0 RLpilen <- 1:nruns RLpicov <- 0 vspilen <- 1:nruns vspicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) nc <- ceiling(n/J)-1 nc <- min(nc,q) nc <- max(nc,1) #the maximum number of variables to use for forward selection zz<-1:nc vmax <- min(p,as.integer(n/5))#max no. of variables to use for lasso dd <- 1:nruns ddbe <- dd dRL <- dd #lasso variable selection and lasso have the same d for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A xf <- val* rnorm(q) %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rpois(n,lambda=exp(SP)) yf <- rpois(1,lambda=exp(int + xf%*%b)) tdata <- as.data.frame(cbind(x,y)) yn <- c(y,yf) xn <- rbind(x,xf) tdat <- as.data.frame(cbind(xn,yn)) #make a PR data set #get full model PR PI if(n >= 5*p){ out <- glm(y~., family=poisson, data=tdata) ESP <- predict(out,newdata = tdat[n+1,]) ydat <- rpois(B,lambda=exp(ESP)) tem <- shpi(yf=yf,ydat=ydat,alph=alpha) #tem <- mshpi(yf=yf,ydat=ydat,n,d=p,alph=alpha) fullpilen[i] <- tem$up - tem$low fullpicov <- fullpicov + tem$inr } #get backward elimination PR PI, could use mshpi if(n >= 5*p){ varnames <- names(out$coef)[-1] outbe <- step(out,trace=0) vinnames <- names(outbe$coef)[-1] vin <- varnames %in% vinnames ddbe[i]<-length(outbe$coef) pp<-ddbe[i] ESP <- outbe$coef[1] + xf[vin] %*% outbe$coef[-1] ydat <- rpois(B,lambda=exp(ESP)) tem <- shpi(yf=yf,ydat=ydat,alph=alpha) #tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) vspilen[i] <- tem$up - tem$low vspicov <- vspicov + tem$inr } #get full model GAM PR PI if p = 4, GAM needs a formula if(n >= 5*p && p == 4){ x1 <- x[,1];x2 <- x[,2];x3 <- x[,3] out <- gam(y ~ s(x1) + s(x2) + s(x3),family=poisson) ESP <- predict.gam(out,newdata=data.frame(x1=xf[1],x2=xf[2],x3=xf[3])) ydat <- rpois(B,lambda=exp(ESP)) tem <- shpi(yf=yf,ydat=ydat,alph=alpha) #tem <- mshpi(yf=yf,ydat=ydat,n,d=p,alph=alpha) gampilen[i] <- tem$up - tem$low gampicov <- gampicov + tem$inr } #get lasso GLM PI out<-cv.glmnet(x,y,family="poisson") lam <- out$lambda.min ESP <- predict(out,s=lam,newx=xf) #now get d lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 #d = pp ydat <- rpois(B,lambda=exp(ESP)) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) laspilen[i] <- tem$up - tem$low laspicov <- laspicov + tem$inr #get lasso variable selection GLM PI xsub <- x[,vin] sub <- glm(y~., family=poisson, data=data.frame(cbind(xsub,y))) dRL[i]<- pp #want these to be near but >= k+1 ESP <- sub$coef[1] + xf[vin] %*% sub$coef[-1] ydat <- rpois(B,lambda=exp(ESP)) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) RLpilen[i] <- tem$up - tem$low RLpicov <- RLpicov + tem$inr #get Olive and Hawkins forward selection PI if(n >= 5*p){ temp<-regsubsets(x,y,nvmax=nc,method="forward") out<-summary(temp) mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] xsub <- x[,vin] sub <- glm(y~., family=poisson, data=data.frame(cbind(xsub,y))) dd[i]<-length(sub$coef)#want these to be near but >= k+1 pp<-dd[i] ESP <- sub$coef[1] + xf[vin] %*% sub$coef[-1] ydat <- rpois(B,lambda=exp(ESP)) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) ohfspilen[i] <- tem$up - tem$low ohfspicov <- ohfspicov + tem$inr } } fullpimnlen <- mean(fullpilen) fullpicov <- fullpicov/nruns gampimnlen <- mean(gampilen) gampicov <- gampicov/nruns laspimnlen <- mean(laspilen) laspicov <- laspicov/nruns RLpimnlen <- mean(RLpilen) RLpicov <- RLpicov/nruns ohfspimnlen <- mean(ohfspilen) ohfspicov <- ohfspicov/nruns vspimnlen <- mean(vspilen) vspicov <- vspicov/nruns mndd <- mean(dd) mndRL <- mean(dRL) mnddbe <- mean(ddbe) #lasso variable selection and lasso have the same d list(mndRL=mndRL,mndd=mndd,mnddbe=mnddbe,int=int,b=b,fullpicov=fullpicov, fullpimenlen=fullpimnlen,gampicov=gampicov,gampimenlen=gampimnlen, laspicov=laspicov, laspimenlen=laspimnlen, RLpicov=RLpicov, RLpimenlen=RLpimnlen,ohfspicov=ohfspicov,ohfspimnlen=ohfspimnlen, vspicov=vspicov,vspimnlen=vspimnlen)} prplot<- function(x, y) {# Makes response plot, OD plot, weighted forward response # and residual plots for Poisson regression. # Workstation: need to activate a graphics # device with command "X11()" or "motif()." # #If q is changed, change the formula in the glm statement. #q <- 5 # change formula to x[,1]+ ... + x[,q] with q #out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[, 5], family = poisson) out <- glm(y~., family=poisson, data=data.frame(cbind(x,y))) x<-as.matrix(x) ESP <- predict(out) par(mfrow = c(2, 2)) Y <- y plot(ESP, Y) abline(mean(Y), 0) Ehat <- exp(ESP) indx <- sort.list(ESP) lines(ESP[indx], Ehat[indx]) lines(lowess(ESP, Y), type = "s") title("a) Response Plot") Vhat <- (Y - Ehat)^2 plot(Ehat, Vhat) abline(0, 1) abline(0, 4) #lines(lowess(Ehat, Vhat)) #abline(lsfit(Ehat, Vhat)$coef) title("b) OD Plot") Z <- Y Z[Y < 1] <- Z[Y < 1] + 0.5 WRES <- sqrt(Z) * (log(Z) - x %*% out$coef[-1] - out$coef[1]) WFIT <- sqrt(Z) * log(Z) - WRES plot(WFIT, sqrt(Z) * log(Z)) abline(0, 1) #abline(lsfit(WFIT, sqrt(Z) * log(Z))$coef) title("c) WFRP") plot(WFIT, WRES) title("d) Wtd Residual Plot") par(mfrow = c(1, 1)) } prplot2 <- function(ESP, x, y) {# Makes response plot and OD plot for Poisson regression. #Can be used for a Poisson GAM or GLM. # If t<-1:13 # y<-c(12,14,33,50,67,74,123,141,165,204,253,246,240) # If out <- glm(y~t+I(t^2),family=poisson) # use ESP <- predict(out). # If outgam <- gam(y~s(t),poisson) # use ESP <- predict.gam(outgam). # Workstation: need to activate a graphics # device with command "X11()" or "motif()." #WRES <- sqrt(Z) * (log(Z) - x %*% out$coef[-1] - out$coef[1])#for a GLM x<-as.matrix(x) par(mfrow = c(2, 2)) Y <- y plot(ESP, Y) abline(mean(Y), 0) Ehat <- exp(ESP) indx <- sort.list(ESP) lines(ESP[indx], Ehat[indx]) lines(lowess(ESP, Y), type = "s") title("a) Response Plot") Vhat <- (Y - Ehat)^2 plot(Ehat, Vhat) abline(0, 1) abline(0, 4) #lines(lowess(Ehat, Vhat)) #abline(lsfit(Ehat, Vhat)$coef) title("b) OD Plot") Z <- Y Z[Y < 1] <- Z[Y < 1] + 0.5 WRES <- sqrt(Z) * (log(Z) - ESP) WFIT <- sqrt(Z) * log(Z) - WRES plot(WFIT, sqrt(Z) * log(Z)) abline(0, 1) #abline(lsfit(WFIT, sqrt(Z) * log(Z))$coef) title("c) WFRP") plot(WFIT, WRES) title("d) Wtd Residual Plot") par(mfrow = c(1, 1)) } prsplit<-function(n = 100, p = 4, k=1, n1=30, J=5, nruns = 100, psi = 0.0, int=1, a=1, B=1000, alpha = 0.05){ #needs library(glmnet), library(mgcv), n1>=30 gets rid of the warnings #J is an integer between 0 and 5 #noundfit is the number of times in nruns that lasso did not underfit #nd is the size of the training data set selected by sequential data splitting #ad is the number of predictors, including a constant, selected by lasso for #the selected training data set: want ad not much larger than k+1 #k+1 is the number of beta_i not equal to 0 #n1 is the initial size of the training data set, want n1 <= n/2 #Simulates sequential data splitting for Poisson regression using lasso #with 10-fold cross validation. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want exp(int+3a) reasonable # with int + 3a <=10, int > 0 and int - 3a > -4. q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta<-c(int,b) one <- as.vector(0*1:(k+1) + 1) one[1]<-int zero <- 0 * 1:p laspilen <- 1:nruns laspicov <- 0 RLpilen <- 1:nruns RLpicov <- 0 lsplitpilen <- 1:nruns lsplitpicov <- 0 splitpilen <- 1:nruns splitpicov <- 0 vars <- as.vector(1:(p-1)) noundfit <- 0 indx <- 1:n nds<-1:nruns ads<-nds #check for bad values of n1 fhalf <- floor((n-J)/2) #to copy and paste an if else statement, need braces {if ... else...} if(n<40) n1 <- max(1,fhalf) else if(floor(n/(2*n1))>1000) n1 <- floor(n/2000) if(n1 > fhalf) n1 <- max(1,fhalf) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rpois(n,lambda=exp(SP)) xf <- val * rnorm(q) %*% A SPf <- int + xf%*%b yf <- rpois(1,lambda=exp(SPf)) #make a PR data set #get lasso PI out<-cv.glmnet(x,y,family="poisson") lam <- out$lambda.min ESP <- predict(out,s=lam,newx=xf) #now get d lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 #d = pp ydat <- rpois(B,lambda=exp(ESP)) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) laspilen[i] <- tem$up - tem$low laspicov <- laspicov + tem$inr #get lasso variable selection PI, problems if number of variables > n-1 if(length(vin) < (n-5)){ xsub <- x[,vin] sub <- glm(y~., family=poisson, data=data.frame(cbind(xsub,y))) ESP <- sub$coef[1] + xf[vin] %*% sub$coef[-1] ydat <- rpois(B,lambda=exp(ESP)) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) RLpilen[i] <- tem$up - tem$low RLpicov <- RLpicov + tem$inr } #use sequential data splitting perm <- sample(indx,n) H <- perm[1:n1] xH <- x[H,] yH <- y[H] nd<-n1 out<-cv.glmnet(xH,yH,family="poisson") lam <- out$lambda.min fit <- predict(out,s=lam,newx=xH) lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 while(nd < fhalf && nd < 10*pp){ if((nd+n1) <= fhalf){ perm<-perm[-(1:n1)] H <- c(H,perm[1:n1]) xH <- x[H,] yH <- y[H] out<-cv.glmnet(xH,yH,family="poisson") lam <- out$lambda.min fit <- predict(out,s=lam,newx=xH) lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 nd<-nd+n1 } else nd<-nd+n1 } if(nd > fhalf) nd <- nd-n1 if(nd < fhalf && nd < 10*pp){ md <- fhalf - nd nd <- nd + md perm<-perm[-(1:n1)] H <- c(H,perm[1:md]) xH <- x[H,] yH <- y[H] out<-cv.glmnet(xH,yH,family="poisson") lam <- out$lambda.min fit <- predict(out,s=lam,newx=xH) lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 } nds[i] <- nd ads[i] <- pp if(length(vin) >= k){ if(vin[k]==k) noundfit <- noundfit + 1} xV <- x[-H,] yV <- y[-H] #get lasso data splitting PI out<-cv.glmnet(xV,yV,family="poisson") lam <- out$lambda.min ESP <- predict(out,s=lam,newx=xf) #now get d lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 #d = pp ydat <- rpois(B,lambda=exp(ESP)) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) lsplitpilen[i] <- tem$up - tem$low lsplitpicov <- lsplitpicov + tem$inr #get lasso variable selection data splitting PI if(length(vin) < (length(yV)-3)){ xsub <- xV[,vin] sub <- glm(yV~., family=poisson, data=data.frame(cbind(xsub,yV))) ESP <- sub$coef[1] + xf[vin] %*% sub$coef[-1] ydat <- rpois(B,lambda=exp(ESP)) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) splitpilen[i] <- tem$up - tem$low splitpicov <- splitpicov + tem$inr } } laspicov <- laspicov/nruns laspilen <- mean(laspilen) RLpicov <- RLpicov/nruns RLpilen <- mean(RLpilen) lsplitpicov <- lsplitpicov/nruns lsplitpilen <- mean(lsplitpilen) splitpicov <- splitpicov/nruns splitpilen <- mean(splitpilen) mnnd<-mean(nds) mnad<-mean(ads) list(mnnd=mnnd,mnad=mnad,laspicov=laspicov,laspilen=laspilen, LVSpicov=RLpicov,LVSpilen=RLpilen,lsplitpicov=lsplitpicov, lsplitpilen=lsplitpilen,splitpicov=splitpicov,splitpilen=splitpilen, beta=beta,k=k,noundfit=noundfit,n1=n1)} rand <- function(n = 100, k = 5){ #randomize n units into k treatment groups of nearly equal size #groups[i] gives the group=fold for the ith unit groups <- 1:n gsize <- as.integer(n/k) z <- sample(n) for(i in 1:k) { lo <- (i - 1) * gsize + 1 hi <- i * gsize if(i == k) hi <- n groups[z[lo:hi]] <- i } list(perm = z, groups = groups) } rcovxy<-function(x,y){ ##get 3 robust estimators of cov(x,Y) and make plots #RMVN, covmb2, and an estimator based on the MAD #only 2 robust estimators if n <= 2.5p x <- as.matrix(x) y <- as.vector(y) u <- cbind(x,y) tem <- x+y r1 <- apply(tem,2,mad) tem <- x-y r2 <- apply(tem,2,mad) rmadxy <- (r1^2 - r2^2)/4 n <- length(y) p <- dim(x)[2] q <- p+1 etahat <- cov(x,y) etaESP <- x%*% etahat rmadESP <- x%*%rmadxy cmb2xy<- as.vector(covmb2(u)$cov[(1:p),q]) rmb2ESP <- x%*%cmb2xy if(n > 2.5*p) {crmvnxy<- as.vector(covrmvn(u)$cov[(1:p),q]) rmvnESP <- x%*%crmvnxy pairs(cbind(y,etaESP,rmadESP,rmb2ESP,rmvnESP), labels = c("Y", "etaESP", "rmadESP", "rmb2ESP", "rmvnESP")) list(etahat=etahat,rmadxy=rmadxy,cmb2xy=cmb2xy,crmvnxy=crmvnxy)} else {pairs(cbind(y,etaESP,rmadESP,rmb2ESP), labels = c("Y", "etaESP", "rmadESP", "rmb2ESP")) list(etahat=etahat,rmadxy=rmadxy,cmb2xy=cmb2xy)} } regboot<-function(x,y, B = 1000){ #residual bootstrap for MLR #asymptotically the same as MSE (X'X)^(-1), so use to compare vselboot #out <- regboot(belx,bely) #ddplot4(out$betas) #this does not work for vselboot: too many 0s #mplot(out$betas) #cov(out$betas) full <- lsfit(x,y) res <- full$resid fit <- y - res n <- length(y) x <- as.matrix(x) p <- dim(x)[2]+1 betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ yb <- fit + sample(res,n,replace=T) betas[i,] <- lsfit(x,yb)$coef } list(betas=betas) } regbootsim<-function(n = 100, p = 4, nruns = 100, eps = 0.1, shift = 9, type = 1, alph = 0.05){ #Simulates residual bootstrap for regression. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k ones p - k zeroes q <- p-1 k <- floor(p/2) b <- 0 * 1:q b[1:(k-1)] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-regboot(x,y) #residual bootstrap for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p - k values of beta are 0 tem <- predreg(out$betas[,(k+1):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} regbootsim2<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi=0.0, BB=1000, alph = 0.05){ #Simulates residual bootstrap for regression (full model). #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-regboot(x,y,B=BB) #residual bootstrap for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 tem <- predreg(out$betas[,(k+2):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} regbootsim3<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi=0.0, BB=1000, alph = 0.05){ ##Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #Simulates residual bootstrap for regression (full model). #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 one <- as.vector(0*1:(k+1) + 1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-regboot(x,y,B=BB) #residual bootstrap bhat <- lsfit(x,y)$coef #bhat_OLS for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- bhat[(k+2):p] tem <- confreg(out$betas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k+1 values of beta are 1 gg <- k + 1 tstat <- bhat[1:(k+1)] tem <- confreg(out$betas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} relpisim2<-function(n = 100, p = 100, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alph = 0.05){ #Needs library(glmnet). Uses 10 fold CV and the Pelawa Watagoda and Olive (2017) PI. ##slow #Simulates PIs for lasso variable selection when p is not necessarily small. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 pilen <- 1:nruns ncvec<- pilen opicov <- 0 q <- p-1 vmax <- min(p,as.integer(n/5)) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] val <- 8*n/9 vars <- 1:q for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the 10 fold CV lasso model out<-cv.glmnet(x,y) lam <- out$lambda.min #get the lasso variable selection model lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] sub <- lsfit(x[,vin],y) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] fres <- sub$resid #get PI pp <- out$nzero[out$lambda==lam] + 1 #crude df for lasso variable selection ncvec[i] <- pp if(pp < val) corfac <- (1 + 15/n) * sqrt( (n+2*pp)/(n - pp) ) else corfac <- 5*(1+15/n) if (alph > 0.1) {qn <- min(1 - alph + 0.05, 1 - alph + pp/n)} if (alph <= 0.1) {qn <- min(1 - alph/2, 1 - alph + 10*alph*pp/n)} pn <- qn if(pn < 1 - alph + 0.001) qn <- 1 - alph alphan <- 1 - qn sres <- sort(fres) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } qncmn <- q + 1 - mean(ncvec) pimnlen <- mean(pilen) opicov <- opicov/nruns list(qncmn=qncmn,opicov=opicov, pimenlen = pimnlen)} ridgetype<-function(x,y,k=0.1,c=0.5){ #computes some ridge type regression estimators. #brr is the ridge regression estimator #bkl is the Kibria and Lukman (2020) estimator #bl is the Liu (1993) estimator using OLS #bkd is the Liu (2003) estimator with d = -k with ridge regression #need n > p to compute OLS #k = floor(n^(1/4)) may be a good choice x <- as.matrix(x) y <- as.vector(y) n <- length(y) q <- dim(x)[2] #q nontrivial predictors p <- q+1 one <- 0*1:n + 1 x1 <- cbind(one,x) I <- diag(p) #I_n = p x p identity matrix rrinv <- solve ( t(x1)%*%x1 + k * I) brr <- rrinv %*% t(x1)%*% y bols <- lsfit(x,y)$coef bkl <- rrinv %*% ( t(x1)%*%x1 - k * I) %*% bols bl <- solve ( t(x1)%*%x1 + I) %*% ( t(x1)%*%y + c * bols) bkd <- rrinv%*%(t(x1)%*% y + k * brr) fols <- x1%*%bols frr <- x1%*%brr fkl <- x1 %*% bkl fl <- x1 %*% bl fkd <- x1 %*% bkd pairs(cbind(y,fols,frr,fkl,fl,fkd),labels=c("Y","fOLS","fRR","fKL","fL","fkd")) list(brr=brr,bkl=bkl,bl=bl,bkd=bkd) } rinvrlar<-function(C, c=50, tau=0.05){ #Returns a regularized ``inverse" correlation matrix Rinvd = Ri(delta,0) and #Rinvt = Ri(delta,tau) given a positive semidefinite dispersion matrix C where #delta depends on the condition number c > = 1. #Ri is a generalized correlation matrix corresponding #to an analog of R inverse. #C = var(x), C = cor(x) and C = covmb2$cov are common. #If R is not well conditioned then regularize R slightly. R <- gcor(C)$R p<-dim(C)[2] out<-eigen(R,only.values=TRUE) if(out$values[p] ==0) { I <- diag(p) Rd <- (R + 0.01*I)/1.01 } else{ if(out$values[1]/out$values[p] <= 500){ Rd <- R} else{ I <- diag(p) Rd <- (R + 0.01*I)/1.01 }} A <- solve(Rd) d <- sqrt(diag(A)) di<- 1/d Da <- diag(d) Dai <- diag(di) #Dainverse Ri <- Dai %*% A %*% Dai #generalized correlation matrix tem <- corrlar(Ri,c=c,tau=tau) Rinvd <- Da%*% tem$Rd %*% Da Rinvt <- Da%*% tem$Rt %*% Da list(Rinvd=Rinvd,Rinvt=Rinvt) } RLPHboot<-function(x,time,status,B=1000){ #needs library(glmnet), library(survival), n > 5p, p > 2, want B >= 50p, #bootstraps the Cox regression lasso variable selection, takes a few minutes x <- as.matrix(x) n <- length(time) p <- dim(x)[2] vars <- 1:p y<-cbind(time,status) outlasso<-cv.glmnet(x,y,family="cox") lam <- outlasso$lambda.min lcoef <- as.vector(predict(outlasso,type="coefficients",s=lam)) vin <- vars[lcoef!=0] tx <- x[,vin] tdat <- as.data.frame(cbind(tx,time,status)) sub <- coxph(Surv(time, status) ~., data=tdat) bhatimin0 <- 0 * 1:p bhatimin0[vin] <- sub$coef betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ samp <- sample(1:n, replace=TRUE) tx <- x[samp,] ty <- y[samp,] ttime <- time[samp] tstatus <- status[samp] temp<-cv.glmnet(tx,ty,family="cox") lam <- temp$lambda.min lcoef <- as.vector(predict(temp,type="coefficients",s=lam)) vin <- vars[lcoef!=0] tx <- tx[,vin] tdat <- as.data.frame(cbind(tx,ttime,tstatus)) sub <- coxph(Surv(ttime, tstatus) ~., data=tdat) betas[i,vin] <- sub$coef } list(bhatimin0=bhatimin0,betas=betas) } RLPHboot2<-function(x,time,status,c=0.01,aug=F,B=1000){ #needs library(glmnet), library(survival), n > 5p, p > 2, want B >= 50p, #bootstraps the Cox regression lasso variable selection, #using bhatVS and bhatMIX, takes a few minutes #If augm neq F, adds cB full model bootstrap samples so S*_T is better #conditioned. x <- as.matrix(x) n <- length(time) p <- dim(x)[2] vars <- 1:p d <- ceiling(c*B) bpd <- B + d bp1 <- B + 1 y<-cbind(time,status) outlasso<-cv.glmnet(x,y,family="cox") lam <- outlasso$lambda.min lcoef <- as.vector(predict(outlasso,type="coefficients",s=lam)) vin <- vars[lcoef!=0] tx <- x[,vin] tdat <- as.data.frame(cbind(tx,time,status)) sub <- coxph(Surv(time, status) ~., data=tdat) bhatimin0 <- 0 * 1:p bhatimin0[vin] <- sub$coef betas <- matrix(0,nrow=B,ncol=p) btmix <- betas for(i in 1:B){ samp <- sample(1:n, replace=TRUE) tx <- x[samp,] ty <- y[samp,] ttime <- time[samp] tstatus <- status[samp] temp<-cv.glmnet(tx,ty,family="cox") lam <- temp$lambda.min lcoef <- as.vector(predict(temp,type="coefficients",s=lam)) vin <- vars[lcoef!=0] tx <- tx[,vin] tdat <- as.data.frame(cbind(tx,ttime,tstatus)) sub <- coxph(Surv(ttime, tstatus) ~., data=tdat) betas[i,vin] <- sub$coef # get betahatMIX* samp <- sample(1:n, replace=TRUE) tx <- x[samp,] ty <- y[samp,] ttime <- time[samp] tstatus <- status[samp] tx <- tx[,vin] tdat <- as.data.frame(cbind(tx,ttime,tstatus)) sub <- coxph(Surv(ttime, tstatus) ~., data=tdat) btmix[i,vin] <- sub$coef } if(aug == F) {betas <- betas[1:B,]; btmix <- btmix[1:B,]} else { for(i in bp1:bpd){ samp <- sample(1:n, replace=TRUE) tdat <- tdata[samp,] temp <- coxph(Surv(time, status) ~., data=tdat) betas[i,]<-temp$coef } btmix[bp1:bpd,] <- betas[bp1:bpd,] } list(bhatimin0=bhatimin0,betas=betas,btmix=btmix) } RLPHbootsim<-function(n=100,p=4,k=1,nruns=100,psi=0.0,B=1000,a=1,gam=1, clam=0.1,alpha=0.05){ #needs library(glmnet), library(survival), n > 5p, p > 2, want B >= 50p, #bootstraps the Cox regression lasso variable selection, takes a long time #Use 1 <= k < p so zeroes are in the model, k is the number of nonnoise variables. #there are p coefficients for beta in the Weibull regression data #need p > 1, beta_A = -(1/gam, ..., 1/gam, 0, ..., 0)^T with p-k zeroes # beta_P = (1,...,1,0,...,0)^T with k ones and p-k zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(p-2)psi^2]/[1 + (p-1)psi^2], i not = j # when the correlation exists. SP~N(0,a^2), and a near 1 is ok. rho <- (2*psi + (p-2)*psi^2)/(1 + (p-1)*psi^2) val <- a/sqrt(k*(1 + (p-1)*psi^2) + k*(k-1)*(2*psi + (p-2)*psi^2)) A <- matrix(psi,nrow=p,ncol=p) diag(A) <- 1 beta <- 0 * 1:p beta[1:k] <- 1 #beta[1:0] acts like beta[1:1] = beta[1] vars <- 1:p pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) zero <- 0 * 1:p one <- as.vector(0*1:k + 1) dd <- 1:nruns for(i in 1:nruns) { x <- matrix(rnorm(n * p), nrow = n, ncol = p) x <- val* x %*% A SP <- x%*%beta #SP_i ~ N(0,a^2) lambdai <- exp(SP) w <- rexp(n, rate = lambdai) y <- w^(1/gam) cen <- rexp(n, rate = clam) time <- pmin(y, cen) status <- as.numeric(cen >= y) tdata <- as.data.frame(cbind(x,time,status)) y<-cbind(time,status) outlasso<-cv.glmnet(x,y,family="cox") lam <- outlasso$lambda.min lcoef <- as.vector(predict(outlasso,type="coefficients",s=lam)) vin <- vars[lcoef!=0] tx <- x[,vin] tdat <- as.data.frame(cbind(tx,time,status)) sub <- coxph(Surv(time, status) ~., data=tdat) bhat <- zero bhat[vin] <- sub$coef dd[i]<-length(sub$coef) betas <- matrix(0,nrow=B,ncol=p) #nonparametric bootstrap for(m in 1:B){ samp <- sample(1:n, replace=TRUE) tx <- x[samp,] ty <- y[samp,] ttime <- time[samp] tstatus <- status[samp] temp<-cv.glmnet(tx,ty,family="cox") lam <- temp$lambda.min lcoef <- as.vector(predict(temp,type="coefficients",s=lam)) vin <- vars[lcoef!=0] tx <- tx[,vin] tdat <- as.data.frame(cbind(tx,ttime,tstatus)) sub <- coxph(Surv(ttime, tstatus) ~., data=tdat) betas[m,vin] <- sub$coef } for (j in 1:p){ tem <- shorth3(betas[,j],alpha=alpha) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] } #test whether the last p-k values of beta are 0 gg <- p - k tstat <- bhat[(k+1):p] tem <- confreg(betas[,(k+1):p],g=gg,that=tstat,alpha=alpha) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k values of beta are (1,...,1) gg <- k tstat <- bhat[1:k] tem <- confreg(betas[,1:k],g=gg,that=tstat,alpha=alpha) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns mndd <- mean(dd) list(mndd=mndd,cicov=cicov,avelen=avelen,beta=beta,k=k) } RLPHbootsim2<-function(n=100,p=4,k=2,nruns=100,psi=0.0,BB=1000,a=1,gam=1, clam=0.1,cc=0.01,augm=F,alph=0.05){ #needs library(glmnet), library(survival), n > 5p, p > 2, want B >= 50p ##Convergence problems is psi much larger than 1/sqrt(p). #Bootstraps the Cox regression lasso variable selection and bhatMIX. # takes a long time #Use 1 <= k < p so zeroes are in the model, k is the number of nonnoise variables. #there are p coefficients for beta in the Weibull regression data #need p > 1, beta_A = -(1/gam, ..., 1/gam, 0, ..., 0)^T with p-k zeroes # beta_P = (1,...,1,0,...,0)^T with k ones and p-k zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(p-2)psi^2]/[1 + (p-1)psi^2], i not = j # when the correlation exists. SP~N(0,a^2), and a near 1 is ok. rho <- (2*psi + (p-2)*psi^2)/(1 + (p-1)*psi^2) val <- a/sqrt(k*(1 + (p-1)*psi^2) + k*(k-1)*(2*psi + (p-2)*psi^2)) A <- matrix(psi,nrow=p,ncol=p) diag(A) <- 1 beta <- 0 * 1:p beta[1:k] <- 1 #beta[1:0] acts like beta[1:1] = beta[1] vars <- 1:p pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) cicovmix <- cicov avelenmix <- avelen zero <- 0 * 1:p one <- as.vector(0*1:k + 1) dd <- 1:nruns for(i in 1:nruns) { x <- matrix(rnorm(n * p), nrow = n, ncol = p) x <- val* x %*% A SP <- x%*%beta #SP_i ~ N(0,a^2) lambdai <- exp(SP) w <- rexp(n, rate = lambdai) y <- w^(1/gam) cen <- rexp(n, rate = clam) time <- pmin(y, cen) status <- as.numeric(cen >= y) #make a Weibull PH model out <-RLPHboot2(x,time,status,c=cc,aug=augm,B=BB) #bootstrap the lasso variable selection model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] tem <- shorth3(out$btmix[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicovmix[j] <- cicovmix[j] + 1 avelenmix[j] <- avelenmix[j] + tem$shorth[2] - tem$shorth[1] } #test whether the last p-k values of beta are 0 gg <- p - k tstat <- out$bhatimin0[(k+1):p] tem <- confreg(out$betas[,(k+1):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim tem <- confreg(out$btmix[,(k+1):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicovmix[pp1] <- cicovmix[pp1] + 1 avelenmix[pp1] <- avelenmix[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicovmix[pp2] <- cicovmix[pp2] + 1 avelenmix[pp2] <- avelenmix[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicovmix[pp3] <- cicovmix[pp3] + 1 avelenmix[pp3] <- avelenmix[pp3] + tem$brlim #test whether the first k values of beta are (1,...,1) gg <- k tstat <- out$bhatimin0[1:k] tem <- confreg(out$betas[,1:k],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim tem <- confreg(out$btmix[,1:k],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicovmix[pp4] <- cicovmix[pp4] + 1 avelenmix[pp4] <- avelenmix[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicovmix[pp5] <- cicovmix[pp5] + 1 avelenmix[pp5] <- avelenmix[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicovmix[pp6] <- cicovmix[pp6] + 1 avelenmix[pp6] <- avelenmix[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns cicovmix <- cicovmix/nruns avelenmix <- avelenmix/nruns list(cicov=cicov,avelen=avelen,cicovmix=cicovmix,avelenmix=avelenmix, beta=beta,k=k) } rowboot<-function(x,y, B = 1000){ #rowwise nonparametric bootstrap for MLR #use if (y^T, x^T) are iid from some distribution #out <- rowboot(belx,bely) #ddplot4(out$betas) #mplot(out$betas) #cov(out$betas) full <- lsfit(x,y) n <- length(y) x <- as.matrix(x) p <- dim(x)[2]+1 indx <- 1:n betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ tem <- sample(indx,n,replace=T) betas[i,] <- lsfit(x[tem,],y[tem])$coef } list(betas=betas,full=full) } Rsqboot<-function(x, y, B = 1000){ #bootstraps the coefficient of determination R^2 #the residual bootstrap is used n <- length(y) x <- as.matrix(x) Rsqs <- 1:B full <- lsfit(x,y) res <- full$resid fit <- y - res for(i in 1:B){ yb <- fit + sample(res,n,replace=T) ybfit <- yb - lsfit(x,yb)$resid Rsqs[i] <- (cor(yb,ybfit))^2 } list(Rsqs=Rsqs) } Rsqbootsim<-function(n = 100, p = 4, BB=1000, nruns = 100, type = 1, psi = 0.0, dd=3, eps = 0.1, cc=0.0, shift = 9.0, alph = 0.05){ #Simulates bootstrap for R^2. # Use 0 <= psi < 1. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients # want p > 2 so pop R^2 formula is correct # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(p-3)psi^2]/[1 + (p-2)psi^2], i not = j, p > 2 # when the correlation exists. #Makes the shorth CI [Ln,Un], the lower CI [0,U] and the upper CI [L,1] #Only the lower CI should work well if b = 0. cicov <- 0 avelen <- 0 ucicov <- 0 uavelen <- 0 lcicov <- 0 lavelen <- 0 q <- p-1 b <- 0 * 1:q + cc #b = 0 means R^2 -> 0 as n -> oo #Y = 1 + cc x2 + ... + cc xp + e = a + b^T u + e, b = (cc, ..., cc)^T rho <- (2*psi + (p-3)*psi^2)/(1 + (p-2)*psi^2) sigLsq <- cc^2*(p-1 + 2*rho*p*(p-1)/2) sigesq <- 1 if(type == 2) sigesq <- 1 if(type == 3) sigesq <- 3 if(type == 4) sigesq <- 1/3 if(type == 5) sigesq <- 1 - eps + eps*(1+shift)^2 #10.9 is the default poprsq <- sigLsq/(sigesq + sigLsq) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = dd) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } rsqs <-Rsqboot(x, y, B= BB)$Rsqs #residual bootstrap tem <- shorthLU(rsqs,alpha=alph) if(poprsq >= tem$shorth[1] && poprsq <= tem$shorth[2]) cicov <- cicov + 1 avelen <- avelen + tem$shorth[2] - tem$shorth[1] if(poprsq <= tem$right) lcicov <- lcicov + 1 lavelen <- lavelen + tem$right if(poprsq >= tem$left) ucicov <- ucicov + 1 uavelen <- uavelen + 1 - tem$left } cicov <- cicov/nruns avelen <- avelen/nruns lcicov <- lcicov/nruns lavelen <- lavelen/nruns ucicov <- ucicov/nruns uavelen <- uavelen/nruns list(rho=rho, sigesq=sigesq, sigLsq = sigLsq, poprsq = poprsq, cicov=cicov, avelen=avelen,lcicov=lcicov,lavelen=lavelen, ucicov=ucicov,uavelen=uavelen)} shorth2<-function(y, alpha = 0.05){ # computes the shorth(c) interval [Ln,Un] for c = ceiling[n(1-alpha)]. #shorth3 is superior with the Frey correction n <- length(y) cc <- ceiling(n * (1 - alpha)) sy <- sort(y) rup <- sy[cc] rlow <- sy[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sy[j] rlow <- sy[j - cc + 1] } } } Ln <- rlow; Un <- rup list(shorth=c(Ln, Un)) } shorth3<-function(y, alpha = 0.05){ # computes the shorth(c) interval [Ln,Un] for c = cc. #shorth lists Ln and Un using Frey's correction n <- length(y) cc <- ceiling(n * (1 - alpha + 1.12*sqrt(alpha/n))) cc <- min(n,cc) sy <- sort(y) rup <- sy[cc] rlow <- sy[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sy[j] rlow <- sy[j - cc + 1] } } } Ln <- rlow; Un <- rup list(shorth=c(Ln, Un)) } shorthLU<-function(y, alpha = 0.05){ # computes the shorth(c) interval [Ln,Un] for c = cc. #shorth lists Ln and Un using Frey's correction #also gets left endpoint and right endpoints for one sided lower and upper CIs #[left,infty) and (-infty,right]. #So left is a lower bound and right an upper bound for the parameter theta. n <- length(y) cc <- ceiling(n * (1 - alpha + 1.12*sqrt(alpha/n))) cc <- min(n,cc) sy <- sort(y) rup <- sy[cc] rlow <- sy[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sy[j] rlow <- sy[j - cc + 1] } } } Ln <- rlow; Un <- rup left <- sy[n-cc+1] right <- sy[cc] list(shorth=c(Ln, Un),left=left,right=right) } shpi<-function(yf=0, ydat, alph = 0.05){ #Gets the Frey (2013) PI for Yf given ydata. ydat<-as.vector(ydat) B<-length(ydat) inr <- 0 cc <- ceiling(B * (1 - alph + 1.12*sqrt(alph/B))) cc <- min(B,cc) sy <- sort(ydat) up <- sy[cc] low <- sy[1] olen <- up - low if(cc < B){ for(j in (cc + 1):B){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen up <- sy[j] low <- sy[j - cc + 1] } } } if(low <= yf && up >= yf) inr <- inr + 1 list(low=low, up=up, inr = inr)} srrpisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alpha = 0.05){ #Needs library(glmnet). #Use 1 <= k <= p-1, where k is the number of nonnoise variables. #Uses the two Pelawa Watagoda and Olive (2018) PIs. SLOW. #Simulates PIs for lasso, scaled lasso, ridge regression, and scaled #ridge regression. Lasso and RR use 10-fold cross validation. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 laspilen <- 1:nruns laspicov <- 0 slaspilen <- 1:nruns slaspicov <- 0 vlaspilen <- 1:nruns vlaspicov <- 0 vslaspilen <- 1:nruns vslaspicov <- 0 rrpilen <- 1:nruns rrpicov <- 0 srrpilen <- 1:nruns srrpicov <- 0 vrrpilen <- 1:nruns vrrpicov <- 0 vsrrpilen <- 1:nruns vsrrpicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) dd <- 1:nruns vdd <- dd indx <- 1:n nH <- ceiling(n/2) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #get lasso PI out<-cv.glmnet(x,y) lam <- out$lambda.min fit <- predict(out,s=lam,newx=x) yfhat <- predict(out,s=lam,newx=xf) res <- y - fit lcoef2 <- predict(out,type="coefficients",s=lam) lcoef2 <- as.vector(lcoef2) #bhat lcoef<-lcoef2[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 dd[i]<- pp #want these to be near but >= k+1 tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) laspilen[i] <- tem$up - tem$low laspicov <- laspicov + tem$inr #get scaled lasso PI temp <- lsfit(fit,y) res <- temp$resid #scaled lasso residuals bhat <- temp$coef[2]*lcoef2 bhat[1] <- bhat[1] + temp$coef[1] #scaled bhat yfhat <- bhat[1] + xf%*%bhat[-1] tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) slaspilen[i] <- tem$up - tem$low slaspicov <- slaspicov + tem$inr # get the validation PI for lasso #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] #find the 10 fold CV lasso model out<-cv.glmnet(xH,yH) lam <- out$lambda.min vp <- out$nzero[out$lambda==lam] + 1 #d for half set lasso vdd[i] <- vp yfhat <- predict(out,s=lam,newx=xf) valfit <- predict(out,s=lam,newx=x[-H,]) valres <- y[-H] - valfit #need as.matrix to prevent an error if vin has 1 variable #get asymptotically optimal PI using validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth up <- yfhat + vpi[2] low <- yfhat + vpi[1] vlaspilen[i] <- up - low if(low < yf && up > yf) vlaspicov <- vlaspicov + 1 #get the validation PI for scaled lasso halffit <- predict(out,s=lam,newx=x[H,]) #yhat for H cases temp <- lsfit(halffit,yH) lcoef2 <- predict(out,type="coefficients",s=lam) #bhat_H bhat <- temp$coef[2]*lcoef2 bhat[1] <- bhat[1] + temp$coef[1] #scaled bhat_H yfhat <- bhat[1] + xf%*%bhat[-1] valfit <- bhat[1] + x[-H,]%*%bhat[-1] valres <- y[-H] - valfit #scaled lasso validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth up <- yfhat + vpi[2] low <- yfhat + vpi[1] vslaspilen[i] <- up - low if(low < yf && up > yf) vslaspicov <- vslaspicov + 1 #get ridge regression PI out<-cv.glmnet(x,y,alpha=0) lcoef2 <- predict(out,type="coefficients",s=lam) lcoef2 <- as.vector(lcoef2) #bhat lam <- out$lambda.min fit <- predict(out,s=lam,newx=x) yfhat <- predict(out,s=lam,newx=xf) res <- y - fit #crude d is using lasso d tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) rrpilen[i] <- tem$up - tem$low rrpicov <- rrpicov + tem$inr #get scaled RR PI temp <- lsfit(fit,y) res <- temp$resid #scaled RR residuals bhat <- temp$coef[2]*lcoef2 bhat[1] <- bhat[1] + temp$coef[1] #scaled bhat yfhat <- bhat[1] + xf%*%bhat[-1] tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) srrpilen[i] <- tem$up - tem$low srrpicov <- srrpicov + tem$inr #get the validation PI for ridge regression out<-cv.glmnet(xH,yH,alpha=0) lcoef2 <- predict(out,type="coefficients",s=lam) lcoef2 <- as.vector(lcoef2) #bhat_H lam <- out$lambda.min yfhat <- predict(out,s=lam,newx=xf) valfit <- predict(out,s=lam,newx=x[-H,]) valres <- y[-H] - valfit #need as.matrix to prevent an error if vin has 1 variable #get asymptotically optimal PI using validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth up <- yfhat + vpi[2] low <- yfhat + vpi[1] vrrpilen[i] <- up - low if(low < yf && up > yf) vrrpicov <- vrrpicov + 1 #get the validation PI for scaled ridge regression halffit <- predict(out,s=lam,newx=x[H,]) #yhat for H cases temp <- lsfit(halffit,yH) bhat <- temp$coef[2]*lcoef2 bhat[1] <- bhat[1] + temp$coef[1] #scaled bhat_H yfhat <- bhat[1] + xf%*%bhat[-1] valfit <- bhat[1] + x[-H,]%*%bhat[-1] valres <- y[-H] - valfit #scaled RR validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth up <- yfhat + vpi[2] low <- yfhat + vpi[1] vsrrpilen[i] <- up - low if(low < yf && up > yf) vsrrpicov <- vsrrpicov + 1 } laspimnlen <- mean(laspilen) laspicov <- laspicov/nruns slaspimnlen <- mean(slaspilen) slaspicov <- slaspicov/nruns vlaspimnlen <- mean(vlaspilen) vlaspicov <- vlaspicov/nruns vslaspimnlen <- mean(vslaspilen) vslaspicov <- vslaspicov/nruns rrpimnlen <- mean(rrpilen) rrpicov <- rrpicov/nruns srrpimnlen <- mean(srrpilen) srrpicov <- srrpicov/nruns vrrpimnlen <- mean(vrrpilen) vrrpicov <- vrrpicov/nruns vsrrpimnlen <- mean(vsrrpilen) vsrrpicov <- vsrrpicov/nruns dd<-mean(dd) vdd<-mean(vdd) #lasso and scaled lasso have the same d list(dlas=dd,dvlas=vdd,laspicov=laspicov,laspimenlen=laspimnlen,slaspicov= slaspicov,slaspimenlen=slaspimnlen,vlaspicov=vlaspicov,vlaspimenlen= vlaspimnlen,vslaspicov=vslaspicov,vslaspimenlen=vslaspimnlen,rrpicov=rrpicov, rrpimenlen=rrpimnlen,srrpicov=srrpicov,srrpimenlen=srrpimnlen,vrrpicov= vrrpicov,vrrpimenlen=vrrpimnlen,vsrrpicov=vsrrpicov,vsrrpimenlen=vsrrpimnlen)} subfullboot<-function(x,y,vin=c(1,2), B = 1000){ #bootstrap submodel with full OLS model residuals for MLR #asymptotic cov matrix for beta_I is (n-p)/n MSE (X_I'X_I)^(-1) #out <- subfullboot(x,y,vin=c(1,2)) #ddplot4(out$betas[vin,]) #mplot(out$betas) #cov(out$betas[vin,]) full <- lsfit(x,y) res <- full$resid fit <- y - res n <- length(y) x <- as.matrix(x) p <- dim(x)[2]+1 psub <- length(vin)+1 betas <- matrix(0,nrow=B,ncol=psub) for(i in 1:B){ yb <- fit + sample(res,n,replace=T) betas[i,] <- lsfit(x[,vin],yb)$coef } list(betas=betas) } tplot2<-function(x, y, type = 1){ #library(glmnet), library(pls), library(leaps) # For type = 1 want n >= Jp with J >= 5 and preferably J >= 10. # Use the rightmost mouse button to advance # the plot, and in R, highlight ``stop." # Values of y should be positive. # Assume the MLR model contains a constant, but # x is the design matrix without the vector of ones. # This function makes transformation plots from the MLR # of y^L on x where L = -1, -0.5, -1/3, 0, 1/3, 0.5, or 1 # except log(Y) is used if L = 0. #type = 1 full model OLS, type = 2, elastic net, type = 3 lasso #type = 4 ridge regression, type = 5 PLS, type = 6 PCR #type = 7 forward selection using EBIC if n<10p and min Cp if n >=10p. #elastic net, lasso, ridge regression, PLS and PCR use 10 fold CV # If plot is linear for L, use y^L (or log(Y) for L = 0) instead # of Y. This is a graphical method for a response transform. x<- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] p <- q+1 nc <- ceiling(n/5)-1 nc <- min(nc,q) nc <- max(nc,1) vars <- as.vector(1:(p-1)) lad <- c(-1, -1/2, -1/3, 0, 1/3, 1/2, 1) xl <- c("1/Z", "1/sqrt(Z))", "Z**(-1/3)", "LOG(Z)", "Z**(1/3)", "sqrt(Z)", "Z") for(i in 1:length(lad)) { if(lad[i] == 0) ytem <- log(y) else ytem <- y^lad[i] if(type ==1) #full model OLS fit <- ytem - lsfit(x,ytem)$resid if(type == 2) #elastic net with CV fit <- enet2(x,ytem)$yhat if(type==3){out <- cv.glmnet(x,ytem,alpha=1) #lasso lam <- out$lambda.min fit <- predict(out,s=lam,newx=x)} if(type==4){out <- cv.glmnet(x,ytem,alpha=0) #ridge regression lam <- out$lambda.min fit <- predict(out,s=lam,newx=x)} if(type==5){w <- as.data.frame(cbind(ytem,x)) out<-plsr(ytem~.,data=w,scale=T,validation="CV") tem<-MSEP(out) cvmse<-tem$val[,,1:(out$ncomp+1)][1,] nc <-max(which.min(cvmse)-1,1) fit<-ytem-out$residuals[,,nc]} if(type==6){w <- as.data.frame(cbind(ytem,x)) out<-pcr(ytem~.,data=w,scale=T,validation="CV") tem<-MSEP(out) cvmse<-tem$val[,,1:(out$ncomp+1)][1,] nc <-max(which.min(cvmse)-1,1) fit<-ytem-out$residuals[,,nc]} if(type==7){temp<-regsubsets(x,ytem,nvmax=nc,method="forward") out<-summary(temp) if(n < 10*p) { xx <- 1:min(length(out$bic),p-1)+1 ebic <- out$bic+2*(-lgamma(xx+1)-lgamma(p-xx+1)) minebic <- out$which[ebic==min(ebic),] #do not need the constant in vin vin <- vars[minebic[-1]] } else { mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]]} sub <- lsfit(x[,vin],ytem) fit <- ytem - sub$resid } plot(fit, ytem, xlab = "TZHAT", ylab = xl[i]) abline(0, 1) if(type==1) title("Full OLS") if(type==2) title("Elastic Net") if(type==3) title("Lasso") if(type==4) title("Ridge Regression") if(type==5) title("PLS") if(type==6) title("PCR") if(type==7) title("Forward Selection") identify(fit, ytem) } } tpls<-function(x,y){ #find the two component PLS estimator for MLR #needs library(pls) x<-as.matrix(x) #best if x is not a vector if(dim(x)[2] ==1){ out <- lsfit(x,y) b2pls <- out$coef[-1] b2 <- b2pls } else{ w1 <- cov(x,y) w2 <- cov(x) %*% w1 Ahat <- t(cbind(w1,w2)) w <- x %*% cbind(w1,w2) out <- lsfit(w,y) gammahat <- out$coef[-1] b2pls <- t(Ahat)%*%gammahat z <- as.data.frame(cbind(y,x)) out<-plsr(y~.,ncomp=2,data=z) b2 <- coef(out,intercept=TRUE)[-1]} list(b2pls=b2pls,b2=b2) } valdvspisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, J = 20, alpha = 0.05){ #Needs library(leaps). #Uses d approx min(p,n/J) variables. Does not need n/p large. #Uses the Pelawa Watagoda and Olive (2017) validation residuals PI. #Simulates PIs for forward selection variable selection. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 val <- 8*n/9 fselpilen <- 1:nruns fselpicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) nc <- ceiling(n/J)-1 nc <- min(nc,q) nc <- max(nc,1) pp <- nc+1 indx <- 1:n nH <- ceiling(n/2) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] #get forward selection PI temp<-regsubsets(xH,yH,nvmax=nc,method="forward") out<-summary(temp) num <- length(out$cp) mod <- out$which[num,] #do not need the constant in vin vin <- vars[mod[-1]] sub <- lsfit(xH[,vin],yH) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] valres <- y[-H] - sub$coef[1] - as.matrix(x[-H,vin]) %*% sub$coef[-1] #need as.matrix to prevent an error if vin has 1 variable #get asymptotically optimal PI using validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth up <- yfhat + vpi[2] low <- yfhat + vpi[1] fselpilen[i] <- up - low if(low < yf && up > yf) fselpicov <- fselpicov + 1 } fselpimnlen <- mean(fselpilen) fselpicov <- fselpicov/nruns list(dvalfselpicov=fselpicov, dvalfselpimenlen=fselpimnlen)} valrelpisim<-function(n = 100, p = 100, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alpha = 0.05){ #Needs library(glmnet). ##slow 10 fold CV, #### can fail to run if nruns=5000 #Uses the Pelawa Watagoda and Olive (2017) validation residuals PI #and split conformal PI for lasso variable selection when p is not necessarily small. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 pilen <- 1:nruns ncvec<- pilen opicov <- 0 splitpilen <- pilen splitpicov <- 0 q <- p-1 vmax <- min(p,as.integer(n/5)) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- 1:q indx <- 1:n nH <- ceiling(n/2) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] #find the 10 fold CV lasso model out<-cv.glmnet(xH,yH) lam <- out$lambda.min pp <- out$nzero[out$lambda==lam] + 1 #crude df for lasso variable selection ncvec[i] <- pp #get the lasso variable selection model lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] sub <- lsfit(xH[,vin],yH) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] valres <- y[-H] - sub$coef[1] - as.matrix(x[-H,vin]) %*% sub$coef[-1] #need as.matrix to prevent an error if vin has 1 variable #get asymptotically optimal PI using validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth up <- yfhat + vpi[2] low <- yfhat + vpi[1] pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 #get the split conformal PI absres <- abs(valres) aq <- quantile(absres, 1-alpha) up <- yfhat + aq low <- yfhat - aq splitpilen[i] <- up - low if(low < yf && up > yf) splitpicov <- splitpicov + 1 } qncmn <- q + 1 - mean(ncvec) pimnlen <- mean(pilen) opicov <- opicov/nruns splitpimnlen <- mean(splitpilen) splitpicov <- splitpicov/nruns list(qncmn=qncmn,valrelpicov=opicov, valrelpimenlen = pimnlen,splitpicov=splitpicov,splitpimenlen=splitpimnlen)} valrrpisim<-function(n = 100, p = 100, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alpha = 0.05, pitype = 1){ #Needs library(glmnet). Does lasso or RR. ##slow 10 fold CV, #Uses the Pelawa Watagoda and Olive (2019) validation residuals PI. #Simulates PIs for lasso if pitype = 1, ridge regression if pitype = 0 # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 pilen <- 1:nruns ncvec<- pilen opicov <- 0 splitpilen <- pilen splitpicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] indx <- 1:n nH <- ceiling(n/2) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] #find the 10 fold CV lasso or ridge regression model out<-cv.glmnet(xH,yH,alpha=pitype) lam <- out$lambda.min pp <- out$nzero[out$lambda==lam] + 1 #d for half set lasso ncvec[i] <- pp yfhat <- predict(out,s=lam,newx=xf) valfit <- predict(out,s=lam,newx=x[-H,]) valres <- y[-H] - valfit #need as.matrix to prevent an error if vin has 1 variable #get asymptotically optimal PI using validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth up <- yfhat + vpi[2] low <- yfhat + vpi[1] pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 #get the split conformal PI absres <- abs(valres) aq <- quantile(absres, 1-alpha) up <- yfhat + aq low <- yfhat - aq splitpilen[i] <- up - low if(low < yf && up > yf) splitpicov <- splitpicov + 1 } qncmn <- q + 1 - mean(ncvec) pimnlen <- mean(pilen) opicov <- opicov/nruns splitpimnlen <- mean(splitpilen) splitpicov <- splitpicov/nruns list(qncmn=qncmn,valpicov=opicov,valpimenlen=pimnlen,splitpicov=splitpicov, splitpimenlen = splitpimnlen)} valrrpisim2<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alpha = 0.05){ #Needs library(glmnet). Does lasso and RR. #Use 1 <= k <= p-1, where k is the number of nonnoise variables. #Uses the Pelawa Watagoda and Olive (2019) shorth PI and split conformal PI #with validation residuals for lasso and RR with 10-fold CV. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 claspilen <- 1:nruns claspicov <- 0 vlaspilen <- 1:nruns vlaspicov <- 0 vrrpilen <- 1:nruns vrrpicov <- 0 crrpilen <- 1:nruns crrpicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) dd <- 1:nruns vdd <- dd indx <- 1:n nH <- ceiling(n/2) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set # get the validation PI for lasso #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] #find the 10 fold CV lasso model out<-cv.glmnet(xH,yH) lam <- out$lambda.min vp <- out$nzero[out$lambda==lam] + 1 #d for half set lasso vdd[i] <- vp yfhat <- predict(out,s=lam,newx=xf) valfit <- predict(out,s=lam,newx=x[-H,]) valres <- y[-H] - valfit #need as.matrix to prevent an error if vin has 1 variable #get asymptotically optimal PI using validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth up <- yfhat + vpi[2] low <- yfhat + vpi[1] vlaspilen[i] <- up - low if(low < yf && up > yf) vlaspicov <- vlaspicov + 1 #get split conformal PI for lasso absres <- abs(valres) aq <- quantile(absres, 1-alpha) up <- yfhat + aq low <- yfhat - aq claspilen[i] <- up - low if(low < yf && up > yf) claspicov <- claspicov + 1 #get the validation PI for ridge regression out<-cv.glmnet(xH,yH,alpha=0) # lcoef2 <- predict(out,type="coefficients",s=lam) # lcoef2 <- as.vector(lcoef2) #bhat_H lam <- out$lambda.min yfhat <- predict(out,s=lam,newx=xf) valfit <- predict(out,s=lam,newx=x[-H,]) valres <- y[-H] - valfit #need as.matrix to prevent an error if vin has 1 variable #get asymptotically optimal PI using validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth up <- yfhat + vpi[2] low <- yfhat + vpi[1] vrrpilen[i] <- up - low if(low < yf && up > yf) vrrpicov <- vrrpicov + 1 #get the split conformal PI for RR absres <- abs(valres) aq <- quantile(absres, 1-alpha) up <- yfhat + aq low <- yfhat - aq crrpilen[i] <- up - low if(low < yf && up > yf) crrpicov <- crrpicov + 1 } vlaspimnlen <- mean(vlaspilen) vlaspicov <- vlaspicov/nruns claspimnlen <- mean(claspilen) claspicov <- claspicov/nruns vrrpimnlen <- mean(vrrpilen) vrrpicov <- vrrpicov/nruns crrpimnlen <- mean(crrpilen) crrpicov <- crrpicov/nruns vdd<-mean(vdd) #lasso and scaled lasso have the same d list(dvlas=vdd,vlaspicov=vlaspicov,vlaspimenlen=vlaspimnlen,claspicov=claspicov,claspimenlen=claspimnlen,vrrpicov=vrrpicov,vrrpimenlen=vrrpimnlen, crrpicov=crrpicov,crrpimenlen=crrpimnlen)} valvspisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alpha = 0.05){ #Needs library(leaps). min Cp Olive (2017) validation residuals PI #Simulates PIs for forward selection variable selection. #Needs p > 2. #Needs n/p large (n > 2p) since the min Cp criterion is used. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. #set.seed(974) pilen <- 1:nruns ps <- pilen opicov <- 0 q <- p-1 vmax <- min(p,as.integer(n/5)) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) indx <- 1:n nH <- ceiling(n/2) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] #find the forward sel minimum Cp model on the training half set H tem<-regsubsets(xH,yH,nvmax=vmax,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] sub <- lsfit(xH[,vin],yH) ps[i]<-length(sub$coef) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] valres <- y[-H] - sub$coef[1] - as.matrix(x[-H,vin]) %*% sub$coef[-1] #need as.matrix to prevent an error if vin has 1 variable #get asymptotically optimal PI using validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth up <- yfhat + vpi[2] low <- yfhat + vpi[1] pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } psmn <- mean(ps)-k #0 if subset is selecting optimal subset pimnlen <- mean(pilen) opicov <- opicov/nruns list(psmn=psmn,opicov=opicov, pimenlen = pimnlen)} valvspisim2<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alpha = 0.05){ #Needs library(leaps). #EBIC or min Cp: Pelawa Watagoda and Olive (2019) validation residuals PI #and split conformal PI #Simulates PIs for forward selection variable selection. #Needs p > 2. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors #4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) pilen <- 1:nruns ps <- pilen opicov <- 0 splitpilen <- pilen splitpicov <- 0 q <- p-1 vmax <- min(p,as.integer(n/5)) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) indx <- 1:n nH <- ceiling(n/2) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] #find the forward sel minimum Cp or EBIC model on the training half set H tem<-regsubsets(xH,yH,nvmax=vmax,method="forward") out<-summary(tem) if(nH < 10*p) { xx <- 1:min(length(out$bic),p-1)+1 ebic <- out$bic+2*(-lgamma(xx+1)-lgamma(p-xx+1)) minebic <- out$which[ebic==min(ebic),] #do not need the constant in vin vin <- vars[minebic[-1]] } #if nH >= 10p use min Cp model else { mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] } sub <- lsfit(xH[,vin],yH) ps[i]<-length(sub$coef) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] valres <- y[-H] - sub$coef[1] - as.matrix(x[-H,vin]) %*% sub$coef[-1] #need as.matrix to prevent an error if vin has 1 variable #get asymptotically optimal PI using validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth up <- yfhat + vpi[2] low <- yfhat + vpi[1] pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 #get the split conformal PI absres <- abs(valres) aq <- quantile(absres, 1-alpha) up <- yfhat + aq low <- yfhat - aq splitpilen[i] <- up - low if(low < yf && up > yf) splitpicov <- splitpicov + 1 } psmn <- mean(ps)-k #0 if subset is selecting optimal subset pimnlen <- mean(pilen) opicov <- opicov/nruns splitpimnlen <- mean(splitpilen) splitpicov <- splitpicov/nruns list(psmn=psmn,opicov=opicov,pimenlen=pimnlen,splitpicov=splitpicov, splitpimenlen = splitpimnlen)} vecw <- function(z){ #Computes vec(z): stacks the columns of z into a column vector. z <- as.matrix(z) vecz <- as.vector(z[,1]) p <- dim(z)[2] if(p > 1){ for(i in 2:p) vecz <- c(vecz,as.vector(z[,i])) } list(vecz=vecz) } vsbiccisim<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, BB=1000, alph = 0.05){ #Needs library(leaps). Has psi and used 3 confidence regions for CIs. #Gets CIs for beta_i with shorth, pred reg, Bickel and Ren methods. #Simulates bootstrap for forward selection variable selection. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) scicov <- 0*(1:p) savelen <- scicov prcicov <- scicov pravelen <- scicov brcicov <- scicov bravelen <- scicov rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 one <- as.vector(0*1:(k+1) + 1) up <- min((1 - alph/2), (1 - alph + 10*alph/BB)) if(alph > 0.1) up <- min((1 - alph + 0.05), (1 - alph + 1/BB)) qB <- up if(qB < 1 - alph + 0.001) up <- 1 - alph for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-bicboot(x,y,B=BB) #bootstrap the forward sel minimum BIC model for (j in 1:p){ #get shorth CIs tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) scicov[j] <- scicov[j] + 1 savelen[j] <- savelen[j] + tem$shorth[2] - tem$shorth[1] #get prediction region CIs tem <- betaci(t=out$betas[,j],bhat=out$bhatimin0[j],up=up) if(beta[j] >= tem$prci[1] && beta[j] <= tem$prci[2]) prcicov[j] <- prcicov[j] + 1 pravelen[j] <- pravelen[j] + tem$prcilen #get Bickel and Ren CIs if(beta[j] >= tem$brci[1] && beta[j] <= tem$brci[2]) brcicov[j] <- brcicov[j] + 1 bravelen[j] <- bravelen[j] + tem$brcilen } } scicov <- scicov/nruns savelen <- savelen/nruns prcicov <- prcicov/nruns pravelen <- pravelen/nruns brcicov <- brcicov/nruns bravelen <- bravelen/nruns list(scicov=scicov,savelen=savelen,prcicov=prcicov,pravelen=pravelen, brcicov=brcicov,bravelen=bravelen,beta=beta,k=k)} vsbootsim<-function(n = 100, p = 4, nruns = 100, eps = 0.1, shift = 9, type = 1, alph = 0.05){ #Needs library(leaps). For all subsets variable selection so very slow. #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for variable selection using all subsets variable #selection. So need p small. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k ones p - k zeroes q <- p-1 k <- floor(p/2) b <- 0 * 1:q b[1:(k-1)] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-vselboot(x,y) #bootstrap the minimum Cp model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p - k values of beta are 0 tem <- predreg(out$betas[,(k+1):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} vsbootsim2<-function(n = 100, p = 4, nruns = 100, eps = 0.1, shift = 9, type = 1, alph = 0.05){ #Needs library(leaps). No psi. #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for forward selection variable selection. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k ones p - k zeroes q <- p-1 k <- floor(p/2) b <- 0 * 1:q b[1:(k-1)] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-fselboot(x,y) #bootstrap the forward sel minimum Cp model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p - k values of beta are 0 tem <- predreg(out$betas[,(k+1):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} vsbootsim3<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, BB=1000, alph = 0.05){ #Needs library(leaps). Has psi and uses the prediction region method. #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for forward selection variable selection. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-fselboot(x,y,B=BB) #bootstrap the forward sel minimum Cp model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 tem <- predreg(out$betas[,(k+2):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} vsbootsim4<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, BB=1000, alph = 0.05){ #Needs library(leaps). Has psi and used 3 confidence regions. #Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for forward selection variable selection. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 one <- as.vector(0*1:(k+1) + 1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-fselboot(x,y,B=BB) #bootstrap the forward sel minimum Cp model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- out$bhatimin0[(k+2):p] tem <- confreg(out$betas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k+1 values of beta are 1 gg <- k + 1 tstat <- out$bhatimin0[1:(k+1)] tem <- confreg(out$betas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} vsbootsim5<-function(n=100,p=4,k=1,nruns=100, eps=0.1,shift=9, cc=0.01, augm=F,type = 1, psi = 0.0, BB=1000, alph = 0.05){ #Needs library(leaps). Has psi and used 3 confidence regions. #Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for forward selection variable selection and bhatMIX. # Needs fselboot2 and confreg. Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. If augm neq F, adds cB full model bootstrap samples so S*_T is better #conditioned. The constant = 1 so there are p = q+1 coefficients. #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) cicovmix <- cicov avelenmix <- avelen rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 one <- as.vector(0*1:(k+1) + 1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-fselboot2(x,y,B=BB,c=cc,aug=augm) #bootstrap the forward sel minimum Cp model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] tem <- shorth3(out$btmix[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicovmix[j] <- cicovmix[j] + 1 avelenmix[j] <- avelenmix[j] + tem$shorth[2] - tem$shorth[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- out$bhatimin0[(k+2):p] tem <- confreg(out$betas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim tem <- confreg(out$btmix[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicovmix[pp1] <- cicovmix[pp1] + 1 avelenmix[pp1] <- avelenmix[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicovmix[pp2] <- cicovmix[pp2] + 1 avelenmix[pp2] <- avelenmix[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicovmix[pp3] <- cicovmix[pp3] + 1 avelenmix[pp3] <- avelenmix[pp3] + tem$brlim #test whether the first k+1 values of beta are 1 gg <- k + 1 tstat <- out$bhatimin0[1:(k+1)] tem <- confreg(out$betas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim tem <- confreg(out$btmix[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicovmix[pp4] <- cicovmix[pp4] + 1 avelenmix[pp4] <- avelenmix[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicovmix[pp5] <- cicovmix[pp5] + 1 avelenmix[pp5] <- avelenmix[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicovmix[pp6] <- cicovmix[pp6] + 1 avelenmix[pp6] <- avelenmix[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns cicovmix <- cicovmix/nruns avelenmix <- avelenmix/nruns list(cicov=cicov,avelen=avelen,cicovmix=cicovmix,avelenmix=avelenmix, beta=beta,k=k)} vsbrbootsim<-function(n = 100, p = 4, k = 1, nruns = 100, psi=0.0, m = 10, B=1000, int=0, a = 5/3, alpha = 0.05){ ##Needs library(MASS). ##Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #Simulates parametric bootstrap for binomial regression with backward #elimination. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model k is the number of nonnoise variables #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want # with int + 3a <=5, int -3a >= -5. # If m=1, a = 4/3 may work better. #set.seed(974) ##need p>2 and want n >= 5p q <- p-1 pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] mv <- 0*1:n + m beta<-c(int,b) one <- as.vector(0*1:(k+1) + 1) one[1]<-int zero <- 0 * 1:p dd <- 1:nruns ddboot <- 1:B for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rbinom(n,size=m,prob=(exp(SP)/(1+exp(SP)))) ny <- mv-y #ny[i] = mv[i]-y[i] = no. of ``failures" tdata <- as.data.frame(cbind(x,y)) #make a BR data set out <- glm(cbind(y,ny)~., family=binomial, data=tdata) ESP <- predict(out,newdata = tdata) varnames <- names(out$coef) outbe <- step(out,trace=0) #backward elimination dd[i]<-length(outbe$coef) vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhat <- zero bhat[vin] <- outbe$coef #bhatImin0 from x and y betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rbinom(n,size=m,prob=(exp(ESP)/(1+exp(ESP)))) nydat <- mv-ydat tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(cbind(ydat,nydat)~., family=binomial, data=tdat) outbe <- step(temp,trace=0) #backward elimination ddboot[i]<-length(outbe$coef) vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhatimin0 <- zero bhatimin0[vin] <- outbe$coef betas[i,] <- bhatimin0 #from x and y* } for (j in 1:p){ tem <- shorth3(betas[,j],alpha=alpha) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- bhat[(k+2):p] #bhatImin0 from x and y tem <- confreg(betas[,(k+2):p],g=gg,that=tstat,alpha=alpha) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k+1 values of beta are (int,1,...,1) gg <- k + 1 tstat <- bhat[1:(k+1)] #bhatImin0 from x and y tem <- confreg(betas[,1:(k+1)],g=gg,that=tstat,alpha=alpha) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns mndd <- mean(dd) mnddboot <- mean(ddboot) #for the last bootstrap sample list(mndd=mndd,mnddboot=mnddboot,cicov=cicov,avelen=avelen,beta=beta,k=k)} vsbrbootsim2<-function(n=100,p=4,k=1,nruns=100,psi=0.0,cc=0.01,augm=F,m=10, BB=1000,int=0,a=5/3,alph=0.05,binary=F){ ##Needs library(MASS). ##Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #Simulates parametric bootstrap for binomial regression with backward #elimination. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want # with int + 3a <=5, int -3a >= -5. # If m=1, a = 4/3 may work better. #set.seed(974) ##need p>2 and want n >= 5p q <- p-1 pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) cicovmix <- cicov avelenmix <- avelen rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] mvv <- 0*1:n + m beta<-c(int,b) one <- as.vector(0*1:(k+1) + 1) one[1]<-int zero <- 0 * 1:p for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rbinom(n,size=m,prob=(exp(SP)/(1+exp(SP)))) out <-vsLRboot2(x,y,mv=mvv,c=cc,aug=augm,B=BB,bin=binary) #bootstrap the backward elimination minimum AIC model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] tem <- shorth3(out$btmix[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicovmix[j] <- cicovmix[j] + 1 avelenmix[j] <- avelenmix[j] + tem$shorth[2] - tem$shorth[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- out$bhatimin0[(k+2):p] tem <- confreg(out$betas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim tem <- confreg(out$btmix[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicovmix[pp1] <- cicovmix[pp1] + 1 avelenmix[pp1] <- avelenmix[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicovmix[pp2] <- cicovmix[pp2] + 1 avelenmix[pp2] <- avelenmix[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicovmix[pp3] <- cicovmix[pp3] + 1 avelenmix[pp3] <- avelenmix[pp3] + tem$brlim #test whether the first k+1 values of beta are (int,1,...,1) gg <- k + 1 tstat <- out$bhatimin0[1:(k+1)] tem <- confreg(out$betas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim tem <- confreg(out$btmix[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicovmix[pp4] <- cicovmix[pp4] + 1 avelenmix[pp4] <- avelenmix[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicovmix[pp5] <- cicovmix[pp5] + 1 avelenmix[pp5] <- avelenmix[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicovmix[pp6] <- cicovmix[pp6] + 1 avelenmix[pp6] <- avelenmix[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns cicovmix <- cicovmix/nruns avelenmix <- avelenmix/nruns list(cicov=cicov,avelen=avelen,cicovmix=cicovmix,avelenmix=avelenmix,beta=beta,k=k)} vscisim<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, BB=1000, alph = 0.05){ #Needs library(leaps). Has psi and used 3 confidence regions for CIs. #Gets CIs for beta_i with shorth, pred reg, Bickel and Ren methods. #Simulates bootstrap for forward selection variable selection. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-1, k is the number of nonnoise variables #Need k < p-1 so there are some zeroes in the model. #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) scicov <- 0*(1:p) savelen <- scicov prcicov <- scicov pravelen <- scicov brcicov <- scicov bravelen <- scicov rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 one <- as.vector(0*1:(k+1) + 1) up <- min((1 - alph/2), (1 - alph + 10*alph/BB)) if(alph > 0.1) up <- min((1 - alph + 0.05), (1 - alph + 1/BB)) qB <- up if(qB < 1 - alph + 0.001) up <- 1 - alph for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-fselboot(x,y,B=BB) #bootstrap the forward sel minimum Cp model for (j in 1:p){ #get shorth CIs tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) scicov[j] <- scicov[j] + 1 savelen[j] <- savelen[j] + tem$shorth[2] - tem$shorth[1] #get prediction region CIs tem <- betaci(t=out$betas[,j],bhat=out$bhatimin0[j],up=up) if(beta[j] >= tem$prci[1] && beta[j] <= tem$prci[2]) prcicov[j] <- prcicov[j] + 1 pravelen[j] <- pravelen[j] + tem$prcilen #get Bickel and Ren CIs if(beta[j] >= tem$brci[1] && beta[j] <= tem$brci[2]) brcicov[j] <- brcicov[j] + 1 bravelen[j] <- bravelen[j] + tem$brcilen } } scicov <- scicov/nruns savelen <- savelen/nruns prcicov <- prcicov/nruns pravelen <- pravelen/nruns brcicov <- brcicov/nruns bravelen <- bravelen/nruns list(scicov=scicov,savelen=savelen,prcicov=prcicov,pravelen=pravelen, brcicov=brcicov,bravelen=bravelen,beta=beta,k=k)} vselboot<-function(x,y,B = 1000){ #needs library(leaps) #need n and p small, 2 < p < 30 #allsubsets minimum Cp regression #does not make sense to do variable selection if there #is only one nontrivial predictor x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vars <- as.vector(1:(p-1)) #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the minimum Cp submodel out<-leaps(x,y) mincp <- out$which[out$Cp==min(out$Cp)] vin <- vars[mincp] sub <- lsfit(x[,vin],y) betas <- matrix(0,nrow=B,ncol=p) #bootstrap the minimum Cp submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) out<-leaps(x,y=yb) mincp <- out$which[out$Cp==min(out$Cp)] vin <- vars[mincp] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef } list(full=full,sub=sub,betas=betas) } vsLRboot<-function(x,y,mv=c(1,1),B = 1000,bin=T){ #needs library(MASS), n > 5p, p > 2, want B >= 50p, takes a few minutes #mv is the m_i vector of the number of trials; if bin=T #then for binary LR the program provides the number of trials #bootstrap min AIC logistic regression backward elimination model #Does not make sense to do variable selection if there #is only one nontrivial predictor. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] zero <- 0 * 1:p bhat <- zero tdata <- as.data.frame(cbind(x,y)) if(bin==T) mv <- 0*1:n + 1 ny <- mv-y out <- glm(cbind(y,ny)~., family=binomial, data=tdata) ESP <- predict(out,newdata = tdata) varnames <- names(out$coef) outbe <- step(out,trace=0) #backward elimination vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhat[vin] <- outbe$coef #bhatImin0 from X and Y betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rbinom(n,size=mv,prob=(exp(ESP)/(1+exp(ESP)))) nydat <- mv-ydat tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(cbind(ydat,nydat)~., family=binomial, data=tdat) outbe <- step(temp,trace=0) #backward elimination vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhatimin0 <- zero bhatimin0[vin] <- outbe$coef betas[i,] <- bhatimin0 #from X and Y* } list(bhatimin0=bhat,betas=betas) } vsLRboot2<-function(x,y,mv=c(1,1),c=0.01,aug=F,B=1000,bin=T){ #needs library(MASS), n > 5p, p > 2, want B >= 50p, takes a few minutes #mv is the m_i vector of the number of trials; if bin=T #then for binary LR the program provides the number of trials #bootstraps min AIC logistic regression backward elimination model #using bhatVS and bhatMIX #If augm neq F, adds cB full model bootstrap samples so S*_T is better #conditioned. #Does not make sense to do variable selection if there #is only one nontrivial predictor. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] zero <- 0 * 1:p d <- ceiling(c*B) bpd <- B + d bp1 <- B + 1 bhat <- zero tdata <- as.data.frame(cbind(x,y)) if(bin==T) mv <- 0*1:n + 1 ny <- mv-y out <- glm(cbind(y,ny)~., family=binomial, data=tdata) ESP <- predict(out,newdata = tdata) varnames <- names(out$coef) outbe <- step(out,trace=0) #backward elimination vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhat[vin] <- outbe$coef #bhatImin0 from X and Y betas <- matrix(0,nrow=bpd,ncol=p) btmix <- betas for(i in 1:B){ ydat <- rbinom(n,size=mv,prob=(exp(ESP)/(1+exp(ESP)))) nydat <- mv-ydat tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(cbind(ydat,nydat)~., family=binomial, data=tdat) outbe <- step(temp,trace=0) #backward elimination vinnames <- names(outbe$coef) vin <- varnames %in% vinnames betas[i,vin] <- outbe$coef #bhatimin0 from X and Y* ydat <- rbinom(n,size=mv,prob=(exp(ESP)/(1+exp(ESP)))) nydat <- mv-ydat tdat <- as.data.frame(cbind(x[,vin[-1]],ydat)) outmix<-glm(cbind(ydat,nydat)~., family=binomial, data=tdat) btmix[i,vin]<- outmix$coef #bhatimin0 from random selection } if(aug == F) {betas <- betas[1:B,]; btmix <- btmix[1:B,]} else { for(i in bp1:bpd){ ydat <- rpois(n,lambda=exp(ESP)) tdat <- as.data.frame(cbind(x,ydat)) betas[i,]<-glm(ydat~., family=poisson, data=tdat)$coef } btmix[bp1:bpd,] <- betas[bp1:bpd,] } list(bhatimin0=bhat,betas=betas,btmix=btmix) } vspisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alpha = 0.05){ #Needs library(leaps). Uses min Cp and the Olive (2013) PI. #Simulates PIs for forward selection variable selection. # 1 <= k <= p-1 is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} if (alpha <= 0.1) {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn pilen <- 1:nruns ps <- pilen opicov <- 0 q <- p-1 vmax <- min(p,as.integer(n/5)) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the forward sel minimum Cp model tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] sub <- lsfit(x[,vin],y) ps[i]<-length(sub$coef) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] fres <- sub$resid #get asymptotically optimal PI sres <- sort(fres) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } psmn <- mean(ps)-k #near 0 if subset is selecting optimal subset pimnlen <- mean(pilen) opicov <- opicov/nruns list(psmn=psmn,opicov=opicov, pimenlen = pimnlen)} vspisim2<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, J = 5, alpha = 0.05){ #Needs library(leaps). Uses the Pelawa Watagoda and Olive (2017) PI. #Simulates PIs for forward selection variable selection using EBIC if n<10p # and min Cp if n >=10p. # ebic - 2 p log(2) = out$bic+2*log(dbinom(x=xx,size=p,prob=0.5)) #ebic <- out$bic+2*log(2^p*dbinom(x=xx,size=p,prob=0.5)) #ebic <- out$bic+2*(lgamma(p+1)-lgamma(xx+1)-lgamma(p-xx+1)) #Formula uses EBIC(I) - 2 lgamma(p+1). # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 val <- 8*n/9 fselpilen <- 1:nruns fselpicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) nc <- ceiling(n/J)-1 nc <- min(nc,q) nc <- max(nc,1) #the maximum number of variables to use zz<-1:nc dd <- 1:nruns for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #get forward selection PI temp<-regsubsets(x,y,nvmax=nc,method="forward") out<-summary(temp) if(n < 10*p) { xx <- 1:min(length(out$bic),p-1)+1 ebic <- out$bic+2*(-lgamma(xx+1)-lgamma(p-xx+1)) #print(ebic) dd[i] <- zz[ebic==min(ebic)] #want these to be near but >= k minebic <- out$which[ebic==min(ebic),] #do not need the constant in vin vin <- vars[minebic[-1]] sub <- lsfit(x[,vin],y) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] res <- sub$resid pp <- length(vin)+1 tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) fselpilen[i] <- tem$up - tem$low fselpicov <- fselpicov + tem$inr } #if n >= 10p use min Cp model else { mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] sub <- lsfit(x[,vin],y) dd[i]<-length(sub$coef)#want these to be near but >= k pp<-dd[i] yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] res <- sub$resid tem <- dpi(yf=yf,yfhat=yfhat,d=pp,resid=res,alph=alpha) fselpilen[i] <- tem$up - tem$low fselpicov <- fselpicov + tem$inr } } fselpimnlen <- mean(fselpilen) fselpicov <- fselpicov/nruns list(dfsel=dd,fselpicov=fselpicov, fselpimenlen=fselpimnlen)} vsPRboot<-function(x,y,B = 1000){ #needs library(MASS), n > 5p, p > 2, want B >= 50p, takes a few minutes #bootstraps the Poisson regression backward elimination x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] zero <- 0 * 1:p tdata <- as.data.frame(cbind(x,y)) out <- glm(y~., family=poisson, data=tdata) ESP <- predict(out,newdata = tdata) varnames <- names(out$coef) outbe <- step(out,trace=0) #backward elimination vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhat <- zero bhat[vin] <- outbe$coef #bhatImin0 from X and Y betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rpois(n,lambda=exp(ESP)) tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(ydat~., family=poisson, data=tdat) outbe <- step(temp,trace=0) #backward elimination vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhatimin0 <- zero bhatimin0[vin] <- outbe$coef betas[i,] <- bhatimin0 #from X and Y* } list(bhatimin0=bhat,betas=betas) } vsPRboot2<-function(x,y,B=1000,c=0.01,aug=F) {#needs library(MASS), n > 5p, p > 2, want B >= 50p, takes a few minutes #bootstraps the Poisson regression backward elimination #using bhatVS and bhatMIX #If augm neq F, adds cB full model bootstrap samples so S*_T is better #conditioned. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] zero <- 0 * 1:p d <- ceiling(c*B) bpd <- B + d bp1 <- B + 1 tdata <- as.data.frame(cbind(x,y)) out <- glm(y~., family=poisson, data=tdata) ESP <- predict(out,newdata = tdata) varnames <- names(out$coef) outbe <- step(out,trace=0) #backward elimination vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhat <- zero bhat[vin] <- outbe$coef #bhatImin0 from x and y betas <- matrix(0,nrow=bpd,ncol=p) btmix <- betas for(i in 1:B){ ydat <- rpois(n,lambda=exp(ESP)) tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(ydat~., family=poisson, data=tdat) outbe <- step(temp,trace=0) #backward elimination vinnames <- names(outbe$coef) vin <- varnames %in% vinnames betas[i,vin] <- outbe$coef #bhatimin0 from x and y* ydat <- rpois(n,lambda=exp(ESP)) tdat <- as.data.frame(cbind(x[,vin[-1]],ydat)) outmix<-glm(ydat~., family=poisson, data=tdat) btmix[i,vin]<- outmix$coef #bhatimin0 from random selection } if(aug == F) {betas <- betas[1:B,]; btmix <- btmix[1:B,]} else { for(i in bp1:bpd){ ydat <- rpois(n,lambda=exp(ESP)) tdat <- as.data.frame(cbind(x,ydat)) betas[i,]<-glm(ydat~., family=poisson, data=tdat)$coef } btmix[bp1:bpd,] <- betas[bp1:bpd,] } list(bhatimin0=bhat,betas=betas,btmix=btmix) } vsprbootsim<-function(n = 100, p = 4, k = 1, nruns = 100, psi=0.0, B=1000, int=1, a = 1, alpha = 0.05){ ##Needs library(MASS). ##Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #Simulates parametric bootstrap for Poisson regression (backwards elimination). # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want exp(int+3a) reasonable # with int + 3a <=10, int > 0 and int - 3a > -4. q <- p-1 pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta<-c(int,b) one <- as.vector(0*1:(k+1) + 1) one[1]<-int zero <- 0 * 1:p dd <- 1:nruns ddboot <- 1:B for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rpois(n,lambda=exp(SP)) tdata <- as.data.frame(cbind(x,y)) #make a PR data set out <- glm(y~., family=poisson, data=tdata) ESP <- predict(out,newdata = tdata) varnames <- names(out$coef) outbe <- step(out,trace=0) #backward elimination dd[i]<-length(outbe$coef) vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhat <- zero bhat[vin] <- outbe$coef #bhatImin0 from x and y betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rpois(n,lambda=exp(ESP)) tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(ydat~., family=poisson, data=tdat) outbe <- step(temp,trace=0) #backward elimination ddboot[i]<-length(outbe$coef) vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhatimin0 <- zero bhatimin0[vin] <- outbe$coef betas[i,] <- bhatimin0 #bhatImin0 from x and y* } for (j in 1:p){ tem <- shorth3(betas[,j],alpha=alpha) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- bhat[(k+2):p] #bhatImin0 from x and y tem <- confreg(betas[,(k+2):p],g=gg,that=tstat,alpha=alpha) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k+1 values of beta are (int,1,...,1) gg <- k + 1 tstat <- bhat[1:(k+1)] #bhatImin0 from x and y tem <- confreg(betas[,1:(k+1)],g=gg,that=tstat,alpha=alpha) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns mndd <- mean(dd) mnddboot <- mean(ddboot) #for the last bootstrap sample list(mndd=mndd,mnddboot=mnddboot,cicov=cicov,avelen=avelen,beta=beta,k=k)} vsprbootsim2<-function(n=100,p=4,k=1,nruns=100,psi=0.0,cc=0.01,augm=F, BB=1000,int=1,a=1,alph=0.05){ ##Needs library(MASS), confreg, shorth3, vsPRboot2. ##Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #Simulates parametric bootstrap for Poisson regression (backwards elimination) # with variable selection and bhatMIX. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want exp(int+3a) reasonable # with int + 3a <=10, int > 0 and int - 3a > -4. q <- p-1 pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) cicovmix <- cicov avelenmix <- avelen rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta<-c(int,b) one <- as.vector(0*1:(k+1) + 1) one[1]<-int zero <- 0 * 1:p for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rpois(n,lambda=exp(SP)) #make a PR data set out <-vsPRboot2(x,y,B=BB,c=cc,aug=augm) #bootstrap the backward elimination minimum AIC model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] tem <- shorth3(out$btmix[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicovmix[j] <- cicovmix[j] + 1 avelenmix[j] <- avelenmix[j] + tem$shorth[2] - tem$shorth[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- out$bhatimin0[(k+2):p] tem <- confreg(out$betas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim tem <- confreg(out$btmix[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicovmix[pp1] <- cicovmix[pp1] + 1 avelenmix[pp1] <- avelenmix[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicovmix[pp2] <- cicovmix[pp2] + 1 avelenmix[pp2] <- avelenmix[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicovmix[pp3] <- cicovmix[pp3] + 1 avelenmix[pp3] <- avelenmix[pp3] + tem$brlim #test whether the first k+1 values of beta are (int,1,...,1) gg <- k + 1 tstat <- out$bhatimin0[1:(k+1)] tem <- confreg(out$betas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim tem <- confreg(out$btmix[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicovmix[pp4] <- cicovmix[pp4] + 1 avelenmix[pp4] <- avelenmix[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicovmix[pp5] <- cicovmix[pp5] + 1 avelenmix[pp5] <- avelenmix[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicovmix[pp6] <- cicovmix[pp6] + 1 avelenmix[pp6] <- avelenmix[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns cicovmix <- cicovmix/nruns avelenmix <- avelenmix/nruns list(cicov=cicov,avelen=avelen,cicovmix=cicovmix,avelenmix=avelenmix,beta=beta,k=k)} wpisim<-function(n = 100, p = 4, k = 1, nruns = 100, psi = 0.0, J = 5, B=1000, a = 1, gam=1, clam=0.1, alpha = 0.05){ #Program can fail if n/p is small and nruns > 100. #Use 1 <= k <= p, where k is the number of nonnoise variables. #Simulates the Olive et al. (2019) PIs for Weibull regression. #PIs for full model, need library(survival) #there are p coefficients #need p > 1, beta = (1, ..., 1, 0, ..., 0) with k ones, p-k zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(p-2)psi^2]/[1 + (p-1)psi^2], i not = j # when the correlation exists. SP~N(0,a^2), and a near 1 is ok. #set.seed(974) ##need p>2 fullpilen <- 1:nruns fullpicov <- 0 rho <- (2*psi + (p-2)*psi^2)/(1 + (p-1)*psi^2) val <- a/sqrt(k*(1 + (p-1)*psi^2) + k*(k-1)*(2*psi + (p-2)*psi^2)) A <- matrix(psi,nrow=p,ncol=p) diag(A) <- 1 b <- 0 * 1:p b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] for(i in 1:nruns) { x <- matrix(rnorm(n * p), nrow = n, ncol = p) x <- val* x %*% A xf <- val* rnorm(p) %*% A SP <- x%*%b #SP_i ~ N(0,a^2) lambdai <- exp(SP) lambdaf <- exp(xf%*%b) w <- rexp(n, rate = lambdai) wf <- rexp(1, rate = lambdaf) y <- w^(1/gam) yf <- wf^(1/gam) #uncensored cen <- rexp(n, rate = clam) ceny <- pmin(y, cen) status <- as.numeric(cen >= y) statusf <- c(status,1) tdata <- as.data.frame(cbind(x,ceny,status)) cenyn <- c(y,yf) xn <- rbind(x,xf) tdat <- as.data.frame(cbind(xn,cenyn,statusf)) #make a WR data set #get full model WR PI if(n >= 5*p){ outw <- survreg(Surv(ceny, status) ~ ., data = tdata) int <- outw$coef[1] bhat <- outw$coef[-1] sig <- outw$scale ghat=1/sig espw <- -xf%*%bhat/sig lamxf <- exp(-int/sig)*exp(espw) sc <- 1/lamxf^(1/ghat) ydat <- rweibull(B,shape=ghat,scale=sc) #tem <- shpi(yf=yf,ydat=ydat,alph=alpha) tem <- mshpi(yf=yf,ydat=ydat,n,d=p,alph=alpha) fullpilen[i] <- tem$up - tem$low fullpicov <- fullpicov + tem$inr } } fullpimnlen <- mean(fullpilen) fullpicov <- fullpicov/nruns list(int=int,beta=b,fullpicov=fullpicov,fullpimenlen=fullpimnlen) }