accisimf<- function(n = 50, N = 500, nruns = 500, p = 0.5, alpha = 0.05){ #Simulates the classical, Agresti Coull type and modified CIs #for p for finite samples. The AC CI seems to work well for #n < 0.4 N, N large. The classical CI seems to work well for #p = 0.01 if n > 0.3 N and N is large. # Changes CI [L,U] to [max(0,L),min(1,U)] to get shorter lengths ccov <- 0 accov <- 0 mcov <- 0 cut <- 1 - alpha/2 clow <- 1:nruns cup <- clow aclow <- clow acup <- clow mlow <- clow mup <- cup cp <- clow acp <- clow val <- qnorm(cut) valsq <- val^2 nt <- n + valsq sqrtfpc <- sqrt((N - n)/N) pop <- 0*1:N # make p = pop p approx nominal p pp <- as.integer(p*N) pop[1:pp] <- 1 p <- mean(pop) for(i in 1:nruns) { x <- sample(pop, n) # get classical CI phat <- mean(x) se <- sqrt((phat * (1 - phat))/(n - 1)) * sqrtfpc tem <- val * se clow[i] <- phat - tem cup[i] <- phat + tem clow[i] <- max(0, clow[i]) cup[i] <- min(1, cup[i]) if(clow[i] <= p && cup[i] >= p) ccov <- ccov + 1 cp[i] <- phat #get AC CI pt <- (n * phat + 0.5 * valsq)/nt se <- sqrt((pt * (1 - pt))/nt) * sqrtfpc tem <- val * se aclow[i] <- pt - tem acup[i] <- pt + tem aclow[i] <- max(0, aclow[i]) acup[i] <- min(1,acup[i]) if(aclow[i] <= p && acup[i] >= p) accov <- accov + 1 acp[i] <- pt # get modified CI mlow[i] <- max(0,min(clow[i],aclow[i])) mup[i] <- min(1,max(cup[i],acup[i])) if(mlow[i] <= p && mup[i] >= p) mcov <- mcov + 1 } ccov <- ccov/nruns clen <- sqrt(n) * mean(cup - clow) accov <- accov/nruns aclen <- sqrt(n) * mean(acup - aclow) mcov <- mcov/nruns mlen <- sqrt(n) * mean(mup - mlow) print("p,mean(phat),mean(ptilde)") print(p) print(mean(cp)) print(mean(acp)) list(ccov = ccov, clen = clen, accov = accov, aclen = aclen, mcov = mcov, mlen = mlen) } AERplot<-function(yhat, y, res, d=1, alph = 0.1){ # Makes a response plot where cases within their PI are not plotted. # Need alph around 0.3 if n is small. Use alph = 0.01 if n > 100000. # If alph = 1, the usual response plot of yhat versus y is made. # The value d is a crude estimate of the model complexity or degrees of freedom. # Could use d = p if n>>p where n is the sample size and p = no. of predictors. # For forward selection, lasso, PCR, and PLS, d=number of nonzero # estimated beta coefficients for model Y = x^T beta + e. n <- length(y) val <- 8*n/9 if(alph == 1){ plot(yhat,y) } else{ ymin <- min(y); ymax <- max(y) yhmin <- min(yhat); yhmax<-max(yhat) plot(c(yhmin,yhmax),c(ymin,ymax),type="n", ylab="Y",xlab="YHAT") #get PI for the residuals if(d < val) corfac <- (1 + 15/n) * sqrt( (n+2*d)/(n - d) ) else corfac <- 5*(1+15/n) if (alph > 0.1) {qn <- min(1 - alph + 0.05, 1 - alph + d/n)} if (alph <= 0.1) {qn <- min(1 - alph/2, 1 - alph + 10*alph*d/n)} pn <- qn if(pn < 1 - alph + 0.001) qn <- 1 - alph alphan <- 1 - qn sres <- sort(res) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } #Plot points corresponding to Ys not in their PIs resup <- corfac*rup reslow <- corfac*rlow points(yhat[y>yhat+resup],y[y>yhat+resup]) points(yhat[y>p where n is the sample size and p = no. of predictors. # For forward selection, lasso, PCR, and PLS, d=number of nonzero # estimated beta coefficients for model Y = x^T beta + e. n <- length(y) val <- 8*n/9 plot(yhat,y) #get PI for the residuals if(d < val) corfac <- (1 + 15/n) * sqrt( (n+2*d)/(n - d) ) else corfac <- 5*(1+15/n) if (alph > 0.1) {qn <- min(1 - alph + 0.05, 1 - alph + d/n)} if (alph <= 0.1) {qn <- min(1 - alph/2, 1 - alph + 10*alph*d/n)} pn <- qn if(pn < 1 - alph + 0.001) qn <- 1 - alph alphan <- 1 - qn sres <- sort(res) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } #Plot pointwise PIs resup <- corfac*rup reslow <- corfac*rlow abline(0,1) abline(resup,1) abline(reslow,1) list(respi=c(reslow,resup)) } bcisim<- function(n = 100, nruns = 500, p = 0.5, alpha = 0.05) { # Simulates the classical, Agresti Coull and exact CIs for p. # Changes CI (L,U) to (max(0,L),min(1,U)) to get shorter lengths. ccov <- 0 accov <- 0 ecov <- 0 cut <- 1 - alpha/2 cut2 <- alpha/2 clow <- 1:nruns cup <- clow aclow <- clow acup <- clow elow <- clow eup <- clow val <- qnorm(cut) valsq <- val^2 nt <- n + valsq zup <- 1/(1 + n * qf(alpha, 2 * n, 2)) zlow <- n/(n + qf((1 - alpha), 2, 2 * n)) for(i in 1:nruns) { x <- rbinom(n, 1, p) # get classical CI phat <- mean(x) se <- sqrt((phat * (1 - phat))/n) tem <- val * se clow[i] <- max(0,phat - tem) cup[i] <- min(1,phat + tem) if(clow[i] < p && cup[i] > p) ccov <- ccov + 1 #get AC CI pt <- (n * phat + 0.5 * valsq)/nt se <- sqrt((pt * (1 - pt))/nt) tem <- val * se aclow[i] <- max(0,pt - tem) acup[i] <- min(1,pt + tem) if(aclow[i] < p && acup[i] > p) accov <- accov + 1 #get exact CI w <- sum(x) if(w == 0) { elow[i] <- 0 eup[i] <- zup } else if(w == n) { elow[i] <- zlow eup[i] <- 1 } else { val2 <- w + (n - w + 1) * qf(cut, (2 * (n - w + 1)), (2 * w)) elow[i] <- w/val2 val2 <- w + 1 + (n - w) * qf(cut2, (2 * (n - w)), (2 * ( w + 1))) eup[i] <- (w + 1)/val2 } if(elow[i] < p && eup[i] > p) ecov <- ecov + 1 } ccov <- ccov/nruns clen <- sqrt(n) * mean(cup - clow) accov <- accov/nruns aclen <- sqrt(n) * mean(acup - aclow) ecov <- ecov/nruns elen <- sqrt(n) * mean(eup - elow) list(ccov = ccov, clen = clen, accov = accov, aclen = aclen, ecov = ecov, elen = elen) } bicboot<-function(x,y,B = 1000){ #needs library(leaps), n > 5p, p > 2 #bootstrap min BIC model forward selection regression #Does not make sense to do variable selection if there #is only one nontrivial predictor. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vmax <- min(p,as.integer(n/5)) vars <- as.vector(1:(p-1)) #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the minimum bic submodel tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) minbic <- out$which[out$bic==min(out$bic)] #do not need the constant in vin vin <- vars[minbic[-1]] sub <- lsfit(x[,vin],y) bhatimin0 <- 0*1:p indx <- c(1,1+vin) bhatimin0[indx] <- sub$coef betas <- matrix(0,nrow=B,ncol=p) #bootstrap the minimum Cp submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-regsubsets(x,y=yb,method="forward") out<-summary(tem) minbic <- out$which[out$bic==min(out$bic)] vin <- vars[minbic[-1]] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef } list(full=full,sub=sub,bhatimin0=bhatimin0,betas=betas) } bicbootsim<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, BB=1000, alph = 0.05){ #needs library(leaps), calls bicboot, confreg, shorth3 from slpack #Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for forward selection variable selection using BIC. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 one <- as.vector(0*1:(k+1) + 1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <- bicboot(x,y,B=BB) #bootstrap the forward sel minimum BIC model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- out$bhatimin0[(k+2):p] tem <- confreg(out$betas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k+1 values of beta are 1 gg <- k + 1 tstat <- out$bhatimin0[1:(k+1)] tem <- confreg(out$betas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} binregbootsim<-function(n = 100, p = 4, k = 1, nruns = 100, psi=0.0, m = 40, B=1000, int=0, a = 5/3, alpha = 0.05){ #calls confreg, shorth3 ##Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #Simulates parametric bootstrap for binomial regression (full model). # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want # with int + 3a <=5, int -3a >= -5. #set.seed(974) ##need p>2 and want n >= 5p q <- p-1 pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] mv <- 0*1:n + m beta<-c(int,b) one <- as.vector(0*1:(k+1) + 1) one[1]<-int for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rbinom(n,size=m,prob=(exp(SP)/(1+exp(SP)))) ny <- mv-y #ny[i] = mv[i]-y[i] = no. of ``failures" tdata <- as.data.frame(cbind(x,y)) #make a BR data set out <- glm(cbind(y,ny)~., family=binomial, data=tdata) bhat<-out$coef ESP <- predict(out,newdata = tdata) betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rbinom(n,size=m,prob=(exp(ESP)/(1+exp(ESP)))) nydat <- mv-ydat tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(cbind(ydat,nydat)~., family=binomial, data=tdat) betas[i,] <- temp$coef } for (j in 1:p){ tem <- shorth3(betas[,j],alpha=alpha) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- bhat[(k+2):p] tem <- confreg(betas[,(k+2):p],g=gg,that=tstat,alpha=alpha) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k+1 values of beta are (int,1,...,1) gg <- k + 1 tstat <- bhat[1:(k+1)] tem <- confreg(betas[,1:(k+1)],g=gg,that=tstat,alpha=alpha) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} brpisim<-function(n = 100, p = 4, k = 1, nruns = 100, psi = 0.0, m = 40, B=1000, J=5, int=0, a = 5/3, alpha = 0.05){ #Needs library(leaps), library(MASS), library(mgcv), and library(glmnet). #Calls shpi and mshpi. #The GAM BR PI is only computed if p = 4. #Use 1 <= k <= p-1, where k is the number of nonnoise variables. #Simulates the Olive et al. (2018) PI for Binomial regression witn m trials. #PIs for full model, lasso, relaxed lasso, backward elimination # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want # with int + 3a <=5, int -3a >= -5. # out <- glm((y/mv)~., family=binomial, data=tdata) #works with warnings # z <- y/mv; zdata <- as.data.frame(cbind(x,z)) #out <- glm(z~., family=binomial, data=zdata) #works with warnings #set.seed(974) ##need p>2 fullpilen <- 1:nruns fullpicov <- 0 gampilen <- 1:nruns gampicov <- 0 ohfspilen <- 1:nruns ohfspicov <- 0 laspilen <- 1:nruns laspicov <- 0 RLpilen <- 1:nruns RLpicov <- 0 vspilen <- 1:nruns vspicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) nc <- ceiling(n/J)-1 nc <- min(nc,q) nc <- max(nc,1) #the maximum number of variables to use for forward selection zz<-1:nc dd <- 1:nruns ddbe <- dd dRL <- dd #relaxed lasso and lasso have the same d mv <- 0*1:n + m for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A xf <- val* rnorm(q) %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rbinom(n,size=m,prob=(exp(SP)/(1+exp(SP)))) ny <- mv-y #ny[i] = mv[i]-y[i] = no. of ``failures" spf <- int + xf%*%b yf <- rbinom(1,size=m,prob = (exp(spf)/(1+exp(spf)))) tdata <- as.data.frame(cbind(x,y)) yn <- c(y,yf) xn <- rbind(x,xf) tdat <- as.data.frame(cbind(xn,yn)) names(tdat)<-names(tdata) #make a BR data set #get full model BR PI #note that glm has the target class first if(n >= 5*p){ out <- glm(cbind(y,ny)~., family=binomial, data=tdata) ESP <- predict(out,newdata = tdat[n+1,]) ydat <- rbinom(B,size=m,prob=(exp(ESP)/(1+exp(ESP)))) #tem <- shpi(yf=yf,ydat=ydat,alph=alpha) tem <- mshpi(yf=yf,ydat=ydat,n,d=p,alph=alpha) fullpilen[i] <- tem$up - tem$low fullpicov <- fullpicov + tem$inr } #get backward elimination PR PI if(n >= 5*p){ varnames <- names(out$coef)[-1] outbe <- step(out,trace=0) vinnames <- names(outbe$coef)[-1] vin <- varnames %in% vinnames ddbe[i]<-length(outbe$coef) pp<-ddbe[i] ESP <- outbe$coef[1] + xf[vin] %*% outbe$coef[-1] ydat <- rbinom(B,size=m,prob=(exp(ESP)/(1+exp(ESP)))) #tem <- shpi(yf=yf,ydat=ydat,alph=alpha) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) vspilen[i] <- tem$up - tem$low vspicov <- vspicov + tem$inr } #get full model GAM BR PI if p = 4, GAM needs a formula if(n >= 5*p && p == 4){ x1 <- x[,1];x2 <- x[,2];x3 <- x[,3] z<-y/mv out <- gam(z ~ s(x1) + s(x2) + s(x3),family=binomial,weights=mv) ESP <- predict.gam(out,newdata=data.frame(x1=xf[1],x2=xf[2],x3=xf[3])) ydat <- rbinom(B,size=m,prob=(exp(ESP)/(1+exp(ESP)))) # tem <- shpi(yf=yf,ydat=ydat,alph=alpha) tem <- mshpi(yf=yf,ydat=ydat,n,d=p,alph=alpha) gampilen[i] <- tem$up - tem$low gampicov <- gampicov + tem$inr } #get lasso GLM PI ##note that lasso has the target class second out<-cv.glmnet(x,cbind(ny,y),family="binomial") lam <- out$lambda.min ESP <- predict(out,s=lam,newx=xf) #now get d lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 #d = pp ydat <- rbinom(B,size=m,prob=(exp(ESP)/(1+exp(ESP)))) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) laspilen[i] <- tem$up - tem$low laspicov <- laspicov + tem$inr #get relaxed lasso GLM PI xsub <- x[,vin] sub <- glm(cbind(y,ny)~., family=binomial, data=data.frame(cbind(xsub,y))) dRL[i]<- pp #want these to be near but >= k+1 ESP <- sub$coef[1] + xf[vin] %*% sub$coef[-1] ydat <- rbinom(B,size=m,prob=(exp(ESP)/(1+exp(ESP)))) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) RLpilen[i] <- tem$up - tem$low RLpicov <- RLpicov + tem$inr #get Olive and Hawkins forward selection PI if(n >= 5*p){ temp<-regsubsets(x,y,nvmax=nc,method="forward") out<-summary(temp) mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] xsub <- x[,vin] sub <- glm(cbind(y,ny)~., family=binomial, data=data.frame(cbind(xsub,y))) dd[i]<-length(sub$coef)#want these to be near but >= k+1 pp<-dd[i] ESP <- sub$coef[1] + xf[vin] %*% sub$coef[-1] ydat <- rbinom(B,size=m,prob=(exp(ESP)/(1+exp(ESP)))) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) ohfspilen[i] <- tem$up - tem$low ohfspicov <- ohfspicov + tem$inr } } fullpimnlen <- mean(fullpilen) fullpicov <- fullpicov/nruns gampimnlen <- mean(gampilen) gampicov <- gampicov/nruns laspimnlen <- mean(laspilen) laspicov <- laspicov/nruns RLpimnlen <- mean(RLpilen) RLpicov <- RLpicov/nruns ohfspimnlen <- mean(ohfspilen) ohfspicov <- ohfspicov/nruns vspimnlen <- mean(vspilen) vspicov <- vspicov/nruns mndd <- mean(dd) mndRL <- mean(dRL) mnddbe <- mean(ddbe) #relaxed lasso and lasso have the same d list(mndRL=mndRL,mndd=mndd,mnddbe=mnddbe,int=int,b=b,fullpicov=fullpicov, fullpimenlen=fullpimnlen,gampicov=gampicov,gampimenlen=gampimnlen, laspicov=laspicov, laspimenlen=laspimnlen, RLpicov=RLpicov, RLpimenlen=RLpimnlen,ohfspicov=ohfspicov,ohfspimnlen=ohfspimnlen, vspicov=vspicov,vspimnlen=vspimnlen)} cmve<- function(x, csteps = 5) {# gets the cmve, rcmve and mb estimators zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) medd3 <- medd2 ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get CMVE attractor covf <- covm mnf <- mnm val <- mahalanobis(t(mnd), med, covv) if(val < medd3) { ##crit = [med(D)]^p * square root of det(cov) rd2 <- mahalanobis(x, mnd, covd) critd <- (sqrt(median(rd2)))^p*prod(diag(chol(covd))) rd2 <- mahalanobis(x, mnm, covm) critm <- (sqrt(median(rd2)))^p*prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get CMVE estimator chisqm <- qchisq(0.5, p) rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/chisqm covf <- const * covf ##reweight the above CMVE estimator (mnf,covf) to get the ##RCMVE estimator (rmnf,rcovf) rd2 <- mahalanobis(x, mnf, covf) up <- qchisq(0.975, p) if(p > 1){ rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) if(p > 1){ rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf list(center = mnf, cov = covf, rmnf = rmnf, rcovf = rcovf, mnm = mnm, covm = covm) } concmv<-function(n = 100, csteps = 5, gam = 0.4, outliers = T, start = 3) {#Shows how concentration works when p = 2. # Use start = 1 for DGK, start = 2 for MBA sphere = FCH sphere = MB estimator, # start = 3 for MBA coord MED, diag([MAD(X1)]^2,...,[MAD(Xp)]^2) start p <- 2 #A <- cbind(c(1, 0.9), c(0.9, 1)) x <- matrix(rnorm(n * p), ncol = p, nrow = n) #A <- diag(sqrt(1:p)) #if(outliers == T) { # val <- floor(gam * n) # tem <- 10 + 0 * 1:p # x[1:val, ] <- x[1:val, ] + tem #} #x <- x %*% A A <- cbind(c(1, 0.4), c(0.4, 1)) B <- cbind(c(0.5, 0), c(0, 0.5)) if(outliers == T) { val <- floor(gam * n) x[(val + 1):n, ] <- x[(val + 1):n, ] %*% A x[1:val, ] <- x[1:val, ] %*% B x[1:val, 1] <- x[1:val, 1] + 0 x[1:val, 2] <- x[1:val, 2] + 6 } else { x <- x %*% A } if(start == 1) { covs <- var(x) mns <- apply(x, 2, mean) } if(start == 2) { covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } if(start > 2) { tem <- apply(x, 2, mad)^2 covv <- diag(tem) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) plot(x[, 1], x[, 2]) points(x[md2 <= medd2, 1], x[md2 <= medd2, 2], pch = 15) identify(x[, 1], x[, 2]) } } concsim<-function(n = 100, p = 2, steps = 5, gam = 0.4, runs = 20) {# Need n > 2p, p > 1. This function is used to determine when the DD # plot separates outliers from non-outliers for various starts. # R users need to type "library(MASS)" or "library(lqs)." A <- sqrt(diag(1:p)) mbact <- 0 fmcdct <- 0 mbct <- 0 madct <- 0 dgkct <- 0 fchct <- 0 for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) ## outliers have mean (10, 10 sqrt(2), ..., 10 sqrt(p))^T val <- floor(gam * n) tem <- 10 + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem x <- x %*% A #MBA out <- covmba(x, csteps = steps) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbact <- mbact + 1 #DGK covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(j in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) dgkct <- dgkct + 1 #Median Ball start covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(j in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbct <- mbct + 1 #MAD start tem <- apply(x, 2, mad)^2 covv <- diag(tem) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(j in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) madct <- madct + 1 #FMCD out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fmcdct <- fmcdct + 1 #FCH out <- covfch(x, csteps = steps) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fchct <- fchct + 1 } list(mbact = mbact, fmcdct = fmcdct, dgkct = dgkct, mbct = mbct, madct = madct, fchct = fchct) } confreg<-function(x, g = 4, that = 1:4, alpha = 0.05){ # Makes confidence regions for theta from rows of x = Ti* from a bootstrap. # Use fot testing H_0: theta = 0 versus H_1: theta != 0. # The prediction region method, hybrid, and Bickel and Ren regions are used. # If g = 1, the shorth interval should work better. # Also computes the distance for the 0 vector. # Need g = dim(x)[2] and T = that the g by 1 estimator of theta. # Often that = A betahat(I_min,0). # Note that center= Tbar* = bagging estimator and cov = S*_T. x <- as.matrix(x) that <- as.vector(that) n <- dim(x)[1] zero <- 0*(1:g) up <- min((1 - alpha/2), (1 - alpha + 10*alpha*g/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + g/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical distance MD <- sqrt(md2) #get prediction region method cutoff cuplim <- sqrt(quantile(md2, up)) D0 <- sqrt(mahalanobis(zero, center, cov)) #get hybrid region statistic = Bickel and Ren statistic br0 <- sqrt(mahalanobis(zero, that, cov)) #get the Bickel and Ren cutoff and test statistic br2 <- mahalanobis(x,that,cov) brlim <- sqrt(quantile(br2, up)) list(cuplim=cuplim,brlim=brlim,br0=br0,D0=D0,MD=MD,center=center,cov=cov) } covdgk<-function(x, csteps = 10) {#computes the scaled DGK multivariate estimator, need p > 1 p <- dim(x)[2] covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } ##scale for consistency at MVN rd2 <- mahalanobis(x, mns, covs) const <- median(rd2)/(qchisq(0.5, p)) covs <- const * covs list(center = mns, cov = covs) } covfch<-function(x, csteps = 5) {# gets the FCH and RFCH estimators and the MB attractor # works for p = 1 zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ##get the location criterion lcut <- medd2 ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val <- mahalanobis(t(mnd), med, covv) if(val < lcut) {##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get FCH estimator chisqm <- qchisq(0.5, p) rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/chisqm covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the RFCH estimator ## (rmnf,rcovf) rd2 <- mahalanobis(x, mnf, covf) up <- qchisq(0.975, p) if(p > 1) { rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) if(p > 1){ rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf list(center = mnf, cov = covf, rmnf = rmnf, rcovf = rcovf, mnm=mnm, covm=covm) } covfch2<-function(x, csteps = 5, locc = 0.5) {# Gets the FCH and RFCH estimators and the MB attractor # Allows the location criterion cutoff to be varied. zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val <- mahalanobis(t(mnd), med, covv) if(val < lcut) {##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get FCH estimator chisqm <- qchisq(0.5, p) rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/chisqm covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the RFCH estimator ## (rmnf,rcovf) rd2 <- mahalanobis(x, mnf, covf) up <- qchisq(0.975, p) if(p > 1) { rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) if(p > 1){ rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf list(center = mnf, cov = covf, rmnf = rmnf, rcovf = rcovf, mnm=mnm, covm=covm) } covmb<-function(x, steps = 5, scale=F) {# Computes the median ball estimator. # Needs p > 1. If scale = T, the plotted points will roughly # scatter about the identity line if the data is MVN # and spherical about mu. p <- dim(x)[2] #Median Ball start covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the half set start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } if(scale == T){ rd2 <- mahalanobis(x, mns, covs) const <- median(rd2)/(qchisq(0.5, p)) covs <- const * covs } list(center=mns,cov=covs) } covmb2<-function(x, m=0, k=5, msteps=9){ # Computes the covmb2 estimator with concentration type steps. Needs p > 1. # Use even if p > n. Look at out<-medout(x) to determine how many cases # m are clean. Use m >= n/2. # Estimate m if m = 0: use k >= 0, so at least half of the cases are used, # and do the concentration type msteps to get a weighted median. # The concentration type steps help the most when the outlier proportion # is high. Try covbm2(x,msteps=0) and covmb2(x,msteps=9). # Using msteps > 0 does slow down the function some. p <- dim(x)[2] #Median Ball start covv <- diag(p) med <- apply(x, 2, median) #Get squared Euclidean distances from coordinatewise median. md2 <- mahalanobis(x, center = med, covv) if(m == 0){ if(msteps > 0){#do concentration type steps for(i in 1:msteps){ medd <- median(md2) medw <- apply(x[md2<=medd,], 2, median) md2 <- mahalanobis(x, center = medw, covv) } } md <- sqrt(md2) mcut <- median(md) + k*mad(md,constant=1) mns <- apply(x[md <= mcut, ], 2, mean) covs <- var(x[md <= mcut, ]) } else{ #Use m cases with the smallest distances. mcut <- sort(md2)[m] mns <- apply(x[md2 <= mcut, ], 2, mean) covs <- var(x[md2 <= mcut, ]) } list(center=mns,cov=covs) } covmba <- function(x, csteps = 5) { # gets the MBA estimator, works for p = 1 zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1){ mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1){ mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covb <- covs mnb <- mns ##get the square root of det(covb) critb <- prod(diag(chol(covb))) ##get the resistant estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start if(p > 1){ mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1){ mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1){ mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1){ mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } crit <- prod(diag(chol(covs))) if(crit < critb) { critb <- crit covb <- covs mnb <- mns } ##scale for better performance at MVN rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/(qchisq(0.5, p)) covb <- const * covb list(center = mnb, cov = covb, mbL = mns, mbc = covs) } covmba2<-function(x, csteps = 5) {# gets the MBA estimator, use covmba2 instead of covmba if p > 1 p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covb <- covs mnb <- mns ##get the square root of det(covb) critb <- prod(diag(chol(covb))) ##get the resistant estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } crit <- prod(diag(chol(covs))) if(crit < critb) { critb <- crit covb <- covs mnb <- mns } ##scale for better performance at MVN rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/(qchisq(0.5, p)) covb <- const * covb list(center = mnb, cov = covb) } covrmb<-function(x, csteps = 5){ # Need p > 1. Produces the median ball and reweighted median ball (RMB) # estimators. RMB is tailored to estimate the covariance matrix # of the bulk of the data if the bulk of the data is MVN # and the outliers are "not too bad." # This function is like covrmvn, except only the MB attractor # is used. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] up <- qchisq(0.975, p) qchi <- qchisq(0.5, p) ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the half subset start mnm <- apply(x[md2 <= medd2, ], 2, mean) covm <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mnm, covm) medd2 <- median(md2) mnm <- apply(x[md2 <= medd2, ], 2, mean) covm <- var(x[md2 <= medd2, ]) } ## scale the MB estimator to behave well for MVN data rd2 <- mahalanobis(x, mnm, covm) const <- median(rd2)/qchi covm <- const * covm ## reweight the scaled MB estimator (mnm,covm) twice ## to get the covrmb estimator (mnrmb,covrmb) tailored for MVN data rd2 <- mahalanobis(x, mnm, covm) mnrmb <- apply(x[rd2 <= up, ], 2, mean) covrmb <- var(x[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(x, mnrmb, covrmb) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) covrmb <- const * covrmb ## reweight again rd2 <- mahalanobis(x, mnrmb, covrmb) mnrmb <- apply(x[rd2 <= up, ], 2, mean) covrmb <- var(x[rd2 <= up, ]) d2 <- sum(rd2 <= up) rd2 <- mahalanobis(x, mnrmb, covrmb) qchi2 <- (0.5 * 0.975 * n)/d2 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) covrmb <- const * covrmb list(center = mnrmb, cov = covrmb, mnm = mnm, covm = covm) } covrmvn<-function(x, csteps = 5, locc = 0.5) {# Needs number of predictors p > 1. # This robust MLD estimator is tailored to estimate the covariance matrix # of the bulk of the data when the bulk of the data is MVN and the outliers # are "not too bad." The FCH and MB estimators are also produced. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] up <- qchisq(0.975, p) qchi <- qchisq(0.5, p) ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the cov estimator ## (rmnmvn,rcovmvn) tailored for MVN data rd2 <- mahalanobis(x, mnf, covf) rmnmvn <- apply(x[rd2 <= up, ], 2, mean) rcovmvn <- var(x[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ## reweight again rd2 <- mahalanobis(x, rmnmvn, rcovmvn) rmnmvn <- apply(x[rd2 <= up, ], 2, mean) rcovmvn <- var(x[rd2 <= up, ]) d2 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d2 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn list(center = rmnmvn, cov = rcovmvn, mnf = mnf, covf = covf, mnm = mnm, covm = covm) } covrob<-function(x, csteps = 5, locc = 0.5){ # gets the MBA, FCH, RFCH, RMVN and CMVE estimators # and the MB attractor zx <- x x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1]##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) ##get MBA attractor covb <- covm mnb <- mnm if(critd < critm) { covb <- covd mnb <- mnd } ##get FCH and CMVE attractors covf <- covm mnf <- mnm covcmv <- covm mncmv <- mnm val <- mahalanobis(t(mnd), med, covv) if(val < lcut) { if(critd < critm) { covf <- covd mnf <- mnd } rd2 <- mahalanobis(x, mnd, covd) critdm <- (sqrt(median(rd2)))^p * critd rd2 <- mahalanobis(x, mnm, covm) critmm <- (sqrt(median(rd2)))^p * critm if(critdm < critmm) { covcmv <- covd mncmv <- mnd } } ##scale for better performance at MVN ## get MBA estimator chisqm <- qchisq(0.5, p) rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/chisqm covb <- const * covb ## get FCH estimator rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/chisqm covf <- const * covf ## get 1st step of the RFCH estimator rd2 <- mahalanobis(x, mnf, covf) up <- qchisq(0.975, p) if(p > 1) { rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) ##use the following 3 commands for the RMVN estimator rd3 <- rd2 rmnmvn <- rmnf rcovmvn <- rcovf ## use rd2 for RFCH and rd4 for RMVN rd2 <- mahalanobis(x, rmnf, rcovf) rd4 <- rd2 ##use the following command for the RFCH estimator const <- median(rd2)/chisqm rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) if(p > 1) { rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf ## get the RMVN estimator d1 <- sum(rd3 <= up) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd4)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ## reweight again rd2 <- mahalanobis(x, rmnmvn, rcovmvn) if(p > 1){ rmnmvn <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnmvn = mean(zx[rd2 <= up]) } rcovmvn <- var(x[rd2 <= up, ]) d2 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d2 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ## get CMVE estimator rd2 <- mahalanobis(x, mncmv, covcmv) const <- median(rd2)/(qchisq(0.5, p)) covcmv <- const * covcmv list(center = mnb, cov = covb, mnf = mnf, covf = covf, rmnf = rmnf, rcovf = rcovf, rmnmvn = rmnmvn, rcovmvn = rcovmvn, mncmv = mncmv, covcmv = covcmv, mnm = mnm, covm = covm) } ddcomp<-function(x, steps = 5) {# Need p > 1. # Makes 4 DD plots using the DGK, FCH, FMCD and MB estimators. # Click left mouse button to identify points. # Click right mouse button to end the function. # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. # R users need to type "library(MASS)" or "library(lqs)." p <- dim(x)[2] par(mfrow = c(2, 2)) center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical and RD the robust distance MD <- sqrt(md2) #DGK start md2 <- mahalanobis(x, center, cov) medd2 <- median(md2) ## get the start mns <- center covs <- cov ## concentrate for(i in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RDdgk <- const * rd plot(MD, RDdgk) abline(0, 1) identify(MD, RDdgk) title("DGK DD Plot") #FCH out <- covfch(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) rd <- sqrt(rd2) RDfch <- rd plot(MD, RDfch) abline(0, 1) identify(MD, RDfch) title("FCH DD Plot") #FMCD out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RDf <- const * rd plot(MD, RDf) abline(0, 1) identify(MD, RDf) title("FMCD DD Plot") #Median Ball start covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RDmb <- const * rd plot(MD, RDmb) abline(0, 1) identify(MD, RDmb) title("Med Ball DD Plot") par(mfrow=c(1,1)) } ddcomp2<-function(x, steps = 5) {# Need p > 1. # Makes a 4 DD plots using the MBA, FCH, RFCH and MB estimators. # Click left mouse button to identify points. # Click right mouse button to end the function, and in R, highlight "stop." # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. p <- dim(x)[2] par(mfrow = c(2, 2)) center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical and RD the robust distance MD <- sqrt(md2) #MBA out <- covmba(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) RDm <- sqrt(rd2) plot(MD, RDm) abline(0, 1) identify(MD, RDm) title("MBA DD Plot") #FCH out <- covfch(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) RDf <- sqrt(rd2) plot(MD, RDf) abline(0, 1) identify(MD, RDf) title("FCH DD Plot") #RFCH center <- out$rmnf cov <- out$rcovf rd2 <- mahalanobis(x, center, cov) RDr <- sqrt(rd2) plot(MD, RDr) abline(0, 1) identify(MD, RDr) title("RFCH DD Plot") #MB center <- out$mnm cov <- out$covm rd2 <- mahalanobis(x, center, cov) RDmb <- sqrt(rd2) plot(MD, RDmb) identify(MD, RDmb) title("MB DD Plot") par(mfrow=c(1,1)) } ddmv<-function(n = 100, p = 2, steps = 5, gam = 0.4, outtype = 2, est = 1) {# Need p > 1. # This function is used to determine when the DD # plot separates outliers from non-outliers for various starts. # Workstation needs to activate a graphics # device with the command "X11()" or "motif()." # Advance the view with the right mouse button, and in R, highlight "stop." ## est = 1 for DGK, 2 for median ball, 3 for MAD A <- sqrt(diag(1:p)) x <- matrix(rnorm(n * p), ncol = p, nrow = n) val <- floor(gam * n) tem <- 10 + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem #if outtype = 1, outliers are Np(10 1, Ip) nonoutliers Np(0,Ip) if(outtype == 2) x <- x %*% A ## outliers have mean (10, 10 sqrt(2), ..., 10 sqrt(p))^T ## get the start if(est == 1) { #DGK classical start covs <- var(x) mns <- apply(x, 2, mean) } if(est == 2) { #Median Ball high breakdown start covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } if(est == 3) { #MAD high breakdown start tem <- apply(x, 2, mad)^2 covv <- diag(tem) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } ## concentrate and plot, highlighting outliers MD <- sqrt(mahalanobis(x, mns, covs)) for(i in 1:steps) { md <- sqrt(mahalanobis(x, mns, covs)) medd <- median(md) mns <- apply(x[md <= medd, ], 2, mean) covs <- var(x[md <= medd, ]) rd <- sqrt(mahalanobis(x, mns, covs)) plot(MD, rd) points(MD[1:val], rd[1:val], pch = 15) identify(MD, rd) } } ddplot4<-function(x, alpha = 0.1){ # Makes a DD plot with covrmvn used for the RDi. # Need p > 1. # Semiparametric prediction regions are added. # Click left mouse button to identify points. # Click right mouse button to end the function. # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. p <- dim(x)[2] n <- dim(x)[1] up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) out <- covrmvn(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) # MD is the classical and RD the robust distance MD <- sqrt(md2) RD <- sqrt(rd2) plot(MD, RD) abline(0, 1) #get nonparametric prediction region boundary cuplim <- sqrt(quantile(md2, up)) a <- min(RD) b <- max(RD) lines(c(cuplim, cuplim), c(b, a)) #get semiparametric prediction region boundary ruplim <- sqrt(quantile(rd2, up)) a <- min(MD) b <- max(MD) lines(c(a, b), c(ruplim, ruplim)) #get parametric MVN prediction region boundary mvnlim <- sqrt(qchisq(up, p)) b <- min(b, mvnlim) lines(c(a, b), c(mvnlim, mvnlim)) identify(MD, RD) list(cuplim = cuplim, ruplim = ruplim, mvnlim = mvnlim) } ddplot5<-function(x, mm=0, kk= 5,steps = 9) {# Plots Euclidean distances from the coordinatewise median #vs. those of covmb2 location estimator with 9 concentration type steps. #Good plot for outlier detection even if p > n. #Needs p > 1. x<-as.matrix(x) p <- dim(x)[2] covv <- diag(p) med <- apply(x, 2, median) RDMED <- sqrt(mahalanobis(x, center = med, covv)) RDCOVMB2 <- sqrt(mahalanobis(x,center=covmb2(x,m=mm,k=kk,msteps=steps)$center,covv)) plot(RDMED,RDCOVMB2) #list(RDMED=RDMED,RDCOVMB2=RDCOVMB2) } ducisim<-function(n = 100, nruns = 5000, alpha = 0.05, eta = 1000) {# Simulates 100(1-alpha)% CI for eta for discrete uniform eta # data. Coverage and n CI length are also given. The CI is # [max(Y), max(Y)/alpha^(1/n)). Notice that max(Y) <= eta wp1. cov <- 0 low <- 1:nruns up <- low cut <- alpha^(1/n) pop <- 1:eta for(i in 1:nruns) { y <- sample(x = pop, size = n, replace = T) maxy <- max(y) up[i] <- maxy/cut low[i] <- maxy if(low[i] <= eta && up[i] > eta) cov <- cov + 1 } cov <- cov/nruns nlen <- n * mean(up - low) ups <- sum(up < eta)/nruns list(ups = ups, cov = cov, nlen = nlen) } enet<-function(x,y,am=10){ #Gets elastic net output using 10-fold CV using a grid of alpha values #0, 1/am, 2/am, ..., am/am. #Needs library(glmnet). #Use am = 10, 20, 50, or 100. Larger am values take longer. x<-as.matrix(x) out<-cv.glmnet(x,y,alpha=0) lam <- out$lambda.min #value of lambda that minimizes 10 fold CV cvmin <- min(out$cvm) alph = 0 for(i in 1:am){ tem <- cv.glmnet(x,y,alpha=i/am) mcv <- min(tem$cvm) if(mcv < cvmin){ out <- tem lam <- out$lambda.min cvmin <- mcv alph <- i/am} } yhat <- predict(out,s=lam,newx=x) res <- y - yhat plot(yhat,y) abline(0,1) list(out=out, yhat=yhat, res=res, lam = lam, alph = alph) } enet2<-function(x,y,am=10){ #Gets elastic net output using 10-fold CV using a grid of alpha values #0, 1/am, 2/am, ..., am/am. Does not make the response plot made by enet. #Needs library(glmnet). #Use am = 10, 20, 50, or 100. Larger am values take longer. x<-as.matrix(x) out<-cv.glmnet(x,y,alpha=0) lam <- out$lambda.min #value of lambda that minimizes 10 fold CV cvmin <- min(out$cvm) alph = 0 for(i in 1:am){ tem <- cv.glmnet(x,y,alpha=i/am) mcv <- min(tem$cvm) if(mcv < cvmin){ out <- tem lam <- out$lambda.min cvmin <- mcv alph <- i/am} } yhat <- predict(out,s=lam,newx=x) res <- y - yhat list(out=out, yhat=yhat, res=res, lam = lam, alph = alph) } expsim<-function(n = 10, nruns = 5000, theta = 0, lambda = 1, alpha = 0.05, p = 1) {#Simulates exp 100(1-alpha)% CI for lambda, CI for theta #and CI for lambda with theta known. scov <- 0 ccov <- 0 mcov <- 0 slow <- 1:nruns sup <- slow mlow <- slow mup <- slow clow <- slow cup <- slow ucut <- alpha/2 lcut <- 1 - ucut d <- 2 * (n - p) d2 <- 2 * n lval <- log(alpha/2)/n uval <- log(1 - alpha/2)/n mlval <- alpha^(-1/(n - 1)) - 1 for(i in 1:nruns) { y <- theta + lambda * rexp(n) miny <- min(y) dn <- sum((y - miny)) #get CI for lambda lamhat <- dn/n num <- 2 * dn slow[i] <- num/qchisq(lcut, df = d) sup[i] <- num/qchisq(ucut, df = d) if(slow[i] < lambda && sup[i] > lambda) scov <- scov + 1 #get Mann et al CI for theta mlow[i] <- miny - lamhat * mlval mup[i] <- miny if(mlow[i] < theta) mcov <- mcov + 1 #get CI for lambda when theta is known tn <- sum((y - theta)) num <- 2 * tn clow[i] <- num/qchisq(lcut, df = d2) cup[i] <- num/qchisq(ucut, df = d2) if(clow[i] < lambda && cup[i] > lambda) ccov <- ccov + 1 } scov <- scov/nruns slen <- sqrt(n) * mean(sup - slow) mcov <- mcov/nruns mlen <- n * mean(mup - mlow) ccov <- ccov/nruns clen <- sqrt(n) * mean(cup - clow) list(d = d, scov = scov, slen = slen, mcov = mcov, mlen = mlen, ccov = ccov, clen = clen) } ffplot2<-function(x, y, nsamps = 7){ # For Unix, use X11() to turn on the graphics device before # using this function. For R, first type library(MASS). # Makes an FF plot with several resistant estimators. # Needs hbreg, mbareg, mbalata, and rmreg3. n <- length(y) x <- as.matrix(x) rmat <- matrix(nrow = n, ncol = 8) lsfit <- y - lsfit(x, y)$residuals print("got OLS") almsfit <- y - lmsreg(x, y)$resid print("got ALMS") altsfit <- y - ltsreg(x, y)$residuals print("got ALTS") mbacoef <- mbareg(x, y, nsamp = nsamps)$coef MBAfit <- mbacoef[1] + x %*% mbacoef[-1] print("got MBA") bbcoef <- hbreg(x, y)$bbcoef BBfit <- bbcoef[1] + x %*% bbcoef[-1] print("got BB") mbalcoef <- mbalata(x, y, nsamp = nsamps)$coef MBALfit <- mbalcoef[1] + x %*% mbalcoef[-1] print("got MBALATA") RMREG2 <- y - rmreg3(x,y)$res rmat[, 1] <- y rmat[, 2] <- lsfit rmat[, 3] <- almsfit rmat[, 4] <- altsfit rmat[, 5] <- MBAfit rmat[, 6] <- BBfit rmat[, 7] <- MBALfit rmat[, 8] <- RMREG2 pairs(rmat, labels = c("Y", "OLS Fit", "ALMS Fit", "ALTS Fit", "MBA Fit", "BB Fit", "MBALATA", "RMREG2")) } fselboot<-function(x,y,B = 1000){ #needs library(leaps), n > 5p, p > 2 #bootstrap min Cp model forward selection regression #Does not make sense to do variable selection if there #is only one nontrivial predictor. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vmax <- min(p,as.integer(n/5)) vars <- as.vector(1:(p-1)) #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the minimum Cp submodel tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] #do not need the constant in vin vin <- vars[mincp[-1]] sub <- lsfit(x[,vin],y) bhatimin0 <- 0*1:p indx <- c(1,1+vin) bhatimin0[indx] <- sub$coef betas <- matrix(0,nrow=B,ncol=p) #bootstrap the minimum Cp submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-regsubsets(x,y=yb,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] vin <- vars[mincp[-1]] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef } list(full=full,sub=sub,bhatimin0=bhatimin0,betas=betas) } gamper<-function(h, k=500) {#estimates amount of comtamination FLTS can tolerate n <- 10000 c <- 5000 gam0 <- min((n - c)/n, (1 - (1 - 0.2^(1/k))^(1/ h))) * 100 print(gam0) } gamper2<-function(p, k = 500) {#estimates the amount of contamination fmcd can tolerate n <- 10000 c <- 5000 h <- p + 1 gam0 <- min((n - c)/n, (1 - (1 - 0.2^(1/k))^(1/h))) * 100 print(gam0) } getB<-function(x, m=0, k= 5, msteps = 0){ # Gets the covmb2 subset B and the index of cases indx. # Best if p > n. # You can estimate number of clean cases m > n/2 with plot: out<-medout(x) x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] index <- 1:n covv <- diag(p) med <- apply(x, 2, median) #Get squared Euclidean distances from coordinatewise median. md2 <- mahalanobis(x, center = med, covv) if(m==0){ if(msteps > 0){#do concentration type steps for(i in 1:msteps){ medd <- median(md2) medw <- apply(x[md2<=medd,], 2, median) md2 <- mahalanobis(x, center = medw, covv) } } md <- sqrt(md2) mcut <- median(md) + k*mad(md,constant=1) } else{ #Use m cases with the smallest distances. md <- sqrt(md2) mcut <- sort(md)[m] } B <- x[md <= mcut,] indx <- index[md <= mcut] list(B = B, indx=indx) } getu<-function(x, csteps = 5, locc = 0.5){ # Gets the RMVN subset U and the index of cases indx. # Needs number of predictors > 1. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] index <- 1:n up <- qchisq(0.975, p) qchi <- qchisq(0.5, p) ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc)## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the cov estimator ## (rmnmvn,rcovmvn) tailored for MVN data rd2 <- mahalanobis(x, mnf, covf) rmnmvn <- apply(x[rd2 <= up, ], 2, mean) rcovmvn <- var(x[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ## get U rd2 <- mahalanobis(x, rmnmvn, rcovmvn) U <- x[rd2 <= up, ] indx <- index[rd2 <= up] list(U = U, indx=indx) } getubig<-function(x, group){ # Gets Ubig for discriminant analysis, binary regression, or one way MANOVA. #Let x contains data, group is vector with group[i] = j #if ith row is a case from the jth group, j = 1, ..., g. #Need each group size nh > 2(p+1). Would like nh > 20p. g <- max(group) x <- as.matrix(x) indx <- 1:2 tindx <- 1:dim(x)[1] #get the cases used in the RMVN set from each group for(i in 1:g){ xi <- as.matrix(x[group==i,]) tind <- tindx[group==i] tem <- getu(xi) tind <- tind[tem$indx] indx <- c(indx,tind) } indx <- indx[-c(1,2)] Ubig <- x[indx,] grp <- group[indx] list(Ubig=Ubig,indx=indx,grp=grp) } hbreg<-function(x, y, csteps = 10, a = 1.4, rreg = T, type = 1) {# Gets the hbreg estimator with 10 concentration steps applied to # the HB attractor BB. Uses LTA criterion to screen the 3 attractors. # For R, type "library(MASS)" before using this function. # If rreg = F, arobcoef = olscoef, if T, arobcoef uses # mbareg if type = 1, ltsreg if type = 2, mbalata if type = 3 # rmreg2 if type =4. # Calls mbareg, mbalata, rmreg3 which is rmreg2 without plots. med <- median(y) madd <- mad(y, constant = 1) x <- as.matrix(x) n <- dim(x)[1] nc <- dim(x)[2] + 1 #nc is number of predictors including intercept temp <- lsfit(x, y) hbf <- temp$coef absres <- abs(temp$residuals) indx <- (absres <= median(absres)) critf <- sum(absres[indx]) #mbareg or lmsreg may work better than ltsreg if(rreg == T){ if(type == 1){ tem <- mbareg(x, y) temres <- y - tem$coef[1] - x %*% tem$coef[-1] absres <- abs(temres)} if(type == 2){ tem <- ltsreg(x, y) absres <- abs(tem$residuals)} if(type == 3){ tem <- mbalata(x,y) temres <- y - tem$coef[1] - x %*% tem$coef[-1] absres <- abs(temres)} if(type == 4){ tem <- rmreg3(x,y) temres <- y - tem$Bhat[1] - x %*% tem$Bhat[-1] absres <- abs(temres)} indx <- (absres <= median(absres)) crit <- a*sum(absres[indx]) if(crit < critf) { critf <- crit hbf <- tem$coef} } #get y's closest to median y indx <- (y >= (med - madd) & y <= (med + madd)) #get the HB attractor bk = BB bk <- lsfit(x[indx, ], y[indx])$coef res <- y - (x %*% bk[2:nc] + bk[1]) ressq <- res^2 m <- median(ressq) for(i in 1:csteps) { indx <- (ressq <= m) bk <- lsfit(x[indx, ], y[indx])$coef res <- y - (x %*% bk[2:nc] + bk[1]) ressq <- res^2 m <- median(ressq) } bb <- bk res <- y - (x %*% bb[2:nc] + bb[1]) absres <- abs(res) indx <- (absres <= median(absres)) crit <- a*sum(absres[indx]) if(crit < critf) hbf <- bb if(rreg == T && type != 4) arobcoef <- tem$coef else if (rreg == T && type == 4) arobcoef <- as.vector(tem$Bhat) else arobcoef <- temp$coef list(coef = hbf, olscoef = temp$coef, arobcoef = arobcoef, bbcoef = bb) } hbregsim<-function(n = 100, p = 4, csteps = 10, nruns = 10, aa = 1.4, rr = T, typ = 1, sige = 1, sigx = 1, etype = 1){ # Simulates MLR model and gets fits from hbreg. # For R, type "library(MASS)" before using this function. # For MLR, want mnbhat = (1, ..., 1), sd(i) = 1\sqrt(n). # If rr = F, arobcoef = olscoef, if T, arobcoef uses # mbareg if typ = 1, ltsreg if typ = 2, mbalata if typ = 3, rmreg3 if typ = 4. # errors are normal if etype = 1, shifted exponential for etype=2, shifted lognormal if etype=3 # q <- p - 1 beta <- 1 + 0 * 1:q # beta <- 1:q hbhat <- matrix(0, nrow = nruns, ncol = p) olshat <- hbhat arobhat <- hbhat bbhat <- hbhat for(i in 1:nruns) { x <- matrix(rnorm(n * q, sd = sige), nrow = n, ncol = q) if (etype == 1) err <- rnorm(n, sd = sigx) if (etype == 2) err <- sige*(rexp(n) - 1) if (etype == 3) err <- sige*(exp(rnorm(n)) - exp(0.5)) y <- 1 + x %*% beta + err out <- hbreg(x, y, csteps = csteps, a = aa, rreg = rr, type = typ) hbhat[i, ] <- out$coef olshat[i, ] <- out$olscoef arobhat[i, ] <- out$arobcoef bbhat[i, ] <- out$bbcoef } mnhbhat <- apply(hbhat, 2, mean) sdhbhat <- sqrt(apply(hbhat, 2, var)) mnolshat <- apply(olshat, 2, mean) sdolshat <- sqrt(apply(olshat, 2, var)) mnarobhat <- apply(arobhat, 2, mean) sdarobhat <- sqrt(apply(arobhat, 2, var)) mnbbhat <- apply(bbhat, 2, mean) sdbbhat <- sqrt(apply(bbhat, 2, var)) list(mnhbhat = mnhbhat, sdhbhat = sdhbhat, mnolshat = mnolshat, sdolshat = sdolshat, mnarobhat = mnarobhat, sdarobhat = sdarobhat, mnbbhat = mnbbhat, sdbbhat = sdbbhat) } hnsim<- function(n = 10, nruns = 5000, mu = 0, sigma = 1, alpha = 0.05, p = 1) { #Simulates Pewsey HN 100(1-alpha)% CI for sigma^2, CI for mu, #and CI for sigma^2 with mu known. scov <- 0 lcov <- 0 ccov <- 0 slow <- 1:nruns sup <- slow llow <- slow lup <- slow clow <- slow cup <- sup ucut <- alpha/2 lcut <- 1 - ucut sigsq <- sigma^2 d <- n - p phiinv <- qnorm((0.5 + 1/(2 * n))) lval <- log(alpha) * phiinv * (1 + 13/n^2) for(i in 1:nruns) { y <- mu + sigma * abs(rnorm(n)) miny <- min(y) dn <- sum((y - miny)^2) #get CI for sigma^2 slow[i] <- dn/qchisq(lcut, df = d) sup[i] <- dn/qchisq(ucut, df = d) if(slow[i] < sigsq && sup[i] > sigsq) scov <- scov + 1 #get CI for mu shat <- sqrt(dn/n) llow[i] <- miny + shat * lval lup[i] <- miny if(llow[i] < mu) lcov <- lcov + 1 #get CI for sigma^2 if mu is known tn <- sum((y - mu)^2) clow[i] <- tn/qchisq(lcut, df = n) cup[i] <- tn/qchisq(ucut, df = n) if(clow[i] < sigsq && cup[i] > sigsq) ccov <- ccov + 1 } scov <- scov/nruns slen <- sqrt(n) * mean(sup - slow) lcov <- lcov/nruns llen <- n * mean(lup - llow) ccov <- ccov/nruns clen <- sqrt(n) * mean(cup - clow) list(d = d, scov = scov, slen = slen, lcov = lcov, llen = llen, ccov = ccov, clen = clen) } lassoboot<-function(x,y,B = 1000,regtype=1){ #needs library(glmnet), n > 5p, p > 2 #bootstrap lasso or ridge regression #Does not make sense to do variable selection if there #is only one nontrivial predictor. #Change regtype = 0 for ridge regression x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res betas <- matrix(0,nrow=B,ncol=p) #bootstrap the lasso or ridge regression model for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-cv.glmnet(x,y=yb,alpha=regtype) betas[i,] <- as.vector(predict(tem,type="coefficients",s=tem$lambda.min)) } list(betas=betas) } lassoboot2<-function(x,y,B = 1000,regtype=1,c=0.01,aug=F){ #needs library(glmnet), n > 5p, p > 2 #bootstrap lasso or ridge regression #If aug not=F, adds cB full model bootstrap samples. #Does not make sense to do variable selection if there #is only one nontrivial predictor. #Change regtype = 0 for ridge regression x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] d <- ceiling(c*B) bpd <- B + d bp1 <- B + 1 #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res betas <- matrix(0,nrow=bpd,ncol=p) #bootstrap the lasso or ridge regression model for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-cv.glmnet(x,y=yb,alpha=regtype) betas[i,] <- as.vector(predict(tem,type="coefficients",s=tem$lambda.min)) } if(aug == F) {betas <- betas[1:B,]} else { for(i in bp1:bpd){ yb <- fit + sample(res,n,replace=T) betas[i,] <- lsfit(x,yb)$coef } } list(betas=betas) } lassobootsim3<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, regtype=1, BB=1000, alph = 0.05){ #needs library(glmnet), calls confreg, shorth3 #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Takes about a minute per run so really slow. #Simulates bootstrap for lasso or RR (regtype=0) #with 10-fold cross validation. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-1, so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-lassoboot(x,y,B=BB,regtype=regtype) #bootstrap 10 fold CV lasso or RR model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 tem <- predreg(out$betas[,(k+2):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} lassobootsim4<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, regtype=1, BB=1000, alph = 0.05){ #needs library(glmnet), calls lassoboot, shorth3 #Gets rid of the test so the program does not fail. #Takes about a minute per run so really slow. #Simulates bootstrap for lasso or RR (regtype=0) #with 10-fold cross validation. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1, so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) cicov <- 0*(1:p) avelen <- 0*(1:p) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-lassoboot(x,y,B=BB,regtype=regtype) #bootstrap 10 fold CV lasso or RR model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} lassobootsim5<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, cc=0.01, augm=F, type = 1, psi = 0.0, regtype=1, BB=1000, alph = 0.05){ #needs library(glmnet), calls lassoboot2, predreg, shorth3 #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #If augm neq F, adds the OLS full model bootstrap samples (lambda = 0) #so S*_T is better conditioned. #Takes about a minute per run so really slow. #Simulates bootstrap for lasso or RR (regtype=0) #with 10-fold cross validation. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-1, so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-lassoboot2(x,y,B=BB,regtype=regtype,c=cc,aug=augm) #bootstrap 10 fold CV lasso or RR model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 tem <- predreg(out$betas[,(k+2):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} LRboot<-function(x,y,mv=c(1,1),B = 1000,bin=T){ #needs library(MASS), n > 5p, p > 2, want B >= 50p, takes a few minutes #mv is the m_i vector of the number of trials; if bin=T #then for binary LR the program provides the number of trials #bootstrap logistic regression full model x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] tdata <- as.data.frame(cbind(x,y)) if(bin==T) mv <- 0*1:n + 1 ny <- mv-y out <- glm(cbind(y,ny)~., family=binomial, data=tdata) bhat<-out$coef ESP <- predict(out,newdata = tdata) betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rbinom(n,size=1,prob=(exp(ESP)/(1+exp(ESP)))) nydat <- mv-ydat tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(cbind(ydat,nydat)~., family=binomial, data=tdat) betas[i,] <- temp$coef } list(bhat=bhat,betas=betas) } lrpiplot<-function(ESP,y,mv,d=2,B=1000,slices=10,alpha=0.05) {# Makes response plot for logistic regression with PIs for Y_i/m_i added. #0 = data, x = PI #mv is the vector of the number of trial m_i #tdata <- as.data.frame(cbind(x,y)) #ny <- mv-y #ny[i] = mv[i]-y[i] = no. of ``failures" #out <- glm(cbind(y,ny)~., family=binomial, data=tdata) #x<-as.matrix(x); d<-dim(x)[2]+1 #d=no. of predictors #ESP <- predict(out) cov<-0 n <- length(y) low<-1:n up<-low Z <- y/mv plot(ESP, Z) abline(weighted.mean(Z, mv), 0) fit <- y fit <- exp(ESP)/(1 + exp(ESP)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) fit2 <- fit val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i - 1) * val + 1):(i * val)] <- weighted.mean(Z[ indx[((i - 1) * val + 1):(i * val)]], mv[indx[(( i - 1) * val + 1):(i * val)]]) } fit2[((slices - 1) * val + 1):n] <- weighted.mean(Z[indx[(( slices - 1) * val + 1):n]], mv[indx[((slices - 1) * val + 1):n]]) # fit2 is already sorted in order corresponding to indx lines(ESP[indx], fit2) for(i in 1:n){ ydat <- rbinom(B,size=mv[i],prob=(exp(ESP[i])/(1+exp(ESP[i])))) tem <- mshpi(yf=y[i],ydat=ydat,n=n,d=d,alph=alpha) low[i] <- tem$low/mv[i] up[i] <- tem$up/mv[i] cov<-cov+tem$inr } points(ESP,low,pch=4) points(ESP,up,pch=4) title("Response Plot") cov<-cov/n list(cov=cov) } manbtsim<-function(n = 100, n2 = 100, n3 = 100, p = 2, csteps = 5, B= 100, gam = 0.4, nruns = 100, xtype = 1, outliers = 0, pm = 10, sig2 = 1, sig3 = 1, eps = 0.4, dd=4, delta = 0){ # This R function simulates one way Manova type bootstrap test for # Ho: mu_1 = ... = mu_g where g = 3. # Here x2 is clean, x = x1 may have outliers. # Uses the coordinatewise mean, median, 25% trimmed mean, #####and RMVN location estimators. SO VERY SLOW. # Need p > 1. Want n > 20p, n2 > 20p, n3 > 20p, B > 20p. # Multiply x by A where xtype = 1 for MVN Np(0,I), # 2, (with delta = eps) for (1 - delta) Np(0,I) + delta Np(0, 25 I) # 3 for multivariate t_d with d = dd # 4 for lognormal. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # For the point mass outlier types 4 and 5, need gam much smaller than 0.4. # Power can be estimated by increasing delta so mu = delta(1,...,1) # and mu2 = (0, ..., 0). # Cov(x) = diag(1,2,...,p), Cov(x2) = sig^2 Cov(x) for clean data. # For outliers=0, want hquant and rquant approx 1. A <- sqrt(diag(1:p)) munot <- 0 * (1:p) mu <- delta * (1 + munot) val <- floor(gam * n) indx <- 1:n indx2 <- 1:n2 indx3 <- 1:n3 medmus <- matrix(0,nrow=B,ncol=2*p) mnmus <- medmus tmnmus <- medmus rmvnmus <- medmus medcv <- 0 mncv <- 0 tmncv <- 0 rmvncv <- 0 chisqcut <- qchisq(p=0.95,df=2*p) cutoffs <- matrix(0,nrow=nruns,ncol=4) for(i in 1:nruns) { #make data x <- matrix(rnorm(n * p), ncol = p, nrow = n) x2 <- matrix(rnorm(n2 * p), ncol = p, nrow = n2) x3 <- matrix(rnorm(n3 * p), ncol = p, nrow = n3) if(xtype == 2) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 zu <- runif(n2) x2[zu < eps, ] <- x2[zu < eps, ] * 5 zu <- runif(n3) x3[zu < eps, ] <- x3[zu < eps, ] * 5 } if(xtype == 3) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu zu <- sqrt(rchisq(n2, dd)/dd) x2 <- x2/zu zu <- sqrt(rchisq(n3, dd)/dd) x3 <- x3/zu } if(xtype == 4){ #Want pop coord med(x) = 0. x <- exp(x) x <- x - 1 x2 <- exp(x2) x2 <- x2 - 1 x3 <- exp(x3) x3 <- x3 - 1 } x <- x %*% A x2 <- x2 %*% A x2 <- sig2 * x2 x3 <- x3 %*% A x3 <- sig3 * x3 if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } x <- mu + x #clean x has mean mu =delta(1,...,1)^T, x2 and x3 have mean (0,...,0)^T #get bootstrapped Tx - Tx3, Tx2 - Tx3 for various statistics T for(j in 1:B){ tem <- sample(indx,n,replace=T) medx <- apply(x[tem,],2,median) mnx <- apply(x[tem,],2,mean) tmnx <- apply(x[tem,],2,tmn) rmvnx <- covrmvn(x[tem,])$center tem <- sample(indx2,n2,replace=T) medx2 <- apply(x2[tem,],2,median) mnx2 <- apply(x2[tem,],2,mean) tmnx2 <- apply(x2[tem,],2,tmn) rmvnx2 <- covrmvn(x2[tem,])$center tem <- sample(indx3,n3,replace=T) medx3 <- apply(x3[tem,],2,median) mnx3 <- apply(x3[tem,],2,mean) tmnx3 <- apply(x3[tem,],2,tmn) rmvnx3 <- covrmvn(x3[tem,])$center medmus[j,] <- c(medx-medx3,medx2-medx3) mnmus[j,] <- c(mnx-mnx3,mnx2-mnx3) tmnmus[j,] <- c(tmnx-tmnx3,tmnx2-tmnx3) rmvnmus[j,] <- c(rmvnx-rmvnx3,rmvnx2-rmvnx3) } outmed<-predreg(medmus) medcv <- medcv + outmed$inr outmn <- predreg(mnmus) mncv <- mncv + outmn$inr outtmn <- predreg(tmnmus) tmncv <- tmncv + outtmn$inr outrmvn <- predreg(rmvnmus) rmvncv <- rmvncv + outrmvn$inr cutoffs[i,]<-c(outmed$cuplim,outmn$cuplim,outtmn$cuplim,outrmvn$cuplim)^2 } medcv <- 1 - medcv/nruns #prop of times Ho is rejected mncv <- 1 - mncv/nruns tmncv <- 1 - tmncv/nruns rmvncv <- 1 - rmvncv/nruns mncut <- apply(cutoffs,2,mean) list(chisqcut = chisqcut, mncut=mncut, medcv = medcv, mncv = mncv,tmncv=tmncv,rmvncv=rmvncv) } manbtsim2<-function(n = 100, n2 = 100, n3 = 100, m = 2, csteps = 5, B= 100, gam = 0.4, nruns = 100, ytype = 1, outliers = 0, pm = 10, sig2 = 1, sig3 = 1, eps = 0.4, dd=4, delta = 0, delta3 = 0, cov3I = F){ #needs library(MASS) # This R function simulates a one way Manova type bootstrap test for # Ho: mu_1 = mu_2 = mu_3 where p = g = 3 = number of groups. #Here y2 and y3 are clean, y = y1 may have outliers, and y is m by 1. # Uses the coordinatewise mean, median, and 25% trimmed mean. #Also does the classical Hotelling Lawley one way MANOVA test. #Partially written by Hasthika S. Rupasinghe Arachchige Don. ## Does not use the slow RMVN location estimators. # Need m > 1. Want n > 20m, n2 > 20m, n3 > 20m, B > 20m. # Multiply y by A where ytype = 1 for MVN Nm(0,I), # 2 for (1 - eps) Nm(0,I) + eps Nm(0, 25 I), # 3 for multivariate t_d with d = dd, # 4 for lognormal. # outliers = 0 for no outliers and Y~N(0,diag(1,...,m)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers Y~N((pm,...,pm)',diag(1,...,m)) # 4 for outliers Y[i,m] = pm # 5 for outliers Y[i,1] = pm # For the point mass outlier types 4 and 5, need gam much smaller than 0.4. # Power can be estimated by increasing delta so mu1 = delta(1,...,1) # and mu2 = (0, ..., 0), mu3 = delta3(1,...,1). # Cov(y) = diag(1,2,...,m), Cov(y2) = sig^2 Cov(y) for clean data. # Cov(y3) = sig^3 Cov(y) for clean data if cov3I = F, # or Cov(y3) = cI_3 if cov3I = T. # For outliers=0, want hquant and rquant approx 1. A <- sqrt(diag(1:m)) munot <- 0 * (1:m) mu <- delta * (1 + munot) mu3 <- delta3 * (1 + munot) val <- floor(gam * n) indx <- 1:n indx2 <- 1:n2 indx3 <- 1:n3 medmus <- matrix(0,nrow=B,ncol=2*m) mnmus <- medmus tmnmus <- medmus medcv <- 0 mncv <- 0 tmncv <- 0 crej <- 0 gp <- 0 grp <- 0 pval <- 0 out <- 0 chisqcut <- qchisq(p=0.95,df=2*m) cutoffs <- matrix(0,nrow=nruns,ncol=3) for(i in 1:nruns) { #make data y <- matrix(rnorm(n * m), ncol = m, nrow = n) y2 <- matrix(rnorm(n2 * m), ncol = m, nrow = n2) y3 <- matrix(rnorm(n3 * m), ncol = m, nrow =n3) if(ytype == 2) { zu <- runif(n) y[zu < eps, ] <- y[zu < eps, ] * 5 zu <- runif(n2) y2[zu < eps, ] <- y2[zu < eps, ] * 5 zu <- runif(n3) y3[zu < eps, ] <- y3[zu < eps, ] * 5 } if(ytype == 3) { zu <- sqrt(rchisq(n, dd)/dd) y <- y/zu zu <- sqrt(rchisq(n2, dd)/dd) y2 <- y2/zu zu <- sqrt(rchisq(n3, dd)/dd) y3 <- y3/zu } if(ytype == 4){ #Want pop coord med(y) = 0. y <- exp(y) y <- y - 1 y2 <- exp(y2) y2 <- y2 - 1 y3 <- exp(y3) y3 <- y3 - 1 } y <- y %*% A y2 <- y2 %*% A y2 <- sig2 * y2 if( cov3I != T){ y3 <- y3 %*% A y3 <- sig3 * y3} if(outliers == 1) { y[1:val, ] <- matrix(rnorm(val * m, sd = 0.01), ncol = m, nrow = val) y[1:val, m] <- y[1:val, m] + pm } if(outliers == 2) { y[1:val, ] <- matrix(rnorm(val * m, sd = 0.01), ncol = m, nrow = val) y[1:val, 1] <- y[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:m y[1:val, ] <- y[1:val, ] + tem } if(outliers == 4) { y[1:val, m] <- pm } if(outliers == 5) { y[1:val, 1] <- pm } y <- mu + y y3 <- mu3 + y3 #clean y has mean mu =delta(1,...,1)^T, #y2 has mean (0,..,0)^T and y3 has mean delta3 (1,...,1)^T #get bootstrapped Ty - Ty3, Ty2 - Ty3 for various statistics T for(j in 1:B){ tem <- sample(indx,n,replace=T) medy <- apply(y[tem,],2,median) mny <- apply(y[tem,],2,mean) tmny <- apply(y[tem,],2,tmn) tem <- sample(indx2,n2,replace=T) medy2 <- apply(y2[tem,],2,median) mny2 <- apply(y2[tem,],2,mean) tmny2 <- apply(y2[tem,],2,tmn) tem <- sample(indx3,n3,replace=T) medy3 <- apply(y3[tem,],2,median) mny3 <- apply(y3[tem,],2,mean) tmny3 <- apply(y3[tem,],2,tmn) medmus[j,] <- c(medy-medy3,medy2-medy3) mnmus[j,] <- c(mny-mny3,mny2-mny3) tmnmus[j,] <- c(tmny-tmny3,tmny2-tmny3) } outmed<-predreg(medmus) medcv <- medcv + outmed$inr outmn <- predreg(mnmus) mncv <- mncv + outmn$inr outtmn <- predreg(tmnmus) tmncv <- tmncv + outtmn$inr cutoffs[i,]<-c(outmed$cuplim,outmn$cuplim,outtmn$cuplim)^2 #Get the classical test coverage yall<-rbind(y,y2,y3) yall<-as.matrix(yall) gp <- c(rep(1, n),rep(2, n2),rep(3, n3)) grp<-factor(gp) out<-manova(yall~grp) pval <- summary(out,test="Hotelling-Lawley")$stats[1,6] #pvalue for Hotelling-Lawley's test if(pval < 0.05){ crej <- crej +1 } } medcv <- 1 - medcv/nruns #prop of times Ho is rejected mncv <- 1 - mncv/nruns tmncv <- 1 - tmncv/nruns mncut <- apply(cutoffs,2,mean) ccv <- crej/nruns list(chisqcut = chisqcut, mncut=mncut, medcv = medcv, mncv = mncv,tmncv=tmncv,ccv=ccv) } manbtsim3<-function(n1 = 100, n2 = 100, n3 = 100, m = 2, csteps = 5, B= 100, gam = 0.4, nruns = 100, ytype = 1, outliers = 0, pm = 10, sig2 = 1, sig3 = 1, eps = 0.4, dd=4, delta1 = 0, delta2 = 0, delta3 = 0, cov3I = F){ #needs library(MASS) # This R function simulates a one way Manova type bootstrap test for # Ho: mu_1 = mu_2 = mu_3 where p = g = 3 = number of groups. #Can vary the mean vectors mui better than in manbtsim2. #Here y2 and y3 are clean, y1 may have outliers, and y1 is m by 1. # Uses the coordinatewise mean, median, and 25% trimmed mean. #Also does the classical Hotelling Lawley one way MANOVA test. #Partially written by Hasthika S. Rupasinghe Arachchige Don. ## Does not use the slow RMVN location estimators. # Need m > 1. Want n > 20m, n2 > 20m, n3 > 20m, B > 20m. # Multiply y by A where ytype = 1 for MVN Nm(0,I), # 2 for (1 - eps) Nm(0,I) + eps Nm(0, 25 I), # 3 for multivariate t_d with d = dd, # 4 for lognormal. # outliers = 0 for no outliers and Y~N(0,diag(1,...,m)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers Y~N((pm,...,pm)',diag(1,...,m)) # 4 for outliers Y[i,m] = pm # 5 for outliers Y[i,1] = pm # For the point mass outlier types 4 and 5, need gam much smaller than 0.4. # Power can be estimated by using unequal deltai so mu1 = delta1(1,...,1) # and mu2 = delta2(1, ..., 1), mu3 = delta3(1,...,1). # Cov(y1) = diag(1,2,...,m), Cov(y2) = sig^2 Cov(y1) for clean data. # Cov(y3) = sig^3 Cov(y1) for clean data if cov3I = F, # or Cov(y3) = cI_3 if cov3I = T. # For outliers=0, want hquant and rquant approx 1. A <- sqrt(diag(1:m)) munot <- 0 * (1:m) mu1 <- delta1 * (1 + munot) mu2 <- delta2 * (1 + munot) mu3 <- delta3 * (1 + munot) val <- floor(gam * n1) indx1 <- 1:n1 indx2 <- 1:n2 indx3 <- 1:n3 medmus <- matrix(0,nrow=B,ncol=2*m) mnmus <- medmus tmnmus <- medmus medcv <- 0 mncv <- 0 tmncv <- 0 crej <- 0 gp <- 0 grp <- 0 pval <- 0 out <- 0 chisqcut <- qchisq(p=0.95,df=2*m) cutoffs <- matrix(0,nrow=nruns,ncol=3) for(i in 1:nruns) { #make data y1 <- matrix(rnorm(n1 * m), ncol = m, nrow = n1) y2 <- matrix(rnorm(n2 * m), ncol = m, nrow = n2) y3 <- matrix(rnorm(n3 * m), ncol = m, nrow = n3) if(ytype == 2) { zu <- runif(n) y1[zu < eps, ] <- y1[zu < eps, ] * 5 zu <- runif(n2) y2[zu < eps, ] <- y2[zu < eps, ] * 5 zu <- runif(n3) y3[zu < eps, ] <- y3[zu < eps, ] * 5 } if(ytype == 3) { zu <- sqrt(rchisq(n1, dd)/dd) y1 <- y1/zu zu <- sqrt(rchisq(n2, dd)/dd) y2 <- y2/zu zu <- sqrt(rchisq(n3, dd)/dd) y3 <- y3/zu } if(ytype == 4){ #Want pop coord med(y) = 0. y1 <- exp(y1) y1 <- y1 - 1 y2 <- exp(y2) y2 <- y2 - 1 y3 <- exp(y3) y3 <- y3 - 1 } y1 <- y1 %*% A y2 <- y2 %*% A y2 <- sig2 * y2 if( cov3I != T){ y3 <- y3 %*% A y3 <- sig3 * y3} if(outliers == 1) { y1[1:val, ] <- matrix(rnorm(val * m, sd = 0.01), ncol = m, nrow = val) y1[1:val, m] <- y1[1:val, m] + pm } if(outliers == 2) { y1[1:val, ] <- matrix(rnorm(val * m, sd = 0.01), ncol = m, nrow = val) y1[1:val, 1] <- y1[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:m y1[1:val, ] <- y1[1:val, ] + tem } if(outliers == 4) { y1[1:val, m] <- pm } if(outliers == 5) { y1[1:val, 1] <- pm } y1 <- mu1 + y1 y2 <- mu2 + y2 y3 <- mu3 + y3 #clean y1 has mean mu1 =delta1(1,...,1)^T, #y2 has mean delta2(1,..,1)^T and y3 has mean delta3 (1,...,1)^T #get bootstrapped Ty - Ty3, Ty2 - Ty3 for various statistics T for(j in 1:B){ tem <- sample(indx1,n1,replace=T) medy <- apply(y1[tem,],2,median) mny <- apply(y1[tem,],2,mean) tmny <- apply(y1[tem,],2,tmn) tem <- sample(indx2,n2,replace=T) medy2 <- apply(y2[tem,],2,median) mny2 <- apply(y2[tem,],2,mean) tmny2 <- apply(y2[tem,],2,tmn) tem <- sample(indx3,n3,replace=T) medy3 <- apply(y3[tem,],2,median) mny3 <- apply(y3[tem,],2,mean) tmny3 <- apply(y3[tem,],2,tmn) medmus[j,] <- c(medy-medy3,medy2-medy3) mnmus[j,] <- c(mny-mny3,mny2-mny3) tmnmus[j,] <- c(tmny-tmny3,tmny2-tmny3) } outmed<-predreg(medmus) medcv <- medcv + outmed$inr outmn <- predreg(mnmus) mncv <- mncv + outmn$inr outtmn <- predreg(tmnmus) tmncv <- tmncv + outtmn$inr cutoffs[i,]<-c(outmed$cuplim,outmn$cuplim,outtmn$cuplim)^2 #Get the classical test coverage yall<-rbind(y1,y2,y3) yall<-as.matrix(yall) gp <- c(rep(1, n1),rep(2, n2),rep(3, n3)) grp<-factor(gp) out<-manova(yall~grp) pval <- summary(out,test="Hotelling-Lawley")$stats[1,6] #pvalue for Hotelling-Lawley's test if(pval < 0.05){ crej <- crej +1 } } medcv <- 1 - medcv/nruns #prop of times Ho is rejected mncv <- 1 - mncv/nruns tmncv <- 1 - tmncv/nruns mncut <- apply(cutoffs,2,mean) ccv <- crej/nruns list(chisqcut = chisqcut, mncut=mncut, medcv = medcv, mncv = mncv,tmncv=tmncv,ccv=ccv) } manbtsim4<-function(n1 = 100, n2 = 100, n3 = 100, m = 2, csteps = 5, B= 100, gam = 0.4, nruns = 100, ytype = 1, outliers = 0, pm = 10, sig2 = 1, sig3 = 1, eps = 0.4, dd=4, delta1 = 0, delta2 = 0, delta3 = 0, cov3I = F){ #needs library(MASS) and library(Matrix) # This R function simulates one way Manova type tests for # Ho: mu_1 = mu_2 = mu_3 where p = g = 3 = number of groups. #Can vary the mean vectors mui better than in manbtsim2. #Has one more test than manbtsim3: #the large sample Manova type test based on the sample mean. #Here y2 and y3 are clean, y1 may have outliers, and y1 is m by 1. # Uses the coordinatewise mean, median, and 25% trimmed mean. #Also does the classical Hotelling Lawley one way MANOVA test. #Partially written by Hasthika S. Rupasinghe Arachchige Don. ## Does not use the slow RMVN location estimators. # Need m > 1. Want n > 20m, n2 > 20m, n3 > 20m, B > 20m. # Multiply y by A where ytype = 1 for MVN Nm(0,I), # 2 for (1 - eps) Nm(0,I) + eps Nm(0, 25 I), # 3 for multivariate t_d with d = dd, # 4 for lognormal. # outliers = 0 for no outliers and Y~N(0,diag(1,...,m)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers Y~N((pm,...,pm)',diag(1,...,m)) # 4 for outliers Y[i,m] = pm # 5 for outliers Y[i,1] = pm # For the point mass outlier types 4 and 5, need gam much smaller than 0.4. # Power can be estimated by using unequal deltai so mu1 = delta1(1,...,1) # and mu2 = delta2(1, ..., 1), mu3 = delta3(1,...,1). # Cov(y1) = diag(1,2,...,m), Cov(y2) = sig^2 Cov(y1) for clean data. # Cov(y3) = sig^3 Cov(y1) for clean data if cov3I = F, # or Cov(y3) = cI_3 if cov3I = T. # For outliers=0, want hquant and rquant approx 1. A <- sqrt(diag(1:m)) munot <- 0 * (1:m) mu1 <- delta1 * (1 + munot) mu2 <- delta2 * (1 + munot) mu3 <- delta3 * (1 + munot) val <- floor(gam * n1) indx1 <- 1:n1 indx2 <- 1:n2 indx3 <- 1:n3 medmus <- matrix(0,nrow=B,ncol=2*m) mnmus <- medmus tmnmus <- medmus medcv <- 0 mncv <- 0 tmncv <- 0 mantcov <- 0 crej <- 0 gp <- 0 grp <- 0 pval <- 0 out <- 0 chisqcut <- qchisq(p=0.95,df=2*m) cutoffs <- matrix(0,nrow=nruns,ncol=3) for(i in 1:nruns) { #make data y1 <- matrix(rnorm(n1 * m), ncol = m, nrow = n1) y2 <- matrix(rnorm(n2 * m), ncol = m, nrow = n2) y3 <- matrix(rnorm(n3 * m), ncol = m, nrow = n3) if(ytype == 2) { zu <- runif(n) y1[zu < eps, ] <- y1[zu < eps, ] * 5 zu <- runif(n2) y2[zu < eps, ] <- y2[zu < eps, ] * 5 zu <- runif(n3) y3[zu < eps, ] <- y3[zu < eps, ] * 5 } if(ytype == 3) { zu <- sqrt(rchisq(n1, dd)/dd) y1 <- y1/zu zu <- sqrt(rchisq(n2, dd)/dd) y2 <- y2/zu zu <- sqrt(rchisq(n3, dd)/dd) y3 <- y3/zu } if(ytype == 4){ #Want pop coord med(y) = 0. y1 <- exp(y1) y1 <- y1 - 1 y2 <- exp(y2) y2 <- y2 - 1 y3 <- exp(y3) y3 <- y3 - 1 } y1 <- y1 %*% A y2 <- y2 %*% A y2 <- sig2 * y2 if( cov3I != T){ y3 <- y3 %*% A y3 <- sig3 * y3} if(outliers == 1) { y1[1:val, ] <- matrix(rnorm(val * m, sd = 0.01), ncol = m, nrow = val) y1[1:val, m] <- y1[1:val, m] + pm } if(outliers == 2) { y1[1:val, ] <- matrix(rnorm(val * m, sd = 0.01), ncol = m, nrow = val) y1[1:val, 1] <- y1[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:m y1[1:val, ] <- y1[1:val, ] + tem } if(outliers == 4) { y1[1:val, m] <- pm } if(outliers == 5) { y1[1:val, 1] <- pm } y1 <- mu1 + y1 y2 <- mu2 + y2 y3 <- mu3 + y3 #clean y1 has mean mu1 =delta1(1,...,1)^T, #y2 has mean delta2(1,..,1)^T and y3 has mean delta3 (1,...,1)^T #get bootstrapped Ty - Ty3, Ty2 - Ty3 for various statistics T for(j in 1:B){ tem <- sample(indx1,n1,replace=T) medy <- apply(y1[tem,],2,median) mny <- apply(y1[tem,],2,mean) tmny <- apply(y1[tem,],2,tmn) tem <- sample(indx2,n2,replace=T) medy2 <- apply(y2[tem,],2,median) mny2 <- apply(y2[tem,],2,mean) tmny2 <- apply(y2[tem,],2,tmn) tem <- sample(indx3,n3,replace=T) medy3 <- apply(y3[tem,],2,median) mny3 <- apply(y3[tem,],2,mean) tmny3 <- apply(y3[tem,],2,tmn) medmus[j,] <- c(medy-medy3,medy2-medy3) mnmus[j,] <- c(mny-mny3,mny2-mny3) tmnmus[j,] <- c(tmny-tmny3,tmny2-tmny3) } outmed<-predreg(medmus) medcv <- medcv + outmed$inr outmn <- predreg(mnmus) mncv <- mncv + outmn$inr outtmn <- predreg(tmnmus) tmncv <- tmncv + outtmn$inr cutoffs[i,]<-c(outmed$cuplim,outmn$cuplim,outtmn$cuplim)^2 #Get the classical test coverage yall<-rbind(y1,y2,y3) yall<-as.matrix(yall) gp <- c(rep(1, n1),rep(2, n2),rep(3, n3)) grp<-factor(gp) out<-manova(yall~grp) pval <- summary(out,test="Hotelling-Lawley")$stats[1,6] #pvalue for Hotelling-Lawley's test if(pval < 0.05){ crej <- crej +1 } #large sample Manova type test based on the sample mean pval <- mantyp(y=yall,p=3,group=grp)$pval if(pval < 0.05) mantcov <- mantcov + 1 } medcv <- 1 - medcv/nruns #prop of times Ho is rejected mncv <- 1 - mncv/nruns tmncv <- 1 - tmncv/nruns mncut <- apply(cutoffs,2,mean) ccv <- crej/nruns mantcov <- mantcov/nruns list(chisqcut = chisqcut, mncut=mncut, medcv = medcv, mncv = mncv,tmncv=tmncv,ccv=ccv,mantcov=mantcov) } mantyp<-function(y,p,group,alph=0.05){ #Does a one way MANOVA type test based on large sample theory. #needs library(Matrix) #p is the number of groups and 1 < p < 10 #group = i if y is from the ith group #n = sum(ni) group <- as.factor(group) y <- as.matrix(y) n<-dim(y)[1] m<-dim(y)[2] Sp <- var(y[group==p,]) Tp <- apply(y[group==p,],2,mean) ni<-1:p q<-p-1 num<- m*q ni[p] <- sum(group==p) for(i in 1:q){ ni[i] <- sum(group==i) if(i==1) {S1 = var(y[group==1,]) T1 <- apply(y[group==1,],2,mean)} if(i==2) {S2 = var(y[group==2,]) T2 <- apply(y[group==2,],2,mean)} if(i==3) {S3 = var(y[group==3,]) T3 <- apply(y[group==3,],2,mean)} if(i==4) {S4 = var(y[group==4,]) T4 <- apply(y[group==4,],2,mean)} if(i==5) {S5 = var(y[group==5,]) T5 <- apply(y[group==5,],2,mean)} if(i==6) {S6 = var(y[group==6,]) T6 <- apply(y[group==6,],2,mean)} if(i==7) {S7 = var(y[group==7,]) T7 <- apply(y[group==7,],2,mean)} if(i==8) {S8 = var(y[group==8,]) T8 <- apply(y[group==8,],2,mean)} } dn <- min(ni) A<-matrix(1,nrow=q,ncol=q) B <- Sp/ni[p] ssigw <- kronecker(A,B) #still need to get the block diagonal of Sigw/n if(p==2) BD <- S1/ni[1] if(p==3) BD <- bdiag(S1/ni[1],S2/ni[2]) if(p==4) BD <- bdiag(S1/ni[1],S2/ni[2],S3/ni[3]) if(p==5) BD <- bdiag(S1/ni[1],S2/ni[2],S3/ni[3],S4/ni[4]) if(p==6) BD <- bdiag(S1/ni[1],S2/ni[2],S3/ni[3],S4/ni[4],S5/ni[5]) if(p==7) BD <- bdiag(S1/ni[1],S2/ni[2],S3/ni[3],S4/ni[4],S5/ni[5],S6/ni[6]) if(p==8) BD <- bdiag(S1/ni[1],S2/ni[2],S3/ni[3],S4/ni[4],S5/ni[5],S6/ni[6], S7/ni[7]) if(p==9) BD <- bdiag(S1/ni[1],S2/ni[2],S3/ni[3],S4/ni[4],S5/ni[5],S6/ni[6], S7/ni[7],S8/ni[8]) ssigw <- ssigw + BD #ssigw =sigw/n, n = sum(ni) n<-sum(ni) ssigw <- as.matrix(ssigw) sigw <- n*ssigw if(p==2) w <- T1 - Tp if(p==3) w <- c(T1-Tp,T2-Tp) if(p==4) w <- c(T1-Tp,T2-Tp,T3-Tp) if(p==5) w <- c(T1-Tp,T2-Tp,T3-Tp,T4-Tp) if(p==6) w <- c(T1-Tp,T2-Tp,T3-Tp,T4-Tp,T5-Tp) if(p==7) w <- c(T1-Tp,T2-Tp,T3-Tp,T4-Tp,T5-Tp,T6-Tp) if(p==8) w <- c(T1-Tp,T2-Tp,T3-Tp,T4-Tp,T5-Tp,T6-Tp,T7-Tp) if(p==9) w <- c(T1-Tp,T2-Tp,T3-Tp,T4-Tp,T5-Tp,T6-Tp,T7-Tp,T8-Tp) t0 <- n* t(w) %*% solve(sigw) %*% w pval <- 1-pf((t0/num),df1=num,df2=dn) #approx 1 - pchisq(t0,num) cut <- num*qf(1-alph,df1=num,df2=dn) #reject Ho if to > cut list(t0=t0,cut<-cut,pval=pval) } mbamv<-function(x, y, nsamp = 7) {# This function is for simple linear regression. The # highlighted boxes get weight 1. Click on right # mouse button to advance plot, and in R, highlight "stop." Only uses 50% trimming. x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] covv <- diag(q) centers <- sample(n, nsamp) for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) med <- median(md2) plot(x, y) points(x[md2 < med], y[md2 < med], pch = 15) abline(lsfit(x[md2 < med],y[md2 < med])) identify(x, y) } } mbamv2<-function(x, Y, nsamp = 7) { # This function is for multiple linear regression. The # highlighted boxes get weight 1. Click on right # mouse button to advance plot, and in R, highlight "stop." Only uses 50% trimming. x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] covv <- diag(q) centers <- sample(n, nsamp) par(mfrow=c(2,1)) for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) med <- median(md2) if(q ==1){out <- lsfit(x[md2 < med],Y[md2 < med])} else{out <- lsfit(x[md2 < med,],Y[md2 < med])} FIT <- out$coef[1] + x%*%out$coef[-1] RES <- Y - FIT plot(FIT,Y) points(FIT[md2 < med], Y[md2 < med], pch = 15) abline(0,1) identify(FIT, Y) plot(FIT,RES) points(FIT[md2 < med], RES[md2 < med], pch = 15) abline(0,0) identify(FIT, RES) } par(mfrow=c(1,1)) } mbareg<-function(x, y, nsamp = 7) {#gets the mbareg fit with 7 centers, med resid crit, 7 ball sizes x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] # q + 1 is number of predictors including intercept vals <- c(q + 3 + floor(n/100), q + 3 + floor(n/40), q + 3 + floor(n/20 ), q + 3 + floor(n/10), q + 3 + floor(n/5), q + 3 + floor(n/3), q + 3 + floor(n/2)) covv <- diag(q) centers <- sample(n, nsamp) temp <- lsfit(x, y) mbaf <- temp$coef critf <- median(temp$residuals^2) for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) smd2 <- sort(md2) for(j in 1:7) { temp <- lsfit(x[md2 <= smd2[vals[j]], ], y[md2 <= smd2[ vals[j]]]) #Use OLS on rows with md2 <= cutoff = smd2[vals[j]] res <- y - temp$coef[1] - x %*% temp$coef[-1] crit <- median(res^2) if(crit < critf) { critf <- crit mbaf <- temp$coef } } } list(coef = mbaf, critf = critf) } mldsim <-function(n = 100, p = 2, steps = 5, gam = 0.4, runs = 20, outliers = 0, pm = 10, loc = 0.5){ # Need p > 1. # This R function compares the MBA, FCH, RFCH, RMVN, scaled DGK, classical, OGK, # FASTMCD, CMVE and biased MB estimators. # R needs library(robustbase), calls covrob, covdgk # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a point mass on major axis (0,...,0,pm)' # 2 for outliers a point mass on minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # 6 for outliers a tight cluster at major axis (0,...,0,pm)' # Also finds n*var(T[p]) which approx p for the classical estimator # and n var(C(p,p)) for MLD estimator (T,C). A <- sqrt(diag(1:p)) qchi <- qchisq(0.5, p) cs <- 1:runs cm <- cs cloc <- 0 * (1:p) csig <- 0 * A mbas <- 1:runs mbam <- cs mbaloc <- cloc mbasig <- csig fchs <- mbas fchm <- cs fchloc <- cloc fchsig <- csig rfchs <- mbas rfchm <- cs rfchloc <- cloc rfchsig <- csig rmvns <- mbas rmvnm <- cs rmvnloc <- cloc rmvnsig <- csig dgks <- mbas dgkm <- cs dgkloc <- cloc dgksig <- csig ogks <- mbas ogkm <- cs ogkloc <- cloc ogksig <- csig fmcds <- mbas fmcdm <- cs fmcdloc <- cloc fmcdsig <- csig cmveloc <- cloc cmvesig <- csig mbs <- mbas mbm <- cs mbsig <- csig mbloc <- cloc mbact <- 0 fchct <- 0 rfchct <- 0 rmvnct <- 0 dgkct <- 0 cct <- 0 ogkct <- 0 fmcdct <- 0 cmvect <- 0 mbct <- 0 val <- floor(gam * n) for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- 0 x[1:val,p] <- pm } if(outliers == 2) { x[1:val, ] <- 0 x[1:val,1] <- pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p ] <- pm } if(outliers == 5) { x[1:val, 1 ] <- pm } if(outliers == 6) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, p] <- x[1:val, p] + pm } out <- covrob(x, csteps = steps, locc = loc) mbaloc <- mbaloc + out$center mbasig <- mbasig + out$cov mbam[i] <- out$center[p] mbas[i] <- out$cov[p,p] rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbact <- mbact + 1 rd2 <- mahalanobis(x, out$mnm, out$covm) const <- median(rd2)/qchi covm <- const * out$covm mbloc <- mbloc + out$mnm mbsig <- mbsig + covm mbm[i] <- out$mnm[p] mbs[i] <- covm[p,p] #scaling does not effect outlier count if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbct <- mbct + 1 fchloc <- fchloc + out$mnf fchsig <- fchsig + out$covf fchm[i] <- out$mnf[p] fchs[i] <- out$covf[p,p] rd2 <- mahalanobis(x, out$mnf, out$covf) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fchct <- fchct + 1 rfchloc <- rfchloc + out$rmnf rfchsig <- rfchsig + out$rcovf rfchm[i] <- out$rmnf[p] rfchs[i] <- out$rcovf[p,p] rd2 <- mahalanobis(x, out$rmnf, out$rcovf) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rfchct <- rfchct + 1 rmvnloc <- rmvnloc + out$rmnmvn rmvnsig <- rmvnsig + out$rcovmvn rmvnm[i] <- out$rmnmvn[p] rmvns[i] <- out$rcovmvn[p,p] rd2 <- mahalanobis(x, out$rmnmvn, out$rcovmvn) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rmvnct <- rmvnct + 1 cmveloc <- cmveloc + out$mncmv cmvesig <- cmvesig + out$covcmv rd2 <- mahalanobis(x, out$mncmv, out$covcmv) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) cmvect <- cmvect + 1 out <- covdgk(x, csteps = 10) dgkloc <- dgkloc + out$center dgksig <- dgksig + out$cov dgkm[i] <- out$center[p] dgks[i] <- out$cov[p,p] rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) dgkct <- dgkct + 1 center <- apply(x,2,mean) cloc <- cloc + center ccov <- var(x) csig <- csig + ccov cm[i] <- center[p] cs[i] <- ccov[p,p] rd2 <- mahalanobis(x, center, ccov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) cct <- cct + 1 out <- covOGK(x,sigmamu = scaleTau2) ogkloc <- ogkloc + out$center ogksig <- ogksig + out$cov ogkm[i] <- out$center[p] ogks[i] <- out$cov[p,p] rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) ogkct <- ogkct + 1 out <- covMcd(x) #for Det-MCD, use #out <- covMcd(x,nsamp="deterministic") #Use out <- covmb2(x) for the covmb2 estimator. fmcdloc <- fmcdloc + out$center fmcdsig <- fmcdsig + out$cov fmcdm[i] <- out$center[p] fmcds[i] <- out$cov[p,p] rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fmcdct <- fmcdct + 1 } mbasv <- n*var(mbas) mbam <- n*var(mbam) mbaloc <- mbaloc/runs mbasig <- mbasig/runs fchsv <- n*var(fchs) fchm <- n*var(fchm) fchloc <- fchloc/runs fchsig <- fchsig/runs rfchsv <- n*var(rfchs) rfchm <- n*var(rfchm) rfchloc <- rfchloc/runs rfchsig <- rfchsig/runs rmvnsv <- n*var(rmvns) rmvnm <- n*var(rmvnm) rmvnloc <- rmvnloc/runs rmvnsig <- rmvnsig/runs dgksv <- n*var(dgks) dgkm <- n*var(dgkm) dgkloc <- dgkloc/runs dgksig <- dgksig/runs ogksv <- n*var(ogks) ogkm <- n*var(ogkm) ogkloc <- ogkloc/runs ogksig <- ogksig/runs csv <- n*var(cs) cm <- n*var(cm) cloc <- cloc/runs csig <- csig/runs fmcdsv <- n*var(fmcds) fmcdm <- n*var(fmcdm) fmcdloc <- fmcdloc/runs fmcdsig <- fmcdsig/runs cmveloc <- cmveloc/runs cmvesig <- cmvesig/runs mbsv <- n*var(mbs) mbm <- n*var(mbm) mbloc <- mbloc/runs mbsig <- mbsig/runs list(mbaloc = mbaloc, fchloc = fchloc, rfchloc = rfchloc, rmvnloc = rmvnloc,dgkloc = dgkloc, cloc = cloc, ogkloc = ogkloc, fmcdloc = fmcdloc, cmveloc = cmveloc, mbloc = mbloc, mbasig = mbasig, fchsig = fchsig, rfchsig = rfchsig, rmvnsig = rmvnsig, dgksig = dgksig, csig = csig, ogksig = ogksig, fmcdsig = fmcdsig, cmvesig = cmvesig, mbsig=mbsig, mbasv = mbasv, fchsv=fchsv, rfchsv = rfchsv, rmvnsv=rmvnsv, dgksv=dgksv, ogksv=ogksv, csv=csv, fmcdsv=fmcdsv, mbsv=mbsv, mbam=mbam, fchm=fchm, rfchm=rfchm, rmvnm=rmvnm, dgkm=dgkm, ogkm=ogkm, cm=cm, fmcdm=fmcdm, mbm=mbm, cct = cct, dgkct=dgkct, mbact=mbact, fchct = fchct, rfchct = rfchct, rmvnct=rmvnct, ogkct=ogkct, fmcdct=fmcdct,cmvect=cmvect,mbct=mbct) } mldsim6<-function(n = 100, p = 2, steps = 5, gam = 0.4, runs = 100, outliers = 0, pm = 10, kk=5, osteps = 0){ # This R function compares the # FCH, RFCH, CMVE, RCMVE, RMVN, COVMB2, and MB estimators. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # Calls cmve, covfch, covrmvn, covmb2 A <- sqrt(diag(1:p)) fchct <- 0 rfchct <- 0 cmvect <- 0 rcmvect <- 0 rmvnct <- 0 mbct <- 0 covmb2ct <- 0 val <- floor(gam * n) for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } out <- covfch(x, csteps = steps) rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fchct <- fchct + 1 rd2 <- mahalanobis(x, out$rmnf, out$rcovf) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rfchct <- rfchct + 1 out <- cmve(x, csteps = steps) rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) cmvect <- cmvect + 1 rd2 <- mahalanobis(x, out$rmnf, out$rcovf) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rcmvect <- rcmvect + 1 rd2 <- mahalanobis(x, out$mnm, out$covm) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbct <- mbct + 1 out <- covrmvn(x, csteps = steps) rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rmvnct <- rmvnct + 1 out <- covmb2(x, k=kk, msteps=osteps) rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) covmb2ct <- covmb2ct + 1 } list(fchct = fchct, rfchct = rfchct, cmvect = cmvect, rcmvect = rcmvect, rmvnct = rmvnct, covmb2ct = covmb2ct, mbct = mbct) } mldsim7<-function(n = 100, p = 2, gam = 0.4, runs = 100, outliers = 0, pm = 10, kk=5, osteps = 0){ # This R function examines the COVMB2 estimators. #The function mldsim6 uses the weighted covariance matrix out$cov #The function mldsim7 uses squared Euclidian distances based on Ip are used, #or the squared Mahalanobis distances using diag(out$cov) # Counts number of times all outlier distances > clean distances. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # Calls covmb2 A <- sqrt(diag(1:p)) covv <- diag(p) #identity matrix Ip covmb2ct <- 0 diagct <- 0 val <- floor(gam * n) for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } out <- covmb2(x, k=kk, msteps=osteps) rd2 <- mahalanobis(x, out$center, covv) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) covmb2ct <- covmb2ct + 1 rd2 <- mahalanobis(x, out$center, diag(diag(out$cov))) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) diagct <- diagct + 1 } list( covmb2ct = covmb2ct, diagct = diagct) } MLRplot<-function(x, Y){ # Response plot and residual plot. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(lqs)" if a robust estimator replaces lsfit. # Advance the view with the right mouse button. x <- as.matrix(x) out <- lsfit(x, Y) cook <- ls.diag(out)$cooks n <- dim(x)[1] p <- dim(x)[2] + 1 tem <- cook > min(0.5, (2 * p)/n) bhat <- out$coef FIT <- Y - out$res cmar <- par("mar") par(mfrow = c(2, 1)) par(mar=c(4.0,4.0,2.0,0.5)) plot(FIT, Y) abline(0, 1) points(FIT[tem], Y[tem], pch = 15) title("Response Plot") identify(FIT, Y) RES <- Y - FIT plot(FIT, RES) points(FIT[tem], RES[tem], pch = 15) title("Residual Plot") identify(FIT, RES) par(mfrow = c(1, 1)) par(mar=cmar) } mlrplot2 <- function(x, Y) {# Makes the response plot and residual plot for two mbareg estimators. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(MASS)" if a robust estimator replaces lsfit. # Advance the view with the right mouse button, and in R, highlight "stop." x <- as.matrix(x) out <- mbareg(x, Y) bhat <- out$coef FIT <- bhat[1] + x %*% bhat[-1] par(mfrow = c(2, 2)) plot(FIT, Y) abline(0, 1) identify(FIT, Y) title("MBA Response Plot") RES <- Y - FIT plot(FIT, RES) identify(FIT, RES) title("MBA Residual Plot") # out <- mbalata(x, Y) bhat <- out$coef FIT <- bhat[1] + x %*% bhat[-1] plot(FIT, Y) abline(0, 1) identify(FIT, Y) title("MBALATA Response Plot") RES <- Y - FIT plot(FIT, RES) identify(FIT, RES) title("MBALATA Residual Plot") par(mfrow=c(1,1)) } mlrplot4<-function(x, Y){ # This function is for R. Use mlrplot2 for R and Splus. # Makes response plot and residual plot with large # Cook's distances and Pena's distances highlighted. # Squares have large Cook's distances # and crosses have large Pena's distances. x <- as.matrix(x) out <- lsfit(x, Y) cook <- ls.diag(out)$cooks n <- dim(x)[1] p <- dim(x)[2] + 1 #get Pena's distances s one <- 1 + 0 * 1:n w <- cbind(one, x) cp <- t(w) %*% w h <- w %*% solve(cp) %*% t(w) s <- 0 * 1:n for(i in 1:n) { for(j in 1:n) { s[i] <- s[i] + (cook[j] * h[i, j]^2)/(h[i, i] * h[j, j]) } } tem <- cook > min(0.5, (2 * p)/n) medd <- median(s) madd <- mad(s, constant = 1) tem2 <- abs(s - medd) > 4.5 * madd bhat <- out$coef FIT <- bhat[1] + x %*% bhat[-1] cmar <- par("mar") par(mfrow = c(2, 1)) par(mar=c(4.0,4.0,2.0,0.5)) plot(FIT, Y) abline(0, 1) points(FIT[tem], Y[tem], pch = 0) points(FIT[tem2], Y[tem2], pch = 3) identify(FIT, Y) title("Response Plot") RES <- Y - FIT plot(FIT, RES) points(FIT[tem], RES[tem], pch = 0) points(FIT[tem2], RES[tem2], pch = 3) identify(FIT, RES) title("Residual Plot") par(mfrow = c(1, 1)) par(mar=cmar) } MLRsim<-function(n = 100, q = 7, nruns = 4, eps = 0.1, shift = 9, type = 1){ #Right click Stop for each plot. #Generates response and residual plots for MLR #for a few iid error distributions: # if type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients b <- 0 * 1:q + 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set MLRplot(x,y) #get the response and residual plots }} mltreg<-function(x, y, indices = c(1,2)){ ##Need p > 1, m > 1. # Does multivariate linear regression. # Advance the plot by highlighting Stop with the right mouse button. # Want n > mp and n > 10p for Hotelling Lawley pvalues. # The indices are the variables to be left out of the # reduced model for the MANOVA partial F test. x <- as.matrix(x) y <- as.matrix(y) n <- dim(x)[1] q <- dim(x)[2] p <- q+1 m <- dim(y)[2] r <- length(indices) #Get L for the MANOVA partial F test L <- matrix(0,nrow=r,ncol=p) for(i in 1:r) L[i,indices[i]] <- 1 res <- matrix(nrow = n, ncol = m, 0) fit <- res Bhat <- matrix(nrow = p, ncol = m, 0) # q + 1 = p is number of predictors including intercept for(i in 1:m){ out <- lsfit(x,y[,i]) res[,i] <- out$res fit[,i] <- y[,i] - res[,i] Bhat[,i] <- out$coef } for(i in 1:m){ MLRplot(x,y[,i]) } Covhat <- (n-1)/(n-p) * var(res) #Get pvalues for testing whether jth predictor is #needed in the model given the other predictors are in the model. one <- 1 + 0*1:n w <- cbind(one,x) pvals <- 0*1:p Fj <- pvals J <- t(w)%*%w J <- solve(J) Covinv <- solve(Covhat) dendf <- n - m*p if( n <= m*p) dendf <- 1 for(j in 1:p){ T <- t(Bhat[j,])%*%Covinv%*%Bhat[j,]/J[j,j] pvals[j] <- 1 - pf((T/m),m,dendf) Fj[j] <- T/m } #Get pval for MANOVA F test for whether the #nontrivial predictors are needed in the model. D <- as.matrix(Bhat[-1,]) if(p == 2) D <- t(D) tem <- as.matrix(J[-1,-1]) teminv <- solve(tem) Weinv <- Covinv/(n-p) H <- t(D)%*%teminv%*%D C <- Weinv%*%H eig <- eigen(C,symmetric=FALSE,only.values=TRUE)$values eig <- as.double(eig) #as.real(eig) u <- sum(eig) MANOVAF <- (n-p)*u/((p-1)*m) pval <- 1 - pf(MANOVAF,(p-1)*m,dendf) MANOVA <- cbind(MANOVAF, pval) #got MANOVA F test summaries #Get pval for MANOVA partial F test for whether the #predictors given by indices are needed in the model. tem <- L%*%J%*%t(L) teminv <- solve(tem) H <- t(L%*%Bhat)%*%teminv%*%(L%*%Bhat) C <- Weinv%*%H eig <- eigen(C,symmetric=FALSE,only.values=TRUE)$values eig <- as.double(eig)# as.real(eig) u <- sum(eig) partialF <- (n-p)*u/(r*m) Pval <- 1 - pf(partialF,r*m,dendf) partial <- cbind(partialF, Pval) ##got partial F test summaries Ftable <- cbind(Fj,pvals) list(fit = fit, res = res, Covhat = Covhat, Bhat = Bhat, partial=partial, Ftable=Ftable, MANOVA=MANOVA) } mregddsim<-function(n = 100, m = 2, p = 4, nruns = 10, etype = 1, eps = 0.25, psi = 0.1, dd = 7, mnull = F, alph=0.1){ ## Need p > 1, m > 1. # Simulates DD plots of residuals for # multivariate linear regression model. # The identity line and lines corresponding to the # 100(1-alph)% prediction regions are added. # p = number of predictors including intercept # m > 1 is the number of response variables # want n > mp + 10, n > 10m and n > 10 p # multiply E by A where etype = 1 for MVN Nm(0,I), # etype = 2 for (1 - eps) Nm(0,I) + eps Nm(0, 25 I) # eps = 0.1, 0.25, 0.4, and 0.6 are interesting # etype = 3 for multivariate t_d with d = dd degrees of freedom # dd = 1, 2, 3, 5, 7 are interesting # etype = 4 for lognormal - E(lognormal). # For MVN data multiplying E by A results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. The diagonal elements # of the covariance matrix are 1 + (m-1) psi^2, while the off # diagonal elements are 2 psi + (m-2) psi^2. Hence the correlations # are (2 psi + (m-2) psi^2)/(1 + (m-1) psi^2). As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. q <- p - 1 A <- matrix(psi,nrow=m,ncol=m) diag(A) <- 1 res <- matrix(nrow = n, ncol = m, 0) fit <- res B <- matrix(nrow = p, ncol = m, 1) if(p == m){ for(j in 1:(m-1)){ B[p-j+1,j:m] <- 0 } } if(p > m){ for(j in 1:m){ B[p-j+1,j:m] <- 0 } } if(p < m){ for(j in 1:(p-1)){ B[p-j+1,j:m] <- 0 } } if (mnull == T) B[-1,] <- 0 Bhat <- B one <- 1 + 0*1:n for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) w <- cbind(one,x) y <- w %*% B #make error matrix E E <- matrix(rnorm(n * m), nrow = n, ncol = m) if(etype == 2) { zu <- runif(n) E[zu < eps, ] <- E[zu < eps, ] * 5 } if(etype == 3) { zu <- sqrt(rchisq(n, dd)/dd) E <- E/zu } if(etype == 4) E <- exp(E) - exp(0.5) #want mean 0 error vectors E <- E %*% A # got error matrix E y <- y + E for(j in 1:m){ out <- lsfit(x,y[,j]) res[,j] <- out$res fit[,j] <- y[,j] + res[,j] Bhat[,j] <- out$coef } ##make DD plot of the errors ddplot4(res, alpha=alph) } Covhat <- (n-1)/(n-p) * var(res) list(Bhat = Bhat, B=B, Covhat = Covhat) } mregsim<-function(n = 100, m = 2, p = 4, nruns = 10, etype = 1, eps = 0.25, psi = 0.1, dd = 7, mnull = T){ ## Need p > 1, m > 1. # Simulates multivariate linear regression model and gets # pvalues for tests for whether the p-1 nontrivial # predictors are needed in the model and # pvalues for tests for whether Xi is needed in the model. # p = number of predictors including intercept # m > 1 is the number of response variables # want n > mp and n > 10 p # multiply E by A where etype = 1 for MVN Nm(0,I), # etype = 2 for (1 - eps) Nm(0,I) + eps Nm(0, 25 I) # eps = 0.1, 0.25, 0.4, and 0.6 are interesting # etype = 3 for multivariate t_d with d = dd degrees of freedom # dd = 1, 2, 3, 5, 7 are interesting # etype = 4 for lognormal - E(lognormal). # For MVN data multiplying E by A results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. The diagonal elements # of the covariance matrix are 1 + (m-1) psi^2, while the off # diagonal elements are 2 psi + (m-2) psi^2. Hence the correlations # are (2 rho + (m-2) psi^2)/(1 + (m-1) psi^2). As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # Illustrates that Hotelling Lawley statistic = last statistic/(n-p). q <- p - 1 A <- matrix(psi,nrow=m,ncol=m) diag(A) <- 1 res <- matrix(nrow = n, ncol = m, 0) fit <- res B <- matrix(nrow = p, ncol = m, 1) if(p == m){ for(j in 1:(m-1)){ B[p-j+1,j:m] <- 0 } } if(p > m){ for(j in 1:m){ B[p-j+1,j:m] <- 0 } } if(p < m){ for(j in 1:(p-1)){ B[p-j+1,j:m] <- 0 } } if (mnull == T) B[-1,] <- 0 Bhat <- B one <- 1 + 0*1:n fcov <- 0*1:p wilkcov <- fcov pilcov <- fcov hotlawcov <- fcov roycov <- fcov rden <- n - p - m + 1 rcut <- qf(0.95,m,rden) dendf <- n - m*p wcv <- 0 pcv <- 0 hlcv<-0 rcv<-0 fcv<-0 ndf <- (p-1)*m mancv <- cbind(wcv,pcv,hlcv,rcv,fcv) #L <- cbind(fcov[-1],diag(p-1)) #want n > mp fcut <- qf(0.95, m, dendf) mcut <- qf(0.95,m*(p-1),dendf) h <- max(p-1,m) mrcut <- qf(0.95,h,(n-h-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) w <- cbind(one,x) y <- w %*% B #make error matrix E E <- matrix(rnorm(n * m), nrow = n, ncol = m) if(etype == 2) { zu <- runif(n) E[zu < eps, ] <- E[zu < eps, ] * 5 } if(etype == 3) { zu <- sqrt(rchisq(n, dd)/dd) E <- E/zu } if(etype == 4) E <- exp(E) - exp(0.5) #want mean 0 error vectors E <- E %*% A # got error matrix E y <- y + E for(j in 1:m){ out <- lsfit(x,y[,j]) res[,j] <- out$res fit[,j] <- y[,j] - res[,j] Bhat[,j] <- out$coef } Covhat <- (n-1)/(n-p) * var(res) # w is the data matrix with a column of ones J <- t(w)%*%w J <- solve(J) We <- Covhat*(n-p) Covinv <- solve(Covhat) Weinv <- Covinv/(n-p) #get the test statistics for whether Xj is needed in the model for(j in 1:p){ T <- t(Bhat[j,])%*%Covinv%*%Bhat[j,]/J[j,j] if(T/m > fcut) fcov[j] <- fcov[j] + 1 H <- Bhat[j,]%*%t(Bhat[j,])/J[j,j] C <- Weinv%*%H eig <- eigen(C,symmetric=FALSE,only.values=TRUE)$values eig <- as.real(eig) lmax <- eig[1] u <- sum(eig) v <- sum(eig/(eig+1)) wilk <- prod(1/(eig+1)) if(-(n-p-1-0.5*m)*log(wilk)/m > fcut) wilkcov[j] <- wilkcov[j] + 1 if((n-p)*v/m > fcut) pilcov[j] <- pilcov[j] + 1 if((n-p)*u/m > fcut) hotlawcov[j] <- hotlawcov[j] + 1 if(rden*lmax/m > rcut) roycov[j] <- roycov[j] + 1 } #get the MANOVA F test statistics for whether nontrivial #predictors are needed in the model D <- as.matrix(Bhat[-1,]) if(p == 2) D <- t(D) vecD <- as.matrix(D[,1]) for(j in 2:m){ vecD <- rbind(vecD,as.matrix(D[,j]))} tem <- as.matrix(J[-1,-1]) teminv <- solve(tem) T <- t(vecD)%*%kronecker(Covinv,teminv)%*%vecD if(T/ndf > mcut) mancv[5] <- mancv[5] + 1 H <- t(D)%*%teminv%*%D C <- Weinv%*%H eig <- eigen(C,symmetric=FALSE,only.values=TRUE)$values eig <- as.double(eig) lmax <- eig[1] u <- sum(eig) v <- sum(eig/(eig+1)) wilk <- prod(1/(eig+1)) if(-(n-0.5*p-0.5*m-2)*log(wilk)/ndf > mcut) mancv[1] <- mancv[1] + 1 if((n-p)*v/ndf > mcut) mancv[2] <- mancv[2] + 1 if((n-p)*u/ndf > mcut) mancv[3] <- mancv[3] + 1 if((n-h-1)*lmax/h > mrcut) mancv[4] <- mancv[4] + 1 } wilkcov <- wilkcov/nruns pilcov <- pilcov/nruns hotlawcov <- hotlawcov/nruns roycov <- roycov/nruns fcov <- fcov/nruns mancv <- mancv/nruns list(Bhat = Bhat, B=B, Covhat = Covhat, wilkcov=wilkcov, pilcov=pilcov, hotlawcov=hotlawcov, roycov=roycov, fcov=fcov, mancv=mancv) } pcisim<- function(nruns = 1000, n1 = 10, n2 = 10, mu1 = 0, mu2 = 0, var1 = 1, var2 = 1, dist = 1, alpha = 0.05) { #gets the pooled, modified pooled, and Welch 100 (1-alpha)% CIs for mu1 - mu2 #defaults are alpha = .05, lengths make the most sense if n1 = n2 #dist = 1 for x ~ N(mu1, var1), y ~ N(mu2, var2) #dist = 2 for x ~ EXP with mean mu1 variance var1, # y ~ EXP with mean mu2 variance var2 #dist = 3 for x ~ N(mu1, var1), y EXP with mean mu2 variance var2 diff <- mu1 - mu2 up <- 1 - alpha/2 pdf <- n1 + n2 - 2 rhohat <- n1/(n1 + n2) pcov <- 0 mpcov <- 0 wcov <- 0 plow <- 1:nruns pup <- plow mplow <- plow mpup <- plow wlow <- plow wup <- plow tpcut <- qt(up, pdf) mtpcut <- qt(up, n1 + n2 - 4) for(i in 1:nruns) { if(dist == 1) { x <- mu1 + sqrt(var1) * rnorm(n1) y <- mu2 + sqrt(var2) * rnorm(n2) } if(dist == 2) { x <- mu1 - sqrt(var1) + sqrt(var1) * rexp(n1) y <- mu2 - sqrt(var2) + sqrt(var2) * rexp(n2) } if(dist == 3) { x <- mu1 + sqrt(var1) * rnorm(n1) y <- mu2 - sqrt(var2) + sqrt(var2) * rexp(n2) } xbar <- mean(x) s1sq <- var(x) ybar <- mean(y) s2sq <- var(y) xmybar <- xbar - ybar sp <- (n1 - 1) * s1sq + (n2 - 1) * s2sq sp <- sqrt(sp/pdf) val <- sp * sqrt(1/n1 + 1/n2) #get pooled CI tem <- tpcut * val plow[i] <- xmybar - tem pup[i] <- xmybar + tem if(plow[i] < diff && pup[i] > diff) pcov <- pcov + 1 #get modified pooled CI thetahat <- s2sq/s1sq tauhatsq <- (1 - rhohat + rhohat * thetahat)/(rhohat + (1 - rhohat) * thetahat) tauhat <- sqrt(tauhatsq) tem <- mtpcut * val * tauhat mplow[i] <- xmybar - tem mpup[i] <- xmybar + tem if(mplow[i] < diff && mpup[i] > diff) mpcov <- mpcov + 1 #get Welch CI t1 <- s1sq/n1 t2 <- s2sq/n2 do <- (t1 + t2)^2/(t1^2/(n1 - 1) + t2^2/(n2 - 1)) d <- max(1, floor(do)) wtcut <- qt(up, d) val <- sqrt(s1sq/n1 + s2sq/n2) tem <- wtcut * val wlow[i] <- xmybar - tem wup[i] <- xmybar + tem if(wlow[i] < diff && wup[i] > diff) wcov <- wcov + 1 } pcov <- pcov/nruns plen <- sqrt(n1) * mean(pup - plow) mpcov <- mpcov/nruns mplen <- sqrt(n1) * mean(mpup - mplow) wcov <- wcov/nruns wlen <- sqrt(n1) * mean(wup - wlow) #print(mean(x)) #print(var(x)) #print(mean(y)) #print(var(y)) list(pcov = pcov, plen = plen, mpcov = mpcov, mplen = mplen, wcov = wcov, wlen = wlen ) } pifclean<-function(k, gam) { p <- floor(log(3/k)/log(1 - gam)) list(p = p) } pifold<-function(x,y,k=5,alph = 0.05){ #Does k-fold CV using PI coverage and average length. #A y case in the jth fold is in its PI iff the ``residual" of the case #is in the PI for the residuals of the cases not in the jth fold. #Use for k < < n: very inefficient for leave one out CV = n-fold CV. x<-as.matrix(x) n<-nrow(x) cov <- 0 len <- 0 d <- ncol(x) + 1 folds<-rand(n,k)$groups #folds<-sample(1:k,n,replace=TRUE) #could be quicker for(j in 1:k){ tem<-lsfit(x[folds!=j,],y[folds!=j]) res <-tem$resid #want PI for the residuals of cases not in the jth fold temp <- dpi(yf=0,yfhat=0,d=d,resid=res,alph=alph) yfhat <- tem$coef[1] + as.matrix(x[folds==j,]) %*% tem$coef[-1] fres <- y[folds==j] - yfhat cov <- cov + sum((temp$low <= fres) * (temp$up >= fres)) len <- len + sum(folds==j)*(temp$up - temp$low) } cov <- cov/n alen <- len/n list(cov=cov,alen=alen) } poiscisim<- function(n = 100, nruns = 500, theta = 1, alpha = 0.05) { # Simulates the classical, modified and ``exact" CIs for theta. ccov <- 0 mcov <- 0 ecov <- 0 cut <- 1 - alpha/2 cut2 <- alpha/2 clow <- 1:nruns cup <- clow mlow <- clow mup <- clow elow <- clow eup <- clow zup <- qchisq(1 - alpha, 2)/(2 * n) val <- qnorm(cut) valsq <- val^2 for(i in 1:nruns) { x <- rpois(n, theta) # get classical CI xbar <- mean(x) se <- sqrt(xbar/n) tem <- val * se clow[i] <- xbar - tem cup[i] <- xbar + tem if(clow[i] < theta && cup[i] > theta) ccov <- ccov + 1 #get modified CI xsum <- sum(x) tem <- xsum - 0.5 + 0.5 * valsq - val * sqrt(xsum - 0.5 + valsq/4) mlow[i] <- tem/n tem <- xsum + 0.5 + 0.5 * valsq + val * sqrt(xsum + 0.5 + valsq/4) mup[i] <- tem/n if(mlow[i] < theta && mup[i] > theta) mcov <- mcov + 1 #get Grosh exact CI w <- as.integer(xsum) if(w == 0) { elow[i] <- 0 eup[i] <- zup } else { elow[i] <- qchisq(cut2, 2 * w)/(2 * n) eup[i] <- qchisq(cut, (2 * w + 2))/(2 * n) } if(elow[i] < theta && eup[i] > theta) ecov <- ecov + 1 } ccov <- ccov/nruns clen <- sqrt(n) * mean(cup - clow) mcov <- mcov/nruns mlen <- sqrt(n) * mean(mup - mlow) ecov <- ecov/nruns elen <- sqrt(n) * mean(eup - elow) list(ccov = ccov, clen = clen, mcov = mcov, mlen = mlen, ecov = ecov, elen = elen) } PRboot<-function(x,y,B = 1000){ #needs library(MASS), n > 5p, p > 2, want B >= 50p, takes a few minutes #bootstraps the Poisson regression full model x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] tdata <- as.data.frame(cbind(x,y)) out <- glm(y~., family=poisson, data=tdata) bhat<-out$coef ESP <- predict(out,newdata = tdata) betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rpois(n,lambda=exp(ESP)) tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(ydat~., family=poisson, data=tdat) betas[i,] <- temp$coef } list(bhat=bhat,betas=betas) } predreg<-function(x, alpha = 0.05){ # Makes a prediction region for the rows of x. # If p = 1, the shorth interval should work better. #Also computes the distance for the 0 vector. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] zero <- 0*(1:p) up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical distance MD <- sqrt(md2) #get nonparametric prediction region boundary cuplim <- sqrt(quantile(md2, up)) D0 <- sqrt(mahalanobis(zero, center, cov)) list(cuplim = cuplim, D0=D0, MD=MD, center=center, cov=cov) } predrgn <-function(x, xf, alpha = 0.05){ # Makes a prediction region for xf when cases are rows of x. # If p = 1, the shorth interval should work better. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] inr <- 0 up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # md2 is the classical squared Mahalanobis distance #get nonparametric prediction region boundary cuplim <- quantile(md2, up) #inr <- 1 if xf is in the prediction region xfd <- mahalanobis(xf, center, cov) if(xfd <= cuplim) inr <- 1 list(md2=md2, center=center,cov=cov, cuplim = cuplim, xfd, inr=inr) } predsim<-function(n = 100, p = 4, nruns = 100, xtype = 1, dd = 1, eps = 0.25, alpha = 0.1) {# MAY NOT WORK IF p = 1. # Gets coverages of nonparametric, semiparametric and # parametric MVN prediction regions. # Multiply x by A where xtype = 1 for MVN Nq(0,I), # 2, 3, 4 and 10 (with delta = eps) for delta Np(0,I) + (1-delta) Np(0, 25 I) # 5 for lognormal, # 6, 7, 8 and 9 for multivariate t_d with d = 3, 5, 19 or dd # mahalanobis gives squared Maha distances # set.seed(974) ccvr <- 0 scvr <- 0 rcvr <- 0 volc <- 1:nruns vols <- volc volr <- volc #up <- 1 - alpha up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha A <- sqrt(diag(1:p)) m <- n + 1 for(i in 1:nruns) { #make data x <- matrix(rnorm(m * p), nrow = m, ncol = p) if(xtype == 2) { zu <- runif(m) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(xtype == 3) { zu <- runif(m) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(xtype == 4) { zu <- runif(m) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(xtype == 5) x <- exp(x) if(xtype == 6) { zu <- sqrt(rchisq(m, 3)/3) x <- x/zu } if(xtype == 7) { zu <- sqrt(rchisq(m, 5)/5) x <- x/zu } if(xtype == 8) { zu <- sqrt(rchisq(m, 19)/19) x <- x/zu } if(xtype == 9) { zu <- sqrt(rchisq(m, dd)/dd) x <- x/zu } if(xtype == 10) { zu <- runif(m) x[zu < eps, ] <- x[zu < eps, ] * 5 } x <- x %*% A xx <- x[ - m, ] center <- apply(xx, 2, mean) cov <- var(xx) md2 <- mahalanobis(xx, center, cov) hsq <- quantile(md2, up) if(mahalanobis(t(x[m, ]), center, cov) <= hsq) ccvr <- ccvr + 1 volc[i] <- sqrt(hsq)^p * prod(diag(chol(cov))) out <- covrmvn(xx) center <- out$center cov <- out$cov md2 <- mahalanobis(xx, center, cov) hsq <- quantile(md2, up) dsq <- mahalanobis(t(x[m, ]), center, cov) if(dsq <= hsq) scvr <- scvr + 1 sqrtdet <- prod(diag(chol(cov))) vols[i] <- sqrt(hsq)^p * sqrtdet hsq <- qchisq(up, p) if(dsq <= hsq) rcvr <- rcvr + 1 volr[i] <- sqrt(hsq)^p * sqrtdet } ccvr <- ccvr/nruns scvr <- scvr/nruns rcvr <- rcvr/nruns #get a measure of efficiency wrt vols, so eff(vols) = 1 vols <- mean(vols) volc <- mean(volc)/vols volr <- mean(volr)/vols vols <- 1 list(ncvr = ccvr, scvr = scvr, mcvr = rcvr, voln = volc, vols = vols, volm = volr, up = up) } pregbootsim<-function(n = 100, p = 4, k = 1, nruns = 100, psi=0.0, B=1000, int=1, a = 1, alpha = 0.05){ ##Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #Simulates parametric bootstrap for Poisson regression (full model). # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so there are zeroes in the model, k is the number of nonnoise variables #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want # with int + 3a <=5, int -3a >= -5. #set.seed(974) ##need p>2 and want n >= 5p q <- p-1 pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta<-c(int,b) one <- as.vector(0*1:(k+1) + 1) one[1] <- int for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rpois(n,lambda=exp(SP)) tdata <- as.data.frame(cbind(x,y)) #make a PR data set out <- glm(y~., family=poisson, data=tdata) bhat<-out$coef ESP <- predict(out,newdata = tdata) betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rpois(n,lambda=exp(ESP)) tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(ydat~., family=poisson, data=tdat) betas[i,] <- temp$coef } for (j in 1:p){ tem <- shorth3(betas[,j],alpha=alpha) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- bhat[(k+2):p] tem <- confreg(betas[,(k+2):p],g=gg,that=tstat,alpha=alpha) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k+1 values of beta are (int,1,...,1) gg <- k + 1 tstat <- bhat[1:(k+1)] tem <- confreg(betas[,1:(k+1)],g=gg,that=tstat,alpha=alpha) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} prpiplot2<- function(ESP, y, d=2, B = 1000, alpha=0.05) {# Makes response plot for Poisson regression with PIs added. #0 = data, x = PI #tdata <- as.data.frame(cbind(x,y)) #out <- glm(y~., family=poisson, data=tdata) #x<-as.matrix(x); d<-dim(x)[2]+1 #d=no. of predictors #ESP <- predict(out) Y <- as.vector(y) n <- length(y) low<-1:n up<-low cov<-0 plot(ESP, Y) abline(mean(Y), 0) Ehat <- exp(ESP) indx <- sort.list(ESP) lines(ESP[indx], Ehat[indx]) lines(lowess(ESP, Y), type = "s") for(i in 1:n){ ydat <- rpois(B,lambda=exp(ESP[i])) tem <- mshpi(yf=y[i],ydat=ydat,n=n,d=d,alph=alpha) low[i] <- tem$low up[i] <- tem$up cov<-cov+tem$inr } points(ESP,low,pch=4) points(ESP,up,pch=4) title("Response Plot") cov<-cov/n list(cov=cov) } prpisim2<-function(n = 100, p = 4, k = 1, nruns = 100, psi = 0.0, J = 5, B=1000, int=1, a = 1, alpha = 0.05){ #Needs library(leaps), library(MASS), library(mgcv), and library(glmnet). #Calls shpi and mshpi. Adds backwards elimination to prpisim. #The GAM PR PI is only computed if p = 4. #Use 1 <= k <= p-1, where k is the number of nonnoise variables. #Simulates the Olive et al. (2018) PI for Poisson regression. #PIs for full model, lasso, relaxed lasso, backward elimination # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want exp(int+3a) reasonable # with int + 3a <=10, int > 0 and int -3a > -4. set.seed(974) ##need p>2 fullpilen <- 1:nruns fullpicov <- 0 gampilen <- 1:nruns gampicov <- 0 ohfspilen <- 1:nruns ohfspicov <- 0 laspilen <- 1:nruns laspicov <- 0 RLpilen <- 1:nruns RLpicov <- 0 vspilen <- 1:nruns vspicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) nc <- ceiling(n/J)-1 nc <- min(nc,q) nc <- max(nc,1) #the maximum number of variables to use for forward selection zz<-1:nc vmax <- min(p,as.integer(n/5))#max no. of variables to use for lasso dd <- 1:nruns ddbe <- dd dRL <- dd #relaxed lasso and lasso have the same d for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A xf <- val* rnorm(q) %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rpois(n,lambda=exp(SP)) yf <- rpois(1,lambda=exp(int + xf%*%b)) tdata <- as.data.frame(cbind(x,y)) yn <- c(y,yf) xn <- rbind(x,xf) tdat <- as.data.frame(cbind(xn,yn)) #make a PR data set #get full model PR PI if(n >= 5*p){ out <- glm(y~., family=poisson, data=tdata) ESP <- predict(out,newdata = tdat[n+1,]) ydat <- rpois(B,lambda=exp(ESP)) tem <- shpi(yf=yf,ydat=ydat,alph=alpha) #tem <- mshpi(yf=yf,ydat=ydat,n,d=p,alph=alpha) fullpilen[i] <- tem$up - tem$low fullpicov <- fullpicov + tem$inr } #get backward elimination PR PI, could use mshpi if(n >= 5*p){ varnames <- names(out$coef)[-1] outbe <- step(out,trace=0) vinnames <- names(outbe$coef)[-1] vin <- varnames %in% vinnames ddbe[i]<-length(outbe$coef) pp<-ddbe[i] ESP <- outbe$coef[1] + xf[vin] %*% outbe$coef[-1] ydat <- rpois(B,lambda=exp(ESP)) tem <- shpi(yf=yf,ydat=ydat,alph=alpha) #tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) vspilen[i] <- tem$up - tem$low vspicov <- vspicov + tem$inr } #get full model GAM PR PI if p = 4, GAM needs a formula if(n >= 5*p && p == 4){ x1 <- x[,1];x2 <- x[,2];x3 <- x[,3] out <- gam(y ~ s(x1) + s(x2) + s(x3),family=poisson) ESP <- predict.gam(out,newdata=data.frame(x1=xf[1],x2=xf[2],x3=xf[3])) ydat <- rpois(B,lambda=exp(ESP)) tem <- shpi(yf=yf,ydat=ydat,alph=alpha) #tem <- mshpi(yf=yf,ydat=ydat,n,d=p,alph=alpha) gampilen[i] <- tem$up - tem$low gampicov <- gampicov + tem$inr } #get lasso GLM PI out<-cv.glmnet(x,y,family="poisson") lam <- out$lambda.min ESP <- predict(out,s=lam,newx=xf) #now get d lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 #d = pp ydat <- rpois(B,lambda=exp(ESP)) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) laspilen[i] <- tem$up - tem$low laspicov <- laspicov + tem$inr #get relaxed lasso GLM PI xsub <- x[,vin] sub <- glm(y~., family=poisson, data=data.frame(cbind(xsub,y))) dRL[i]<- pp #want these to be near but >= k+1 ESP <- sub$coef[1] + xf[vin] %*% sub$coef[-1] ydat <- rpois(B,lambda=exp(ESP)) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) RLpilen[i] <- tem$up - tem$low RLpicov <- RLpicov + tem$inr #get Olive and Hawkins forward selection PI if(n >= 5*p){ temp<-regsubsets(x,y,nvmax=nc,method="forward") out<-summary(temp) mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] xsub <- x[,vin] sub <- glm(y~., family=poisson, data=data.frame(cbind(xsub,y))) dd[i]<-length(sub$coef)#want these to be near but >= k+1 pp<-dd[i] ESP <- sub$coef[1] + xf[vin] %*% sub$coef[-1] ydat <- rpois(B,lambda=exp(ESP)) tem <- mshpi(yf=yf,ydat=ydat,n,d=pp,alph=alpha) ohfspilen[i] <- tem$up - tem$low ohfspicov <- ohfspicov + tem$inr } } fullpimnlen <- mean(fullpilen) fullpicov <- fullpicov/nruns gampimnlen <- mean(gampilen) gampicov <- gampicov/nruns laspimnlen <- mean(laspilen) laspicov <- laspicov/nruns RLpimnlen <- mean(RLpilen) RLpicov <- RLpicov/nruns ohfspimnlen <- mean(ohfspilen) ohfspicov <- ohfspicov/nruns vspimnlen <- mean(vspilen) vspicov <- vspicov/nruns mndd <- mean(dd) mndRL <- mean(dRL) mnddbe <- mean(ddbe) #relaxed lasso and lasso have the same d list(mndRL=mndRL,mndd=mndd,mnddbe=mnddbe,int=int,b=b,fullpicov=fullpicov, fullpimenlen=fullpimnlen,gampicov=gampicov,gampimenlen=gampimnlen, laspicov=laspicov, laspimenlen=laspimnlen, RLpicov=RLpicov, RLpimenlen=RLpimnlen,ohfspicov=ohfspicov,ohfspimnlen=ohfspimnlen, vspicov=vspicov,vspimnlen=vspimnlen)} prplot<- function(x, y) {# Makes response plot, OD plot, weighted forward response # and residual plots for Poisson regression. # Workstation: need to activate a graphics # device with command "X11()" or "motif()." # #If q is changed, change the formula in the glm statement. #q <- 5 # change formula to x[,1]+ ... + x[,q] with q #out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[, 5], family = poisson) out <- glm(y~., family=poisson, data=data.frame(cbind(x,y))) x<-as.matrix(x) ESP <- predict(out) par(mfrow = c(2, 2)) Y <- y plot(ESP, Y) abline(mean(Y), 0) Ehat <- exp(ESP) indx <- sort.list(ESP) lines(ESP[indx], Ehat[indx]) lines(lowess(ESP, Y), type = "s") title("a) Response Plot") Vhat <- (Y - Ehat)^2 plot(Ehat, Vhat) abline(0, 1) abline(0, 4) #lines(lowess(Ehat, Vhat)) #abline(lsfit(Ehat, Vhat)$coef) title("b) OD Plot") Z <- Y Z[Y < 1] <- Z[Y < 1] + 0.5 WRES <- sqrt(Z) * (log(Z) - x %*% out$coef[-1] - out$coef[1]) WFIT <- sqrt(Z) * log(Z) - WRES plot(WFIT, sqrt(Z) * log(Z)) abline(0, 1) #abline(lsfit(WFIT, sqrt(Z) * log(Z))$coef) title("c) WFRP") plot(WFIT, WRES) title("d) Wtd Residual Plot") par(mfrow = c(1, 1)) } prplot2 <- function(ESP, x, y) {# Makes response plot and OD plot for Poisson regression. #Can be used for a Poisson GAM or GLM. # If t<-1:13 # y<-c(12,14,33,50,67,74,123,141,165,204,253,246,240) # If out <- glm(y~t+I(t^2),family=poisson) # use ESP <- predict(out). # If outgam <- gam(y~s(t),poisson) # use ESP <- predict.gam(outgam). # Workstation: need to activate a graphics # device with command "X11()" or "motif()." #WRES <- sqrt(Z) * (log(Z) - x %*% out$coef[-1] - out$coef[1])#for a GLM x<-as.matrix(x) par(mfrow = c(2, 2)) Y <- y plot(ESP, Y) abline(mean(Y), 0) Ehat <- exp(ESP) indx <- sort.list(ESP) lines(ESP[indx], Ehat[indx]) lines(lowess(ESP, Y), type = "s") title("a) Response Plot") Vhat <- (Y - Ehat)^2 plot(Ehat, Vhat) abline(0, 1) abline(0, 4) #lines(lowess(Ehat, Vhat)) #abline(lsfit(Ehat, Vhat)$coef) title("b) OD Plot") Z <- Y Z[Y < 1] <- Z[Y < 1] + 0.5 WRES <- sqrt(Z) * (log(Z) - ESP) WFIT <- sqrt(Z) * log(Z) - WRES plot(WFIT, sqrt(Z) * log(Z)) abline(0, 1) #abline(lsfit(WFIT, sqrt(Z) * log(Z))$coef) title("c) WFRP") plot(WFIT, WRES) title("d) Wtd Residual Plot") par(mfrow = c(1, 1)) } raysim<- function(n = 100, mu = 1, sigma = 1, iter = 100, runs = 100, alpha = 0.05) { #For simulation of Rayleigh MLEs. # Note that mum and sigm are the initial estimators. muhat <- 1:runs sighat <- muhat mum <- muhat sigm <- muhat muo <- muhat muo2 <- muo sigo <- muhat sigo2 <- sigo mlow <- 1:runs mup <- mlow slow <- mlow sup <- mlow mcov <- 0 scov <- 0 lcut <- qnorm(1 - alpha/2) for(i in 1:runs) { w <- 2 * sigma^2 * rexp(n) y <- sqrt(w) + mu shat <- sqrt(var(y)/0.429204) mhat <- mean(y) - 1.253314 * shat mum[i] <- min(mhat, (2 * min(y) - mhat)) sigm[i] <- sqrt(sum((y - mum[i])^2)/(2 * n)) muold <- mum[i] sigold <- sigm[i] #use Newton's method to find the MLE for(j in 1:iter) { g1 <- - sum((y - muold)^(-1)) + sum(y - muold)/sigold^2 g2 <- (-2 * n)/sigold + sum((y - muold)^2)/sigold^3 d11 <- - sum((y - muold)^(-2)) - n/sigold^2 d12 <- (-2 * sum((y - muold)))/sigold^3 d21 <- d12 d22 <- (2 * n)/sigold^2 - (3 * sum((y - muold)^2))/sigold^4 D <- rbind(c(d11, d12), c(d21, d22)) told <- cbind(muold, sigold) g <- cbind(g1, g2) tnew <- t(told) - solve(D) %*% t(g) muold <- tnew[1] sigold <- tnew[2] } #muhat[i] <- tnew[1] or try muhat[i] <- min(tnew[1], (2 * min(y) - tnew[1])) #sighat[i] <- tnew[2] or try sighat[i] <- sqrt(sum((y - muhat[i])^2)/(2 * n)) muo[i] <- told[1] muo2[i] <- tnew[1] - told[1] sigo[i] <- told[2] sigo2[i] <- tnew[2] - told[2] #get CIs for mu and sigma d11 <- - sum((y - muhat[i])^(-2)) - n/sighat[i]^2 d12 <- (-2 * sum((y - muhat[i])))/sighat[i]^3 d21 <- d12 d22 <- (2 * n)/sighat[i]^2 - (3 * sum((y - muhat[i])^2))/sighat[i]^4 D <- rbind(c(d11, d12), c(d21, d22)) C <- solve(D) tem <- lcut * sqrt( - C[1, 1]) mlow[i] <- muhat[i] - tem mup[i] <- muhat[i] + tem if(mlow[i] < mu && mup[i] > mu) mcov <- mcov + 1 tem <- lcut * sqrt( - C[2, 2]) slow[i] <- sighat[i] - tem sup[i] <- sighat[i] + tem if(slow[i] < sigma && sup[i] > sigma) scov <- scov + 1 } mummn <- mean(mum) sdm <- sqrt(n * var(mum)) sigmmn <- mean(sigm) sds <- sqrt(n * var(sigm)) muhatmn <- mean(muhat) sdmuhat <- sqrt(n * var(muhat)) sighatmn <- mean(sighat) sdsighat <- sqrt(n * var(sighat)) muconv <- max(abs(muhat - muo)) muconv2 <- max(abs(muo2)) sigconv <- max(abs(sighat - sigo)) sigconv2 <- max(abs(sigo2)) mcov <- mcov/runs smlen <- sqrt(n) * mean(mup - mlow) scov <- scov/runs sslen <- sqrt(n) * mean(sup - slow) list(muconv = muconv, muconv2 = muconv2, sigconv = sigconv, sigconv2 = sigconv2, mummn = mummn, sdm = sdm, sigmmn = sigmmn, sds = sds, muhatmn = muhatmn, sdmuhat = sdmuhat, sighatmn = sighatmn, sdsighat = sdsighat, mcov = mcov, smlen = smlen, scov = scov, sslen = sslen) } regboot<-function(x,y, B = 1000){ #bootstrap with residuals for MLR #asymptotically the same as MSE (X'X)^(-1), so use to compare vselboot #out <- regboot(belx,bely) #ddplot4(out$betas) #this does not work for vselboot: too many 0s #mplot(out$betas) #cov(out$betas) full <- lsfit(x,y) res <- full$resid fit <- y - res n <- length(y) x <- as.matrix(x) p <- dim(x)[2]+1 betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ yb <- fit + sample(res,n,replace=T) betas[i,] <- lsfit(x,yb)$coef } list(betas=betas) } regbootsim<-function(n = 100, p = 4, nruns = 100, eps = 0.1, shift = 9, type = 1, alph = 0.05){ #Simulates residual bootstrap for regression. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k ones p - k zeroes q <- p-1 k <- floor(p/2) b <- 0 * 1:q b[1:(k-1)] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-regboot(x,y) #residual bootstrap for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p - k values of beta are 0 tem <- predreg(out$betas[,(k+1):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} regbootsim2<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi=0.0, BB=1000, alph = 0.05){ #Simulates residual bootstrap for regression (full model). #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-regboot(x,y,B=BB) #residual bootstrap for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 tem <- predreg(out$betas[,(k+2):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} regbootsim3<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi=0.0, BB=1000, alph = 0.05){ ##Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #Simulates residual bootstrap for regression (full model). #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 one <- as.vector(0*1:(k+1) + 1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-regboot(x,y,B=BB) #residual bootstrap bhat <- lsfit(x,y)$coef #bhat_OLS for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- bhat[(k+2):p] tem <- confreg(out$betas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k+1 values of beta are 1 gg <- k + 1 tstat <- bhat[1:(k+1)] tem <- confreg(out$betas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} rmregboot<-function(x,y, B = 1000){ #bootstrap with residuals for MLR using rmreg3 #out <- rmregboot(belx,bely) #ddplot4(out$betas); ddplot5(out$betas) #mplot(out$betas) #cov(out$betas) full <- lsfit(x,y) res <- full$resid fit <- y - res n <- length(y) x <- as.matrix(x) p <- dim(x)[2]+1 betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ yb <- fit + sample(res,n,replace=T) betas[i,] <- rmreg3(x,yb)$Bhat } list(betas=betas) } rmregbootsim<-function(n = 100, p = 4, nruns = 100, eps = 0.1, shift = 9, type = 1, alph = 0.05){ #Very Slow. #Simulates residual bootstrap for rmreg3 with m = 1. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k ones, p - k zeroes q <- p-1 k <- floor(p/2) b <- 0 * 1:q b[1:(k-1)] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-rmregboot(x,y) #residual bootstrap using rmreg3 for (j in 1:p){ tem <- shorth2(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p - k values of beta are 0 tem <- predreg(out$betas[,(k+1):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} rmreg2<-function(x, y, csteps = 5, locc = 0.5){ # Does robust multivariate linear regression based on RMVN. # Here x contains the nontrivial predictors. # Advance the plot by highlighting Stop with the right mouse button. x <- as.matrix(x) y <- as.matrix(y) n <- dim(x)[1] q <- dim(x)[2] p <- q+1 m <- dim(y)[2] d <- m + q u <- cbind(x,y) res <- matrix(nrow = n, ncol = m, 0) fit <- res Bhat <- matrix(nrow = p, ncol = m, 0) # q + 1 = p is number of predictors including intercept cmar <- par("mar") par(mfrow = c(2, 1)) par(mar=c(4.0,4.0,2.0,0.5)) #get the RMVN subset of the predictors and response u up <- qchisq(0.975, d) qchi <- qchisq(0.5, d) ##get the DGK estimator covs <- var(u) mns <- apply(u, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(u, mns, covs) medd2 <- median(md2) mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(d) med <- apply(u, 2, median) md2 <- mahalanobis(u, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(u, mns, covs) medd2 <- median(md2) mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(u, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) rd2 <- mahalanobis(u, mnf, covf) rmnmvn <- apply(u[rd2 <= up, ], 2, mean) rcovmvn <- var(u[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(u, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, d) rcovmvn <- const * rcovmvn ##now get the subset of data w used to construct the RMVN estimator rd2 <- mahalanobis(u, rmnmvn, rcovmvn) w <- u[rd2 <= up, ] xw <- as.matrix(w[,1:q]) yw <- as.matrix(w[,(q+1):d]) #get the multivariate regression after ellipsoidal trimming for(i in 1:m){ out <- lsfit(xw,yw[,i]) res[,i] <- y[,i] - out$coef[1] - x %*% out$coef[-1] fit[,i] <- y[,i] - res[,i] Bhat[,i] <- out$coef } for(i in 1:m){ plot(fit[,i],y[,i]) abline(0, 1) title("Response Plot") identify(fit[,i],y[,i]) plot(fit[,i], res[,i]) title("Residual Plot") identify(fit[,i], res[,i]) } par(mfrow = c(1, 1)) par(mar=cmar) list(fit = fit, res = res, Bhat = Bhat) } rmreg3<-function(x, y, csteps = 5, locc = 0.5){ # Does robust multiple linear regression based on RMVN. # Here x contains the nontrivial predictors. # This is rmreg2 except plots are not made. # Used by hbreg. x <- as.matrix(x) y <- as.matrix(y) n <- dim(x)[1] q <- dim(x)[2] p <- q+1 m <- dim(y)[2] d <- m + q u <- cbind(x,y) res <- matrix(nrow = n, ncol = m, 0) fit <- res Bhat <- matrix(nrow = p, ncol = m, 0) # q + 1 = p is number of predictors including intercept #get the RMVN subset of the predictors and response u up <- qchisq(0.975, d) qchi <- qchisq(0.5, d) ##get the DGK estimator covs <- var(u) mns <- apply(u, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(u, mns, covs) medd2 <- median(md2) mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(d) med <- apply(u, 2, median) md2 <- mahalanobis(u, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(u, mns, covs) medd2 <- median(md2) mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(u, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) rd2 <- mahalanobis(u, mnf, covf) rmnmvn <- apply(u[rd2 <= up, ], 2, mean) rcovmvn <- var(u[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(u, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, d) rcovmvn <- const * rcovmvn ##now get the subset of data w used to construct the RMVN estimator rd2 <- mahalanobis(u, rmnmvn, rcovmvn) w <- u[rd2 <= up, ] xw <- as.matrix(w[,1:q]) yw <- as.matrix(w[,(q+1):d]) #get the multivariate regression after ellipsoidal trimming for(i in 1:m){ out <- lsfit(xw,yw[,i]) res[,i] <- y[,i] - out$coef[1] - x %*% out$coef[-1] fit[,i] <- y[,i] - res[,i] Bhat[,i] <- out$coef } list(fit = fit, res = res, Bhat = Bhat) } rowboot<-function(x,y, B = 1000){ #rowwise nonparametric bootstrap #use if (y^T, x^T) are iid from some distribution #out <- rowboot(belx,bely) #ddplot4(out$betas) #mplot(out$betas) #cov(out$betas) full <- lsfit(x,y) n <- length(y) x <- as.matrix(x) p <- dim(x)[2]+1 indx <- 1:n betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ tem <- sample(indx,n,replace=T) betas[i,] <- lsfit(x[tem,],y[tem])$coef } list(betas=betas,full=full) } Rsqboot<-function(x, y, B = 1000){ #bootstraps the coefficient of determination R^2 #the residual bootstrap is used n <- length(y) x <- as.matrix(x) Rsqs <- 1:B full <- lsfit(x,y) res <- full$resid fit <- y - res for(i in 1:B){ yb <- fit + sample(res,n,replace=T) ybfit <- yb - lsfit(x,yb)$resid Rsqs[i] <- (cor(yb,ybfit))^2 } list(Rsqs=Rsqs) } Rsqbootsim<-function(n = 100, p = 4, BB=1000, nruns = 100, type = 1, psi = 0.0, dd=3, eps = 0.1, cc=0.0, shift = 9.0, alph = 0.05){ #Simulates bootstrap for R^2. # Use 0 <= psi < 1. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients # want p > 2 so pop R^2 formula is correct # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(p-3)psi^2]/[1 + (p-2)psi^2], i not = j, p > 2 # when the correlation exists. #Makes the shorth CI [Ln,Un], the lower CI [0,U] and the upper CI [L,1] #Only the lower CI should work well if b = 0. cicov <- 0 avelen <- 0 ucicov <- 0 uavelen <- 0 lcicov <- 0 lavelen <- 0 q <- p-1 b <- 0 * 1:q + cc #b = 0 means R^2 -> 0 as n -> oo #Y = 1 + cc x2 + ... + cc xp + e = a + b^T u + e, b = (cc, ..., cc)^T rho <- (2*psi + (p-3)*psi^2)/(1 + (p-2)*psi^2) sigLsq <- cc^2*(p-1 + 2*rho*p*(p-1)/2) sigesq <- 1 if(type == 2) sigesq <- 1 if(type == 3) sigesq <- 3 if(type == 4) sigesq <- 1/3 if(type == 5) sigesq <- 1 - eps + eps*(1+shift)^2 #10.9 is the default poprsq <- sigLsq/(sigesq + sigLsq) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = dd) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } rsqs <-Rsqboot(x, y, B= BB)$Rsqs #residual bootstrap tem <- shorthLU(rsqs,alpha=alph) if(poprsq >= tem$shorth[1] && poprsq <= tem$shorth[2]) cicov <- cicov + 1 avelen <- avelen + tem$shorth[2] - tem$shorth[1] if(poprsq <= tem$right) lcicov <- lcicov + 1 lavelen <- lavelen + tem$right if(poprsq >= tem$left) ucicov <- ucicov + 1 uavelen <- uavelen + 1 - tem$left } cicov <- cicov/nruns avelen <- avelen/nruns lcicov <- lcicov/nruns lavelen <- lavelen/nruns ucicov <- ucicov/nruns uavelen <- uavelen/nruns list(rho=rho, sigesq=sigesq, sigLsq = sigLsq, poprsq = poprsq, cicov=cicov, avelen=avelen,lcicov=lcicov,lavelen=lavelen, ucicov=ucicov,uavelen=uavelen)} shorth<-function(y, alpha = 0.05){ # computes the shorth(c) interval [Ln,Un] for c = ceiling[n(1-alpha)]. n <- length(y) cc <- ceiling(n * (1 - alpha)) sy <- sort(y) rup <- sy[cc] rlow <- sy[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sy[j] rlow <- sy[j - cc + 1] } } } list(Ln = rlow, Un = rup) } shorth3<-function(y, alpha = 0.05){ # computes the shorth(c) interval [Ln,Un] for c = cc. #shorth lists Ln and Un using Frey's correction n <- length(y) cc <- ceiling(n * (1 - alpha + 1.12*sqrt(alpha/n))) cc <- min(n,cc) sy <- sort(y) rup <- sy[cc] rlow <- sy[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sy[j] rlow <- sy[j - cc + 1] } } } Ln <- rlow; Un <- rup list(shorth=c(Ln, Un)) } shorthLU<-function(y, alpha = 0.05){ # computes the shorth(c) interval [Ln,Un] for c = cc. #shorth lists Ln and Un using Frey's correction #also gets left endpoint and right endpoints for one sided lower and upper CIs #[left,infty) and (-infty,right]. #So left is a lower bound and right an upper bound for the parameter theta. n <- length(y) cc <- ceiling(n * (1 - alpha + 1.12*sqrt(alpha/n))) cc <- min(n,cc) sy <- sort(y) rup <- sy[cc] rlow <- sy[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sy[j] rlow <- sy[j - cc + 1] } } } Ln <- rlow; Un <- rup left <- sy[n-cc+1] right <- sy[cc] list(shorth=c(Ln, Un),left=left,right=right) } tplot2<-function(x, y, type = 1){ #library(glmnet), library(pls), library(leaps) # For type = 1 want n >= Jp with J >= 5 and preferably J >= 10. # Use the rightmost mouse button to advance # the plot, and in R, highlight ``stop." # Values of y should be positive. # Assume the MLR model contains a constant, but # x is the design matrix without the vector of ones. # This function makes transformation plots from the MLR # of y^L on x where L = -1, -0.5, -1/3, 0, 1/3, 0.5, or 1 # except log(Y) is used if L = 0. #type = 1 full model OLS, type = 2, elastic net, type = 3 lasso #type = 4 ridge regression, type = 5 PLS, type = 6 PCR #type = 7 forward selection using EBIC if n<10p and min Cp if n >=10p. #elastic net, lasso, ridge regression, PLS and PCR use 10 fold CV # If plot is linear for L, use y^L (or log(Y) for L = 0) instead # of Y. This is a graphical method for a response transform. x<- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] p <- q+1 nc <- ceiling(n/5)-1 nc <- min(nc,q) nc <- max(nc,1) vars <- as.vector(1:(p-1)) lad <- c(-1, -1/2, -1/3, 0, 1/3, 1/2, 1) xl <- c("1/Z", "1/sqrt(Z))", "Z**(-1/3)", "LOG(Z)", "Z**(1/3)", "sqrt(Z)", "Z") for(i in 1:length(lad)) { if(lad[i] == 0) ytem <- log(y) else ytem <- y^lad[i] if(type ==1) #full model OLS fit <- ytem - lsfit(x,ytem)$resid if(type == 2) #elastic net with CV fit <- enet2(x,ytem)$yhat if(type==3){out <- cv.glmnet(x,ytem,alpha=1) #lasso lam <- out$lambda.min fit <- predict(out,s=lam,newx=x)} if(type==4){out <- cv.glmnet(x,ytem,alpha=0) #ridge regression lam <- out$lambda.min fit <- predict(out,s=lam,newx=x)} if(type==5){w <- as.data.frame(cbind(ytem,x)) out<-plsr(ytem~.,data=w,scale=T,validation="CV") tem<-MSEP(out) cvmse<-tem$val[,,1:(out$ncomp+1)][1,] nc <-max(which.min(cvmse)-1,1) fit<-ytem-out$residuals[,,nc]} if(type==6){w <- as.data.frame(cbind(ytem,x)) out<-pcr(ytem~.,data=w,scale=T,validation="CV") tem<-MSEP(out) cvmse<-tem$val[,,1:(out$ncomp+1)][1,] nc <-max(which.min(cvmse)-1,1) fit<-ytem-out$residuals[,,nc]} if(type==7){temp<-regsubsets(x,ytem,nvmax=nc,method="forward") out<-summary(temp) if(n < 10*p) { xx <- 1:min(length(out$bic),p-1)+1 ebic <- out$bic+2*(-lgamma(xx+1)-lgamma(p-xx+1)) minebic <- out$which[ebic==min(ebic),] #do not need the constant in vin vin <- vars[minebic[-1]] } else { mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]]} sub <- lsfit(x[,vin],ytem) fit <- ytem - sub$resid } plot(fit, ytem, xlab = "TZHAT", ylab = xl[i]) abline(0, 1) if(type==1) title("Full OLS") if(type==2) title("Elastic Net") if(type==3) title("Lasso") if(type==4) title("Ridge Regression") if(type==5) title("PLS") if(type==6) title("PCR") if(type==7) title("Forward Selection") identify(fit, ytem) } } tvreg<-function(x, Y, ii = 1, type = 4){ # Trimmed views (TV) regression for 0, 10, ..., 90 percent # trimming. Use ii = 10 for 0, ii = 9 for 10 then 0, ... # ii = 1 for 90 then 80 ... then 0 percent trimming. # Workstation: activate a graphics device # with commands "X11()" or "motif()." # R needs command "library(MASS)." # Advance the view with the right mouse button and # in R, highight "stop." x <- as.matrix(x) if(type == 1) out <- cov.mcd(x) if(type == 2) out <- covmba(x) if (type == 3) out <- covfch(x) if(type > 3) out <- covrmvn(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) bhat <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$coef FIT <- x %*% bhat[-1] + bhat[1] plot(FIT, Y) abline(0, 1) title(labs[i]) identify(FIT, Y) print(bhat) } } tvreg2<-function(X, Y, M = 0, type = 4) {# Trimmed views regression for M percent trimming. # Workstation: activate a graphics device # with commands "X11()" or "motif()." # R needs command "library(MASS)." X <- as.matrix(X) if(type == 1) out <- cov.mcd(X) if(type == 2) out <- covmba(X) if(type == 3) out <- covfch(X) if(type > 3) out <- covrmvn(X) center <- out$center cov <- out$cov rd2 <- mahalanobis(X, center, cov) tem <- (100 - M)/100 val <- quantile(rd2, tem) bhat <- lsfit(X[rd2 <= val, ], Y[rd2 <= val])$coef FIT <- X %*% bhat[-1] + bhat[1] plot(FIT, Y) abline(0, 1) identify(FIT, Y) list(coef = bhat) } varcisim<-function(n = 100, nruns = 1000, alpha = 0.05, type = 1) {#Simulates 100(1-alpha)% CI for the variance sigsq. vcov <- 0 vlow <- 1:nruns vup <- vlow cut <- qt(1 - alpha/2, df = n - 1) sigsqln <- exp(1) * (exp(1) - 1) tav <- vcov for(i in 1:nruns) { if(type == 1) { y <- rnorm(n) sigsq <- 1 } if(type == 2) { y <- rexp(n) sigsq <- 1 } if(type == 3) { y <- exp(rnorm(n)) sigsq <- sigsqln } if(type == 4) { y <- runif(n) sigsq <- 1/12 } if(type == 5) { y <- rbinom(n, 1, 0.5) sigsq <- 0.25 } if(type == 6) { y <- rpois(n, 1) sigsq <- 1 } ssq <- var(y) tem <- (y - mean(y))^4 tem <- mean(tem) tauhat <- tem/(ssq^2) - 1 if(tauhat < 0) tauhat <- 0 val <- sqrt(n * tauhat) * cut if(tauhat < 2) { numl <- (n - 2) * ssq numu <- (n + 1) * ssq } if(tauhat >= 2 && tauhat < 4) { numl <- (n - 5) * ssq numu <- (n + 5) * ssq } if(tauhat >= 4 && tauhat < 5) { numl <- (n - 20) * ssq numu <- (n + 25) * ssq } if(tauhat >= 5 && tauhat < 6) { numl <- (n - 20) * ssq numu <- (n + 50) * ssq } if(tauhat >= 6 && tauhat < 9) { numl <- (n - 20) * ssq numu <- (n + 55) * ssq } if(tauhat >= 9) { numl <- (n - 25) * ssq numu <- (n + 55) * ssq } if(n - val < 0) { vup[i] <- 2 * max(ssq, sqrt(ssq)) } else vup[i] <- numu/(n - val) vlow[i] <- numl/(n + val) if(vlow[i] < sigsq && vup[i] > sigsq) vcov <- vcov + 1 tav[i] <- tauhat } vcov <- vcov/nruns vlen <- sqrt(n) * mean(vup - vlow) lows <- sum(vlow > sigsq)/nruns ups <- sum(vup < sigsq)/nruns tauave <- mean(tav) #could list zz <- sum(tav > 6)/nruns list(lows = lows, ups = ups, tauave = tauave, vcov = vcov, vlen = vlen) } vsbootsim<-function(n = 100, p = 4, nruns = 100, eps = 0.1, shift = 9, type = 1, alph = 0.05){ #Needs library(leaps). For all subsets variable selection so very slow. #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for variable selection using all subsets variable #selection. So need p small. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k ones p - k zeroes q <- p-1 k <- floor(p/2) b <- 0 * 1:q b[1:(k-1)] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-vselboot(x,y) #bootstrap the minimum Cp model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p - k values of beta are 0 tem <- predreg(out$betas[,(k+1):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} vsbootsim2<-function(n = 100, p = 4, nruns = 100, eps = 0.1, shift = 9, type = 1, alph = 0.05){ #Needs library(leaps). No psi. #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for forward selection variable selection. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k ones p - k zeroes q <- p-1 k <- floor(p/2) b <- 0 * 1:q b[1:(k-1)] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-fselboot(x,y) #bootstrap the forward sel minimum Cp model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p - k values of beta are 0 tem <- predreg(out$betas[,(k+1):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} vsbootsim3<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, BB=1000, alph = 0.05){ #Needs library(leaps). Has psi and uses the prediction region method. #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for forward selection variable selection. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so there are zeroes in the model, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-fselboot(x,y,B=BB) #bootstrap the forward sel minimum Cp model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 tem <- predreg(out$betas[,(k+2):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} vsbootsim4<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, BB=1000, alph = 0.05){ #Needs library(leaps). Has psi and used 3 confidence regions. #Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for forward selection variable selection. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 one <- as.vector(0*1:(k+1) + 1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-fselboot(x,y,B=BB) #bootstrap the forward sel minimum Cp model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- out$bhatimin0[(k+2):p] tem <- confreg(out$betas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k+1 values of beta are 1 gg <- k + 1 tstat <- out$bhatimin0[1:(k+1)] tem <- confreg(out$betas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} vsbrbootsim<-function(n = 100, p = 4, k = 1, nruns = 100, psi=0.0, m = 10, B=1000, int=0, a = 5/3, alpha = 0.05){ ##Needs library(MASS). ##Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #Simulates parametric bootstrap for binomial regression with backward #elimination. # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want # with int + 3a <=5, int -3a >= -5. # If m=1, a = 4/3 may work better. #set.seed(974) ##need p>2 and want n >= 5p q <- p-1 pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] mv <- 0*1:n + m beta<-c(int,b) one <- as.vector(0*1:(k+1) + 1) one[1]<-int zero <- 0 * 1:p dd <- 1:nruns ddboot <- 1:B for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rbinom(n,size=m,prob=(exp(SP)/(1+exp(SP)))) ny <- mv-y #ny[i] = mv[i]-y[i] = no. of ``failures" tdata <- as.data.frame(cbind(x,y)) #make a BR data set out <- glm(cbind(y,ny)~., family=binomial, data=tdata) ESP <- predict(out,newdata = tdata) varnames <- names(out$coef) outbe <- step(out,trace=0) #backward elimination dd[i]<-length(outbe$coef) vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhat <- zero bhat[vin] <- outbe$coef #bhatImin betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rbinom(n,size=m,prob=(exp(ESP)/(1+exp(ESP)))) nydat <- mv-ydat tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(cbind(ydat,nydat)~., family=binomial, data=tdat) outbe <- step(temp,trace=0) #backward elimination ddboot[i]<-length(outbe$coef) vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhatimin <- zero bhatimin[vin] <- outbe$coef betas[i,] <- bhatimin } for (j in 1:p){ tem <- shorth3(betas[,j],alpha=alpha) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- bhat[(k+2):p] tem <- confreg(betas[,(k+2):p],g=gg,that=tstat,alpha=alpha) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k+1 values of beta are (int,1,...,1) gg <- k + 1 tstat <- bhat[1:(k+1)] tem <- confreg(betas[,1:(k+1)],g=gg,that=tstat,alpha=alpha) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns mndd <- mean(dd) mnddboot <- mean(ddboot) #for the last bootstrap sample list(mndd=mndd,mnddboot=mnddboot,cicov=cicov,avelen=avelen,beta=beta,k=k)} vscisim<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, BB=1000, alph = 0.05){ #Needs library(leaps). Has psi and used 3 confidence regions for CIs. #Gets CIs for beta_i with shorth, pred reg, Bickel and Ren methods. #Simulates bootstrap for forward selection variable selection. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-1, k is the number of nonnoise variables #Need k < p-1 so there are some zeroes in the model. #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) scicov <- 0*(1:p) savelen <- scicov prcicov <- scicov pravelen <- scicov brcicov <- scicov bravelen <- scicov rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 one <- as.vector(0*1:(k+1) + 1) up <- min((1 - alph/2), (1 - alph + 10*alph/BB)) if(alph > 0.1) up <- min((1 - alph + 0.05), (1 - alph + 1/BB)) qB <- up if(qB < 1 - alph + 0.001) up <- 1 - alph for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-fselboot(x,y,B=BB) #bootstrap the forward sel minimum Cp model for (j in 1:p){ #get shorth CIs tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) scicov[j] <- scicov[j] + 1 savelen[j] <- savelen[j] + tem$shorth[2] - tem$shorth[1] #get prediction region CIs tem <- betaci(t=out$betas[,j],bhat=out$bhatimin0[j],up=up) if(beta[j] >= tem$prci[1] && beta[j] <= tem$prci[2]) prcicov[j] <- prcicov[j] + 1 pravelen[j] <- pravelen[j] + tem$prcilen #get Bickel and Ren CIs if(beta[j] >= tem$brci[1] && beta[j] <= tem$brci[2]) brcicov[j] <- brcicov[j] + 1 bravelen[j] <- bravelen[j] + tem$brcilen } } scicov <- scicov/nruns savelen <- savelen/nruns prcicov <- prcicov/nruns pravelen <- pravelen/nruns brcicov <- brcicov/nruns bravelen <- bravelen/nruns list(scicov=scicov,savelen=savelen,prcicov=prcicov,pravelen=pravelen, brcicov=brcicov,bravelen=bravelen,beta=beta,k=k)} vselboot<-function(x,y,B = 1000){ #needs library(leaps) #need n and p small, 2 < p < 30 #allsubsets minimum Cp regression #does not make sense to do variable selection if there #is only one nontrivial predictor x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vars <- as.vector(1:(p-1)) #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the minimum Cp submodel out<-leaps(x,y) mincp <- out$which[out$Cp==min(out$Cp)] vin <- vars[mincp] sub <- lsfit(x[,vin],y) betas <- matrix(0,nrow=B,ncol=p) #bootstrap the minimum Cp submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) out<-leaps(x,y=yb) mincp <- out$which[out$Cp==min(out$Cp)] vin <- vars[mincp] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef } list(full=full,sub=sub,betas=betas) } vsLRboot<-function(x,y,mv=c(1,1),B = 1000,bin=T) { #needs library(MASS), n > 5p, p > 2, want B >= 50p, takes a few minutes #mv is the m_i vector of the number of trials; if bin=T #then for binary LR the program provides the number of trials #bootstrap min AIC logistic regression backward elimination model #Does not make sense to do variable selection if there #is only one nontrivial predictor. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] zero <- 0 * 1:p bhat <- zero tdata <- as.data.frame(cbind(x,y)) if(bin==T) mv <- 0*1:n + 1 ny <- mv-y out <- glm(cbind(y,ny)~., family=binomial, data=tdata) ESP <- predict(out,newdata = tdata) varnames <- names(out$coef) outbe <- step(out,trace=0) #backward elimination vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhat[vin] <- outbe$coef #bhatImin betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rbinom(n,size=1,prob=(exp(ESP)/(1+exp(ESP)))) nydat <- mv-ydat tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(cbind(ydat,nydat)~., family=binomial, data=tdat) outbe <- step(temp,trace=0) #backward elimination vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhatimin <- zero bhatimin[vin] <- outbe$coef betas[i,] <- bhatimin } list(bhatimin0=bhat,betas=betas) } vsPRboot<-function(x,y,B = 1000) { #needs library(MASS), n > 5p, p > 2, want B >= 50p, takes a few minutes #bootstraps the Poisson regression backward elimination x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] zero <- 0 * 1:p tdata <- as.data.frame(cbind(x,y)) out <- glm(y~., family=poisson, data=tdata) ESP <- predict(out,newdata = tdata) varnames <- names(out$coef) outbe <- step(out,trace=0) #backward elimination vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhat <- zero bhat[vin] <- outbe$coef #bhatImin betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rpois(n,lambda=exp(ESP)) tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(ydat~., family=poisson, data=tdat) outbe <- step(temp,trace=0) #backward elimination vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhatimin <- zero bhatimin[vin] <- outbe$coef betas[i,] <- bhatimin } list(bhatimin0=bhat,betas=betas) } vsprbootsim<-function(n = 100, p = 4, k = 1, nruns = 100, psi=0.0, B=1000, int=1, a = 1, alpha = 0.05) {##Needs library(MASS). ##Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #Simulates parametric bootstrap for Poisson regression (backwards elimination). # constant = 1 so there are p = q+1 coefficients #1 <= k < p-1 so zeroes are in the model, k is the number of nonnoise variables #need p > 1, beta = (int, 1, ..., 1, 0, ..., 0) with int, k ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. SP~N(int,a^2). Want exp(int+3a) reasonable # with int + 3a <=10, int > 0 and int - 3a > -4. q <- p-1 pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) val <- a/sqrt(k*(1 + (q-1)*psi^2) + k*(k-1)*(2*psi + (q-2)*psi^2)) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta<-c(int,b) one <- as.vector(0*1:(k+1) + 1) one[1]<-int zero <- 0 * 1:p dd <- 1:nruns ddboot <- 1:B for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- val* x %*% A SP <- int + x%*%b #SP_i ~ N(int,a^2) y <- rpois(n,lambda=exp(SP)) tdata <- as.data.frame(cbind(x,y)) #make a PR data set out <- glm(y~., family=poisson, data=tdata) ESP <- predict(out,newdata = tdata) varnames <- names(out$coef) outbe <- step(out,trace=0) #backward elimination dd[i]<-length(outbe$coef) vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhat <- zero bhat[vin] <- outbe$coef #bhatImin betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ ydat <- rpois(n,lambda=exp(ESP)) tdat <- as.data.frame(cbind(x,ydat)) temp<-glm(ydat~., family=poisson, data=tdat) outbe <- step(temp,trace=0) #backward elimination ddboot[i]<-length(outbe$coef) vinnames <- names(outbe$coef) vin <- varnames %in% vinnames bhatimin <- zero bhatimin[vin] <- outbe$coef betas[i,] <- bhatimin } for (j in 1:p){ tem <- shorth3(betas[,j],alpha=alpha) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- bhat[(k+2):p] tem <- confreg(betas[,(k+2):p],g=gg,that=tstat,alpha=alpha) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k+1 values of beta are (int,1,...,1) gg <- k + 1 tstat <- bhat[1:(k+1)] tem <- confreg(betas[,1:(k+1)],g=gg,that=tstat,alpha=alpha) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns mndd <- mean(dd) mnddboot <- mean(ddboot) #for the last bootstrap sample list(mndd=mndd,mnddboot=mnddboot,cicov=cicov,avelen=avelen,beta=beta,k=k)} wcisim<-function(n = 100, phi = 1, lam = 1, iter = 100, runs = 100, alpha = 0.05) {#For simulation of Weibull CIs for phi and lambda. # Note that phir and lamr are the robust initial estimators. phihat <- 1:runs lamhat <- phihat lamr <- phihat phir <- phihat phio <- phihat lamo <- phihat plow <- 1:runs pup <- plow llow <- plow lup <- plow pcov <- 0 lcov <- 0 lcut <- qnorm(1 - alpha/2) pcut <- sqrt(0.608/n) * qnorm(1 - alpha/2) for(i in 1:runs) { weib <- (lam * rexp(n))^(1/phi) lw <- log(weib) tem <- mad(lw, constant = 1) phir[i] <- 0.767049/tem ahat <- median(lw) - log(log(2))/phir[i] lamr[i] <- exp(ahat * phir[i]) #get the MLEs y <- weib suml <- sum(log(y)) pold <- phir[i] lold <- lamr[i] for(j in 1:iter) { lnew <- mean(y^pold) lvo <- lold lold <- lnew den <- (1/lold) * sum(y^pold * log(y)) - suml pnew <- n/den pvo <- pold pold <- pnew } phihat[i] <- pnew lamhat[i] <- lnew phio[i] <- pvo lamo[i] <- lvo #get CI for phi plow[i] <- phihat[i] * (1 - pcut) pup[i] <- phihat[i] * (1 + pcut) if(plow[i] < phi && pup[i] > phi) pcov <- pcov + 1 #get CI for lambda logl <- log(lamhat[i]) tem <- 1 + 0.4635 * logl + 0.5824 * (logl)^2 val <- lcut * lamhat[i] * sqrt((1.109 * tem)/n) llow[i] <- lamhat[i] - val lup[i] <- lamhat[i] + val if(llow[i] < lam && lup[i] > lam) lcov <- lcov + 1 } mnphir <- mean(phir) sdphir <- sqrt(var(phir)) mnlamr <- mean(lamr) sdlamr <- sqrt(var(lamr)) pconv <- max(abs(phihat - phio)) lconv <- max(abs(lamhat - lamo)) mnphihat <- mean(phihat) sdphihat <- sqrt(n * var(phihat)) mnlamhat <- mean(lamhat) sdlamhat <- sqrt(n * var(lamhat)) pcov <- pcov/runs lcov <- lcov/runs phiasd <- phi * sqrt(0.608) lamasd <- sqrt(1.109 * lam^2 * (1 + 0.4635 * log(lam) + 0.5282 * (log(lam))^2)) list(pconv = pconv, lconv = lconv, mnphir = mnphir, sdphir = sdphir, mnlamr = mnlamr, sdlamr = sdlamr, mnphihat = mnphihat, sdphihat = sdphihat, phiasd = phiasd, mnlamhat = mnlamhat, sdlamhat = sdlamhat, lamasd = lamasd, pcov = pcov, lcov = lcov) } wildboot<-function(x,y,B=1000,slices=7) { #wild, nonparametric, parametric bootstrap for WLS #want n/slices > 13 n <- length(y) x <- as.matrix(x) p <- dim(x)[2]+1 full <- lsfit(x,y) res <- full$resid fit <- y - res #ESP indx <- sort.list(fit) vars<-1:n val <- as.integer(n/slices) for(i in 1:slices){ lo <- (i-1)*val + 1 hi <- i*val if(i == slices) hi <- n vars[indx[lo:hi]] <- var(res[indx[lo:hi]]) } sds <- sqrt( n*vars/(n-p) ) sres <- res * sqrt(n/(n-p)) betas <- matrix(0,nrow=B,ncol=p) wbetas <- betas pbetas <- betas indx <- 1:n for(i in 1:B){ tem <- sample(indx,n,replace=T) betas[i,] <- lsfit(x[tem,],y[tem])$coef #nonparametric bootstrap eps <- 2*rbinom(n,size=1,0.5)-1 #wild bootstrap yb <- fit + eps*sres wbetas[i,] <- lsfit(x,yb)$coef yb <- fit + rnorm(n,mean=0,sd=sds) pbetas[i,] <- lsfit(x,yb)$coef #parametric bootstrap } list(betas=betas,wbetas=wbetas,pbetas=pbetas,full=full) } wlsbootsim<-function(n = 100, p = 4, k=1, nruns = 100, nslices=7, eps = 0.1, shift = 9, etype = 1, wtype=1, psi = 0.0, BB=200, alph = 0.05) {#calls confreg, rowboot, shorth3 #Simulates WLS nonparamtric and wild bootstrap. # Use 0 <= psi < 1. Want n/nslices > 13. #Uses five iid error distributions: # etype = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. #wtype = 1 for OLS, 2 if err = abs(SP - 5) * e, 3 if sqrt(1 + 0.5* x[,1]^2)*e #4 for exp[1 + log(|x_2|) + ... + log(|x_p|] * e, #5 for [1 + log(|x_2|) + ... + log(|x_p|] * e # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-2, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(p-3)psi^2]/[1 + (p-2)psi^2], i not = j, p > 2 # when the correlation exists. q <- p-1 b <- 0 * 1:q dd <- b + 1 dq <- dd/q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) wcicov <- cicov wavelen <- avelen pcicov <- cicov pavelen <- avelen rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 one <- as.vector(0*1:(k+1) + 1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A SP <- 1 + x %*% b if(etype == 1) err <- rnorm(n) if(etype == 2) err <- rt(n, df = 3) if(etype == 3) err <- rexp(n) - 1 if(etype == 4) err <- runif(n, min = -1, max = 1) if(etype == 5) err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) *shift) if(wtype == 2) err <- abs(SP - 5) * err if(wtype == 3) err <- sqrt(1 + x[,1]^2)*err if(wtype == 4) err <- exp(1 + log(abs(x))%*%dd) * err if(wtype == 5) err <- (1 + log(abs(x))%*%dd) * err if(wtype == 6) err <- exp(log(abs(x))%*%dq) * err if(wtype == 7) err <- (log(abs(x))%*%dq) * err #make an MLR data set y <- SP + err out <- wildboot(x,y,B=BB,slices=nslices) #nonparametric bootstrap for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) temw <- shorth3(out$wbetas[,j],alpha=alph) temp <- shorth3(out$pbetas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] if(beta[j] >= temw$shorth[1] && beta[j] <= temw$shorth[2]) wcicov[j] <- wcicov[j] + 1 wavelen[j] <- wavelen[j] + temw$shorth[2] - temw$shorth[1] if(beta[j] >= temp$shorth[1] && beta[j] <= temp$shorth[2]) pcicov[j] <- pcicov[j] + 1 pavelen[j] <- pavelen[j] + temp$shorth[2] - temp$shorth[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- out$full$coef[(k+2):p] tem <- confreg(out$betas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim # temw <- confreg(out$wbetas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(temw$D0 <= temw$cuplim) #pred. reg. method wcicov[pp1] <- wcicov[pp1] + 1 wavelen[pp1] <- wavelen[pp1] + temw$cuplim if(temw$br0 <= temw$cuplim) #hybrid method wcicov[pp2] <- wcicov[pp2] + 1 wavelen[pp2] <- wavelen[pp1] #same cutoff so same length if(temw$br0 <= temw$brlim) #Bickel and Ren method wcicov[pp3] <- wcicov[pp3] + 1 wavelen[pp3] <- wavelen[pp3] + temw$brlim # temp <- confreg(out$pbetas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(temp$D0 <= temp$cuplim) #pred. reg. method pcicov[pp1] <- pcicov[pp1] + 1 pavelen[pp1] <- pavelen[pp1] + temp$cuplim if(temp$br0 <= temp$cuplim) #hybrid method pcicov[pp2] <- pcicov[pp2] + 1 pavelen[pp2] <- pavelen[pp1] #same cutoff so same length if(temp$br0 <= temp$brlim) #Bickel and Ren method pcicov[pp3] <- pcicov[pp3] + 1 pavelen[pp3] <- pavelen[pp3] + temp$brlim #test whether the first k+1 values of beta are 1 gg <- k + 1 tstat <- out$full$coef[1:(k+1)] tem <- confreg(out$betas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim # temw <- confreg(out$wbetas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, temw$center, temw$cov)) if(D0 <= temw$cuplim) #pred. reg. method wcicov[pp4] <- wcicov[pp4] + 1 wavelen[pp4] <- wavelen[pp4] + temw$cuplim D1 <- sqrt(mahalanobis(one, tstat, temw$cov)) if(D1 <= temw$cuplim) #hybrid method wcicov[pp5] <- wcicov[pp5] + 1 wavelen[pp5] <- wavelen[pp4] #same cutoff so same length if(D1 <= temw$brlim) #Bickel and Ren method wcicov[pp6] <- wcicov[pp6] + 1 wavelen[pp6] <- wavelen[pp6] + temw$brlim # temp <- confreg(out$pbetas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, temp$center, temp$cov)) if(D0 <= temp$cuplim) #pred. reg. method pcicov[pp4] <- pcicov[pp4] + 1 pavelen[pp4] <- pavelen[pp4] + temp$cuplim D1 <- sqrt(mahalanobis(one, tstat, temp$cov)) if(D1 <= temp$cuplim) #hybrid method pcicov[pp5] <- pcicov[pp5] + 1 pavelen[pp5] <- pavelen[pp4] #same cutoff so same length if(D1 <= temp$brlim) #Bickel and Ren method pcicov[pp6] <- pcicov[pp6] + 1 pavelen[pp6] <- pavelen[pp6] + temp$brlim #FIT <- y-out$full$res #plot(FIT,y) #abline(0,1) } #FIT <- y-out$full$res #plot(FIT,y) #abline(0,1) cicov <- cicov/nruns avelen <- avelen/nruns wcicov <- wcicov/nruns wavelen <- wavelen/nruns pcicov <- pcicov/nruns pavelen <- pavelen/nruns list(cicov=cicov,avelen=avelen,wcicov=wcicov,wavelen=wavelen, pcicov=pcicov,pavelen=pavelen,beta=beta,k=k) }