##The command ##source("http://parker.ad.siu.edu/Olive/lregpack.txt") ##is an easy way to get these functions into R. ##70 functions anovasim<- function(n1 = 20, n2 = 20, n3 = 20, n4 = 20, m1 = 0, m2 = 0, m3 = 0, m4 = 0, sd1 = 1, sd2 = 1, sd3 = 1, sd4 = 1, nruns = 100) { # Simulates ANOVA F, Welch F, modified Welch F, modified F and rank F statistic # for 1 way ANOVA. # tests Ho: mu1 = mu2 = mu3 = mu4 p <- 4 n <- n1 + n2 + n3 + n4 y <- 0 * 1:n c1 <- 0 * 1:n c2 <- c1 c3 <- c1 c4 <- c1 c1[1:n1] <- 1 c2[(n1 + 1):(n1 + n2)] <- 1 c3[(n1 + n2 + 1):(n1 + n2 + n3)] <- 1 c4[(n1 + n2 + n3 + 1):n] <- 1 x <- cbind(c1, c2, c3, c4) y1 <- 1:n1 y2 <- 1:n2 y3 <- 1:n3 y4 <- 1:n4 beta <- c(m1, m2, m3, m4) bols <- matrix(0, nrow = nruns, ncol = p) faov <- 0 * 1:nruns rf <- faov mct <- 0 wct <- 0 mwct <- 0 for(i in 1:nruns) { #make data y1 <- rnorm(n1, mean = m1, sd = sd1) y2 <- rnorm(n2, mean = m2, sd = sd2) y3 <- rnorm(n3, mean = m3, sd = sd3) y4 <- rnorm(n4, mean = m4, sd = sd4) y <- c(y1, y2, y3, y4) out <- lsfit(x, y, intercept = F) bols[i, ] <- out$coef mn1 <- bols[i, 1] mn2 <- bols[i, 2] mn3 <- bols[i, 3] mn4 <- bols[i, 4] s1sq <- var(y1) s2sq <- var(y2) s3sq <- var(y3) s4sq <- var(y4) #get ANOVA F statistic res <- out$resid mse <- t(res) %*% res/(n - p) sse <- mse * (n - p) mny <- mean(y) sstr <- n1 * (mn1 - mny)^2 + n2 * (mn2 - mny)^2 + n3 * (mn3 - mny)^2 + n4 * (mn4 - mny)^2 fnum <- sstr/(p - 1) faov[i] <- fnum/mse #get modified F statistic num <- (p - 1) * fnum den <- (1 - n1/n) * s1sq + (1 - n2/n) * s2sq + (1 - n3/n) * s3sq + (1 - n4/n) * s4sq modf <- num/den c1 <- ((1 - n1/n) * s1sq)/den c2 <- ((1 - n2/n) * s2sq)/den c3 <- ((1 - n3/n) * s3sq)/den c4 <- ((1 - n4/n) * s4sq)/den finv <- c1^2/(n1 - 1) + c2^2/(n2 - 1) + c3^2/(n3 - 1) + c4^2/(n4 - 1) md <- ceiling(1/finv) if(modf > qf(0.95, p - 1, md)) mct <- mct + 1 #get Welch F statistic w1 <- n1/s1sq w2 <- n2/s2sq w3 <- n3/s3sq w4 <- n4/s4sq u <- w1 + w2 + w3 + w4 xdd <- (w1 * mn1)/u + (w2 * mn2)/u + (w3 * mn3)/u + (w4 * mn4)/u num <- (w1 * (mn1 - xdd)^2 + w2 * (mn2 - xdd)^2 + w3 * (mn3 - xdd)^2 + w4 * (mn4 - xdd)^2)/(p - 1) tem <- (1 - w1/u)^2/(n1 - 1) + (1 - w2/u)^2/(n2 - 1) + (1 - w3/u)^2/(n3 - 1) + (1 - w4/u)^2/(n4 - 1) den <- 1 + (2 * (p - 2) * tem)/(p^2 - 1) wf <- num/den finv <- (3 * tem)/(p^2 - 1) wd <- ceiling(1/finv) if(wf > qf(0.95, p - 1, wd)) wct <- wct + 1 #get modified Welch F statistic dfnum <- (1/w1 + 1/w2 + 1/w3 + 1/w4)^2 dfden <- (1/w1)^2/(n1 - 1) + (1/w2)^2/(n2 - 1) + (1/w3)^2/(n3 - 1) + (1/w4)^2/(n4 - 1) wd <- ceiling(dfnum/dfden) if(wf > qf(0.95, p - 1, wd)) mwct <- mwct + 1 #get rank F statistic y <- rank(y) out <- lsfit(x, y, intercept = F) brols <- out$coef mn1 <- brols[1] mn2 <- brols[2] mn3 <- brols[3] mn4 <- brols[4] res <- out$resid mse <- t(res) %*% res/(n - p) sse <- mse * (n - p) mny <- mean(y) sstr <- n1 * (mn1 - mny)^2 + n2 * (mn2 - mny)^2 + n3 * (mn3 - mny)^2 + n4 * (mn4 - mny)^2 fnum <- sstr/(p - 1) rf[i] <- fnum/mse } mnbols <- apply(bols, 2, mean) ssdbols <- sqrt(n) * sqrt(apply(bols, 2, var)) faovcov <- sum(faov > qf(0.95, p - 1, n - p))/nruns modfcov <- mct/nruns mwfcov <- mwct/nruns wfcov <- wct/nruns rfcov <- sum(rf > qf(0.95, p - 1, n - p))/nruns list(mnbols = mnbols, ssdbols = ssdbols, faovcov = faovcov, modfcov = modfcov, wfcov = wfcov, mwfcov = mwfcov, rfcov = rfcov) } aovplots<-function(Y, FIT, RES) {# Makes a response plot and residual plot. cmar <- par("mar") par(mfrow = c(2, 1)) par(mar=c(4.0,4.0,2.0,0.5)) plot(FIT, Y) abline(0, 1) identify(FIT, Y) title("Response Plot") plot(FIT, RES) identify(FIT, RES) title("Residual Plot") par(mfrow = c(1, 1)) par(mar=cmar) } aovtplt<-function(x, y){ # Need x to have the factor levels needed for one way Anova. # This function plots y^L vs one way Anova fitted values # from the one way Anova of y^L on x where # L = -1, -0.5, 0, 0.5 or 1 except log(Y) is used if L = 0. # If plot is linear for L, use y^L (or log(Y) for L = 0) instead # of Y. This is a graphical method for a response transform. lam <- c(-1, -1/2, 0, 1/2, 1) xl <- c("1/Z", "1/sqrt(Z)", "LOG(Z)", "sqrt(Z)", "Z") par(mfrow=c(2,3)) x<-factor(x) for(i in 1:length(lam)) { if(lam[i] == 0) ytem <- log(y) else ytem <- y^lam[i] aovfit <- ytem - aov(ytem~x)$resid plot(aovfit, ytem, xlab = "TZHAT", ylab = xl[i]) abline(0, 1) #identify(aovfit, ytem) } par(mfrow=c(1,1)) } covfch<-function(x, csteps = 5) {# gets the FCH and RFCH estimators and the MB attractor # works for p = 1 zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ##get the location criterion lcut <- medd2 ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val <- mahalanobis(t(mnd), med, covv) if(val < lcut) {##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get FCH estimator chisqm <- qchisq(0.5, p) rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/chisqm covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the RFCH estimator ## (rmnf,rcovf) rd2 <- mahalanobis(x, mnf, covf) up <- qchisq(0.975, p) if(p > 1) { rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) if(p > 1){ rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf list(center = mnf, cov = covf, rmnf = rmnf, rcovf = rcovf, mnm=mnm, covm=covm) } covrmvn<-function(x, csteps = 5, locc = 0.5) {# Needs number of predictors p > 1. # This robust MLD estimator is tailored to estimate the covariance matrix # of the bulk of the data when the bulk of the data is MVN and the outliers # are "not too bad." The FCH and MB estimators are also produced. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] up <- qchisq(0.975, p) qchi <- qchisq(0.5, p) ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the cov estimator ## (rmnmvn,rcovmvn) tailored for MVN data rd2 <- mahalanobis(x, mnf, covf) rmnmvn <- apply(x[rd2 <= up, ], 2, mean) rcovmvn <- var(x[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ## reweight again rd2 <- mahalanobis(x, rmnmvn, rcovmvn) rmnmvn <- apply(x[rd2 <= up, ], 2, mean) rcovmvn <- var(x[rd2 <= up, ]) d2 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d2 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn list(center = rmnmvn, cov = rcovmvn, mnf = mnf, covf = covf, mnm = mnm, covm = covm) } ddcomp<-function(x, steps = 5) {# Need p > 1. # Makes 4 DD plots using the DGK, FCH, FMCD and MB estimators. # Click left mouse button to identify points. # Click right mouse button to end the function. # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. # R users need to type "library(MASS)" or "library(lqs)." # Calls covfch. p <- dim(x)[2] par(mfrow = c(2, 2)) center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical and RD the robust distance MD <- sqrt(md2) #DGK start md2 <- mahalanobis(x, center, cov) medd2 <- median(md2) ## get the start mns <- center covs <- cov ## concentrate for(i in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RDdgk <- const * rd plot(MD, RDdgk) abline(0, 1) identify(MD, RDdgk) title("DGK DD Plot") #FCH out <- covfch(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) rd <- sqrt(rd2) RDfch <- rd plot(MD, RDfch) abline(0, 1) identify(MD, RDfch) title("FCH DD Plot") #FMCD out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RDf <- const * rd plot(MD, RDf) abline(0, 1) identify(MD, RDf) title("FMCD DD Plot") #Median Ball start covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RDmb <- const * rd plot(MD, RDmb) abline(0, 1) identify(MD, RDmb) title("Med Ball DD Plot") par(mfrow=c(1,1)) } ddplot4<-function(x, alpha = 0.1){ # Makes a DD plot with covrmvn used for the RDi. # Calls covrmvn. # Need p > 1. # Semiparametric prediction regions are added. # Click left mouse button to identify points. # Click right mouse button to end the function. # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. p <- dim(x)[2] n <- dim(x)[1] up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) out <- covrmvn(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) # MD is the classical and RD the robust distance MD <- sqrt(md2) RD <- sqrt(rd2) plot(MD, RD) abline(0, 1) #get nonparametric prediction region boundary cuplim <- sqrt(quantile(md2, up)) a <- min(RD) b <- max(RD) lines(c(cuplim, cuplim), c(b, a)) #get semiparametric prediction region boundary ruplim <- sqrt(quantile(rd2, up)) a <- min(MD) b <- max(MD) lines(c(a, b), c(ruplim, ruplim)) #get parametric MVN prediction region boundary mvnlim <- sqrt(qchisq(up, p)) b <- min(b, mvnlim) lines(c(a, b), c(mvnlim, mvnlim)) identify(MD, RD) list(cuplim = cuplim, ruplim = ruplim, mvnlim = mvnlim) } diagplot<-function(x, Y) {# Scatterplot matrix of OLS diagnostics. # Workstation need to activate a graphics # device with command "X11()" or "motif()." n <- length(Y) rmat <- matrix(nrow = n, ncol = 7) out <- lsfit(x, Y) tem <- ls.diag(out) rmat[, 1] <- tem$cooks rmat[, 2] <- tem$hat rmat[, 3] <- tem$std.res rmat[, 4] <- tem$stud.res rmat[, 5] <- tem$dfits rmat[, 6] <- Y - out$resid rmat[, 7] <- Y pairs(rmat, labels = c("Cook's CD", "leverages", "stand resid", "stud resid", "DFFITS", "YHAT", "Y")) } edtplt<-function(x, y, int = F){ # Need x to be an appropriate design matrix for the experimental # design fit by OLS. This function plots y^L vs OLS fit where # L = -1, -0.5, 0, 0.5 or 1 except log(Y) is used if L = 0. # If plot is linear for L, use y^L (or log(Y) for L = 0) instead # of Y. This is a graphical method for a response transform. lam <- c(-1, -1/2, 0, 1/2, 1) xl <- c("1/Z", "1/sqrt(Z)", "LOG(Z)", "sqrt(Z)", "Z") par(mfrow=c(2,3)) for(i in 1:length(lam)) { if(lam[i] == 0) ytem <- log(y) else ytem <- y^lam[i] olsfit <- ytem - lsfit(x, ytem, intercept = int)$resid plot(olsfit, ytem, xlab = "TZHAT", ylab = xl[i]) abline(0, 1) #identify(olsfit, ytem) } par(mfrow=c(1,1)) } ffplot2<-function(x, y, nsamps = 7){ # For Unix, use X11() to turn on the graphics device before # using this function. For R, first type library(MASS). # Makes an FF plot with several resistant estimators. # Needs hbreg, mbareg, and mbalata. n <- length(y) x <- as.matrix(x) rmat <- matrix(nrow = n, ncol = 7) lsfit <- y - lsfit(x, y)$residuals print("got OLS") almsfit <- y - lmsreg(x, y)$resid print("got ALMS") altsfit <- y - ltsreg(x, y)$residuals print("got ALTS") mbacoef <- mbareg(x, y, nsamp = nsamps)$coef MBAfit <- mbacoef[1] + x %*% mbacoef[-1] print("got MBA") bbcoef <- hbreg(x, y)$bbcoef BBfit <- bbcoef[1] + x %*% bbcoef[-1] print("got BB") mbalcoef <- mbalata(x, y, nsamp = nsamps)$coef MBALfit <- mbalcoef[1] + x %*% mbalcoef[-1] print("got MBALATA") rmat[, 1] <- y rmat[, 2] <- lsfit rmat[, 3] <- almsfit rmat[, 4] <- altsfit rmat[, 5] <- MBAfit rmat[, 6] <- BBfit rmat[, 7] <- MBALfit pairs(rmat, labels = c("Y", "OLS Fit", "ALMS Fit", "ALTS Fit", "MBA Fit", "BB Fit", "MBALATA")) } fselboot<-function(x,y,B = 1000){ #needs library(leaps), n > 5p, p > 2 #bootstrap min Cp model forward selection regression #Does not make sense to do variable selection if there #is only one nontrivial predictor. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vmax <- min(p,as.integer(n/5)) vars <- as.vector(1:(p-1)) #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the minimum Cp submodel tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] #do not need the constant in vin vin <- vars[mincp[-1]] sub <- lsfit(x[,vin],y) betas <- matrix(0,nrow=B,ncol=p) #bootstrap the minimum Cp submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-regsubsets(x,y=yb,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] vin <- vars[mincp[-1]] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef } list(full=full,sub=sub,betas=betas) } fwlssim<-function(n = 100, q = 4, sd = 1, nruns = 1, type = 1){ # Simulates FWLS data Y = SP + |SP - j| e, with SP = alpha + beta' x. # So w = 1/(SP-j)^2 and what = 1/(ESP - j)^2 # type = 1 or 3 for wls, type = 2 or 4 for fwls # There are q = p-1 nontrivial predictors as well as a constant # Need p > 1 since x is n by q. beta <- 2 + 0 * 1:q par(mfrow = c(2, 2)) for(i in 1:nruns) { #make data x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(q == 1) x <- as.matrix(x) u <- x if(type == 1) { sp <- 25 + x %*% beta Y <- sp + abs(sp - 5) * sd * rnorm(n) ols <- lsfit(x, Y) olsesp <- Y - ols$resid what <- 1/(sp - 5)^2 } if(type == 2) { sp <- 25 + x %*% beta Y <- sp + abs(sp - 5) * sd * rnorm(n) ols <- lsfit(x, Y) olsesp <- Y - ols$resid what <- 1/(olsesp - 5)^2 } if(type == 3) { sp <- 25 + x %*% beta Y <- sp + abs(sp - 25) * sd * rnorm(n) ols <- lsfit(x, Y) olsesp <- Y - ols$resid what <- 1/(sp - 25)^2 } if(type == 4) { sp <- 25 + x %*% beta Y <- sp + abs(sp - 25) * sd * rnorm(n) ols <- lsfit(x, Y) olsesp <- Y - ols$resid what <- 1/(olsesp - 25)^2 } uu <- sqrt(what) Z <- uu * Y for(j in 1:q) { u[, j] <- uu * x[, j] } u <- cbind(uu, u) wls <- lsfit(u, Z, intercept = F) FIT <- Y - ols$resid plot(FIT, Y) abline(0, 1) title("a) OLS Response Plot") RESID <- ols$resid plot(FIT, RESID) title("b) OLS Residual Plot") ZFIT <- Z - wls$resid plot(ZFIT, Z) abline(0, 1) title("c) FWLS Response Plot") ZRESID <- wls$resid plot(ZFIT, ZRESID) title("d) FWLS Residual Plot") } } ganova<-function(x, y){ # Let x have the factor levels 1,2, ..., k needed for one way Anova # and y the corresponding response values. # This function does graphical anova. Treatments <- c("A","B","C","D","E","F","G","H","I","J","K") xx <- as.integer(x) k <- max(xx) n <- length(y) v10 <- 10 + 0*1:n v20 <- 20 + 0*1:k gmn <- mean(y) mn <- 1:k smn <- mn for (i in 1:k){ mn[i] <- mean(y[xx==i]) } mn <- mn - gmn smn <- sqrt((n-k)/(k-1))*mn w <- as.factor(x) residuals <- aov(y~w)$resid graphicalanova <- c(v10,v20) Residuals <- c(residuals,smn) plot(Residuals,graphicalanova) title("Scaled Treatment Deviations") Treatments <- Treatments[1:k] list(smn=smn,Treatments=Treatments) } ganova2<-function(x, blocks, y){ # Let x and blocks have levels 1,2, ..., k and y the corresponding # response values. This function does graphical anova for # completely randomized block design. Treatments <- c("A","B","C","D","E","F","G","H","I","J","K") xx <- as.integer(x) blk <- as.integer(blocks) k <- max(xx) b <- max(blk) n <- length(y) v10 <- 10 + 0*1:n v20 <- 20 + 0*1:k v30 <- 30 + 0*1:b gmn <- mean(y) mn <- 1:k smn <- mn bmn <- 1:b sbmn <- bmn for (i in 1:k){ mn[i] <- mean(y[xx==i]) } for (i in 1:b){ bmn[i] <- mean(y[blk==i]) } mn <- mn - gmn smn <- sqrt(b-1)*mn bmn <- bmn - gmn sbmn <- sqrt(k-1)*bmn w <- as.factor(x) block <- as.factor(blocks) residuals <- aov(y~block+w)$resid Treatmentdevs <- c(v10,v20,v30) Residuals <- c(residuals,smn,sbmn) plot(Residuals,Treatmentdevs) title("Scaled Block Deviations") Treatments <- Treatments[1:k] Blocks <- 1:b list(sbmn=sbmn,Blocks,smn=smn,Treatments=Treatments) } hbreg<-function(x, y, csteps = 10, a = 1.4, rreg = T, type = 1) {# Gets the hbreg estimator with 10 concentration steps applied to # the HB attractor BB. Uses LTA criterion to screen the 3 attractors. # For R, type "library(MASS)" before using this function. # If rreg = F, arobcoef = olscoef, if T, arobcoef uses # mbareg if type = 1, ltsreg if type = 2, mbalata if type = 3 # rmreg2 if type =4. # Calls mbareg, mbalata, rmreg3 which is rmreg2 without plots. med <- median(y) madd <- mad(y, constant = 1) x <- as.matrix(x) n <- dim(x)[1] nc <- dim(x)[2] + 1 #nc is number of predictors including intercept temp <- lsfit(x, y) hbf <- temp$coef absres <- abs(temp$residuals) indx <- (absres <= median(absres)) critf <- sum(absres[indx]) #mbareg or lmsreg may work better than ltsreg if(rreg == T){ if(type == 1){ tem <- mbareg(x, y) temres <- y - tem$coef[1] - x %*% tem$coef[-1] absres <- abs(temres)} if(type == 2){ tem <- ltsreg(x, y) absres <- abs(tem$residuals)} if(type == 3){ tem <- mbalata(x,y) temres <- y - tem$coef[1] - x %*% tem$coef[-1] absres <- abs(temres)} if(type == 4){ tem <- rmreg3(x,y) temres <- y - tem$Bhat[1] - x %*% tem$Bhat[-1] absres <- abs(temres)} indx <- (absres <= median(absres)) crit <- a*sum(absres[indx]) if(crit < critf) { critf <- crit hbf <- tem$coef} } #get y's closest to median y indx <- (y >= (med - madd) & y <= (med + madd)) #get the HB attractor bk = BB bk <- lsfit(x[indx, ], y[indx])$coef res <- y - (x %*% bk[2:nc] + bk[1]) ressq <- res^2 m <- median(ressq) for(i in 1:csteps) { indx <- (ressq <= m) bk <- lsfit(x[indx, ], y[indx])$coef res <- y - (x %*% bk[2:nc] + bk[1]) ressq <- res^2 m <- median(ressq) } bb <- bk res <- y - (x %*% bb[2:nc] + bb[1]) absres <- abs(res) indx <- (absres <= median(absres)) crit <- a*sum(absres[indx]) if(crit < critf) hbf <- bb if(rreg == T && type != 4) arobcoef <- tem$coef else if (rreg == T && type == 4) arobcoef <- as.vector(tem$Bhat) else arobcoef <- temp$coef list(coef = hbf, olscoef = temp$coef, arobcoef = arobcoef, bbcoef = bb) } locpi2<-function(Y, alpha = 0.05){ # Makes the PI (L_n,U_n) for the location model, # Can be used as a bootstrap confidence inteval if Y contains # n = B bootstrap values of a statistic, # or Bayesian highest density interval if Y contanins n values #from a posterior pdf. #Usesresiduals from the sample mean ybar. sy <- as.vector(Y) n <- length(sy) cc <- ceiling(n * (1 - alpha)) corfac <- (1 + 15/n) * sqrt((n+1)/(n - 1)) yfhat <- mean(sy) #get the PI sy <- sort(sy) rup <- sy[cc] rlow <- sy[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sy[j] rlow <- sy[j - cc + 1] } } } UPI <- (1-corfac) * yfhat + corfac * rup LPI <- (1-corfac) * yfhat + corfac * rlow list(LOCPI = c( LPI, UPI)) } lrdata <- function(n = 200, type = 3) {# Generates data for logistic regression. # If X|y=1 ~ N(mu_1,I) and X|Y=0 ~ N(0,I) then beta = mu_1 and alpha = -0.5 ||mu_1||^2. # # If q is changed, change the formula in the glm statement. q <- 5 y <- 0 * 1:n y[(n/2 + 1):n] <- y[(n/2 + 1):n] + 1 beta <- 0 * 1:q if(type == 1) { beta[1] <- 1 alpha <- -0.5 } if(type == 2) { beta <- beta + 1 alpha <- -q/2 } if(type == 3) { beta[1:3] <- 1 alpha <- -1.5 } x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { x[(n/2 + 1):n, 1] <- x[(n/2 + 1 ):n, 1] + 1 } if(type == 2) { x[(n/2 + 1):n, ] <- x[(n/2 + 1 ):n, ] + 1 } if(type == 3) { x[(n/2 + 1):n, 1:3 ] <- x[(n/2 + 1 ):n, 1:3 ] + 1 } #X|y=0 ~ N(0, I) and X|y=1 ~ N(beta,I) # change formula to x[,1]+ ... + x[,q] with q out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[,5], family = binomial) list(alpha = alpha, beta = beta, lrcoef = out$coef,x=x,y=y) } lrdata2<-function(n = 200, m = 10, type = 1) {# Generates data for logistic regression. # If q is changed, change the formula in the glm statement. q <- 5 y <- 1:n z <- y mv <- m + 0 * y beta <- 0 * 1:q if(type == 1) { beta <- beta + 2 alpha <- 0 } if(type == 2) { beta[1] <- 2 alpha <- 0 } if(type == 3) { beta[1:3] <- 2 alpha <- 0 } x <- matrix(rnorm(n * q), nrow = n, ncol = q) SP <- alpha + x %*% beta pv <- exp(SP)/(1 + exp(SP)) for(i in 1:n) y[i] <- rbinom(1, size = m, prob = pv[i]) z <- y/m # change formula to x[,1]+ ... + x[,q] with q out <- glm(z ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[, 5], family = binomial, weights = mv) list(x = x, y = y, mv = mv, alpha = alpha, beta = beta, lrcoef = out$coef) } lressp<-function(x, y, slices = 10){ # Makes the ESSP for logistic regression. # If X|y=1 ~ N(mu_1,I) and X|Y=0 ~ N(0,I) then # beta = mu_1 and alpha = ||mu_1||^2. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(lqs)." # Advance the view with the right mouse button. # In R, highlight "stop." # #ESP <- x %*% out$coef[-1] + out$coef[1] #If q is changed, change the formula in the glm statement. #q <- 5 # change formula to x[,1]+ ... + x[,q] with q #out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[, 5], family = poisson) out <- glm(y~., family=binomial, data=data.frame(cbind(x,y))) ESP <- predict(out) Y <- y plot(ESP, Y) abline(mean(y), 0) fit <- y fit <- exp(ESP)/(1 + exp(ESP)) # lines(sort(ESP),sort(fit)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) fit2 <- fit n <- length(y) val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i-1)*val+1):(i * val)] <- mean(y[indx[((i-1)*val+1):(i*val)]]) } fit2[((slices-1)*val+1):n] <- mean(y[indx[((slices-1)*val+1):n]]) #fit2 is already sorted in order corresponding to indx lines(ESP[indx], fit2) #list(fit2=fit2,n=n,slices=slices,val=val) } lrplot<-function(x, y, mv, slices = 10, ff = 0.97, step = T) {# Makes response and OD plots for binomial logistic regression. # mv = (m1, ..., mn) where yi is bin(mi,p(SP)) # If step = T use step function, else use lowess. # Workstation: need to activate a graphics # device with command "X11()" or "motif()." ####lrplot4 is better # If q is changed, change the formula in the glm statement. q <- 5 # change formula to x[,1]+ ... + x[,q] with q n <- length(y) Z <- y/mv out <- glm(Z ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[, 5], family = binomial, weights = mv) #out<-glm(Z~., family=binomial,weights=mv,data=data.frame(cbind(x,Z))) #ESP <- x %*% out$coef[-1] + out$coef[1] ESP <- predict(out) par(mfrow = c(1, 2),pty="s") plot(ESP, Z) abline(weighted.mean(Z, mv), 0) fit <- y fit <- exp(ESP)/(1 + exp(ESP)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) if(step == T) { fit2 <- fit val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i - 1) * val + 1):(i * val)] <- weighted.mean(Z[ indx[((i - 1) * val + 1):(i * val)]], mv[indx[(( i - 1) * val + 1):(i * val)]]) } fit2[((slices - 1) * val + 1):n] <- weighted.mean(Z[indx[(( slices - 1) * val + 1):n]], mv[indx[((slices - 1) * val + 1):n]]) # fit2 is already sorted in order corresponding to indx lines(ESP[indx], fit2) } else lines(lowess(ESP, Z, f = ff), type = "s") title("a) ESSP") #get OD plot val <- exp(ESP)/(1 + exp(ESP)) Ehat <- mv * val Vmodhat <- Ehat * (1 - val) Vhat <- (y - Ehat)^2 plot(Vmodhat, Vhat) abline(0, 1) abline(0, 4) abline(lsfit(Vmodhat, Vhat)$coef) title("b) OD Plot") par(mfrow=c(1,1)) } lrplot2<-function(x, y, slices = 10, ff = 0.99, step = T) {# Makes the ESSP for binary logistic regression. # If step = T use step function, else use lowess. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # #ESP <- x %*% out$coef[-1] + out$coef[1] #If q is changed, change the formula in the glm statement. #q <- 5 # change formula to x[,1]+ ... + x[,q] with q #out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[, 5], family = poisson) out <- glm(y~., family=binomial, data=data.frame(cbind(x,y))) ESP <- predict(out) Y <- y plot(ESP, Y) fit <- y fit <- exp(ESP)/(1 + exp(ESP)) # lines(sort(ESP),sort(fit)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) if(step == T){ fit2 <- fit n <- length(y) val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i - 1) * val + 1):(i * val)] <- mean(y[indx[((i - 1) * val + 1):(i * val)]]) } fit2[((slices - 1) * val + 1):n] <- mean(y[indx[((slices - 1) * val + 1 ):n]]) # fit2 is already sorted in order corresponding to indx lines(ESP[indx], fit2)} else lines(lowess(ESP, Y, f = ff), type = "s") title("Response Plot") } lrplot4<-function(x, y, mv, slices = 10, ff = 0.97, step = T) {# Makes response plot and OD plot for binomial logistic regression. # mv = (m1, ..., mn) where yi is bin(mi,p(SP)) # If step = T use step function, else use lowess. # Workstation: need to activate a graphics # device with command "X11()" or "motif()." # #ESP <- x %*% out$coef[-1] + out$coef[1] # n <- length(y) tdata <- as.data.frame(cbind(x,y)) ny <- mv-y #ny[i] = mv[i]-y[i] = no. of ``failures" out <- glm(cbind(y,ny)~., family=binomial, data=tdata) ESP <- predict(out) par(mfrow = c(1, 2),pty="s") Z <- y/mv plot(ESP, Z) abline(weighted.mean(Z, mv), 0) fit <- y fit <- exp(ESP)/(1 + exp(ESP)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) if(step == T) { fit2 <- fit val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i - 1) * val + 1):(i * val)] <- weighted.mean(Z[ indx[((i - 1) * val + 1):(i * val)]], mv[indx[(( i - 1) * val + 1):(i * val)]]) } fit2[((slices - 1) * val + 1):n] <- weighted.mean(Z[indx[(( slices - 1) * val + 1):n]], mv[indx[((slices - 1) * val + 1):n]]) # fit2 is already sorted in order corresponding to indx lines(ESP[indx], fit2) } else lines(lowess(ESP, Z, f = ff), type = "s") title("a) Response Plot") #get OD plot val <- exp(ESP)/(1 + exp(ESP)) Ehat <- mv * val Vmodhat <- Ehat * (1 - val) Vhat <- (y - Ehat)^2 plot(Vmodhat, Vhat) abline(0, 1) abline(0, 4) abline(lsfit(Vmodhat, Vhat)$coef) title("b) OD Plot") } maha<-function(x) {# Generates the classical mahalanobis distances. Need p > 1 or x a matrix. center <- apply(x, 2, mean) cov <- var(x) return(sqrt(mahalanobis(x, center, cov))) } #called by hbreg mbalata<-function(x, y, k=6, nsamp = 7) {#gets the median ball fit with 7 centers, med resid crit, 7 ball sizes x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] # q + 1 is number of predictors including intercept vals <- c(q + 3 + floor(n/100), q + 3 + floor(n/40), q + 3 + floor(n/20), q + 3 + floor(n/10), q + 3 + floor(n/5), q + 3 + floor(n/3), q + 3 + floor(n/2)) covv <- diag(q) centers <- sample(n, nsamp) temp <- lsfit(x, y) mbaf <- temp$coef ## get LATA criterion res <- temp$residuals crit <- k^2*median(res^2) cn <- sum(res^2 <= crit) absres <- sort(abs(res)) critf <- sum(absres[1:cn]) ## for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) smd2 <- sort(md2) for(j in 1:7) { temp <- lsfit(x[md2 <= smd2[vals[j]], ], y[md2 <= smd2[vals[j]]]) #Use OLS on rows with md2 <= cutoff = smd2[vals[j]] res <- y - temp$coef[1] - x %*% temp$coef[-1] ## get LATA criterion crit <- k^2*median(res^2) cn <- sum(res^2 <= crit) absres <- sort(abs(res)) crit <- sum(absres[1:cn]) ## if(crit < critf) { critf <- crit mbaf <- temp$coef } } } list(coef = mbaf, critf = critf) } #called by hbreg mbareg<-function(x, y, nsamp = 7) {#gets the mbareg fit with 7 centers, med resid crit, 7 ball sizes x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] # q + 1 is number of predictors including intercept vals <- c(q + 3 + floor(n/100), q + 3 + floor(n/40), q + 3 + floor(n/20 ), q + 3 + floor(n/10), q + 3 + floor(n/5), q + 3 + floor(n/3), q + 3 + floor(n/2)) covv <- diag(q) centers <- sample(n, nsamp) temp <- lsfit(x, y) mbaf <- temp$coef critf <- median(temp$residuals^2) for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) smd2 <- sort(md2) for(j in 1:7) { temp <- lsfit(x[md2 <= smd2[vals[j]], ], y[md2 <= smd2[ vals[j]]]) #Use OLS on rows with md2 <= cutoff = smd2[vals[j]] res <- y - temp$coef[1] - x %*% temp$coef[-1] crit <- median(res^2) if(crit < critf) { critf <- crit mbaf <- temp$coef } } } list(coef = mbaf, critf = critf) } medout<-function(x) {# finds squared Euclidean distances from the coordinatewise median x<-as.matrix(x) p <- dim(x)[2] covv <- diag(p) med <- apply(x, 2, median) d2 <- mahalanobis(x, center = med, covv) plot(d2) list(d2 = d2) } MLRplot<-function(x, Y){ # Response plot and residual plot. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(lqs)" if a robust estimator replaces lsfit. # Advance the view with the right mouse button. x <- as.matrix(x) out <- lsfit(x, Y) cook <- ls.diag(out)$cooks n <- dim(x)[1] p <- dim(x)[2] + 1 tem <- cook > min(0.5, (2 * p)/n) bhat <- out$coef FIT <- Y - out$res cmar <- par("mar") par(mfrow = c(2, 1)) par(mar=c(4.0,4.0,2.0,0.5)) plot(FIT, Y) abline(0, 1) points(FIT[tem], Y[tem], pch = 15) title("Response Plot") identify(FIT, Y) RES <- Y - FIT plot(FIT, RES) points(FIT[tem], RES[tem], pch = 15) title("Residual Plot") identify(FIT, RES) par(mfrow = c(1, 1)) par(mar=cmar) } mlrplot2 <- function(x, Y) {# Makes the response plot and residual plot for two mbareg estimators. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # Advance the view with the right mouse button, and in R, highlight "stop." # Calls mbareg, mbalata. x <- as.matrix(x) out <- mbareg(x, Y) bhat <- out$coef FIT <- bhat[1] + x %*% bhat[-1] par(mfrow = c(2, 2)) plot(FIT, Y) abline(0, 1) identify(FIT, Y) title("MBA Response Plot") RES <- Y - FIT plot(FIT, RES) identify(FIT, RES) title("MBA Residual Plot") # out <- mbalata(x, Y) bhat <- out$coef FIT <- bhat[1] + x %*% bhat[-1] plot(FIT, Y) abline(0, 1) identify(FIT, Y) title("MBALATA Response Plot") RES <- Y - FIT plot(FIT, RES) identify(FIT, RES) title("MBALATA Residual Plot") par(mfrow=c(1,1)) } mlrplot4<-function(x, Y) {# This function is for R. # Makes response plot and residual plot with large # Cook's distances and Pena's distances highlighted. # Squares have large Cook's distances # and crosses have large Pena's distances. x <- as.matrix(x) out <- lsfit(x, Y) cook <- ls.diag(out)$cooks n <- dim(x)[1] p <- dim(x)[2] + 1 #get Pena's distances s one <- 1 + 0 * 1:n w <- cbind(one, x) cp <- t(w) %*% w h <- w %*% solve(cp) %*% t(w) s <- 0 * 1:n for(i in 1:n) { for(j in 1:n) { s[i] <- s[i] + (cook[j] * h[i, j]^2)/(h[i, i] * h[j, j]) } } tem <- cook > min(0.5, (2 * p)/n) medd <- median(s) madd <- mad(s, constant = 1) tem2 <- abs(s - medd) > 4.5 * madd bhat <- out$coef FIT <- bhat[1] + x %*% bhat[-1] cmar <- par("mar") par(mfrow = c(2, 1)) par(mar=c(4.0,4.0,2.0,0.5)) plot(FIT, Y) abline(0, 1) points(FIT[tem], Y[tem], pch = 0) points(FIT[tem2], Y[tem2], pch = 3) identify(FIT, Y) title("Response Plot") RES <- Y - FIT plot(FIT, RES) points(FIT[tem], RES[tem], pch = 0) points(FIT[tem2], RES[tem2], pch = 3) identify(FIT, RES) title("Residual Plot") par(mfrow = c(1, 1)) par(mar=cmar) } MLRsim<-function(n = 100, q = 7, nruns = 4, eps = 0.1, shift = 9, type = 1){ #Right click Stop for each plot. #Generates response and residual plots for MLR #for a few iid error distributions: # if type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients # Calls MLRplot. b <- 0 * 1:q + 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set MLRplot(x,y) #get the response and residual plots }} mltreg<-function(x, y, indices = c(1,2)){ ##Need p > 1, m > 1. # Does multivariate linear regression. # Advance the plot by highlighting Stop with the right mouse button. # Want n > mp and n > 10p for Hotelling Lawley pvalues. # The indices are the variables to be left out of the # reduced model for the MANOVA partial F test. # Calls MLRplot. x <- as.matrix(x) y <- as.matrix(y) n <- dim(x)[1] q <- dim(x)[2] p <- q+1 m <- dim(y)[2] r <- length(indices) #Get L for the MANOVA partial F test L <- matrix(0,nrow=r,ncol=p) for(i in 1:r) L[i,indices[i]] <- 1 res <- matrix(nrow = n, ncol = m, 0) fit <- res Bhat <- matrix(nrow = p, ncol = m, 0) # q + 1 = p is number of predictors including intercept for(i in 1:m){ out <- lsfit(x,y[,i]) res[,i] <- out$res fit[,i] <- y[,i] - res[,i] Bhat[,i] <- out$coef } for(i in 1:m){ MLRplot(x,y[,i]) } Covhat <- (n-1)/(n-p) * var(res) #Get pvalues for testing whether jth predictor is #needed in the model given the other predictors are in the model. one <- 1 + 0*1:n w <- cbind(one,x) pvals <- 0*1:p Fj <- pvals J <- t(w)%*%w J <- solve(J) Covinv <- solve(Covhat) dendf <- n - m*p if( n <= m*p) dendf <- 1 for(j in 1:p){ T <- t(Bhat[j,])%*%Covinv%*%Bhat[j,]/J[j,j] pvals[j] <- 1 - pf((T/m),m,dendf) Fj[j] <- T/m } #Get pval for MANOVA F test for whether the #nontrivial predictors are needed in the model. D <- as.matrix(Bhat[-1,]) if(p == 2) D <- t(D) tem <- as.matrix(J[-1,-1]) teminv <- solve(tem) Weinv <- Covinv/(n-p) H <- t(D)%*%teminv%*%D C <- Weinv%*%H eig <- eigen(C,symmetric=FALSE,only.values=TRUE)$values eig <- as.double(eig) u <- sum(eig) MANOVAF <- (n-p)*u/((p-1)*m) pval <- 1 - pf(MANOVAF,(p-1)*m,dendf) MANOVA <- cbind(MANOVAF, pval) #got MANOVA F test summaries #Get pval for MANOVA partial F test for whether the #predictors given by indices are needed in the model. tem <- L%*%J%*%t(L) teminv <- solve(tem) H <- t(L%*%Bhat)%*%teminv%*%(L%*%Bhat) C <- Weinv%*%H eig <- eigen(C,symmetric=FALSE,only.values=TRUE)$values eig <- as.double(eig) u <- sum(eig) partialF <- (n-p)*u/(r*m) Pval <- 1 - pf(partialF,r*m,dendf) partial <- cbind(partialF, Pval) ##got partial F test summaries Ftable <- cbind(Fj,pvals) list(fit = fit, res = res, Covhat = Covhat, Bhat = Bhat, partial=partial, Ftable=Ftable, MANOVA=MANOVA) } modIboot<-function(x,y,B = 1000){ #needs library(leaps), n > 5p, p > 2 #bootstrap the I_I model for forward selection regression #Does not make sense to do variable selection if there #is only one nontrivial predictor. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vmax <- min(p,as.integer(n/5)) vars <- as.vector(1:(p-1)) #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the I_I submodel tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) num <- 1:length(out$cp) tnum <- num[out$cp <= min(out$cp)+1] Icp <- out$cp[min(tnum)] modI <- out$which[out$cp==Icp] #do not need the constant in vin vin <- vars[modI[-1]] sub <- lsfit(x[,vin],y) betas <- matrix(0,nrow=B,ncol=p) #bootstrap the I_I submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-regsubsets(x,y=yb,method="forward") out<-summary(tem) num <- 1:length(out$cp) tnum <- num[out$cp <= min(out$cp)+1] Icp <- out$cp[min(tnum)] modI <- out$which[out$cp==Icp] vin <- vars[modI[-1]] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef } list(full=full,sub=sub,betas=betas) } modIpisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alpha = 0.05){ #Needs library(leaps). #Simulates PIs for forward selection variable selection for model I_I. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} if (alpha <= 0.1) {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn pilen <- 1:nruns ps <- pilen opicov <- 0 q <- p-1 vmax <- min(p,as.integer(n/5)) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the forward sel model I_I model tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) num <- 1:length(out$cp) tnum <- num[out$cp <= min(out$cp)+1] Icp <- out$cp[min(tnum)] modI <- out$which[out$cp==Icp,] #do not need the constant in vin vin <- vars[modI[-1]] sub <- lsfit(x[,vin],y) ps[i]<-length(sub$coef) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] fres <- sub$resid #get asymptotically optimal PI sres <- sort(fres) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } psmn <- mean(ps)-k #0 if subset is selecting optimal subset pimnlen <- mean(pilen) opicov <- opicov/nruns list(psmn=psmn, opicov=opicov, pimenlen = pimnlen)} mplot<-function(x){ # Makes a DD plot only using the MDi, the RDi are not used. #A line with slope near 1 suggests x may be approx MVN. #right click Stop to get cursor p <- dim(x)[2] center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) md <- sqrt(md2) rd <- md const <- sqrt(qchisq(0.5, p))/median(rd) rd <- const * rd plot(md, rd) abline(0, 1) identify(md, rd) } mpredsim<-function(n = 100, m = 2, p = 4, nruns = 10, etype = 1, eps = 0.25, rho = 0.1, dd = 7, mnull = F, alpha = 0.1){ # Simulates prediction regions for multivariate linear regression # p = number of predictors including intercept # m > 1 is the number of response variables # want n > mp and n > 10 p # multiply E by A where etype = 1 for MVN Nq(0,I), # etype = 2 for (1 - eps) Nq(0,I) + eps Nq(0, 25 I) # eps = 0.1, 0.25, 0.4, and 0.6 are interesting # etype = 3 for multivariate t_d with d = dd degrees of freedom # dd = 1, 2, 3, 5, 7 are interesting # etype = 4 for lognormal - E(lognormal). # For MVN data multiplying E by A results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. The diagonal elements # of the covariance matrix are 1 + (m-1) rho^2, while the off # diagonal elements are 2 rho + (m-2) rho^2. Hence the correlations # are (2 rho + (m-2) rho^2)/(1 + (m-1) rho^2). As rho gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # set.seed(974) # Calls covrmvn. ccvr <- 0 scvr <- 0 rcvr <- 0 volc <- 1:nruns vols <- volc volr <- volc #up <- 1 - alpha up <- min((1 - alpha/2), (1 - alpha + 10*alpha*m/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + m/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha np1 <- n + 1 q <- p - 1 A <- matrix(rho,nrow=m,ncol=m) diag(A) <- 1 res <- matrix(nrow = n, ncol = m, 0) fit <- res B <- matrix(nrow = p, ncol = m, 1) if(p == m){ for(j in 1:(m-1)){ B[p-j+1,j:m] <- 0 } } if(p > m){ for(j in 1:m){ B[p-j+1,j:m] <- 0 } } if(p < m){ for(j in 1:(p-1)){ B[p-j+1,j:m] <- 0 } } if (mnull == T) B[-1,] <- 0 Bhat <- B one <- 1 + 0*1:np1 onen <- one[-1] for(i in 1:nruns) { x <- matrix(rnorm(np1 * q), nrow = np1, ncol = q) w <- cbind(one,x) y <- w %*% B #make error matrix E E <- matrix(rnorm(np1 * m), nrow = np1, ncol = m) if(etype == 2) { zu <- runif(np1) E[zu < eps, ] <- E[zu < eps, ] * 5 } if(etype == 3) { zu <- sqrt(rchisq(np1, dd)/dd) E <- E/zu } if(etype == 4) E <- exp(E) - exp(0.5) #want mean 0 error vectors E <- E %*% A # got error matrix E y <- y + E yf <- y[np1,] #want xf to be a column vector with a 1 xf <- rbind(1,as.matrix(x[np1,])) y <- y[-np1,] x <- x[-np1,] for(j in 1:m){ out <- lsfit(x,y[,j]) res[,j] <- out$res fit[,j] <- y[,j] + res[,j] Bhat[,j] <- out$coef } ##Note that ith row of xx is t(hat(E(yf)) + hat(eps)_i) ##for i = 1,...,n. ##Get prediction regions based on xx which has m columns, ##and see if yf is in the prediction regions. zz <- t(t(Bhat)%*%xf) xx <- res + onen %*% zz center <- apply(xx, 2, mean) cov <- var(xx) md2 <- mahalanobis(xx, center, cov) hsq <- quantile(md2, up) if(mahalanobis(t(yf), center, cov) <= hsq) ccvr <- ccvr + 1 volc[i] <- sqrt(hsq)^m * prod(diag(chol(cov))) out <- covrmvn(xx) center <- out$center cov <- out$cov md2 <- mahalanobis(xx, center, cov) hsq <- quantile(md2, up) dsq <- mahalanobis(t(yf), center, cov) if(dsq <= hsq) scvr <- scvr + 1 sqrtdet <- prod(diag(chol(cov))) vols[i] <- sqrt(hsq)^m * sqrtdet hsq <- qchisq(up, m) if(dsq <= hsq) rcvr <- rcvr + 1 volr[i] <- sqrt(hsq)^m * sqrtdet } ccvr <- ccvr/nruns scvr <- scvr/nruns rcvr <- rcvr/nruns #get a measure of efficiency wrt vols, so eff(vols) = 1 vols <- mean(vols) volc <- mean(volc)/vols volr <- mean(volr)/vols vols <- 1 list(ncvr = ccvr, scvr = scvr, mcvr = rcvr, voln = volc, vols = vols, volm = volr, up = up) } mregddsim<-function(n = 100, m = 2, p = 4, nruns = 10, etype = 1, eps = 0.25, rho = 0.1, dd = 7, mnull = F, alph=0.1){ ## Need p > 1, m > 1. # Simulates DD plots of residuals for # multivariate linear regression model. # The identity line and lines corresponding to the # 100(1-alph)% prediction regions are added. # p = number of predictors including intercept # m > 1 is the number of response variables # want n > mp + 10, n > 10m and n > 10 p # multiply E by A where etype = 1 for MVN Nq(0,I), # etype = 2 for (1 - eps) Nq(0,I) + eps Nq(0, 25 I) # eps = 0.1, 0.25, 0.4, and 0.6 are interesting # etype = 3 for multivariate t_d with d = dd degrees of freedom # dd = 1, 2, 3, 5, 7 are interesting # etype = 4 for lognormal - E(lognormal). # For MVN data multiplying E by A results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. The diagonal elements # of the covariance matrix are 1 + (m-1) rho^2, while the off # diagonal elements are 2 rho + (m-2) rho^2. Hence the correlations # are (2 rho + (m-2) rho^2)/(1 + (m-1) rho^2). As rho gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). q <- p - 1 A <- matrix(rho,nrow=m,ncol=m) diag(A) <- 1 res <- matrix(nrow = n, ncol = m, 0) fit <- res B <- matrix(nrow = p, ncol = m, 1) if(p == m){ for(j in 1:(m-1)){ B[p-j+1,j:m] <- 0 } } if(p > m){ for(j in 1:m){ B[p-j+1,j:m] <- 0 } } if(p < m){ for(j in 1:(p-1)){ B[p-j+1,j:m] <- 0 } } if (mnull == T) B[-1,] <- 0 Bhat <- B one <- 1 + 0*1:n for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) w <- cbind(one,x) y <- w %*% B #make error matrix E E <- matrix(rnorm(n * m), nrow = n, ncol = m) if(etype == 2) { zu <- runif(n) E[zu < eps, ] <- E[zu < eps, ] * 5 } if(etype == 3) { zu <- sqrt(rchisq(n, dd)/dd) E <- E/zu } if(etype == 4) E <- exp(E) - exp(0.5) #want mean 0 error vectors E <- E %*% A # got error matrix E y <- y + E for(j in 1:m){ out <- lsfit(x,y[,j]) res[,j] <- out$res fit[,j] <- y[,j] + res[,j] Bhat[,j] <- out$coef } ##make DD plot of the errors ddplot4(res, alpha=alph) } Covhat <- (n-1)/(n-p) * var(res) list(Bhat = Bhat, B=B, Covhat = Covhat) } mregsim<-function(n = 100, m = 2, p = 4, nruns = 10, etype = 1, eps = 0.25, rho = 0.1, dd = 7, mnull = T){ ## Need p > 1, m > 1. # Simulates multivariate linear regression model and gets # pvalues for tests for whether the p-1 nontrivial # predictors are needed in the model and # pvalues for tests for whether Xi is needed in the model. # p = number of predictors including intercept # m > 1 is the number of response variables # want n > mp and n > 10 p # multiply E by A where etype = 1 for MVN Nq(0,I), # etype = 2 for (1 - eps) Nq(0,I) + eps Nq(0, 25 I) # eps = 0.1, 0.25, 0.4, and 0.6 are interesting # etype = 3 for multivariate t_d with d = dd degrees of freedom # dd = 1, 2, 3, 5, 7 are interesting # etype = 4 for lognormal - E(lognormal). # For MVN data multiplying E by A results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. The diagonal elements # of the covariance matrix are 1 + (m-1) rho^2, while the off # diagonal elements are 2 rho + (m-2) rho^2. Hence the correlations # are (2 rho + (m-2) rho^2)/(1 + (m-1) rho^2). As rho gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # Illustrates that Hotelling Lawley statistic = last statistic/(n-p). q <- p - 1 A <- matrix(rho,nrow=m,ncol=m) diag(A) <- 1 res <- matrix(nrow = n, ncol = m, 0) fit <- res B <- matrix(nrow = p, ncol = m, 1) if(p == m){ for(j in 1:(m-1)){ B[p-j+1,j:m] <- 0 } } if(p > m){ for(j in 1:m){ B[p-j+1,j:m] <- 0 } } if(p < m){ for(j in 1:(p-1)){ B[p-j+1,j:m] <- 0 } } if (mnull == T) B[-1,] <- 0 Bhat <- B one <- 1 + 0*1:n fcov <- 0*1:p wilkcov <- fcov pilcov <- fcov hotlawcov <- fcov roycov <- fcov rden <- n - p - m + 1 rcut <- qf(0.95,m,rden) dendf <- n - m*p wcv <- 0 pcv <- 0 hlcv<-0 rcv<-0 fcv<-0 ndf <- (p-1)*m mancv <- cbind(wcv,pcv,hlcv,rcv,fcv) #L <- cbind(fcov[-1],diag(p-1)) #want n > mp fcut <- qf(0.95, m, dendf) mcut <- qf(0.95,m*(p-1),dendf) h <- max(p-1,m) mrcut <- qf(0.95,h,(n-h-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) w <- cbind(one,x) y <- w %*% B #make error matrix E E <- matrix(rnorm(n * m), nrow = n, ncol = m) if(etype == 2) { zu <- runif(n) E[zu < eps, ] <- E[zu < eps, ] * 5 } if(etype == 3) { zu <- sqrt(rchisq(n, dd)/dd) E <- E/zu } if(etype == 4) E <- exp(E) - exp(0.5) #want mean 0 error vectors E <- E %*% A # got error matrix E y <- y + E for(j in 1:m){ out <- lsfit(x,y[,j]) res[,j] <- out$res fit[,j] <- y[,j] - res[,j] Bhat[,j] <- out$coef } Covhat <- (n-1)/(n-p) * var(res) # w is the data matrix with a column of ones J <- t(w)%*%w J <- solve(J) We <- Covhat*(n-p) Covinv <- solve(Covhat) Weinv <- Covinv/(n-p) #get the test statistics for whether Xj is needed in the model for(j in 1:p){ T <- t(Bhat[j,])%*%Covinv%*%Bhat[j,]/J[j,j] if(T/m > fcut) fcov[j] <- fcov[j] + 1 H <- Bhat[j,]%*%t(Bhat[j,])/J[j,j] C <- Weinv%*%H eig <- eigen(C,symmetric=FALSE,only.values=TRUE)$values eig <- as.double(eig) lmax <- eig[1] u <- sum(eig) v <- sum(eig/(eig+1)) wilk <- prod(1/(eig+1)) if(-(n-p-0.5*m)*log(wilk)/m > fcut) wilkcov[j] <- wilkcov[j] + 1 if((n-p)*v/m > fcut) pilcov[j] <- pilcov[j] + 1 if((n-p)*u/m > fcut) hotlawcov[j] <- hotlawcov[j] + 1 if(rden*lmax/m > rcut) roycov[j] <- roycov[j] + 1 } #get the MANOVA F test statistics for whether nontrivial #predictors are needed in the model D <- as.matrix(Bhat[-1,]) if(p == 2) D <- t(D) vecD <- as.matrix(D[,1]) for(j in 2:m){ vecD <- rbind(vecD,as.matrix(D[,j]))} tem <- as.matrix(J[-1,-1]) teminv <- solve(tem) T <- t(vecD)%*%kronecker(Covinv,teminv)%*%vecD if(T/ndf > mcut) mancv[5] <- mancv[5] + 1 H <- t(D)%*%teminv%*%D C <- Weinv%*%H eig <- eigen(C,symmetric=FALSE,only.values=TRUE)$values eig <- as.double(eig) lmax <- eig[1] u <- sum(eig) v <- sum(eig/(eig+1)) wilk <- prod(1/(eig+1)) if(-(n-0.5*p-0.5*m-1)*log(wilk)/ndf > mcut) mancv[1] <- mancv[1] + 1 if((n-p)*v/ndf > mcut) mancv[2] <- mancv[2] + 1 if((n-p)*u/ndf > mcut) mancv[3] <- mancv[3] + 1 if((n-h-1)*lmax/h > mrcut) mancv[4] <- mancv[4] + 1 } wilkcov <- wilkcov/nruns pilcov <- pilcov/nruns hotlawcov <- hotlawcov/nruns roycov <- roycov/nruns fcov <- fcov/nruns mancv <- mancv/nruns list(Bhat = Bhat, B=B, Covhat = Covhat, wilkcov=wilkcov, pilcov=pilcov, hotlawcov=hotlawcov, roycov=roycov, fcov=fcov, mancv=mancv) } oddata<-function(n = 100, q = 5, k = 1) {# Generates overdispersion (negative binomial) data # for Poisson regression. y <- 1:n beta <- 0 * 1:q beta[1:3] <- 1 alpha <- -2.5 x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- 0.5 * x + 1 SP <- alpha + x %*% beta for(i in 1:n) { pr <- k/(exp(SP[i]) + k) y[i] <- rnbinom(1, size = k, pr) } list(x = x, y = y) } pb12<-function(resp,k=11,MSE=NA){ # Makes the effects and QQ plot for the PB(12) design. # Need the response resp given in standard order. # MSE should be given if k=11. # MSe is used for the MS for the E effect # Assumes number of replicates m = 1. # Could define c1-c11 as factors and use # out2<-aov(resp~c1+c2+c3+c4+c5+c6+c7+c8+c9+c10+c11) c1 <- c(1,1,-1,1,1,1,-1,-1,-1,1,-1,-1) c2 <- c(c1[11],c1[1:10],-1) c3 <- c(c2[11],c2[1:10],-1) c4 <- c(c3[11],c3[1:10],-1) c5 <- c(c4[11],c4[1:10],-1) c6 <- c(c5[11],c5[1:10],-1) c7 <- c(c6[11],c6[1:10],-1) c8 <- c(c7[11],c7[1:10],-1) c9 <- c(c8[11],c8[1:10],-1) c10 <- c(c9[11],c9[1:10],-1) c11 <- c(c10[11],c10[1:10],-1) Aeff <- as.double(t(c1)%*%resp/6) Beff <- as.double(t(c2)%*%resp/6) Ceff <- as.double(t(c3)%*%resp/6) Deff <- as.double(t(c4)%*%resp/6) Eeff <- as.double(t(c5)%*%resp/6) Feff <- as.double(t(c6)%*%resp/6) Geff <- as.double(t(c7)%*%resp/6) Heff <- as.double(t(c8)%*%resp/6) Jeff <- as.double(t(c9)%*%resp/6) Keff <- as.double(t(c10)%*%resp/6) Leff <- as.double(t(c11)%*%resp/6) MSA <- 3*(Aeff)^2 MSB <- 3*(Beff)^2 MSC <- 3*(Ceff)^2 MSD <- 3*(Deff)^2 MSe <- 3*(Eeff)^2 MSF <- 3*(Feff)^2 MSG <- 3*(Geff)^2 MSH <- 3*(Heff)^2 MSJ <- 3*(Jeff)^2 MSK <- 3*(Keff)^2 MSL <- 3*(Leff)^2 effects <- c(Aeff,Beff,Ceff,Deff,Eeff,Feff,Geff,Heff, Jeff,Keff=Keff,Leff) library(MASS) nquantiles <- qnorm(ppoints(11)) qqplot(nquantiles,effects) abline(lmsreg(nquantiles,sort(effects))$coef) title("Normal QQ plot for PB12 Design") x <- cbind(c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11) out <- lsfit(x[,1:k],resp) ls.print(out) if(k < 11) MSE <- sum(out$resid^2)/(11-k) SEeff <- sqrt(MSE/3) list(Aeff=Aeff,Beff=Beff,Ceff=Ceff,Deff=Deff,Eeff=Eeff,Feff=Feff, Geff=Geff,Heff=Heff,Jeff=Jeff,Keff=Keff,Leff=Leff,MSA=MSA,MSB=MSB, MSC=MSC,MSD=MSD,MSe=MSe,MSF=MSF,MSG=MSG,MSH=MSH,MSJ=MSJ,MSK=MSK, MSL=MSL,MSE=MSE,SEeff=SEeff) } pcisim<-function(nruns = 1000, n1 = 10, n2 = 10, mu1 = 0, mu2 = 0, var1 = 1, var2 = 1, dist = 1, alpha = 0.05){ #gets the pooled, modified pooled, and Welch 100 (1-alpha)% CIs for mu1 - mu2 #defaults are alpha = .05, lengths make the most sense if n1 = n2 #dist = 1 for x ~ N(mu1, var1), y ~ N(mu2, var2) #dist = 2 for x ~ EXP with mean mu1 variance var1, # y ~ EXP with mean mu2 variance var2 #dist = 3 for x ~ N(mu1, var1), y EXP with mean mu2 variance var2 diff <- mu1 - mu2 up <- 1 - alpha/2 pdf <- n1 + n2 - 2 rhohat <- n1/(n1 + n2) pcov <- 0 mpcov <- 0 wcov <- 0 plow <- 1:nruns pup <- plow mplow <- plow mpup <- plow wlow <- plow wup <- plow tpcut <- qt(up, pdf) mtpcut <- qt(up, n1 + n2 - 4) for(i in 1:nruns) { if(dist == 1) { x <- mu1 + sqrt(var1) * rnorm(n1) y <- mu2 + sqrt(var2) * rnorm(n2) } if(dist == 2) { x <- mu1 - sqrt(var1) + sqrt(var1) * rexp(n1) y <- mu2 - sqrt(var2) + sqrt(var2) * rexp(n2) } if(dist == 3) { x <- mu1 + sqrt(var1) * rnorm(n1) y <- mu2 - sqrt(var2) + sqrt(var2) * rexp(n2) } xbar <- mean(x) s1sq <- var(x) ybar <- mean(y) s2sq <- var(y) xmybar <- xbar - ybar sp <- (n1 - 1) * s1sq + (n2 - 1) * s2sq sp <- sqrt(sp/pdf) val <- sp * sqrt(1/n1 + 1/n2) #get pooled CI tem <- tpcut * val plow[i] <- xmybar - tem pup[i] <- xmybar + tem if(plow[i] < diff && pup[i] > diff) pcov <- pcov + 1 #get modified pooled CI thetahat <- s2sq/s1sq tauhatsq <- (1 - rhohat + rhohat * thetahat)/(rhohat + (1 - rhohat) * thetahat) tauhat <- sqrt(tauhatsq) tem <- mtpcut * val * tauhat mplow[i] <- xmybar - tem mpup[i] <- xmybar + tem if(mplow[i] < diff && mpup[i] > diff) mpcov <- mpcov + 1 #get Welch CI t1 <- s1sq/n1 t2 <- s2sq/n2 do <- (t1 + t2)^2/(t1^2/(n1 - 1) + t2^2/(n2 - 1)) d <- max(1, floor(do)) wtcut <- qt(up, d) val <- sqrt(s1sq/n1 + s2sq/n2) tem <- wtcut * val wlow[i] <- xmybar - tem wup[i] <- xmybar + tem if(wlow[i] < diff && wup[i] > diff) wcov <- wcov + 1 } pcov <- pcov/nruns plen <- sqrt(n1) * mean(pup - plow) mpcov <- mpcov/nruns mplen <- sqrt(n1) * mean(mpup - mplow) wcov <- wcov/nruns wlen <- sqrt(n1) * mean(wup - wlow) #print(mean(x)) #print(var(x)) #print(mean(y)) #print(var(y)) list(pcov = pcov, plen = plen, mpcov = mpcov, mplen = mplen, wcov = wcov, wlen = wlen ) } pcrpisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, type = 1, alpha = 0.05){ #Needs library(pls). #Simulates PIs for principle components regression with 10 fold CV. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes set.seed(974) corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} if (alpha <= 0.1) {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn ncvec<-1:nruns pilen <- 1:nruns opicov <- 0 q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) xf <- rnorm(q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the PCR 10 fold CV estimator z <- as.data.frame(cbind(y,x)) zz <- rbind(z,c(0,xf)) out<-pcr(V1~.,data=z,scale=T,validation="CV") #If y is used instead of V1, predict does not work, #and nc tends to equal p, which should be impossible. tem<-MSEP(out) cvmse<-tem$val[,,1:out$ncomp][1,] nc <-which.min(cvmse) #using predict is rather difficult #if xf is used, predict does not work yfhat<-predict(out,zz[(n+1),-1],ncomp=nc)[1,1,1] ncvec[i] <- nc #get the number of components to use #if nc=p-1=q, same as OLS res <- out$residuals[,,nc] #get asymptotically optimal PI sres <- sort(res) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } pimnlen <- mean(pilen) opicov <- opicov/nruns qminncmn <- q - mean(ncvec) #If qminncmn is 0 then PCR is equivalent to OLS. list(qminncmn, opicov=opicov, pimenlen = pimnlen)} piplot<-function(x, y, alpha = 0.05){ # For Unix, use X11() to turn on the graphics device before # using this function. # Makes a response plot with prediction limits added. x <- as.matrix(x) p <- dim(x)[2] + 1 n <- length(y) up <- 1:n low <- up out <- lsfit(x, y) tem <- ls.diag(out) lev <- tem$hat res <- out$residuals FIT <- y - res Y <- y corfac <- (1 + 15/n)*sqrt(n/(n - p)) val2 <- quantile(res, c(alpha/2, 1 - alpha/2)) #get lower and upper PI limits for each case for(i in 1:n) { val <- sqrt(1 + lev[i]) val3 <- as.single(corfac * val2[1] * val) val4 <- as.single(corfac * val2[2] * val) up[i] <- FIT[i] + val4 low[i] <- FIT[i] + val3 } zy <- c(min(low), Y, max(up)) zx <- c(min(FIT), FIT, max(FIT)) plot(zx, zy, type = "n") points(FIT, Y) abline(0, 1) points(FIT, up, pch = 17) points(FIT, low, pch = 17) } pisim<-function(n = 100, q = 7, nruns = 100, alpha = 0.05, eps = 0.1, shift = 9, type = 1){ # compares new and classical PIs for multiple linear regression # if type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors # constant = 1 so there are p = q+1 coefficients b <- 0 * 1:q + 1 cpicov <- 0 npicov <- 0 acpicov <- 0 opicov <- 0 val3 <- 1:nruns val4 <- val3 val5 <- val3 pilen <- matrix(0, nrow = nruns, ncol = 4) coef <- matrix(0, nrow = nruns, ncol = q + 1) corfac <- (1 + 15/n) * sqrt(n/(n - q - 1)) corfac2 <- sqrt(n/(n - q - 1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) xf <- rnorm(q) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) xf <- rnorm(q) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 xf <- rnorm(q) yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) xf <- rnorm(q) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err xf <- rnorm(q) yf <- 1 + xf %*% b + rnorm(1, sd = 1 + rbinom(1, 1, eps ) * shift) } out <- lsfit(x, y) fres <- out$resid coef[i, ] <- out$coef yfhat <- out$coef[1] + xf %*% out$coef[-1] w <- cbind(1, x) xtxinv <- solve(t(w) %*% w) xf <- c(1, xf) hf <- xf %*% xtxinv hf <- hf %*% xf val <- sqrt(1 + hf) #get classical PI mse <- sum(fres^2)/(n - q - 1) val2 <- qt(1 - alpha/2, n - q - 1) * sqrt(mse) * val up <- yfhat + val2 low <- yfhat - val2 pilen[i, 1] <- up - low if(low < yf && up > yf) cpicov <- cpicov + 1 #get semiparametric PI val2 <- quantile(fres, c(alpha/2, 1 - alpha/2)) val3[i] <- as.single(corfac * val2[1] * val) val4[i] <- as.single(corfac * val2[2] * val) up <- yfhat + val4[i] low <- yfhat + val3[i] pilen[i, 2] <- up - low if(low < yf && up > yf) npicov <- npicov + 1 # asymptotically conservative PI val6 <- corfac2 * max(abs(val2)) val5[i] <- val6 * val up <- yfhat + val5[i] low <- yfhat - val5[i] pilen[i, 3] <- up - low if(low < yf && up > yf) acpicov <- acpicov + 1 # asymptotically optimal PI sres <- sort(fres) cc <- ceiling(n * (1 - alpha)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac * val * rup low <- yfhat + corfac * val * rlow pilen[i, 4] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } pimnlen <- apply(pilen, 2, mean) mnbhat <- apply(coef, 2, mean) lcut <- mean(val3) hcut <- mean(val4) accut <- mean(val5) cpicov <- cpicov/nruns npicov <- npicov/nruns acpicov <- acpicov/nruns opicov <- opicov/nruns list(mnbhat = mnbhat, pimenlen = pimnlen, cpicov = cpicov, spicov = npicov, acpicov = acpicov, opicov = opicov, lcut = lcut, hcut = hcut, accut = accut) } pisim4<-function(n = 100, q = 7, nruns = 100, alpha = 0.05, eps = 0.1, shift = 9, type = 1, mod = 1){ #this function is for SPLUS # compares asymptotically optimal PIs for multiple linear regression # if type = 1 for N(0,1) errors, 2 for t3 errors, #3 for exp(1) - 1 errors #4 for uniform(-1,1) errors, #5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors #constant = 1 so there are p = q+1 coefficients #mod = 1 for least squares #mod = 2 for l1fit #mod = 3 for rrfit set.seed(974) p <- q + 1 b <- 0 * 1:q + 1 npicov <- 0 opicov <- 0 val3 <- 1:nruns val4 <- val3 pilen <- matrix(0, nrow = nruns, ncol = 2) coef <- matrix(0, nrow = nruns, ncol = q + 1) corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} else {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) xf <- rnorm(q) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) xf <- rnorm(q) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 xf <- rnorm(q) yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) xf <- rnorm(q) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err xf <- rnorm(q) yf <- 1 + xf %*% b + rnorm(1, sd = 1 + rbinom(1, 1, eps ) * shift) } if(mod == 1) out <- lsfit(x, y) if(mod == 2) out <- l1fit(x, y) if(mod == 3) out <- rreg(x, y) fres <- out$resid coef[i, ] <- out$coef yfhat <- out$coef[1] + xf %*% out$coef[-1] #get semiparametric PI val2 <- quantile(fres, c(alphan/2, 1 - alphan/2)) val3[i] <- val2[1] val4[i] <- val2[2] up <- yfhat + corfac*val4[i] low <- yfhat + corfac*val3[i] pilen[i, 1] <- up - low if(low < yf && up > yf) npicov <- npicov + 1 # asymptotically optimal PI sres <- sort(fres) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i, 2] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } pimnlen <- apply(pilen, 2, mean) mnbhat <- apply(coef, 2, mean) lcut <- mean(val3) hcut <- mean(val4) npicov <- npicov/nruns opicov <- opicov/nruns list(mnbhat = mnbhat, pimenlen = pimnlen, spicov = npicov, opicov = opicov, lcut = lcut, hcut = hcut) } plspisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, type = 1, alpha = 0.05){ #Needs library(pls). #Simulates PIs for partial least squares with 10 fold CV. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes set.seed(974) corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} if (alpha <= 0.1) {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn ncvec<-1:nruns pilen <- 1:nruns opicov <- 0 q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) xf <- rnorm(q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the PLS 10 fold CV estimator z <- as.data.frame(cbind(y,x)) zz <- rbind(z,c(0,xf)) out<-plsr(V1~.,data=z,scale=T,validation="CV") #If y is used instead of V1, predict does not work, #and nc tends to equal p, which should be impossible. tem<-MSEP(out) cvmse<-tem$val[,,1:out$ncomp][1,] nc <-which.min(cvmse) #using predict is rather difficult #if xf is used, predict does not work yfhat<-predict(out,zz[(n+1),-1],ncomp=nc)[1,1,1] ncvec[i] <- nc #get the number of components to use #if nc=p-1=q, same as OLS res <- out$residuals[,,nc] #get asymptotically optimal PI sres <- sort(res) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } pimnlen <- mean(pilen) opicov <- opicov/nruns qminncmn <- q - mean(ncvec) #If qminncmn is 0 then PLS is equivalent to OLS. list(qminncmn, opicov=opicov, pimenlen = pimnlen)} poisontplt<-function(){ # Need the poison data frame from lregdata. # This function plots y^L vs two way Anova fitted values # from the two way Anova of y^L on x where # L = -1, -0.5, 0, 0.5 or 1 except log(Y) is used if L = 0. # If plot is linear for L, use y^L (or log(Y) for L = 0) instead # of Y. This is a graphical method for a response transform. lam <- c(-1, -1/2, 0, 1/2, 1) xl <- c("1/Z", "1/sqrt(Z)", "LOG(Z)", "sqrt(Z)", "Z") par(mfrow=c(2,3)) attach(poison) for(i in 1:length(lam)) { if(lam[i] == 0) ytem <- log(poison$stime) else ytem <- poison$stime^lam[i] aovfit <- ytem - aov(ytem~ptype*treat,poison)$resid plot(aovfit, ytem, xlab = "TZHAT", ylab = xl[i]) abline(0, 1) #identify(aovfit, ytem) } par(mfrow=c(1,1)) detach(poison) ##using aov(ytem~.^2,poison) would fit ytem on ##ptype, treat, stime, the 3 two way interactions ##and the three way interaction } prdata <- function(n = 100, q = 5) {# Generates data for Poisson regression. y <- 0 * 1:n beta <- 0 * 1:q beta[1:3] <- 1 alpha <- -2.5 x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- 0.5*x + 1 SP <- alpha + x%*%beta y <- rpois(n,lambda=exp(SP)) # change formula to x[,1]+ ... + x[,q] with q #out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + # x[, 4] + x[,5], family = poisson) #list(alpha = alpha, beta = beta, prcoef = out$coef,x=x,y=y) list(x=x,y=y) } predreg<-function(x, alpha = 0.05){ # Makes a prediction region for the rows of x. # If p = 1, the shorth interval should work better. #Also computes the distance for the 0 vector. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] zero <- 0*(1:p) up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical distance MD <- sqrt(md2) #get nonparametric prediction region boundary cuplim <- sqrt(quantile(md2, up)) D0 <- sqrt(mahalanobis(zero, center, cov)) list(cuplim = cuplim, D0=D0, MD=MD, center=center, cov=cov) } pressp <- function(x,y) {# Makes the response plot for Poisson regression. # Workstation: need to activate a graphics # device with command "X11()" or "motif()." # #ESP <- x %*% out$coef[-1] + out$coef[1] #If q is changed, change the formula in the glm statement. #q <- 5 # change formula to x[,1]+ ... + x[,q] with q #out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[, 5], family = poisson) out <- glm(y~., family=poisson, data=data.frame(cbind(x,y))) ESP <- predict(out) Y <- y plot(ESP,Y) abline(mean(y),0) fit <- y fit <- exp(ESP) indx <- sort.list(ESP) lines(ESP[indx],fit[indx]) lines(lowess(ESP,y),type="s") } prplot<- function(x, y) {# Makes response plot, OD plot, weighted forward response # and residual plots for Poisson regression. # Workstation: need to activate a graphics # device with command "X11()" or "motif()." # #If q is changed, change the formula in the glm statement. #q <- 5 # change formula to x[,1]+ ... + x[,q] with q #out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[, 5], family = poisson) out <- glm(y~., family=poisson, data=data.frame(cbind(x,y))) x<-as.matrix(x) ESP <- predict(out) par(mfrow = c(2, 2)) Y <- y plot(ESP, Y) abline(mean(Y), 0) Ehat <- exp(ESP) indx <- sort.list(ESP) lines(ESP[indx], Ehat[indx]) lines(lowess(ESP, Y), type = "s") title("a) Response Plot") Vhat <- (Y - Ehat)^2 plot(Ehat, Vhat) abline(0, 1) abline(0, 4) #lines(lowess(Ehat, Vhat)) #abline(lsfit(Ehat, Vhat)$coef) title("b) OD Plot") Z <- Y Z[Y < 1] <- Z[Y < 1] + 0.5 WRES <- sqrt(Z) * (log(Z) - x %*% out$coef[-1] - out$coef[1]) WFIT <- sqrt(Z) * log(Z) - WRES plot(WFIT, sqrt(Z) * log(Z)) abline(0, 1) #abline(lsfit(WFIT, sqrt(Z) * log(Z))$coef) title("c) WFRP") plot(WFIT, WRES) title("d) Wtd Residual Plot") par(mfrow = c(1, 1)) } prplot2 <- function(ESP, x, y) {# Makes response plot and OD plot for Poisson regression. #Can be used for a Poisson GAM or GLM. # If t<-1:13 # y<-c(12,14,33,50,67,74,123,141,165,204,253,246,240) # If out <- glm(y~t+I(t^2),family=poisson) # use ESP <- predict(out). # If outgam <- gam(y~s(t),poisson) # use ESP <- predict.gam(outgam). # Workstation: need to activate a graphics # device with command "X11()" or "motif()." #WRES <- sqrt(Z) * (log(Z) - x %*% out$coef[-1] - out$coef[1])#for a GLM x<-as.matrix(x) par(mfrow = c(2, 2)) Y <- y plot(ESP, Y) abline(mean(Y), 0) Ehat <- exp(ESP) indx <- sort.list(ESP) lines(ESP[indx], Ehat[indx]) lines(lowess(ESP, Y), type = "s") title("a) Response Plot") Vhat <- (Y - Ehat)^2 plot(Ehat, Vhat) abline(0, 1) abline(0, 4) #lines(lowess(Ehat, Vhat)) #abline(lsfit(Ehat, Vhat)$coef) title("b) OD Plot") Z <- Y Z[Y < 1] <- Z[Y < 1] + 0.5 WRES <- sqrt(Z) * (log(Z) - ESP) WFIT <- sqrt(Z) * log(Z) - WRES plot(WFIT, sqrt(Z) * log(Z)) abline(0, 1) #abline(lsfit(WFIT, sqrt(Z) * log(Z))$coef) title("c) WFRP") plot(WFIT, WRES) title("d) Wtd Residual Plot") par(mfrow = c(1, 1)) } prsim<-function(n = 100, nruns = 1, type = 1, k = 1) {# Runs prplot nruns times on simulated data. # Type = 1 for Poisson regression data, #Type = 2 for negative binomial regression data # Calls prdata, oddata, prplot. q <- 5 for(i in 1:nruns) { if(type == 1) out <- prdata(n, q) else out <- oddata(n, q, k) x <- out$x y <- out$y prplot(x, y) #identify(WFIT, WRES) } } rand <- function(n = 10, k = 3) {#randomize n units into k treatment groups of nearly equal size #groups[i] gives the group for the ith unit groups <- 1:n gsize <- as.integer(n/k) z <- sample(n) for(i in 1:k) { lo <- (i - 1) * gsize + 1 hi <- i * gsize if(i == k) hi <- n groups[z[lo:hi]] <- i } list(perm = z, groups = groups) } rand1way <- function(y,group,B=1000){ #computes the randomization F test for the 1 way Anova design #based on Ernst (2009) #y and group should have the same length if(length(y) != length(group)) stop("Response and factor vectors must be the same length") if(!is.factor(group)) group <- as.factor(group) Fstat <- NULL z <- anova(lm(y ~ group)) Fstat[1] <- z$F[1] Fpval <- z$P[1] Fdf <- z$Df n <- length(y) m <- max(B,floor(n*log(n))) for(i in 2:m){ Fstat[i] <- anova(lm(sample(y) ~ group))$F[1] } randpval <- sum(Fstat>=Fstat[1])/m list(rdist=Fstat,Fpval=Fpval,randpval=randpval) } rcplot <-function(x, Y) {# Makes an RC plot. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # Advance the view with the right mouse button, and in R, highlight "stop." x <- as.matrix(x) out <- lsfit(x, Y) #zz <- ls.diag(out) CD <- zz$cooks n <- dim(x)[1] p <- dim(x)[2] + 1 RES <- out$resid #tem <- zz$std.dev #tem <- p * tem * (n - p)^2 #sortr <- sort(RES) #quad <- (n * (sortr^2))/tem plot(RES, CD) #lines(sortr, quad) identify(RES,CD) title("RC Plot") } regboot<-function(x,y, B = 1000){ #bootstrap with residuals for MLR #asymptotically the same as MSE (X'X)^(-1), so use to compare vselboot #out <- regboot(belx,bely) #ddplot4(out$betas) #this does not work for vselboot: too many 0s #mplot(out$betas) #cov(out$betas) full <- lsfit(x,y) res <- full$resid fit <- y - res n <- length(y) x <- as.matrix(x) p <- dim(x)[2]+1 betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ yb <- fit + sample(res,n,replace=T) betas[i,] <- lsfit(x,yb)$coef } list(betas=betas) } regbootsim<-function(n = 100, p = 4, nruns = 100, eps = 0.1, shift = 9, type = 1, alph = 0.05){ #Simulates residual bootstrap for regression. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k ones p - k zeroes q <- p-1 k <- floor(p/2) b <- 0 * 1:q b[1:(k-1)] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-regboot(x,y) #residual bootstrap for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p - k values of beta are 0 tem <- predreg(out$betas[,(k+1):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} #called by hbreg rmreg3<-function(x, y, csteps = 5, locc = 0.5){ # Does robust multiple linear regression based on RMVN. # Here x contains the nontrivial predictors. # This is rmreg2 except plots are not made. # Used by hbreg. x <- as.matrix(x) y <- as.matrix(y) n <- dim(x)[1] q <- dim(x)[2] p <- q+1 m <- dim(y)[2] d <- m + q u <- cbind(x,y) res <- matrix(nrow = n, ncol = m, 0) fit <- res Bhat <- matrix(nrow = p, ncol = m, 0) # q + 1 = p is number of predictors including intercept #get the RMVN subset of the predictors and response u up <- qchisq(0.975, d) qchi <- qchisq(0.5, d) ##get the DGK estimator covs <- var(u) mns <- apply(u, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(u, mns, covs) medd2 <- median(md2) mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(d) med <- apply(u, 2, median) md2 <- mahalanobis(u, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(u, mns, covs) medd2 <- median(md2) mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(u, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) rd2 <- mahalanobis(u, mnf, covf) rmnmvn <- apply(u[rd2 <= up, ], 2, mean) rcovmvn <- var(u[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(u, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, d) rcovmvn <- const * rcovmvn ##now get the subset of data w used to construct the RMVN estimator rd2 <- mahalanobis(u, rmnmvn, rcovmvn) w <- u[rd2 <= up, ] xw <- as.matrix(w[,1:q]) yw <- as.matrix(w[,(q+1):d]) #get the multivariate regression after ellipsoidal trimming for(i in 1:m){ out <- lsfit(xw,yw[,i]) res[,i] <- y[,i] - out$coef[1] - x %*% out$coef[-1] fit[,i] <- y[,i] - res[,i] Bhat[,i] <- out$coef } list(fit = fit, res = res, Bhat = Bhat) } rowboot<-function(x,y, B = 1000){ #rowwise nonparametric bootstrap #use if (y^T, x^T) are iid from some distribution #out <- rowboot(belx,bely) #ddplot4(out$betas) #mplot(out$betas) #cov(out$betas) full <- lsfit(x,y) n <- length(y) x <- as.matrix(x) p <- dim(x)[2]+1 indx <- 1:n betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ tem <- sample(indx,n,replace=T) betas[i,] <- lsfit(x[tem,],y[tem])$coef } list(betas=betas,full=full) } shorth2<-function(y, alpha = 0.05){ # computes the shorth(c) interval [Ln,Un] for c = ceiling[n(1-alpha)]. n <- length(y) cc <- ceiling(n * (1 - alpha)) sy <- sort(y) rup <- sy[cc] rlow <- sy[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sy[j] rlow <- sy[j - cc + 1] } } } Ln <- rlow; Un <- rup list(shorth=c(Ln, Un)) } shorth3<-function(y, alpha = 0.05){ # computes the shorth(c) interval [Ln,Un] for c = cc. #shorth lists Ln and Un using Frey's correction n <- length(y) cc <- ceiling(n * (1 - alpha + 1.12*sqrt(alpha/n))) cc <- min(n,cc) sy <- sort(y) rup <- sy[cc] rlow <- sy[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sy[j] rlow <- sy[j - cc + 1] } } } Ln <- rlow; Un <- rup list(shorth=c(Ln, Un)) } tplot<-function(x, y){ # Use the rightmost mouse button to advance # the plot, and in R, highlight ``stop." # Values of y should be positive. # Assume the MLR model contains a constant, but # x is the design matrix without the vector of ones. # This function makes transformation plots from the MLR # of y^L on x where L = -1, -0.5, -1/3, 0, 1/3, 0.5, or 1 # except log(Y) is used if L = 0. # If plot is linear for L, use y^L (or log(Y) for L = 0) instead # of Y. This is a graphical method for a response transform. lam <- c(-1, -1/2, -1/3, 0, 1/3, 1/2, 1) xl <- c("1/Z", "1/sqrt(Z))", "Z**(-1/3)", "LOG(Z)", "Z**(1/3)", "sqrt(Z)", "Z") for(i in 1:length(lam)) { if(lam[i] == 0) ytem <- log(y) else ytem <- y^lam[i] fit <- ytem - lsfit(x,ytem)$resid plot(fit, ytem, xlab = "TZHAT", ylab = xl[i]) abline(0, 1) identify(fit, ytem) } } tvreg<-function(x, Y, ii = 1, type = 4){ # Trimmed views (TV) regression for 0, 10, ..., 90 percent # trimming. Use ii = 10 for 0, ii = 9 for 10 then 0, ... # ii = 1 for 90 then 80 ... then 0 percent trimming. # Workstation: activate a graphics device # with commands "X11()" or "motif()." # R needs command "library(MASS)." # Advance the view with the right mouse button and # in R, highight "stop." x <- as.matrix(x) if(type == 1) out <- cov.mcd(x) if(type == 2) out <- covmba(x) if (type == 3) out <- covfch(x) if(type > 3) out <- covrmvn(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) bhat <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$coef FIT <- x %*% bhat[-1] + bhat[1] plot(FIT, Y) abline(0, 1) title(labs[i]) identify(FIT, Y) print(bhat) } } tvreg2<-function(X, Y, M = 0, type = 4) {# Trimmed views regression for M percent trimming. # Workstation: activate a graphics device # with commands "X11()" or "motif()." # R needs command "library(MASS)." X <- as.matrix(X) if(type == 1) out <- cov.mcd(X) if(type == 2) out <- covmba(X) if(type == 3) out <- covfch(X) if(type > 3) out <- covrmvn(X) center <- out$center cov <- out$cov rd2 <- mahalanobis(X, center, cov) tem <- (100 - M)/100 val <- quantile(rd2, tem) bhat <- lsfit(X[rd2 <= val, ], Y[rd2 <= val])$coef FIT <- X %*% bhat[-1] + bhat[1] plot(FIT, Y) abline(0, 1) identify(FIT, Y) list(coef = bhat) } twocub<-function(means,m=1,MSE=NA){ # Makes the effects and QQ plot for the 2^3 design. # Need the means of a 2^3 design replicated m times given # in standard order. MSE should be given if m > 1. # The commands out <- aov(y~A*B*C) and summary(out) # can be used to find the MSE. ca <- c(-1,1,-1,1,-1,1,-1,1) cb <- c(-1,-1,1,1,-1,-1,1,1) cc <- c(-1,-1,-1,-1,1,1,1,1) cab <- c(1,-1,-1,1,1,-1,-1,1) cac <- c(1,-1,1,-1,-1,1,-1,1) cbc <- c(1,1,-1,-1,-1,-1,1,1) cabc <- c(-1,1,1,-1,1,-1,-1,1) Aeff <- as.double(t(ca)%*%means/4) Beff <- as.double(t(cb)%*%means/4) Ceff <- as.double(t(cc)%*%means/4) ABeff <- as.double(t(cab)%*%means/4) ACeff <- as.double(t(cac)%*%means/4) BCeff <- as.double(t(cbc)%*%means/4) ABCeff <- as.double(t(cabc)%*%means/4) MSA <- 2*m*(Aeff)^2 MSB <- 2*m*(Beff)^2 MSC <- 2*m*(Ceff)^2 MSAB <- 2*m*(ABeff)^2 MSAC <- 2*m*(ACeff)^2 MSBC <- 2*m*(BCeff)^2 MSABC <- 2*m*(ABCeff)^2 SEeff <- NA if(m > 1) SEeff <- sqrt(MSE/(2*m)) effects <- c(Aeff,Beff,Ceff,ABeff,ACeff,BCeff,ABCeff) library(MASS) nquantiles <- qnorm(ppoints(7)) qqplot(nquantiles,effects) abline(lmsreg(nquantiles,sort(effects))$coef) title("Normal QQ plot") list(Aeff=Aeff,Beff=Beff,Ceff=Ceff,ABeff=ABeff,ACeff=ACeff, BCeff=BCeff,ABCeff=ABCeff,MSA=MSA,MSB=MSB,MSC=MSC,MSAB=MSAB, MSAC=MSAC,MSABC=MSABC,MSE=MSE,SEeff=SEeff) } twofourth<-function(means,m=1,MSE=NA){ # Makes the effects and QQ plot for the 2^4 design. # Need the means of a 2^4 design replicated m times given # in standard order. MSE should be given if m > 1. # The commands out <- aov(y~A*B*C*D) and summary(out) # can be used to find the MSE. ca <- rep(c(-1,1,-1,1,-1,1,-1,1),2) cb <- rep(c(-1,-1,1,1,-1,-1,1,1),2) cc <- rep(c(-1,-1,-1,-1,1,1,1,1),2) cd <- c(-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1) cab <- rep(c(1,-1,-1,1,1,-1,-1,1),2) cac <- rep(c(1,-1,1,-1,-1,1,-1,1),2) cad <- ca*cd cbc <- rep(c(1,1,-1,-1,-1,-1,1,1),2) cbd <- cb*cd ccd <- cc*cd cabc <- rep(c(-1,1,1,-1,1,-1,-1,1),2) cabd <- cab*cd cacd <- cac*cd cbcd <- cbc*cd cabcd <- cabc*cd Aeff <- as.double(t(ca)%*%means/8) Beff <- as.double(t(cb)%*%means/8) Ceff <- as.double(t(cc)%*%means/8) Deff <- as.double(t(cd)%*%means/8) ABeff <- as.double(t(cab)%*%means/8) ACeff <- as.double(t(cac)%*%means/8) ADeff <- as.double(t(cad)%*%means/8) BCeff <- as.double(t(cbc)%*%means/8) BDeff <- as.double(t(cbd)%*%means/8) CDeff <- as.double(t(ccd)%*%means/8) ABCeff <- as.double(t(cabc)%*%means/8) ABDeff <- as.double(t(cabd)%*%means/8) ACDeff <- as.double(t(cacd)%*%means/8) BCDeff <- as.double(t(cbcd)%*%means/8) ABCDeff <- as.double(t(cabcd)%*%means/8) MSA <- 4*m*(Aeff)^2 MSB <- 4*m*(Beff)^2 MSC <- 4*m*(Ceff)^2 MSD <- 4*m*(Deff)^2 MSAB <- 4*m*(ABeff)^2 MSAC <- 4*m*(ACeff)^2 MSAD <- 4*m*(ADeff)^2 MSBC <- 4*m*(BCeff)^2 MSBD <- 4*m*(BDeff)^2 MSCD <- 4*m*(CDeff)^2 MSABC <- 4*m*(ABCeff)^2 MSABD <- 4*m*(ABDeff)^2 MSACD <- 4*m*(ACDeff)^2 MSBCD <- 4*m*(BCDeff)^2 MSABCD <- 4*m*(ABCDeff)^2 SEeff <- NA if(m > 1) SEeff <- sqrt(MSE/(4*m)) effects <- c(Aeff,Beff,Ceff,Deff,ABeff,ACeff,ADeff,BCeff, BDeff,CDeff=CDeff,ABCeff,ABDeff,ACDeff,BCDeff,ABCDeff) if(m == 1) par(mfrow = c(2,2)) library(MASS) nquantiles <- qnorm(ppoints(15)) qqplot(nquantiles,effects) abline(lmsreg(nquantiles,sort(effects))$coef) title("Normal QQ plot") if(m == 1){ #ca <- as.factor(ca) #cb <- as.factor(cb) #cc <- as.factor(cc) #cd <- as.factor(cd) #using factors gives the wrong output out <- aov(means~(ca+cb+cc+cd)^2) print(summary.lm(out)) Fit <- means - out$resid Resid <- out$resid Y <- means plot(Fit,Y) abline(0,1) title("Response Plot") plot(Fit,Resid) abline(0,0) title("Residual Plot") par(mfrow = c(1,1)) } list(Aeff=Aeff,Beff=Beff,Ceff=Ceff,Deff=Deff,ABeff=ABeff,ACeff=ACeff, ADeff=ADeff,BCeff=BCeff,BDeff=BDeff,CDeff=CDeff,ABCeff=ABCeff,ABDeff=ABDeff, ACDeff=ACDeff,BCDeff=BCDeff,ABCDeff=ABCDeff,MSA=MSA,MSB=MSB,MSC=MSC,MSD=MSD, MSAB=MSAB,MSAC=MSAC,MSAD=MSAD,MSBC=MSBC,MSBD=MSBD,MSCD=MSCD,MSABC=MSABC, MSABD=MSABD,MSACD=MSACD,MSBCD=MSBCD,MSABCD=MSABCD,MSE=MSE,SEeff=SEeff) } vsbootsim<-function(n = 100, p = 4, nruns = 100, eps = 0.1, shift = 9, type = 1, alph = 0.05){ #needs library(leaps) #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for variable selection using all subsets variable #selection. So need p small. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k ones p - k zeroes q <- p-1 k <- floor(p/2) b <- 0 * 1:q b[1:(k-1)] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-vselboot(x,y) #bootstrap the minimum Cp model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p - k values of beta are 0 tem <- predreg(out$betas[,(k+1):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} vsbootsim2<-function(n = 100, p = 4, nruns = 100, eps = 0.1, shift = 9, type = 1, alph = 0.05){ #needs library(leaps) #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for forward selection variable selection. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k ones p - k zeroes q <- p-1 k <- floor(p/2) b <- 0 * 1:q b[1:(k-1)] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-fselboot(x,y) #bootstrap the forward sel minimum Cp model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p - k values of beta are 0 tem <- predreg(out$betas[,(k+1):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} vselboot<-function(x,y,B = 1000){ #needs library(leaps) #need n and p small, 2 < p < 30 #allsubsets minimum Cp regression #does not make sense to do variable selection if there #is only one nontrivial predictor x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vars <- as.vector(1:(p-1)) #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the minimum Cp submodel out<-leaps(x,y) mincp <- out$which[out$Cp==min(out$Cp)] vin <- vars[mincp] sub <- lsfit(x[,vin],y) betas <- matrix(0,nrow=B,ncol=p) #bootstrap the minimum Cp submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) out<-leaps(x,y=yb) mincp <- out$which[out$Cp==min(out$Cp)] vin <- vars[mincp] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef } list(full=full,sub=sub,betas=betas) } vspisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alpha = 0.05){ #Needs library(leaps). #Simulates PIs for forward selection variable selection. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} if (alpha <= 0.1) {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn pilen <- 1:nruns ps <- pilen opicov <- 0 q <- p-1 vmax <- min(p,as.integer(n/5)) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the forward sel minimum Cp model tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] sub <- lsfit(x[,vin],y) ps[i]<-length(sub$coef) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] fres <- sub$resid #get asymptotically optimal PI sres <- sort(fres) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } psmn <- mean(ps)-k #0 if subset is selecting optimal subset pimnlen <- mean(pilen) opicov <- opicov/nruns list(psmn=psmn,opicov=opicov, pimenlen = pimnlen)} wlsplot<- function(x, y, w) { # Make OLS response and residual plots, # then transform data with weights and make OLS plots # based on weights to check that the weights are reasonable. # use X11() to turn on the graphics device before using this function ols <- lsfit(x, y) Y <- y Z <- sqrt(w) * y u <- sqrt(w) * x uu <- sqrt(w) u <- cbind(uu, u) wls <- lsfit(u, Z, intercept = F) par(mfrow = c(2, 2)) FIT <- Y - ols$resid plot(FIT, Y) abline(0, 1) title("a) OLS Response Plot") RESID <- ols$resid plot(FIT, RESID) title("b) OLS Residual Plot") ZFIT <- Z - wls$resid plot(ZFIT, Z) abline(0, 1) title("c) WLS Response Plot") ZRESID <- wls$resid plot(ZFIT, ZRESID) title("d) WLS Residual Plot") list(bols = ols$coef, bwls = wls$coef) par(mfrow=c(1,1)) }