##The command ##source("http://parker.ad.siu.edu/Olive/rpack.txt") ##is an easy way to get these functions into R. AERplot<-function(yhat, y, res, d=1, alph = 0.1){ # Makes a response plot where cases within their PI are not plotted. # Need alph around 0.3 if n is small. Use alph = 0.01 if n > 100000. # If alph = 1, the usual response plot of yhat versus y is made. # The value d is a crude estimate of the model complexity or degrees of freedom. # Could use d = p if n>>p where n is the sample size and p = no. of predictors. # For forward selection, lasso, PCR, and PLS, d=number of nonzero # estimated beta coefficients for model Y = x^T beta + e. n <- length(y) val <- 8*n/9 if(alph == 1){ plot(yhat,y) } else{ ymin <- min(y); ymax <- max(y) yhmin <- min(yhat); yhmax<-max(yhat) plot(c(yhmin,yhmax),c(ymin,ymax),type="n", ylab="Y",xlab="YHAT") #get PI for the residuals if(d < val) corfac <- (1 + 15/n) * sqrt( (n+2*d)/(n - d) ) else corfac <- 5*(1+15/n) if (alph > 0.1) {qn <- min(1 - alph + 0.05, 1 - alph + d/n)} if (alph <= 0.1) {qn <- min(1 - alph/2, 1 - alph + 10*alph*d/n)} pn <- qn if(pn < 1 - alph + 0.001) qn <- 1 - alph alphan <- 1 - qn sres <- sort(res) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } #Plot points corresponding to Ys not in their PIs resup <- corfac*rup reslow <- corfac*rlow points(yhat[y>yhat+resup],y[y>yhat+resup]) points(yhat[y>p where n is the sample size and p = no. of predictors. # For forward selection, lasso, PCR, and PLS, d=number of nonzero # estimated beta coefficients for model Y = x^T beta + e. n <- length(y) val <- 8*n/9 plot(yhat,y) #get PI for the residuals if(d < val) corfac <- (1 + 15/n) * sqrt( (n+2*d)/(n - d) ) else corfac <- 5*(1+15/n) if (alph > 0.1) {qn <- min(1 - alph + 0.05, 1 - alph + d/n)} if (alph <= 0.1) {qn <- min(1 - alph/2, 1 - alph + 10*alph*d/n)} pn <- qn if(pn < 1 - alph + 0.001) qn <- 1 - alph alphan <- 1 - qn sres <- sort(res) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } #Plot pointwise PIs resup <- corfac*rup reslow <- corfac*rlow abline(0,1) abline(resup,1) abline(reslow,1) list(respi=c(reslow,resup)) } aovtplt<-function(x, y){ # Need x to have the factor levels needed for one way Anova. # This function plots y^L vs one way Anova fitted values # from the one way Anova of y^L on x where # L = -1, -0.5, 0, 0.5 or 1 except log(Y) is used if L = 0. # If plot is linear for L, use y^L (or log(Y) for L = 0) instead # of Y. This is a graphical method for a response transform. lam <- c(-1, -1/2, 0, 1/2, 1) xl <- c("1/Z", "1/sqrt(Z)", "LOG(Z)", "sqrt(Z)", "Z") par(mfrow=c(2,3)) x<-factor(x) for(i in 1:length(lam)) { if(lam[i] == 0) ytem <- log(y) else ytem <- y^lam[i] aovfit <- ytem - aov(ytem~x)$resid plot(aovfit, ytem, xlab = "TZHAT", ylab = xl[i]) abline(0, 1) #identify(aovfit, ytem) } par(mfrow=c(1,1)) } ardata<-function(n = 200) {#simulates AR(p=2) data y <- 1:(2 * n) y[1] <- sum(rnorm(3)) y[2] <- y[1] + sum(rnorm(2)) for(i in 3:(2 * n)) { y[i] <- y[i - 1] - 0.89 * y[i - 2] + rnorm(1) } list(arts = y[(n + 1):(2 * n)]) } atmci<-function(x, k1 = 6, k2 = 6, alpha = 0.05) {#gets robust 100 (1-alpha)% CI #defaults are alpha = .05 and k = 6 for the 2 stage asy. trimmed mean n <- length(x) up <- 1 - alpha/2 madd <- mad(x, constant = 1) med <- median(x) LM <- sum(x < (med - k1 * madd)) nmUM <- sum(x > (med + k2 * madd)) # ll (hh) is the percentage to be trimmed to the left (right) ll <- ceiling((100 * LM)/n) hh <- ceiling((100 * (nmUM))/n) ln <- floor((ll * n)/100) un <- floor((n * (100 - hh))/100) low <- ln + 1 d <- sort(x) val1 <- d[low] val2 <- d[un] tstmn <- mean(x[(x >= val1) & (x <= val2)]) #have obtained the two stage asymmetrically trimmed mean if(ln > 0) { d[1:ln] <- d[(low)] } if(un < n) { d[(un + 1):n] <- d[un] } den <- ((un - ln)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- un - ln - 1 rval <- qt(up, rdf) * sqrt(swv/n) rlo <- tstmn - rval rhi <- tstmn + rval #got low and high endpoints of robust CI list(int = c(rlo, rhi), tstmn = tstmn, swv = swv) } attract<-function(X, Y, k = 5) {# Works in Splus but not in R. # For simple linear regression: plots k elemental starts and # their domains of attraction. Calls conc2. l1coef <- l1fit(X, Y)$coef X <- as.matrix(X) nr <- dim(X)[1] nc <- dim(X)[2] + 1 J <- 1:nc dom <- matrix(nrow = k, ncol = nc) par(mfrow = c(1, 2)) plot(X, Y) title("a) 5 Elemental Starts") for(i in 1:k) { ## get J J <- sample(nr, nc) ## get bJ, the elem fit if(abs(X[J[1]] - X[J[2]]) < 1/100000000) { slope <- 0 } else { slope <- (Y[J[1]] - Y[J[2]])/(X[J[1]] - X[J[2]]) } int <- Y[J[1]] - slope * X[J[1]] fit <- c(int, slope) yhat <- X %*% fit[2:nc] + fit[1] lines(X, yhat) ## get the domain of attraction for LTA concentration dom[i, ] <- conc2(X, Y, start = fit)$coef } plot(X, Y) for(i in 1:k) { fit <- dom[i, ] yhat <- X %*% fit[2:nc] + fit[1] lines(X, yhat) } title("b) The Corresponding Attractors") } bg2ci<-function(x, alpha = 0.05) {#gets BGse with middle n^0.8 cases for sample median and #the corresponding robust 100 (1-alpha)% CI. This is optimal #for estimating the SE but is not resistant. n <- length(x) up <- 1 - alpha/2 med <- median(x) ln <- max(1,floor(n/2) - ceiling(0.5 * n^0.8)) un <- n - ln rdf <- un - ln - 1 cut <- qt(up, rdf) d <- sort(x) se2 <- (d[un] - d[ln])/(2 * n^0.3) rval <- cut * se2 rlo2 <- med - rval rhi2 <- med + rval #got low and high endpoints of robust CI list(int = c(rlo2, rhi2), med = med, se2 = se2) } binrplot <- function(x, y) {# Calls the ``sir" function (in rpack but obtained from STATLIB). # The binary response plot allows visualization of binary data. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # # x needs at least two columns or the call to sir fails # q <- dim(x)[2] h <- q + 7 out <- sir(x, y, h) SIRESP <- x %*% out$edr[, 1] V <- x %*% out$edr[, 2] plot(SIRESP, V, type = "n") points(SIRESP[y == 1], V[y == 1], pch = 3) points(SIRESP[y == 0], V[y == 0], pch = 1) } cav<-function(alpha = 0.01, k = 5) {#gets n(asy var) for the alpha trimmed mean #and T_(A,n)(k) if errors are Cauchy(0,1) z <- tan(pi * (alpha - 0.5)) val <- (z - atan(z))/((1 - 2 * alpha) * atan(z)) ntmav <- val + (2 * alpha * (tan(pi * (alpha - 0.5)))^2)/(1 - 2 * alpha )^2 zj <- k alphaj <- 0.5 + atan( - k)/pi alphaj <- ceiling(100 * alphaj)/100 zj <- tan(pi * (alphaj - 0.5)) val <- (zj - atan(zj))/((1 - 2 * alphaj) * atan(zj)) natmav <- val + (2 * alphaj * (tan(pi * (alphaj - 0.5)))^2)/(1 - 2 * alphaj)^2 list(ntmav=ntmav, natmav=natmav) } cci<-function(x, alpha = 0.05) {#gets classical 100 (1-alpha)% CI #defaults are alpha = .05 n <- length(x) up <- 1 - alpha/2 mn <- mean(x) v <- var(x) se <- sqrt(v/n) val <- qt(up, n - 1) * se lo <- mn - val hi <- mn + val list(int = c(lo, hi), mean = mn, se = se) } cgci<-function(x, alpha = 0.05, ks = 3.5) {#gets T_S,n with a coarse grid # and the corresponding robust 100 (1-alpha)% CI n <- length(x) up <- 1 - alpha/2 med <- median(x) madd <- mad(x, constant = 1) d <- sort(x) ##get robust T_S,n CI lo <- sum(x < (med - ks * madd)) hi <- sum(x > (med + ks * madd)) tp <- max(hi, lo)/n if(tp == 0) tp <- 0 if(tp > 0 && tp <= 0.01) tp <- 0.01 if(tp > 0.01 && tp <= 0.1) tp <- 0.1 if(tp > 0.1 && tp <= 0.25) tp <- 0.25 if(tp > 0.25 && tp <= 0.4) tp <- 0.4 if(tp > 0.4) tp <- 0.49 tstmn <- mean(x, trim = tp) #have obtained the two stage trimmed mean ln <- floor(n * tp) un <- n - ln if(ln > 0) { d[1:ln] <- d[(ln + 1)] d[(un + 1):n] <- d[un] } den <- ((un - ln)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- un - ln - 1 rval <- qt(up, rdf) * sqrt(swv/n) tslo <- tstmn - rval tshi <- tstmn + rval ##got low and high endpoints of robust T_S,n CI list(int = c(tslo, tshi), tp = tp) } cisim<-function(n, runs = 500, type = 1, eps = 0.25, shift = 100, df = 1, kaa = 6, kss = 3.5) {# simulates classical and 2 robust CI's for median # type = 1: normal data, type = 2: contaminated normal data # type = 3: t(df) data, type = 4: double exponential # type = 5: exponential if(type == 1) x <- matrix(rnorm(n * runs), nrow = runs, ncol = n) if(type == 2) { x <- matrix(rnorm(n * runs), nrow = runs, ncol = n) x <- x + shift * matrix(rbinom(n * runs, 1, eps), nrow = runs, ncol = n) } if(type == 3) x <- matrix(rt(n * runs, df = df), nrow = runs, ncol = n) if(type == 4) { x <- matrix(rexp(n * runs), nrow = runs, ncol = n) w <- matrix(rbinom(n * runs, 1, 0.5), nrow = runs, ncol = n) w <- 2 * w - 1 x <- w * x } if(type == 5) x <- matrix(rexp(n * runs), nrow = runs, ncol = n) tcov <- 0 tlow <- 1:runs tup <- tlow tacov <- 0 talow <- tup taup <- tup tscov <- 0 tslow <- tup tsup <- tup mcov <- 0 mlow <- tup mup <- tup vcov <- 0 vlow <- tup vup <- tup trcov <- 0 trlow <- tup trup <- tup mexp <- log(2) for(i in 1:runs) { out <- robci(x[i, ], ka = kaa, ks = kss) tlow[i] <- out$tint[1] tup[i] <- out$tint[2] talow[i] <- out$taint[1] taup[i] <- out$taint[2] tslow[i] <- out$tsint[1] tsup[i] <- out$tsint[2] mlow[i] <- out$mint[1] mup[i] <- out$mint[2] vlow[i] <- out$vint[1] vup[i] <- out$vint[2] trlow[i] <- out$trint[1] trup[i] <- out$trint[2] if(type == 5) { if(tlow[i] < 1 && tup[i] > 1) tcov <- tcov + 1 if(talow[i] < 0.89155 && taup[i] > 0.89155) tacov <- tacov + 1 if(tslow[i] < 0.83071 && tsup[i] > 0.83071) tscov <- tscov + 1 if(mlow[i] < mexp && mup[i] > mexp) mcov <- mcov + 1 if(vlow[i] < mexp && vup[i] > mexp) vcov <- vcov + 1 if(trlow[i] < 0.73838 && trup[i] > 0.73838) trcov <- trcov + 1 } else { if(tlow[i] < 0 && tup[i] > 0) tcov <- tcov + 1 if(talow[i] < 0 && taup[i] > 0) tacov <- tacov + 1 if(tslow[i] < 0 && tsup[i] > 0) tscov <- tscov + 1 if(mlow[i] < 0 && mup[i] > 0) mcov <- mcov + 1 if(vlow[i] < 0 && vup[i] > 0) vcov <- vcov + 1 if(trlow[i] < 0 && trup[i] > 0) trcov <- trcov + 1 } } tcov <- tcov/runs tlen <- sqrt(n) * mean(tup - tlow) tacov <- tacov/runs talen <- sqrt(n) * mean(taup - talow) tscov <- tscov/runs tslen <- sqrt(n) * mean(tsup - tslow) mcov <- mcov/runs mlen <- sqrt(n) * mean(mup - mlow) vcov <- vcov/runs vlen <- sqrt(n) * mean(vup - vlow) trcov <- trcov/runs trlen <- sqrt(n) * mean(trup - trlow) list(tcov = tcov, tlen = tlen, tacov = tacov, talen = talen, tscov = tscov, tslen = tslen, mcov = mcov, mlen = mlen, vcov = vcov, vlen = vlen, trcov = trcov, trlen = trlen) } cltv<-function(gam = 0.5) {# Gets asy var for lts(h) and lta(h) at Cauchy C(0,1) # where h/n -> gam. k <- tan((pi * gam)/2) num <- 2 * k - pi * gam den <- pi * (gam - (2 * k)/(pi * (1 + k^2)))^2 ltsv <- num/den num <- gam den <- 4 * (1/pi - 1/(pi * (1 + k^2)))^2 ltav <- num/den list(ltsv=ltsv, ltav=ltav) } cmba2<-function(x, csteps = 5, ii = 1) {# gets the covmba estimator using 98, 95, 90, 80, 70, 60 and 50% trimming #needs p > 1 n <- dim(x)[1] p <- dim(x)[2] mds <- matrix(nrow = n, ncol = 8, 0) ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) cmd <- sqrt(mahalanobis(x, mns, covs)) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } mds[, 8] <- sqrt(mahalanobis(x, mns, covs)) covb <- covs mnb <- mns ##get the square root of det(covb) critb <- prod(diag(chol(covb))) ##get the resistant estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) smd2 <- sort(md2) val <- p + 3 tem <- 1:7 tem[1] <- smd2[val + floor(0.02 * n)] tem[2] <- smd2[val + floor(0.05 * n)] tem[3] <- smd2[val + floor(0.1 * n)] tem[4] <- smd2[val + floor(0.2 * n)] tem[5] <- smd2[val + floor(0.3 * n)] tem[6] <- smd2[val + floor(0.4 * n)] tem[7] <- median(md2) medd2 <- tem[7] for(j in ii:7) { ## get the start val2 <- tem[j] mns <- apply(x[md2 <= val2, ], 2, mean) covs <- var(x[md2 <= val2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } mds[, j] <- sqrt(mahalanobis(x, mns, covs)) plot(cmd, mds[, j]) identify(cmd, mds[, j]) crit <- prod(diag(chol(covs))) if(crit < critb) { critb <- crit covb <- covs mnb <- mns } } pairs(mds) ##scale for better performance at MVN rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/(qchisq(0.5, p)) covb <- const * covb list(center = mnb, cov = covb, mds = mds) } cmve<- function(x, csteps = 5) {# gets the cmve, rcmve and mb estimators zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) medd3 <- medd2 ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get CMVE attractor covf <- covm mnf <- mnm val <- mahalanobis(t(mnd), med, covv) if(val < medd3) { ##crit = [med(D)]^p * square root of det(cov) rd2 <- mahalanobis(x, mnd, covd) critd <- (sqrt(median(rd2)))^p*prod(diag(chol(covd))) rd2 <- mahalanobis(x, mnm, covm) critm <- (sqrt(median(rd2)))^p*prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get CMVE estimator chisqm <- qchisq(0.5, p) rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/chisqm covf <- const * covf ##reweight the above CMVE estimator (mnf,covf) to get the ##RCMVE estimator (rmnf,rcovf) rd2 <- mahalanobis(x, mnf, covf) up <- qchisq(0.975, p) if(p > 1){ rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) if(p > 1){ rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf list(center = mnf, cov = covf, rmnf = rmnf, rcovf = rcovf, mnm = mnm, covm = covm) } conc2<-function(x, y, start = l1fit(x, y)$coef) {#Finds the L1 attractor of the start. # Works in Splus but not R. x <- as.matrix(x) nc <- dim(x)[2] + 1 res <- y - (x %*% start[2:nc] + start[1]) ares <- abs(res) cov <- ceiling(length(y)/2) m <- sort(ares, partial = cov)[cov] old <- sum(ares[ares <= m]) new <- old - 1 ct <- 0 while(new < old) { ct <- ct + 1 start <- l1fit(x[ares <= m, ], y[ares <= m])$coef res <- y - (x %*% start[2:nc] + start[1 ]) ares <- abs(res) m <- sort(ares, partial = cov)[cov] new <- sum(ares[ares <= m]) #print(old) if(new < old) { old <- new new <- new - 1 } } list(coef = start, ct = ct) } concmv<-function(n = 100, csteps = 5, gam = 0.4, outliers = T, start = 3) {#Shows how concentration works when p = 2. #right click csteps times # Use start = 1 for DGK, start = 2 for MBA sphere = FCH sphere = MB estimator, # start = 3 for MBA MAD p <- 2 #A <- cbind(c(1, 0.9), c(0.9, 1)) x <- matrix(rnorm(n * p), ncol = p, nrow = n) #A <- diag(sqrt(1:p)) #if(outliers == T) { # val <- floor(gam * n) # tem <- 10 + 0 * 1:p # x[1:val, ] <- x[1:val, ] + tem #} #x <- x %*% A A <- cbind(c(1, 0.4), c(0.4, 1)) B <- cbind(c(0.5, 0), c(0, 0.5)) if(outliers == T) { val <- floor(gam * n) x[(val + 1):n, ] <- x[(val + 1):n, ] %*% A x[1:val, ] <- x[1:val, ] %*% B x[1:val, 1] <- x[1:val, 1] + 0 x[1:val, 2] <- x[1:val, 2] + 6 } else { x <- x %*% A } if(start == 1) { covs <- var(x) mns <- apply(x, 2, mean) } if(start == 2) { covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } if(start > 2) { tem <- apply(x, 2, mad)^2 covv <- diag(tem) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) plot(x[, 1], x[, 2]) points(x[md2 <= medd2, 1], x[md2 <= medd2, 2], pch = 15) identify(x[, 1], x[, 2]) } } concsim<-function(n = 100, p = 2, steps = 5, gam = 0.4, runs = 20) {# type "library(MASS)" or "library(lqs)." # Need n > 2p, p > 1. This function is used to determine when the DD # plot separates outliers from non-outliers for various starts. A <- sqrt(diag(1:p)) mbact <- 0 fmcdct <- 0 mbct <- 0 madct <- 0 dgkct <- 0 fchct <- 0 for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) ## outliers have mean (10, 10 sqrt(2), ..., 10 sqrt(p))^T val <- floor(gam * n) tem <- 10 + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem x <- x %*% A #MBA out <- covmba(x, csteps = steps) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbact <- mbact + 1 #DGK covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(j in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) dgkct <- dgkct + 1 #Median Ball start covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(j in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbct <- mbct + 1 #MAD start tem <- apply(x, 2, mad)^2 covv <- diag(tem) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(j in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) madct <- madct + 1 #FMCD out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fmcdct <- fmcdct + 1 #FCH out <- covfch(x, csteps = steps) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fchct <- fchct + 1 } list(mbact = mbact, fmcdct = fmcdct, dgkct = dgkct, mbct = mbct, madct = madct, fchct = fchct) } confreg<-function(x, g = 4, that = 1:4, alpha = 0.05){ # Makes confidence regions for theta from rows of x = Ti* from a bootstrap. # Use fot testing H_0: theta = 0 versus H_1: theta != 0. # The prediction region method, hybrid, and Bickel and Ren regions are used. # If g = 1, the shorth interval should work better. # Also computes the distance for the 0 vector. # Need g = dim(x)[2] and T = that the g by 1 estimator of theta. # Often that = A betahat(I_min,0). # Note that center= Tbar* = bagging estimator and cov = S*_T. x <- as.matrix(x) that <- as.vector(that) n <- dim(x)[1] zero <- 0*(1:g) up <- min((1 - alpha/2), (1 - alpha + 10*alpha*g/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + g/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical distance MD <- sqrt(md2) #get prediction region method cutoff cuplim <- sqrt(quantile(md2, up)) D0 <- sqrt(mahalanobis(zero, center, cov)) #get hybrid region statistic = Bickel and Ren statistic br0 <- sqrt(mahalanobis(zero, that, cov)) #get the Bickel and Ren cutoff and test statistic br2 <- mahalanobis(x,that,cov) brlim <- sqrt(quantile(br2, up)) list(cuplim=cuplim,brlim=brlim,br0=br0,D0=D0,MD=MD,center=center,cov=cov) } corrsim<-function(n = 100, p = 3, eps = 0.4, nruns = 100, type = 5) {#Need n > 2p. For R, type "library(MASS)" before using this #function. This function generates 100 n by p matrices x. # The output is the 100 sample correlations between the MDi and RDi # RDi uses covmba for type = 1, rmba for type = 2, cov.mcd for # type = 3, covfch for type = 4, rfch for type = 5, cmve for type 6 # rcmve for type 7, rmvn for type = 8. # mahalanobis gives squared Maha distances corrs <- 1:nruns for(i in 1:nruns) { wt <- 0 * (1:n) x <- matrix(rnorm(n * p), ncol = p, nrow = n) #The following 3 commands make x elliptically contoured. #zu <- runif(n) #x[zu < eps,] <- x[zu < eps,]*5 #x <- x^2 # To make marginals of x lognormal, use #x <- exp(x) center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) if(type == 1) { out <- covmba(x) } if(type == 2) { out <- rmba(x) } if(type == 3) { out <- cov.mcd(x) } if(type == 4) { out <- covfch(x) } if(type == 5) { out <- covfch(x) center <- out$rmnf cov <- out$rcovf } if(type == 6) { out <- cmve(x) } if(type == 7) { out <- cmve(x) center <- out$rmnf cov <- out$rcovf } if(type == 8) { out <- covrmvn(x) } if(type != 5 && type != 7){ center <- out$center cov <- out$cov } rd2 <- mahalanobis(x, center, cov) # need square roots for the usual distances md <- sqrt(md2) rd <- sqrt(rd2) const <- sqrt(qchisq(0.5, p))/median(rd) rd <- const * rd # wt[rd < sqrt(qchisq(0.975, p))] <- 1 # corrs[i] <- cor(md[wt > 0], rd[wt > 0])} corrs[i] <- cor(md, rd) } cmean <- mean(corrs) cmin <- min(corrs) clt95 <- sum(corrs < 0.95) clt80 <- sum(corrs < 0.8) list(cmean = cmean, cmin = cmin, clt95 = clt95, clt80 = clt80, corrs = corrs) } corrsim2<-function(n = 100, q = 4, nruns = 100, xtype = 1, type = 1, dd = 1, eps = 0.25){ # Need n > 2q. R users need to type library(MASS). # MAY NOT WORK IF q = 1 # Multiply x by A where xtype = 1 for MVN Nq(0,I), # 2, 3, 4 and 5 (with delta = eps) for (1 - delta) Nq(0,I) + delta Nq(0, 25 I) # 6, 7, 8 and 9 for multivariate t_d with d = 3, 5, 19 or dd # 10 for lognormal. # The output is the 100 sample correlations between the MDi and RDi # RDi uses covmba for type = 1, rmba for type = 2, cov.mcd for # type = 3, covfch for type = 4, rfch for type = 5, cmve for type 6 # rcmve for type 7, rmvn for type = 8. # mahalanobis gives squared Maha distances set.seed(974) corrs <- 1:nruns k <- q/2 A <- sqrt(diag(1:q)) for(i in 1:nruns) { #make data x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(xtype == 2) { zu <- runif(n) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(xtype == 3) { zu <- runif(n) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(xtype == 4) { zu <- runif(n) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(xtype == 5) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 } if(xtype == 6) { zu <- sqrt(rchisq(n, 3)/3) x <- x/zu } if(xtype == 7) { zu <- sqrt(rchisq(n, 5)/5) x <- x/zu } if(xtype == 8) { zu <- sqrt(rchisq(n, 19)/19) x <- x/zu } if(xtype == 9) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu } if(xtype == 10) x <- exp(x) x <- x %*% A center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) if(type == 1) { out <- covmba(x) } if(type == 2) { out <- rmba(x) } if(type == 3) { out <- cov.mcd(x) } if(type == 4) { out <- covfch(x) } if(type < 5) { center <- out$center cov <- out$cov } if(type == 5) { out <- covfch(x) center <- out$rmnf cov <- out$rcovf } if(type == 6) { out <- cmve(x) } if(type == 7) { out <- cmve(x) center <- out$rmnf cov <- out$rcovf } if(type == 8) { out <- covrmvn(x) } if(type != 5 && type != 7){ center <- out$center cov <- out$cov } rd2 <- mahalanobis(x, center, cov) # need square roots for the usual distances md <- sqrt(md2) rd <- sqrt(rd2) const <- sqrt(qchisq(0.5, p))/median(rd) rd <- const * rd corrs[i] <- cor(md, rd) } cmean <- mean(corrs) cmin <- min(corrs) clt95 <- sum(corrs < 0.95) clt80 <- sum(corrs < 0.8) list(cmean = cmean, cmin = cmin, clt95 = clt95, clt80 = clt80, corrs = corrs) } covdgk<-function(x, csteps = 10) {#computes the scaled DGK multivariate estimator, need p > 1 p <- dim(x)[2] covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } ##scale for consistency at MVN rd2 <- mahalanobis(x, mns, covs) const <- median(rd2)/(qchisq(0.5, p)) covs <- const * covs list(center = mns, cov = covs) } covesim<-function(n = 10, p = 2, csteps = 5, gam = 0.4, runs = 20, outliers = 0, pm = 10){ # This R function compares various ways to robustly estimate the # covariance matrix. The estimators used are ccov: the classical # estimator applied to the clean cases, RFCH and RMVN. # For RMVN want diago approx (1, 2, ...., p) # For RFCH want diagfch approx c (1, 2, ...., p) # Need p > 1. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm A <- sqrt(diag(1:p)) ccov <- 0 * A rfche <- ccov rmvne <- ccov covv <- diag(p) val <- floor(gam * n) up <- qchisq(0.975, p) qchi <- qchisq(0.5, p) qtem <- 1:runs qtem2 <- qtem diagdiff <- 1:p for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } ccov <- ccov + var(x[ - (1:val), ]) ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(j in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) medd3 <- medd2 ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(j in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < medd3) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get FCH estimator rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the RFCH estimator ## (rmnf,rcovf) rd2 <- mahalanobis(x, mnf, covf) rmnf <- apply(x[rd2 <= up, ], 2, mean) rcovf <- var(x[rd2 <= up, ]) ##use the following 3 commands for the covrmvn estimator rd3 <- rd2 rmnmvn <- rmnf rcovmvn <- rcovf ## rd2 <- mahalanobis(x, rmnf, rcovf) ##rd4 is used for covrmvn rd4 <- rd2 const <- median(rd2)/qchi rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) rmnf <- apply(x[rd2 <= up, ], 2, mean) rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/qchi rcovf <- const * rcovf rfche <- rfche + rcovf ##reweight the FCH estimator (mnf,covf) to get the covrmvn estimator ## (rmnmvn,rcovmvn) tailored for MVN data d1 <- sum(rd3 <= up) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) qtem[i] <- qchi2 const <- median(rd4)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ## reweight again rd2 <- mahalanobis(x, rmnmvn, rcovmvn) rmnmvn <- apply(x[rd2 <= up, ], 2, mean) rcovmvn <- var(x[rd2 <= up, ]) d2 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d2 qchi2 <- min(qchi2, 0.995) qtem2[i] <- qchi2 const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn rmvne <- rmvne + rcovmvn } ccov <- ccov/runs rfche <- rfche/runs rmvne <- rmvne/runs diago <- diag(rmvne) diagdiff <- diag(ccov) - diag(rmvne) abssumrmvn <- sum(abs(diagdiff)) diagfch <- diag(rfche) diagdifch <- diag(ccov) - diag(rfche) abssumfch <- sum(abs(diagdifch)) list(ccov = ccov, rfche = rfche, rmvne = rmvne, diago = diago, diagdiff = diagdiff, abssumrmvn = abssumrmvn, diagfch = diagfch, diagdifch= diagdifch, abssumfch = abssumfch ) } covfch<-function(x, csteps = 5) {# gets the FCH and RFCH estimators and the MB attractor # works for p = 1 zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ##get the location criterion lcut <- medd2 ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val <- mahalanobis(t(mnd), med, covv) if(val < lcut) {##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get FCH estimator chisqm <- qchisq(0.5, p) rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/chisqm covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the RFCH estimator ## (rmnf,rcovf) rd2 <- mahalanobis(x, mnf, covf) up <- qchisq(0.975, p) if(p > 1) { rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) if(p > 1){ rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf list(center = mnf, cov = covf, rmnf = rmnf, rcovf = rcovf, mnm=mnm, covm=covm) } covfch2<-function(x, csteps = 5, locc = 0.5) {# Gets the FCH and RFCH estimators and the MB attractor # Allows the location criterion cutoff to be varied. zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val <- mahalanobis(t(mnd), med, covv) if(val < lcut) {##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get FCH estimator chisqm <- qchisq(0.5, p) rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/chisqm covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the RFCH estimator ## (rmnf,rcovf) rd2 <- mahalanobis(x, mnf, covf) up <- qchisq(0.975, p) if(p > 1) { rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) if(p > 1){ rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf list(center = mnf, cov = covf, rmnf = rmnf, rcovf = rcovf, mnm=mnm, covm=covm) } covmb<-function(x, steps = 5, scale=F) {# Computes the median ball estimator. # Needs p > 1. If scale = T, the plotted points will roughly # scatter about the identity line if the data is MVN # and spherical about mu. p <- dim(x)[2] #Median Ball start covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the half set start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } if(scale == T){ rd2 <- mahalanobis(x, mns, covs) const <- median(rd2)/(qchisq(0.5, p)) covs <- const * covs } list(center=mns,cov=covs) } covmb2<-function(x, m=0, k=5, msteps=9){ # Computes the covmb2 estimator with concentration type steps. Needs p > 1. # Best if p > n. Look at out<-medout(x) to determine how many cases # m are clean. Use m >= n/2. # Estimate m if m = 0: use k >= 0, so at least half of the cases are used, # and do the concentration type msteps to get a weighted median. # The concentration type steps help the most when the outlier proportion # is high. Try covbm2(x,msteps=0) and covmb2(x,msteps=9). # Using msteps > 0 does slow down the function some. p <- dim(x)[2] #Median Ball start covv <- diag(p) med <- apply(x, 2, median) #Get squared Euclidean distances from coordinatewise median. md2 <- mahalanobis(x, center = med, covv) if(m == 0){ if(msteps > 0){#do concentration type steps for(i in 1:msteps){ medd <- median(md2) medw <- apply(x[md2<=medd,], 2, median) md2 <- mahalanobis(x, center = medw, covv) } } md <- sqrt(md2) mcut <- median(md) + k*mad(md,constant=1) mns <- apply(x[md <= mcut, ], 2, mean) covs <- var(x[md <= mcut, ]) } else{ #Use m cases with the smallest distances. mcut <- sort(md2)[m] mns <- apply(x[md2 <= mcut, ], 2, mean) covs <- var(x[md2 <= mcut, ]) } list(center=mns,cov=covs) } covmba <- function(x, csteps = 5) { # gets the MBA estimator, works for p = 1 zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1){ mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1){ mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covb <- covs mnb <- mns ##get the square root of det(covb) critb <- prod(diag(chol(covb))) ##get the resistant estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start if(p > 1){ mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1){ mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1){ mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1){ mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } crit <- prod(diag(chol(covs))) if(crit < critb) { critb <- crit covb <- covs mnb <- mns } ##scale for better performance at MVN rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/(qchisq(0.5, p)) covb <- const * covb list(center = mnb, cov = covb, mbL = mns, mbc = covs) } covmba2<-function(x, csteps = 5) {# gets the MBA estimator, use covmba2 instead of covmba if p > 1 p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covb <- covs mnb <- mns ##get the square root of det(covb) critb <- prod(diag(chol(covb))) ##get the resistant estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } crit <- prod(diag(chol(covs))) if(crit < critb) { critb <- crit covb <- covs mnb <- mns } ##scale for better performance at MVN rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/(qchisq(0.5, p)) covb <- const * covb list(center = mnb, cov = covb) } covrmb<-function(x, csteps = 5){ # Need p > 1. Produces the median ball and reweighted median ball (RMB) # estimators. RMB is tailored to estimate the covariance matrix # of the bulk of the data if the bulk of the data is MVN # and the outliers are "not too bad." # This function is like covrmvn, except only the MB attractor # is used. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] up <- qchisq(0.975, p) qchi <- qchisq(0.5, p) ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the half subset start mnm <- apply(x[md2 <= medd2, ], 2, mean) covm <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mnm, covm) medd2 <- median(md2) mnm <- apply(x[md2 <= medd2, ], 2, mean) covm <- var(x[md2 <= medd2, ]) } ## scale the MB estimator to behave well for MVN data rd2 <- mahalanobis(x, mnm, covm) const <- median(rd2)/qchi covm <- const * covm ## reweight the scaled MB estimator (mnm,covm) twice ## to get the covrmb estimator (mnrmb,covrmb) tailored for MVN data rd2 <- mahalanobis(x, mnm, covm) mnrmb <- apply(x[rd2 <= up, ], 2, mean) covrmb <- var(x[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(x, mnrmb, covrmb) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) covrmb <- const * covrmb ## reweight again rd2 <- mahalanobis(x, mnrmb, covrmb) mnrmb <- apply(x[rd2 <= up, ], 2, mean) covrmb <- var(x[rd2 <= up, ]) d2 <- sum(rd2 <= up) rd2 <- mahalanobis(x, mnrmb, covrmb) qchi2 <- (0.5 * 0.975 * n)/d2 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) covrmb <- const * covrmb list(center = mnrmb, cov = covrmb, mnm = mnm, covm = covm) } covrmvn<-function(x, csteps = 5, locc = 0.5) {# Needs number of predictors p > 1. # This robust MLD estimator is tailored to estimate the covariance matrix # of the bulk of the data when the bulk of the data is MVN and the outliers # are "not too bad." The FCH and MB estimators are also produced. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] up <- qchisq(0.975, p) qchi <- qchisq(0.5, p) ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the cov estimator ## (rmnmvn,rcovmvn) tailored for MVN data rd2 <- mahalanobis(x, mnf, covf) rmnmvn <- apply(x[rd2 <= up, ], 2, mean) rcovmvn <- var(x[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ## reweight again rd2 <- mahalanobis(x, rmnmvn, rcovmvn) rmnmvn <- apply(x[rd2 <= up, ], 2, mean) rcovmvn <- var(x[rd2 <= up, ]) d2 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d2 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn list(center = rmnmvn, cov = rcovmvn, mnf = mnf, covf = covf, mnm = mnm, covm = covm) } covrob<-function(x, csteps = 5, locc = 0.5){ # gets the MBA, FCH, RFCH, RMVN and CMVE estimators # and the MB attractor zx <- x x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1]##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) ##get MBA attractor covb <- covm mnb <- mnm if(critd < critm) { covb <- covd mnb <- mnd } ##get FCH and CMVE attractors covf <- covm mnf <- mnm covcmv <- covm mncmv <- mnm val <- mahalanobis(t(mnd), med, covv) if(val < lcut) { if(critd < critm) { covf <- covd mnf <- mnd } rd2 <- mahalanobis(x, mnd, covd) critdm <- (sqrt(median(rd2)))^p * critd rd2 <- mahalanobis(x, mnm, covm) critmm <- (sqrt(median(rd2)))^p * critm if(critdm < critmm) { covcmv <- covd mncmv <- mnd } } ##scale for better performance at MVN ## get MBA estimator chisqm <- qchisq(0.5, p) rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/chisqm covb <- const * covb ## get FCH estimator rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/chisqm covf <- const * covf ## get 1st step of the RFCH estimator rd2 <- mahalanobis(x, mnf, covf) up <- qchisq(0.975, p) if(p > 1) { rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) ##use the following 3 commands for the RMVN estimator rd3 <- rd2 rmnmvn <- rmnf rcovmvn <- rcovf ## use rd2 for RFCH and rd4 for RMVN rd2 <- mahalanobis(x, rmnf, rcovf) rd4 <- rd2 ##use the following command for the RFCH estimator const <- median(rd2)/chisqm rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) if(p > 1) { rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf ## get the RMVN estimator d1 <- sum(rd3 <= up) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd4)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ## reweight again rd2 <- mahalanobis(x, rmnmvn, rcovmvn) if(p > 1){ rmnmvn <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnmvn = mean(zx[rd2 <= up]) } rcovmvn <- var(x[rd2 <= up, ]) d2 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d2 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ## get CMVE estimator rd2 <- mahalanobis(x, mncmv, covcmv) const <- median(rd2)/(qchisq(0.5, p)) covcmv <- const * covcmv list(center = mnb, cov = covb, mnf = mnf, covf = covf, rmnf = rmnf, rcovf = rcovf, rmnmvn = rmnmvn, rcovmvn = rcovmvn, mncmv = mncmv, covcmv = covcmv, mnm = mnm, covm = covm) } covsim2<-function(n=100, p = 2, gam = 0.4, runs = 20) {# Needs p > 1. This function is used to determine when the DD # plot separates outliers from non-outliers. # Rather large n and p can be used since covfch is used. A <- sqrt(diag(1:p)) ct <- 0 for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) ## outliers have mean (10, 10 sqrt(2), ..., 10 sqrt(p))^T val <- floor(gam * n) tem <- 10 + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem x <- x %*% A out <- covfch(x) #try covmba(x),rmba(x),cov.mcd(x),covOGK(x,sigmamu = scaleTau2) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) ct <- ct + 1 } list(ct = ct) } ctrviews<-function(x, Y, ii = 1) {# Uses classical distances instead of robust distances. # Trimmed views for 90, 80, ... 0 percent # trimming. Allows visualization of m # and crude estimatation of c beta in models # of the form y = m(x^T beta) + e. # Advance the view with the right mouse button # and in R, highight "stop." # Workstation: activate a graphics # device with command "X11()" or "motif()." x <- as.matrix(x) center <- apply(x, 2, mean) cov <- var(x) rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) bhat <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$coef ESP <- x %*% bhat[-1] plot(ESP, Y) title(labs[i]) identify(ESP, Y) print(bhat) } } ddcomp<-function(x, steps = 5) {# Need p > 1. # Makes 4 DD plots using the DGK, FCH, FMCD and MB estimators. # Click left mouse button to identify points. # Click right mouse button to end the function. # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. # R users need to type "library(MASS)" or "library(lqs)." p <- dim(x)[2] par(mfrow = c(2, 2)) center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical and RD the robust distance MD <- sqrt(md2) #DGK start md2 <- mahalanobis(x, center, cov) medd2 <- median(md2) ## get the start mns <- center covs <- cov ## concentrate for(i in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RDdgk <- const * rd plot(MD, RDdgk) abline(0, 1) identify(MD, RDdgk) title("DGK DD Plot") #FCH out <- covfch(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) rd <- sqrt(rd2) RDfch <- rd plot(MD, RDfch) abline(0, 1) identify(MD, RDfch) title("FCH DD Plot") #FMCD out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RDf <- const * rd plot(MD, RDf) abline(0, 1) identify(MD, RDf) title("FMCD DD Plot") #Median Ball start covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RDmb <- const * rd plot(MD, RDmb) abline(0, 1) identify(MD, RDmb) title("Med Ball DD Plot") par(mfrow=c(1,1)) } ddcomp2<-function(x, steps = 5) {# Need p > 1. # Makes a 4 DD plots using the MBA, FCH, RFCH and MB estimators. # Click left mouse button to identify points. # Click right mouse button to end the function, and in R, highlight "stop." # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. p <- dim(x)[2] par(mfrow = c(2, 2)) center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical and RD the robust distance MD <- sqrt(md2) #MBA out <- covmba(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) RDm <- sqrt(rd2) plot(MD, RDm) abline(0, 1) identify(MD, RDm) title("MBA DD Plot") #FCH out <- covfch(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) RDf <- sqrt(rd2) plot(MD, RDf) abline(0, 1) identify(MD, RDf) title("FCH DD Plot") #RFCH center <- out$rmnf cov <- out$rcovf rd2 <- mahalanobis(x, center, cov) RDr <- sqrt(rd2) plot(MD, RDr) abline(0, 1) identify(MD, RDr) title("RFCH DD Plot") #MB center <- out$mnm cov <- out$covm rd2 <- mahalanobis(x, center, cov) RDmb <- sqrt(rd2) plot(MD, RDmb) identify(MD, RDmb) title("MB DD Plot") par(mfrow=c(1,1)) } ddmv<-function(n = 100, p = 2, steps = 5, gam = 0.4, outtype = 2, est = 1) {# Need p > 1. # This function is used to determine when the DD # plot separates outliers from non-outliers for various starts. # Workstation needs to activate a graphics # device with the command "X11()" or "motif()." # Advance the view with the right mouse button, and in R, highlight "stop." ## est = 1 for DGK, 2 for median ball, 3 for MAD A <- sqrt(diag(1:p)) x <- matrix(rnorm(n * p), ncol = p, nrow = n) val <- floor(gam * n) tem <- 10 + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem #if outtype = 1, outliers are Np(10 1, Ip) nonoutliers Np(0,Ip) if(outtype == 2) x <- x %*% A ## outliers have mean (10, 10 sqrt(2), ..., 10 sqrt(p))^T ## get the start if(est == 1) { #DGK classical start covs <- var(x) mns <- apply(x, 2, mean) } if(est == 2) { #Median Ball high breakdown start covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } if(est == 3) { #MAD high breakdown start tem <- apply(x, 2, mad)^2 covv <- diag(tem) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } ## concentrate and plot, highlighting outliers MD <- sqrt(mahalanobis(x, mns, covs)) for(i in 1:steps) { md <- sqrt(mahalanobis(x, mns, covs)) medd <- median(md) mns <- apply(x[md <= medd, ], 2, mean) covs <- var(x[md <= medd, ]) rd <- sqrt(mahalanobis(x, mns, covs)) plot(MD, rd) points(MD[1:val], rd[1:val], pch = 15) identify(MD, rd) } } ddplot<-function(x, type = 5){ # Makes a DD plot. Needs p > 1. # Click left mouse button to identify points. # Click right mouse button to end the function # and in R, highlight "stop". # R users need to type "library(MASS)" or "library(lqs)." # RDi uses covmba for type = 1, rmba for type = 2, cov.mcd for # type = 3, covfch for type = 4, rfch for type = 5, cmve for type 6 # rcmve for type 7, rmvn for type = 8. # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. p <- dim(x)[2] center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) if(type == 1){ out <- covmba(x) } if(type == 2){ out <- rmba(x) } if(type >= 3){ out <- cov.mcd(x) } if(type == 4) { out <- covfch(x) } if(type == 5) { out <- covfch(x) center <- out$rmnf cov <- out$rcovf } if(type == 6) { out <- cmve(x) } if(type == 7) { out <- cmve(x) center <- out$rmnf cov <- out$rcovf } if(type == 8) { out <- covrmvn(x) } if(type != 5 && type != 7){ center <- out$center cov <- out$cov } rd2 <- mahalanobis(x, center, cov) # md is the classical and rd the robust distance MD <- sqrt(md2) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RD <- const * rd plot(MD, RD) abline(0, 1) identify(MD, RD) # list(MD = MD, RD = RD) } ddplot2<-function(x, type = 2) {# Need p > 1. Makes a DD plot with RDi depending on type # as well as a MB DD plot. # Click left mouse button to identify points. # Click right mouse button to end the function # and in R, highlight "stop". # R users need to type "library(MASS)" or "library(lqs)." # The plot used covfch for the RDi if type = 1 # rfch if type = 2 and cov.mcd if type = 3. # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. p <- dim(x)[2] center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) outm <- covfch(x) if(type == 1){ center <- outm$center cov <- outm$cov } if(type == 2){ center <- outm$rmnf cov <- outm$rcovf } if(type >= 3){ out <- cov.mcd(x) center <- out$center cov <- out$cov } rd2 <- mahalanobis(x, center, cov) # md is the classical and rd the robust distance MD <- sqrt(md2) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RD <- const * rd par(mfrow=c(1,2)) plot(MD, RD) abline(0, 1) identify(MD, RD) #if statement causes a bug # if(type > 1) rd2 <- mahalanobis(x, outm$mnm, outm$covm) rd <- sqrt(rd2) plot(MD,rd) title("MB DD Plot") identify(MD, rd) # list(MD = MD, RD = RD) par(mfrow=c(1,1)) } ddplot4<-function(x, alpha = 0.1){ # Makes a DD plot with covrmvn used for the RDi. # Need p > 1. # Semiparametric prediction regions are added. # Click left mouse button to identify points. # Click right mouse button to end the function. # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. p <- dim(x)[2] n <- dim(x)[1] up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) out <- covrmvn(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) # MD is the classical and RD the robust distance MD <- sqrt(md2) RD <- sqrt(rd2) plot(MD, RD) abline(0, 1) #get nonparametric prediction region boundary cuplim <- sqrt(quantile(md2, up)) a <- min(RD) b <- max(RD) lines(c(cuplim, cuplim), c(b, a)) #get semiparametric prediction region boundary ruplim <- sqrt(quantile(rd2, up)) a <- min(MD) b <- max(MD) lines(c(a, b), c(ruplim, ruplim)) #get parametric MVN prediction region boundary mvnlim <- sqrt(qchisq(up, p)) b <- min(b, mvnlim) lines(c(a, b), c(mvnlim, mvnlim)) identify(MD, RD) list(cuplim = cuplim, ruplim = ruplim, mvnlim = mvnlim) } ddplot5<-function(x, mm=0, kk= 5,steps = 9) {# Plots Euclidean distances from the coordinatewise median #vs. those of covmb2 location estimator with 9 concentration type steps. #Good plot for outlier detection even if p > n. #Needs p > 1. x<-as.matrix(x) p <- dim(x)[2] covv <- diag(p) med <- apply(x, 2, median) RDMED <- sqrt(mahalanobis(x, center = med, covv)) RDCOVMB2 <- sqrt(mahalanobis(x,center=covmb2(x,m=mm,k=kk,msteps=steps)$center,covv)) plot(RDMED,RDCOVMB2) #list(RDMED=RDMED,RDCOVMB2=RDCOVMB2) } ddsim<-function(n = 100, p = 3, eps = 0.4, type = 5, ident=F) {# Need p > 1. If ident=T, right click stop 10 times to see the plots. # R: type "library(MASS)" or "library(lqs)." # Rapidly plots 10 DD plots in a row. # Unix: type "X11()" or "motif()" to # turn on a graphics device. # RDi uses covmba for type = 1, rmba for type = 2, cov.mcd for # type = 3, covfch for type = 4, rfch for type = 5, rmvn type = 6. med <- 1:10 for(i in 1:10) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) ## For elliptically contoured data, use: #zu <- runif(n) #x[zu < eps,] <- x[zu < eps,]*5 #x <- x^2 ##For lognormal marginals, add: #x <- exp(x) center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) if(type == 1) { out <- covmba(x) } if(type == 2) { out <- rmba(x) } if(type == 3) { out <- cov.mcd(x) } if(type == 4) { out <- covfch(x) } if(type < 5) { center <- out$center cov <- out$cov } if(type == 5) { out <- covfch(x) center <- out$rmnf cov <- out$rcovf } if(type == 6) { out <- covrmvn(x) } if(type != 5) { center <- out$center cov <- out$cov } rd2 <- mahalanobis(x, center, cov) md <- sqrt(md2) rd <- sqrt(rd2) #Scale the RDi so plot follows 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) rd <- const * rd plot(md, rd) abline(0, 1) med[i] <- median(md) #Use ident=T to slow down the plots. if(ident==T) identify(md,rd) } list(med = med) } ddsim3<-function(n = 100, p = 4, xtype = 1, dd = 1, eps = 0.25, alpha = 0.1){ # MAY NOT WORK IF p = 1. Calls ddplot4. # Makes DD plots of data used in function "predsim". Gets coverages # of semiparametric and robust parametric prediction regions. # Multiply x by A where xtype = 1 for MVN Nq(0,I), # 2, 3, 4 and 10 (with delta = eps) for delta Np(0,I) + (1-delta) Np(0, 25 I) # 5 for lognormal, # 6, 7, 8 and 9 for multivariate t_dd # mahalanobis gives squared Maha distances up <- min((1 - alpha/2), (1 - alpha + (50 * alpha)/n)) A <- sqrt(diag(1:p)) m <- n + 1 #make data x <- matrix(rnorm(m * p), nrow = m, ncol = p) if(xtype == 2) { zu <- runif(m) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(xtype == 3) { zu <- runif(m) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(xtype == 4) { zu <- runif(m) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(xtype == 5) x <- exp(x) if(xtype == 6) { zu <- sqrt(rchisq(m, 3)/3 ) x <- x/zu } if(xtype == 7) { zu <- sqrt(rchisq(m, 5)/5 ) x <- x/zu } if(xtype == 8) { zu <- sqrt(rchisq(m, 19)/ 19) x <- x/zu } if(xtype == 9) { zu <- sqrt(rchisq(m, dd)/ dd) x <- x/zu } if(xtype == 10) { zu <- runif(m) x[zu < eps, ] <- x[zu < eps, ] * 5 } x <- x %*% A xx <- x[ - m, ] ddplot4(xx) } deav<-function(alpha = 0.01, k = 5) {#gets n(asy var) for the alpha trimmed mean #and T_(A,n)(k) if errors are DE(0,1) z <- - log(2 * alpha) num <- 2 - (2 + 2 * z + z^2) * exp( - z) den <- (1 - exp( - z)) * (1 - 2 * alpha) val1 <- num/den num <- 2 * alpha * z^2 den <- (1 - 2 * alpha)^2 ntmav <- val1 + num/den zj <- k * log(2) alphaj <- 0.5 * exp( - zj) alphaj <- ceiling(100 * alphaj)/100 zj <- - log(2 * alphaj) num <- 2 - (2 + 2 * zj + zj^2) * exp( - zj) den <- (1 - exp( - zj)) * (1 - 2 * alphaj) val1 <- num/den num <- 2 * alphaj * zj^2 den <- (1 - 2 * alphaj)^2 natmav <- val1 + num/den list(ntamv=ntmav, natmav=natmav) } deltv<-function(gam = 0.5) {# Gets asy var for lts(h) and lta(h) at standard double exp # where h/n -> gam. k <- -1 * log(1 - gam) num <- 2 - (2 + 2 * k + k^2) * exp( - k) den <- (gam - k * exp( - k))^2 ltsv <- num/den ltav <- 1/gam list(ltsv=ltsv, ltav=ltav) } diagplot<-function(x, Y) {# Scatterplot matrix of OLS diagnostics. # Workstation need to activate a graphics # device with command "X11()" or "motif()." n <- length(Y) rmat <- matrix(nrow = n, ncol = 7) out <- lsfit(x, Y) tem <- ls.diag(out) rmat[, 1] <- tem$cooks rmat[, 2] <- tem$hat rmat[, 3] <- tem$std.res rmat[, 4] <- tem$stud.res rmat[, 5] <- tem$dfits rmat[, 6] <- Y - out$resid rmat[, 7] <- Y pairs(rmat, labels = c("Cook's CD", "leverages", "stand resid", "stud resid", "DFFITS", "YHAT", "Y")) } drsim5<-function(n=100,q=4,nruns=100,h=2,tr=10,xtype=1,mtype=1,eps=0.4,dd=5) {# MAY NOT WORK IF q = 1 # Needs dr library so only works in R. Uses FCH. # Finds ave(esp), scaled sd, ave scaled ols se, ave scaled chen and li se for tvreg(M) estimator. # CHANGE A IF q IS NOT EVEN # Change beta, q, and A to change Ho. # Use tr = 1 for 90%, tr = 2 for 80%, ..., tr = 10 for 0% trimming # xtype = 1 for MVN Nq(0,I), # 2, 3, 4 and 10 for eps Nq(0,I) + (1-eps) Nq(0, 25 I) # 5 for lognormal, # 6, 7, 8 and 9 for multivariate t_d with d = 3, 5, 19 or dd, set.seed(974) k <- q/2 # test Ho beta(q/2 + 1) = ... = beta(q) = 0 beta <- 0 * 1:q beta[1:q/2] <- 1 bols <- matrix(0, nrow = nruns, ncol = (q + 1)) bsir <- matrix(0, nrow = nruns, ncol = q) bwsir <- bsir sseols <- matrix(0, nrow = nruns, ncol = q) ssecl <- sseols zols <- 0 * 1:nruns fols <- zols zsir <- zols zwsir <- zsir labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") A2 <- diag(q/2) A1 <- 0 * A2 A <- cbind(A1, A2) tem <- seq(0.1, 1, 0.1) for(i in 1:nruns) { #make data x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(xtype == 2){ zu <- runif(n) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(xtype == 3) { zu <- runif(n) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(xtype == 4) { zu <- runif(n) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(xtype == 5) x <- exp(x) if(xtype == 6) { zu <- sqrt(rchisq(n, 3)/3) x <- x/zu } if(xtype == 7) { zu <- sqrt(rchisq(n, 5)/5) x <- x/zu } if(xtype == 8){ zu <- sqrt(rchisq(n, 19)/19) x <- x/zu } if(xtype == 9) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu } if(xtype == 10) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 } sp <- 1 + x %*% beta if(mtype == 1) y <- sp + rnorm(n) if(mtype == 2) y <- sp^2 + rnorm(n) if(mtype == 3) y <- exp(sp) + rnorm(n) if(mtype == 4) y <- (sp)^3 + rnorm(n) if(mtype == 5) y <- sin(sp)/sp + 0.01*rnorm(n) if(mtype == 6) y <- sp + sin(sp) + 0.1*rnorm(n) if(mtype == 7) y <- sqrt(abs(sp)) + 0.1*rnorm(n) #do trimming out <- covfch(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) val <- quantile(rd2, tem[tr]) #get untrimmed data xm <- x[rd2 <= val, ] ym <- y[rd2 <= val] #get ESP from untrimmed data out <- lsfit(xm, ym) bols[i, ] <- out$coef resm <- out$resid nm <- length(ym) msem <- t(resm) %*% resm/(nm - q - 1) cxm <- var(xm) #get OLS SE which is usually incorrect cxminv <- solve(cxm) sseols[i, ] <- sqrt(n/nm) * sqrt(msem * diag(cxminv)) #get OLS chisquare test statistic bhat <- bols[i, -1] covAb <- A %*% cxminv %*% t(A) covinv <- solve(covAb) cent <- t(A) %*% covinv %*% A num <- t(bhat) %*% cent %*% bhat zols[i] <- (nm * num)/msem #get OLS partial F statistic fols[i] <- ((nm - 1) * zols[i])/(k * nm) #get Chen and Li scaled SE mnxm <- apply(xm, 2, mean) mid <- matrix(0, nrow = q, ncol = q) for(j in 1:nm) mid <- mid + resm[j]^2 * (xm[j, ] - mnxm) %*% t(xm[j, ] - mnxm) mid <- mid/nm cmcl <- cxminv %*% mid %*% cxminv ssecl[i, ] <- sqrt(n/nm) * sqrt(diag(cmcl)) #get Statlib SIR scaled SE and test statistics out <- sir(xm, ym, h) bsir[i, ] <- out$edr[, 1] bhat <- bsir[i, ] lam <- out$evalues[1] val <- (1 - lam)/lam num <- t(bhat) %*% cent %*% bhat zsir[i] <- (nm * num)/val #get Weisberg SIR scaled SE and test statistics out <- dr(ym~xm[,1]+xm[,2]+xm[,3]+xm[,4],nslices=h) bwsir[i,] <- out$evectors[,1] bhat <- bwsir[i, ] lam <- out$evalues[1] val <- (1 - lam)/lam num <- t(bhat) %*% cent %*% bhat zwsir[i] <- (nm * num)/val } print(labs[tr]) print(beta) mnbsir <- apply(bsir, 2, mean) mnwbsir <- apply(bwsir,2,mean) mnbols <- apply(bols, 2, mean) ssdbols <- sqrt(n) * sqrt(apply(bols, 2, var)) mnsseols <- apply(sseols, 2, mean) mnssecl <- apply(ssecl, 2, mean) zolslev <- sum(zols > qchisq(0.95, k))/nruns folslev <- sum(fols > qf(0.95, k, nm - q - 1))/nruns zsirlev <- sum(zsir > qchisq(0.95, k))/nruns zwsirlev <- sum(zwsir > qchisq(0.95, k))/nruns list(mnbsir = mnbsir, mnwbsir= mnwbsir, mnbols = mnbols, ssdbols = ssdbols, mnsseols = mnsseols, mnssecl = mnssecl, zolslev = zolslev, folslev = folslev, zsirlev = zsirlev, zwsirlev=zwsirlev) } drsim6<-function(n = 100,q = 4,nruns = 100,xtype = 1,mtype = 1,eps = 0.4,dd = 5) {# MAY NOT WORK IF q = 1 # Needs dr library so only works in R. Uses FCH. # Finds ave(esp), and rejection proportions for 0% and adaptive # trimming OLS estimators. # CHANGE A IF q IS NOT EVEN # xtype = 1 for MVN Nq(0,I), # 2, 3, 4 and 10 for eps Nq(0,I) + (1-eps) Nq(0, 25 I) # 5 for lognormal, # 6, 7, 8 and 9 for multivariate t_d, with d = 3, 5, 19 or dd set.seed(974) k <- q/2 beta <- 0 * 1:q beta[1:q/2] <- 1 esp <- matrix(0, nrow = n, ncol = 10) bols <- matrix(0, nrow = nruns, ncol = (q + 1)) baols <- bols fols <- 0 * 1:nruns best <- fols atrp <- best faols <- fols faolslev <- 0 labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") A2 <- diag(q/2) A1 <- 0 * A2 A <- cbind(A1, A2) tem <- seq(0.1, 1, 0.1) for(i in 1:nruns) { #make data x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(xtype == 2) { zu <- runif(n) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(xtype == 3) { zu <- runif(n) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(xtype == 4) { zu <- runif(n) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(xtype == 5) x <- exp(x) if(xtype == 6) { zu <- sqrt(rchisq(n, 3)/3) x <- x/zu } if(xtype == 7) { zu <- sqrt(rchisq(n, 5)/5) x <- x/zu } if(xtype == 8) { zu <- sqrt(rchisq(n, 19)/19) x <- x/zu } if(xtype == 9) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu } if(xtype == 10) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 } sp <- 1 + x %*% beta if(mtype == 1) y <- sp + rnorm(n) if(mtype == 2) y <- sp^2 + rnorm(n) if(mtype == 3) y <- exp(sp) + rnorm(n) if(mtype == 4) y <- (sp)^3 + rnorm(n) if(mtype == 5) y <- sin(sp)/sp + 0.01 * rnorm(n) if(mtype == 6) y <- sp + sin(sp) + 0.1 * rnorm(n) if(mtype == 7) y <- sqrt(abs(sp)) + 0.1 * rnorm(n) #get OLS for 0% trimming xm <- x ym <- y #get ESP out <- lsfit(xm, ym) bols[i, ] <- out$coef resm <- out$resid nm <- length(ym) msem <- t(resm) %*% resm/(nm - q - 1) cxm <- var(xm) cxminv <- solve(cxm) #get OLS chisquare test statistic bhat <- bols[i, -1] covAb <- A %*% cxminv %*% t(A) covinv <- solve(covAb) cent <- t(A) %*% covinv %*% A num <- t(bhat) %*% cent %*% bhat zols <- (nm * num)/msem #get OLS partial F statistic fols[i] <- ((nm - 1) * zols)/(k * nm) #get covfch estimator out <- covfch(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) #get best trimming proportion btr <- 10 esp[, 10] <- x %*% bhat bcorr <- abs(cor(esp[, 10], sp)) for(j in 1:9) { val <- quantile(rd2, tem[j]) #get (xm,ym) xm <- x[rd2 <= val, ] ym <- y[rd2 <= val] #get ESP from (xm,ym) out <- lsfit(xm, ym)$coef[-1] esp[, j] <- x %*% out tcorr <- abs(cor(esp[, j], sp)) if(tcorr > bcorr) btr <- j } best[i] <- btr #get adaptive trimming proportion atr <- btr if(btr < 10) { for(j in (btr + 1):10) { tval <- abs(cor(esp[, j], esp[, btr])) if(tval > 0.95) atr <- j } } atrp[i] <- atr val <- quantile(rd2, tem[atr]) #get data from adaptive trimming xm <- x[rd2 <= val, ] ym <- y[rd2 <= val] #get ESP from adaptive trimming out <- lsfit(xm, ym) baols[i, ] <- out$coef resm <- out$resid nm <- length(ym) msem <- t(resm) %*% resm/(nm - q - 1) cxm <- var(xm) cxminv <- solve(cxm) #get OLS chisquare test statistic bhat <- baols[i, -1] covAb <- A %*% cxminv %*% t(A) covinv <- solve(covAb) cent <- t(A) %*% covinv %*% A num <- t(bhat) %*% cent %*% bhat zols <- (nm * num)/msem #get OLS partial F statistic faols[i] <- ((nm - 1) * zols)/(k * nm) #check whether adaptive trimming rejects Ho if(faols[i] > qf(0.95, q/2, nm - q - 1)) faolslev <- faolslev + 1 } print(labs[btr]) print(beta) mnbols <- apply(bols, 2, mean) ssdbols <- sqrt(n) * sqrt(apply(bols, 2, var)) folslev <- sum(fols > qf(0.95, q/2, n - q - 1))/nruns mnbaols <- apply(baols, 2, mean) ssdbaols <- sqrt(n) * sqrt(apply(baols, 2, var)) faolslev <- faolslev/nruns list(mnbols = mnbols, mnbaols = mnbaols, ssdbols = ssdbols, ssdbaols = ssdbaols, folslev = folslev, faolslev = faolslev, best = best, atrp = atrp) } drsim7<-function(n=100,p = 4,h=4,steps = 5,gam=0.49,nruns=100,drtype=1,outliers=F) {# Need to modify dr formulas if p is not equal to 4. # This R function generates outlier data. Uses FCH. # Compares SIR, SAVE, PHD and OLS for 0 and 50% trimming # Use drtype = 1 for sir, 2 for save, 3 for phd, 4 for ols # Needs library dr. set.seed(974) beta1 <- 0 * 1:p beta1[1] <- 1 bwdr1 <- matrix(0, nrow = nruns, ncol = p) b50 <- bwdr1 mbaloc <- 0 * (1:p) mbasig <- 0 * diag(1:p) for(i in 1:nruns) { x <- matrix(rnorm(n * p), nrow = n, ncol = p) sp1 <- 1 + x %*% beta1 y <- (sp1)^3 + rnorm(n) ## code below: x outliers have mean +/- (10, 10, ..., 10)^T ## and y outliers have mean 0 if(outliers == T) { val <- floor(gam * n) val2 <- as.integer(val/2) tem <- 10 + 0 * 1:p x[1:val2, ] <- x[1:val2, ] + tem x[(val2+1):val,] <- x[(val2+1):val,] - tem y[1:val] <- rnorm(val) } out <- covfch(x, csteps = steps) mbaloc <- mbaloc + out$center mbasig <- mbasig + out$cov #do trimming center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) val <- quantile(rd2, 0.5) #get untrimmed data xm <- x[rd2 <= val, ] ym <- y[rd2 <= val] #get Weisberg SIR, SAVE or PHD if(drtype==1){ out <- dr(y~x[,1]+x[,2]+x[,3]+x[,4],nslices=h,method="sir") outm <- dr(ym~xm[,1]+xm[,2]+xm[,3]+xm[,4],nslices=h,method="sir") } if(drtype==2){ out <- dr(y~x[,1]+x[,2]+x[,3]+x[,4],nslices=h,method="save") outm <- dr(ym~xm[,1]+xm[,2]+xm[,3]+xm[,4],nslices=h,method="save") } if(drtype==3){ out <- dr(y~x[,1]+x[,2]+x[,3]+x[,4],nslices=h,method="phd") outm <- dr(ym~xm[,1]+xm[,2]+xm[,3]+xm[,4],nslices=h,method="phd") } if(drtype==4){ out <- lsfit(x,y) outm <- lsfit(xm,ym) } if(drtype == 4){ bwdr1[i,] <- out$coef[-1] b50[i,] <- outm$coef[-1] } else{ bwdr1[i,] <- out$evectors[,1] b50[i,] <- outm$evectors[,1] } } mbaloc <- mbaloc/nruns mbasig <- mbasig/nruns mnwbdr1 <- apply(bwdr1,2,mean) mnb50 <- apply(b50,2,mean) list(mnwbdr1=mnwbdr1, mnb50 = mnb50, mbaloc = mbaloc, mbasig = mbasig) } ellipse<-function(x, center = apply(x, 2, mean), cov = var(x), alph = 0.95) {# Makes a covering interval. The x should have 2 columns. mu1 <- center[1] mu2 <- center[2] w <- solve(cov) w11 <- w[1, 1] w12 <- w[1, 2] w22 <- w[2, 2] tem <- x[, 2] - mu2 y2 <- seq(min(tem), max(tem), length = 100) xc <- qchisq(alph, 2) el <- matrix(0, 2, 2) ind <- 0 for(i in 1:100) { j1 <- (y2[i] * w12)^2 j2 <- w11 * ((y2[i])^2 * w22 - xc) # print(i) # print(j1 - j2) if((j1 - j2) >= 0) { ind <- ind + 2 tem <- (y2[i] * w12)^2 tem <- tem - w11 * ((y2[i])^2 * w22 - xc) tem <- sqrt(tem) term <- ( - y2[i] * w12 + tem)/ w11 el <- rbind(el, c((term + mu1), ( y2[i] + mu2))) term <- ( - y2[i] * w12 - tem)/ w11 el <- rbind(el, c((term + mu1), ( y2[i] + mu2))) } } el <- el[3:ind, ] nn <- dim(x)[1] if((ind - 2) > nn) { tem <- sample((ind - 2), nn) el <- el[tem, ] } xt <- cbind(x[, 1], el[, 1]) yt <- cbind(x[, 2], el[, 2]) matplot(xt, yt) } enet<-function(x,y,am=10){ #Gets elastic net output using 10-fold CV using a grid of alpha values #0, 1/am, 2/am, ..., am/am. #Needs library(glmnet). #Use am = 10, 20, 50, or 100. Larger am values take longer. x<-as.matrix(x) out<-cv.glmnet(x,y,alpha=0) lam <- out$lambda.min #value of lambda that minimizes 10 fold CV cvmin <- min(out$cvm) alph = 0 for(i in 1:am){ tem <- cv.glmnet(x,y,alpha=i/am) mcv <- min(tem$cvm) if(mcv < cvmin){ out <- tem lam <- out$lambda.min cvmin <- mcv alph <- i/am} } yhat <- predict(out,s=lam,newx=x) res <- y - yhat plot(yhat,y) abline(0,1) list(out=out, yhat=yhat, res=res, lam = lam, alph = alph) } enet2<-function(x,y,am=10){ #Gets elastic net output using 10-fold CV using a grid of alpha values #0, 1/am, 2/am, ..., am/am. Does not make the response plot made by enet. #Needs library(glmnet). #Use am = 10, 20, 50, or 100. Larger am values take longer. x<-as.matrix(x) out<-cv.glmnet(x,y,alpha=0) lam <- out$lambda.min #value of lambda that minimizes 10 fold CV cvmin <- min(out$cvm) alph = 0 for(i in 1:am){ tem <- cv.glmnet(x,y,alpha=i/am) mcv <- min(tem$cvm) if(mcv < cvmin){ out <- tem lam <- out$lambda.min cvmin <- mcv alph <- i/am} } yhat <- predict(out,s=lam,newx=x) res <- y - yhat list(out=out, yhat=yhat, res=res, lam = lam, alph = alph) } essp<- function(x, Y, M = 50, type = 3) {# Trimmed view or ESSP for M percent # trimming. Allows visualization of g # and crude estimation of c beta in models # of the form y = g(x^T beta,e). # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(MASS)" or "library(lqs)." # Click on the right mouse button to finish, and # in R, highlight "stop." x <- as.matrix(x) tval <- M/100 if(type == 1) out <- cov.mcd(x) if(type == 2) out <- covmba(x) if(type == 3) out <- covfch(x) if(type == 4) out <- covrmvn(x) if (type <= 4){ center <- out$center cov <- out$cov} #type = 5 for rfch estimator if (type == 5){ out <- covfch(x) center <- out$rmnf cov <- out$rcovf} rd2 <- mahalanobis(x, center, cov) val <- quantile(rd2, (1 - tval)) bhat <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$coef ESP <- x %*% bhat[-1] + bhat[1] plot(ESP, Y) identify(ESP, Y) return(bhat[-1]) } ffL<-function(x, y) {# for unix, use X11() to turn on the graphics device before using this function # this function makes a FF lambda plot where the competing models are Y^L n <- length(y) rmat <- matrix(nrow = n, ncol = 5) rmat[, 1] <- y - lsfit(x, y)$resid ytem <- (y^(0.5) - 1)/0.5 rmat[, 2] <- ytem - lsfit(x, ytem)$resid rmat[, 3] <- log(y) - lsfit(x, log(y))$resid ytem <- (y^(-0.5) - 1)/-0.5 rmat[, 4] <- ytem - lsfit(x, ytem)$resid ytem <- (y^(-1) - 1)/-1 rmat[, 5] <- ytem - lsfit(x, ytem)$resid pairs(rmat, labels = c("YHAT", "YHAT^(0.5)", "YHAT^(0)", "YHAT^(-0.5)", "YHAT^(-1)")) min(cor(rmat)) } fflynx<-function() {# R users need to type data(lynx) Y <- log10(lynx) FAR2 <- 1:114 FAR11 <- 1:114 FAR12 <- 1:114 SETAR272 <- 1:114 SETAR252 <- 1:114 for(i in 3:114){ FAR2[i ] <- 1.05 + 1.41*Y[i-1] -0.77*Y[i-2]} for(i in 12:114){ FAR11[i ] <- 1.13*Y[i-1] -0.51*Y[i-2] + .23*Y[i-3] -0.29*Y[i-4] + .14*Y[i-5] -0.14*Y[i-6] + 0.08*Y[i-7] -0.04*Y[i-8] + .13*Y[i-9] + 0.19*Y[i-10] - .31*Y[i-11] } for(i in 13:114){ FAR12[i ] <- 1.123 + 1.084*Y[i-1] -0.477*Y[i-2] + .265*Y[i-3] -0.218*Y[i-4] + .180*Y[i-9] - .224*Y[i-12] } for(i in 13:114){ if( Y[i-2] <= 3.116){ SETAR272[i ] <- 0.546 + 1.032*Y[i-1] -0.173*Y[i-2] + .171*Y[i-3] -0.431*Y[i-4] + .332*Y[i-5] - .284*Y[i-6] + .210*Y[i-7]} else {SETAR272[i ] <- 2.632 + 1.492*Y[i-1] -1.324*Y[i-2]} } for(i in 13:114){ if( Y[i-2] <= 3.05){ SETAR252[i ] <- 0.768 + 1.064*Y[i-1] -0.200*Y[i-2] + .164*Y[i-3] -0.428*Y[i-4] + .181*Y[i-5] } else {SETAR252[i ] <- 2.254 + 1.474*Y[i-1] -1.202*Y[i-2]} } x <- cbind(Y,FAR2,FAR11,FAR12,SETAR272,SETAR252) x <- x[13:114,] print(cor(x)) pairs(x) } ffplot<-function(x, y, nsamps = 7){ # For Splus since R does not have l1fit. # For Unix, use X11() to turn on the graphics device before # using this function. # Makes an FF plot with several resistant estimators. # Needs mbareg and hbreg. n <- length(y) rmat <- matrix(nrow = n, ncol = 7) lsfit <- y - lsfit(x, y)$residuals print("got OLS") l1fit <- y - l1fit(x, y)$residuals print("got L1") almsfit <- y - lmsreg(x, y)$resid print("got ALMS") altsfit <- y - ltsreg(x, y)$residuals print("got ALTS") mbacoef <- mbareg(x, y, nsamp = nsamps)$coef MBAfit <- mbacoef[1] + x %*% mbacoef[-1] print("got MBA") bbcoef <- hbreg(x, y)$bbcoef BBfit <- bbcoef[1] + x %*% bbcoef[-1] print("got BB") rmat[, 1] <- y rmat[, 2] <- lsfit rmat[, 3] <- l1fit rmat[, 4] <- almsfit rmat[, 5] <- altsfit rmat[, 6] <- MBAfit rmat[, 7] <- BBfit pairs(rmat, labels = c("Y", "OLS Fit", "L1 Fit", "ALMS Fit", "ALTS Fit", "MBA Fit", "BB Fit")) } ffplot2<-function(x, y, nsamps = 7){ # For Unix, use X11() to turn on the graphics device before # using this function. For R, first type library(MASS). # Makes an FF plot with several resistant estimators. # Needs hbreg, mbareg, mbalata, and rmreg3. n <- length(y) x <- as.matrix(x) rmat <- matrix(nrow = n, ncol = 8) lsfit <- y - lsfit(x, y)$residuals print("got OLS") almsfit <- y - lmsreg(x, y)$resid print("got ALMS") altsfit <- y - ltsreg(x, y)$residuals print("got ALTS") mbacoef <- mbareg(x, y, nsamp = nsamps)$coef MBAfit <- mbacoef[1] + x %*% mbacoef[-1] print("got MBA") bbcoef <- hbreg(x, y)$bbcoef BBfit <- bbcoef[1] + x %*% bbcoef[-1] print("got BB") mbalcoef <- mbalata(x, y, nsamp = nsamps)$coef MBALfit <- mbalcoef[1] + x %*% mbalcoef[-1] print("got MBALATA") RMREG2 <- y - rmreg3(x,y)$res rmat[, 1] <- y rmat[, 2] <- lsfit rmat[, 3] <- almsfit rmat[, 4] <- altsfit rmat[, 5] <- MBAfit rmat[, 6] <- BBfit rmat[, 7] <- MBALfit rmat[, 8] <- RMREG2 pairs(rmat, labels = c("Y", "OLS Fit", "ALMS Fit", "ALTS Fit", "MBA Fit", "BB Fit", "MBALATA", "RMREG2")) } fselboot<-function(x,y,B = 1000){ #needs library(leaps), n > 5p, p > 2 #bootstrap min Cp model forward selection regression #Does not make sense to do variable selection if there #is only one nontrivial predictor. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vmax <- min(p,as.integer(n/5)) vars <- as.vector(1:(p-1)) #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the minimum Cp submodel tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] #do not need the constant in vin vin <- vars[mincp[-1]] sub <- lsfit(x[,vin],y) bhatimin0 <- 0*1:p indx <- c(1,1+vin) bhatimin0[indx] <- sub$coef betas <- matrix(0,nrow=B,ncol=p) #bootstrap the minimum Cp submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-regsubsets(x,y=yb,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] vin <- vars[mincp[-1]] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef } list(full=full,sub=sub,bhatimin0=bhatimin0,betas=betas) } fselboot2<-function(x,y,B=1000,c=0.01,aug=F){ #needs library(leaps), n > 5p, p > 2 #bootstrap min Cp model forward selection regression #using bhatVS and bhatMIX #If aug not=F, adds cB full model bootstrap samples. #Then S_T^* will be better conditioned. #Does not make sense to do variable selection if there #is only one nontrivial predictor. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vmax <- min(p,as.integer(n/5)) vars <- as.vector(1:(p-1)) d <- ceiling(c*B) bpd <- B + d bp1 <- B + 1 #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the minimum Cp submodel tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] #do not need the constant in vin vin <- vars[mincp[-1]] sub <- lsfit(x[,vin],y) bhatimin0 <- 0*1:p indx <- c(1,1+vin) bhatimin0[indx] <- sub$coef betas <- matrix(0,nrow=bpd,ncol=p) btmix <- betas #bootstrap the minimum Cp submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-regsubsets(x,y=yb,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] vin <- vars[mincp[-1]] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef yb <- fit + sample(res,n,replace=T) btmix[i,indx] <- lsfit(x[,vin],yb)$coef } if(aug == F) {betas <- betas[1:B,]; btmix <- btmix[1:B,]} else { for(i in bp1:bpd){ yb <- fit + sample(res,n,replace=T) betas[i,] <- lsfit(x,yb)$coef } btmix[bp1:bpd,] <- betas[bp1:bpd,] } list(full=full,sub=sub,bhatimin0=bhatimin0,betas=betas,btmix=btmix) } fysim<-function( runs = 20) {# 20 FY response plots for simulated AR(2) time series data fycorr <- 1:runs for(i in 1: runs){ Y <- ardata()$arts out <- ar.yw(Y) Yts <- Y[10:200] FIT <- Yts - out$resid[10:200] plot(FIT,Yts) abline(0,1) fycorr[i] <- cor(FIT,Yts) } list(fycorr=fycorr) } gamper<-function(h, k=500) {#estimates amount of comtamination FLTS can tolerate n <- 10000 c <- 5000 gam0 <- min((n - c)/n, (1 - (1 - 0.2^(1/k))^(1/ h))) * 100 print(gam0) } gamper2<-function(p, k = 500) {#estimates the amount of contamination fmcd can tolerate n <- 10000 c <- 5000 h <- p + 1 gam0 <- min((n - c)/n, (1 - (1 - 0.2^(1/k))^(1/h))) * 100 print(gam0) } getB<-function(x, m=0, k= 5, msteps = 0){ # Gets the covmb2 subset B and the index of cases indx. # Best if p > n. # You can estimate number of clean cases m > n/2 with plot: out<-medout(x) x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] index <- 1:n covv <- diag(p) med <- apply(x, 2, median) #Get squared Euclidean distances from coordinatewise median. md2 <- mahalanobis(x, center = med, covv) if(m==0){ if(msteps > 0){#do concentration type steps for(i in 1:msteps){ medd <- median(md2) medw <- apply(x[md2<=medd,], 2, median) md2 <- mahalanobis(x, center = medw, covv) } } md <- sqrt(md2) mcut <- median(md) + k*mad(md,constant=1) } else{ #Use m cases with the smallest distances. md <- sqrt(md2) mcut <- sort(md)[m] } B <- x[md <= mcut,] indx <- index[md <= mcut] list(B = B, indx=indx) } getu<-function(x, csteps = 5, locc = 0.5){ # Gets the RMVN subset U and the index of cases indx. # Needs number of predictors > 1. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] index <- 1:n up <- qchisq(0.975, p) qchi <- qchisq(0.5, p) ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc)## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the cov estimator ## (rmnmvn,rcovmvn) tailored for MVN data rd2 <- mahalanobis(x, mnf, covf) rmnmvn <- apply(x[rd2 <= up, ], 2, mean) rcovmvn <- var(x[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ## get U rd2 <- mahalanobis(x, rmnmvn, rcovmvn) U <- x[rd2 <= up, ] indx <- index[rd2 <= up] list(U = U, indx=indx) } getubig<-function(x, group){ # Gets Ubig for discriminant analysis, binary regression, or one way MANOVA. #Let x contains data, group is vector with group[i] = j #if ith row is a case from the jth group, j = 1, ..., g. #Need each group size nh > 2(p+1). Would like nh > 20p. g <- max(group) x <- as.matrix(x) indx <- 1:2 tindx <- 1:dim(x)[1] #get the cases used in the RMVN set from each group for(i in 1:g){ xi <- as.matrix(x[group==i,]) tind <- tindx[group==i] tem <- getu(xi) tind <- tind[tem$indx] indx <- c(indx,tind) } indx <- indx[-c(1,2)] Ubig <- x[indx,] grp <- group[indx] list(Ubig=Ubig,indx=indx,grp=grp) } getuc<-function(x, group){ # Gets Uc for discriminant analysis or one way MANOVA # assuming the g groups or populations only differ in mean. #So Yij - muj are iid where j = 1, ..., g. BIG ASSUMPTION! #Subtracts the coordinatewise median from each group. #Then gets the RMVN set from the 0 median data. #Then gets the corresponding x data. #Let x contains data, group is vector with group[i] = j #if ith row is a case from the jth group, j = 1, ..., g. #Need each group size nj > 2(p+1). Would like nj > 20p. g <- max(group) x <- as.matrix(x) Uz <- x[1:2,] #Subtract the group coordinatewise median from data in each group. #Combine the groups into one data set Uz of nearly iid data #if the medians approximate the means. for(i in 1:g){ xi <- as.matrix(x[group==i,]) med <- apply(xi, 2, median) #one <- 1 + 0*1:dim(xi)[1];xc <- xi - one%*%t(med) xc <- sweep(xi,2,med) Uz <- rbind(Uz,xc) } Uz <- Uz[-c(1,2),] out <- getu(Uz) indx <- out$indx Uc <- x[indx,] grp <- group[indx] list(Uc=Uc,indx=indx,grp=grp) } hbplot<-function(x, Y, aa = 1.4, rr = T, typ = 1) {# Makes the response plot for hbreg estimator and attractors. #Click on right mouse button to advance plot, and in R, highlight #"stop." # For R, type "library(MASS)" before using this function. # The hbreg estimator uses MBA if typ = 1, ltsreg if typ = 2, # mbalata if typ = 3. x <- as.matrix(x) out <- hbreg(x, Y, a = aa, rreg = rr, type = typ) bhat <- out$coef HBFIT <- bhat[1] + x %*% bhat[-1] bhat <- out$olscoef OLSFIT <- bhat[1] + x %*% bhat[-1] bhat <- out$arobcoef AROBFIT <- bhat[1] + x %*% bhat[-1] bhat <- out$bbcoef BBFIT <- bhat[1] + x %*% bhat[-1] par(mfrow = c(2, 2)) plot(HBFIT, Y) abline(0, 1) identify(HBFIT, Y) plot(OLSFIT, Y) abline(0, 1) identify(OLSFIT, Y) plot(AROBFIT, Y) abline(0, 1) identify(AROBFIT, Y) plot(BBFIT, Y) abline(0, 1) identify(BBFIT, Y) par(mfrow = c(1, 1)) } hbreg<-function(x, y, csteps = 10, a = 1.4, rreg = T, type = 1) {# Gets the hbreg estimator with 10 concentration steps applied to # the HB attractor BB. Uses LTA criterion to screen the 3 attractors. # For R, type "library(MASS)" before using this function. # If rreg = F, arobcoef = olscoef, if T, arobcoef uses # mbareg if type = 1, ltsreg if type = 2, mbalata if type = 3 # rmreg2 if type =4. med <- median(y) madd <- mad(y, constant = 1) x <- as.matrix(x) n <- dim(x)[1] nc <- dim(x)[2] + 1 #nc is number of predictors including intercept temp <- lsfit(x, y) hbf <- temp$coef absres <- abs(temp$residuals) indx <- (absres <= median(absres)) critf <- sum(absres[indx]) #mbareg or lmsreg may work better than ltsreg if(rreg == T){ if(type == 1){ tem <- mbareg(x, y) temres <- y - tem$coef[1] - x %*% tem$coef[-1] absres <- abs(temres)} if(type == 2){ tem <- ltsreg(x, y) absres <- abs(tem$residuals)} if(type == 3){ tem <- mbalata(x,y) temres <- y - tem$coef[1] - x %*% tem$coef[-1] absres <- abs(temres)} if(type == 4){ tem <- rmreg3(x,y) temres <- y - tem$Bhat[1] - x %*% tem$Bhat[-1] absres <- abs(temres)} indx <- (absres <= median(absres)) crit <- a*sum(absres[indx]) if(crit < critf) { critf <- crit hbf <- tem$coef} } #get y's closest to median y indx <- (y >= (med - madd) & y <= (med + madd)) #get the HB attractor bk = BB bk <- lsfit(x[indx, ], y[indx])$coef res <- y - (x %*% bk[2:nc] + bk[1]) ressq <- res^2 m <- median(ressq) for(i in 1:csteps) { indx <- (ressq <= m) bk <- lsfit(x[indx, ], y[indx])$coef res <- y - (x %*% bk[2:nc] + bk[1]) ressq <- res^2 m <- median(ressq) } bb <- bk res <- y - (x %*% bb[2:nc] + bb[1]) absres <- abs(res) indx <- (absres <= median(absres)) crit <- a*sum(absres[indx]) if(crit < critf) hbf <- bb if(rreg == T && type != 4) arobcoef <- tem$coef else if (rreg == T && type == 4) arobcoef <- as.vector(tem$Bhat) else arobcoef <- temp$coef list(coef = hbf, olscoef = temp$coef, arobcoef = arobcoef, bbcoef = bb) } hbregsim<-function(n = 100, p = 4, csteps = 10, nruns = 10, aa = 1.4, rr = T, typ = 1, sige = 1, sigx = 1, etype = 1){ # Simulates MLR model and gets fits from hbreg. # For R, type "library(MASS)" before using this function. # For MLR, want mnbhat = (1, ..., 1), sd(i) = 1\sqrt(n). # If rr = F, arobcoef = olscoef, if T, arobcoef uses # mbareg if typ = 1, ltsreg if typ = 2, mbalata if typ = 3, rmreg3 if typ = 4. # errors are normal if etype = 1, shifted exponential for etype=2, shifted lognormal if etype=3 # q <- p - 1 beta <- 1 + 0 * 1:q # beta <- 1:q hbhat <- matrix(0, nrow = nruns, ncol = p) olshat <- hbhat arobhat <- hbhat bbhat <- hbhat for(i in 1:nruns) { x <- matrix(rnorm(n * q, sd = sige), nrow = n, ncol = q) if (etype == 1) err <- rnorm(n, sd = sigx) if (etype == 2) err <- sige*(rexp(n) - 1) if (etype == 3) err <- sige*(exp(rnorm(n)) - exp(0.5)) y <- 1 + x %*% beta + err out <- hbreg(x, y, csteps = csteps, a = aa, rreg = rr, type = typ) hbhat[i, ] <- out$coef olshat[i, ] <- out$olscoef arobhat[i, ] <- out$arobcoef bbhat[i, ] <- out$bbcoef } mnhbhat <- apply(hbhat, 2, mean) sdhbhat <- sqrt(apply(hbhat, 2, var)) mnolshat <- apply(olshat, 2, mean) sdolshat <- sqrt(apply(olshat, 2, var)) mnarobhat <- apply(arobhat, 2, mean) sdarobhat <- sqrt(apply(arobhat, 2, var)) mnbbhat <- apply(bbhat, 2, mean) sdbbhat <- sqrt(apply(bbhat, 2, var)) list(mnhbhat = mnhbhat, sdhbhat = sdhbhat, mnolshat = mnolshat, sdolshat = sdolshat, mnarobhat = mnarobhat, sdarobhat = sdarobhat, mnbbhat = mnbbhat, sdbbhat = sdbbhat) } lassoboot2<-function(x,y,B = 1000,regtype=1,c=0.01,aug=F){ #needs library(glmnet), n > 5p, p > 2 #bootstrap lasso or ridge regression #If aug not=F, adds cB full model bootstrap samples. #Does not make sense to do variable selection if there #is only one nontrivial predictor. #Change regtype = 0 for ridge regression x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] d <- ceiling(c*B) bpd <- B + d bp1 <- B + 1 #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res betas <- matrix(0,nrow=bpd,ncol=p) #bootstrap the lasso or ridge regression model for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-cv.glmnet(x,y=yb,alpha=regtype) betas[i,] <- as.vector(predict(tem,type="coefficients",s=tem$lambda.min)) } if(aug == F) {betas <- betas[1:B,]} else { for(i in bp1:bpd){ yb <- fit + sample(res,n,replace=T) betas[i,] <- lsfit(x,yb)$coef } } list(betas=betas) } lassobootsim5<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, cc=0.01, augm=F, type = 1, psi = 0.0, regtype=1, BB=1000, alph = 0.05){ #needs library(glmnet) #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #If augm neq F, adds the OLS full model bootstrap samples (lambda = 0) #so S*_T is better conditioned. #Takes about a minute per run so really slow. #Simulates bootstrap for lasso or RR (regtype=0) #with 10-fold cross validation. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-1, k is the number of nonnoise variables #Need k < p-1 so there are some zeroes in the model. #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-lassoboot2(x,y,B=BB,regtype=regtype,c=cc,aug=augm) #bootstrap 10 fold CV lasso or RR model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 tem <- predreg(out$betas[,(k+2):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} lmsviews<-function(x, Y, ii = 1) {# Trimmed views using lmsreg for 90, 80, ... 0 percent # trimming. Allows visualization of m # and crudely estimation of c beta in models # of the form y = m(x^T beta) + e. # Workstation: activate a graphics device # with commands "X11()" or "motif()." # R needs command "library(MASS)" or "library(lqs)." # Advance the view with the right mouse button and # in R, highight "stop." x <- as.matrix(x) out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) b <- lmsreg(x[rd2 <= val, ], Y[rd2 <= val])$coef ESP <- x %*% b[-1] + b[1] plot(ESP, Y) title(labs[i]) identify(ESP, Y) print(b) } } locsim<-function(n = 100, runs = 100, type = 1, eps = 0.25, shift = 100, df = 1){ # calls tmn,rstmn,ratmn # simulates classical and robust location point estimators # tmn=1% trimmed mean, rstmn= T_S,n and ratmn=T_A,n both with with k1=k2=6 # type = 1: normal data, type = 2: contaminated normal data # type = 3: t(df) data, type = 4: double exponential if(type == 1) x <- matrix(rnorm(n * runs), nrow = runs, ncol = n) if(type == 2) { x <- matrix(rnorm(n * runs), nrow = runs, ncol = n) x <- x + shift * matrix(rbinom(n * runs, 1, eps), nrow = runs, ncol = n) } if(type == 3) x <- matrix(rt(n * runs, df = df), nrow = runs, ncol = n) if(type == 4) { x <- matrix(rexp(n * runs), nrow = runs, ncol = n) w <- matrix(rbinom(n * runs, 1, 0.5), nrow = runs, ncol = n) w <- 2 * w - 1 x <- w * x } xbar <- apply(x, 1, mean) med <- apply(x, 1, median) trmn <- apply(x, 1, tmn) stmn <- apply(x, 1, rstmn) atmn <- apply(x, 1, ratmn) vars <- c(n * var(xbar), n * var(med), n * var( trmn), n * var(stmn), n * var(atmn)) means <- c(mean(xbar), mean(med), mean(trmn), mean(stmn), mean(atmn)) tem <- xbar - atmn diff <- sum(tem < 0) + sum(tem > 0) #xbar = xbar, med = med, trmn = trmn, stmn = stmn, atmn = atmn, print("mean,median,trmn,rstmn,ratmn") list(means = means, vars = vars, diff = diff) } lpisim<-function(n = 100, nruns = 100, alpha = 0.05, eps = 0.1, shift = 9, tdf = 3, type = 1){ # simulates asymptotically optimal PI [L_n, U_n] for the location model # if type = 1 for N(0,1), 2 for t with tdf degrees of freedom, 3 for exp(1), # 4 for uniform(-1,1), 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) opicov <- 0 opilen <- 1:nruns q <- n+1 cc <- ceiling(n * (1 - alpha)) corfac <- (1 + 15/n) * sqrt((n+1)/(n - 1)) for(i in 1:nruns) { if(type == 1) w <- rnorm(q) if(type == 2) w <- rt(q, df = tdf) if(type == 3) w <- rexp(q) if(type == 4) w <- runif(q, min = -1, max = 1) if(type == 5) w <- rnorm(q, sd = 1 + rbinom(q, 1, eps) * shift) y <- w[-q] yf <- w[q] yfhat <- median(y) #got y and yf #get asymptotically optimal PI sy <- sort(y) rup <- sy[cc] rlow <- sy[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sy[j] rlow <- sy[j - cc + 1] } } } up <- (1-corfac) * yfhat + corfac * rup low <- (1-corfac) * yfhat + corfac * rlow opilen[i] <- up - low if(low <= yf && up >= yf) opicov <- opicov + 1 } pimnlen <- mean(opilen) opicov <- opicov/nruns list(pimenlen = pimnlen, opicov = opicov) } lrdata <- function(n = 200, type = 3) {# Generates data for logistic regression. # If X|y=1 ~ N(mu_1,I) and X|Y=0 ~ N(0,I) then beta = mu_1 and alpha = -0.5 ||mu_1||^2. # # If q is changed, change the formula in the glm statement. q <- 5 y <- 0 * 1:n y[(n/2 + 1):n] <- y[(n/2 + 1):n] + 1 beta <- 0 * 1:q if(type == 1) { beta[1] <- 1 alpha <- -0.5 } if(type == 2) { beta <- beta + 1 alpha <- -q/2 } if(type == 3) { beta[1:3] <- 1 alpha <- -1.5 } x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { x[(n/2 + 1):n, 1] <- x[(n/2 + 1 ):n, 1] + 1 } if(type == 2) { x[(n/2 + 1):n, ] <- x[(n/2 + 1 ):n, ] + 1 } if(type == 3) { x[(n/2 + 1):n, 1:3 ] <- x[(n/2 + 1 ):n, 1:3 ] + 1 } #X|y=0 ~ N(0, I) and X|y=1 ~ N(beta,I) # change formula to x[,1]+ ... + x[,q] with q out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[,5], family = binomial) list(alpha = alpha, beta = beta, lrcoef = out$coef,x=x,y=y) } lrdata2<-function(n = 200, p = 5, m = 10, type = 1) {# Generates data for logistic regression. q <- p -1 y <- 1:n z <- y mv <- m + 0 * y beta <- 0 * 1:q if(type == 1) { beta <- beta + 2 alpha <- 0 } if(type == 2) { beta[1] <- 2 alpha <- 0 } if(type == 3) { beta[1:3] <- 2 alpha <- 0 } x <- matrix(rnorm(n * q), nrow = n, ncol = q) SP <- alpha + x %*% beta pv <- exp(SP)/(1 + exp(SP)) for(i in 1:n) y[i] <- rbinom(1, size = m, prob = pv[i]) Z <- y/m out<-glm(Z~., family=binomial,weights=mv,data=data.frame(cbind(x,Z))) list(x = x, y = y, mv = mv, alpha = alpha, beta = beta, lrcoef = out$coef) } lressp<-function(x, y, slices = 10){ # Makes the ESSP for logistic regression. # If X|y=1 ~ N(mu_1,I) and X|Y=0 ~ N(0,I) then # beta = mu_1 and alpha = ||mu_1||^2. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(lqs)." # Advance the view with the right mouse button. # In R, highlight "stop." # #ESP <- x %*% out$coef[-1] + out$coef[1] #If q is changed, change the formula in the glm statement. #q <- 5 # change formula to x[,1]+ ... + x[,q] with q #out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[, 5], family = poisson) out <- glm(y~., family=binomial, data=data.frame(cbind(x,y))) ESP <- predict(out) Y <- y plot(ESP, Y) abline(mean(y), 0) fit <- y fit <- exp(ESP)/(1 + exp(ESP)) # lines(sort(ESP),sort(fit)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) fit2 <- fit n <- length(y) val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i-1)*val+1):(i * val)] <- mean(y[indx[((i-1)*val+1):(i*val)]]) } fit2[((slices-1)*val+1):n] <- mean(y[indx[((slices-1)*val+1):n]]) #fit2 is already sorted in order corresponding to indx lines(ESP[indx], fit2) #list(fit2=fit2,n=n,slices=slices,val=val) } lrplot<-function(x, y, mv, slices = 10, ff = 0.97, step = T) {# Makes response and OD plots for binomial logistic regression. # mv = (m1, ..., mn) where yi is bin(mi,p(SP)) # If step = T use step function, else use lowess. # Workstation: need to activate a graphics # device with command "X11()" or "motif()." # # If q is changed, change the formula in the glm statement. q <- 5 # change formula to x[,1]+ ... + x[,q] with q n <- length(y) Z <- y/mv out <- glm(Z ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[, 5], family = binomial, weights = mv) #out<-glm(Z~., family=binomial,weights=mv,data=data.frame(cbind(x,Z))) #ESP <- x %*% out$coef[-1] + out$coef[1] ESP <- predict(out) par(mfrow = c(1, 2),pty="s") plot(ESP, Z) abline(weighted.mean(Z, mv), 0) fit <- y fit <- exp(ESP)/(1 + exp(ESP)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) if(step == T) { fit2 <- fit val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i - 1) * val + 1):(i * val)] <- weighted.mean(Z[ indx[((i - 1) * val + 1):(i * val)]], mv[indx[(( i - 1) * val + 1):(i * val)]]) } fit2[((slices - 1) * val + 1):n] <- weighted.mean(Z[indx[(( slices - 1) * val + 1):n]], mv[indx[((slices - 1) * val + 1):n]]) # fit2 is already sorted in order corresponding to indx lines(ESP[indx], fit2) } else lines(lowess(ESP, Z, f = ff), type = "s") title("a) ESSP") #get OD plot val <- exp(ESP)/(1 + exp(ESP)) Ehat <- mv * val Vmodhat <- Ehat * (1 - val) Vhat <- (y - Ehat)^2 plot(Vmodhat, Vhat) abline(0, 1) abline(0, 4) abline(lsfit(Vmodhat, Vhat)$coef) title("b) OD Plot") par(mfrow=c(1,1)) } lrplot2<-function(x, y, slices = 10, ff = 0.99, step = T) {# Makes the ESSP for binary logistic regression. # If step = T use step function, else use lowess. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # #ESP <- x %*% out$coef[-1] + out$coef[1] #If q is changed, change the formula in the glm statement. #q <- 5 # change formula to x[,1]+ ... + x[,q] with q #out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[, 5], family = poisson) out <- glm(y~., family=binomial, data=data.frame(cbind(x,y))) ESP <- predict(out) Y <- y plot(ESP, Y) fit <- y fit <- exp(ESP)/(1 + exp(ESP)) # lines(sort(ESP),sort(fit)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) if(step == T){ fit2 <- fit n <- length(y) val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i - 1) * val + 1):(i * val)] <- mean(y[indx[((i - 1) * val + 1):(i * val)]]) } fit2[((slices - 1) * val + 1):n] <- mean(y[indx[((slices - 1) * val + 1 ):n]]) # fit2 is already sorted in order corresponding to indx lines(ESP[indx], fit2)} else lines(lowess(ESP, Y, f = ff), type = "s") title("Response Plot") } lsviews<-function(x, Y, ii = 10, type = 1) {# This function is the same as tvreg except that the untrimmed # cases are highlighted. It compares the LS fits for 0, 10, # ..., 90 percent trimming. # Use ii = 10 for 0, ii = 9 for 10 then 0, ..., # ii = 1 for 90 then 80 ..., then 0 percent trimming. # Used to visualize g if y = g(beta^T x,e). # Workstation: activate a graphics # device with command "X11()" or "motif()." # R needs command "library(MASS)." # Advance the view with the right mouse button. # In R, highlight ``stop." x <- as.matrix(x) if(type == 1) out <- cov.mcd(x) else out <- covmba(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) bhat <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$coef ESP <- bhat[1] + x %*% bhat[-1] plot(ESP, Y) points(ESP[rd2 <= val], Y[rd2 <= val], pch = 15, cex = 1.4) abline(0, 1) title(labs[i]) identify(ESP, Y) print(bhat) } } maha<-function(x) {# Generates the classical mahalanobis distances. Need p > 1 or x a matrix. center <- apply(x, 2, mean) cov <- var(x) return(sqrt(mahalanobis(x, center, cov))) } mbalata<-function(x, y, k=6, nsamp = 7) {#gets the median ball fit with 7 centers, med resid crit, 7 ball sizes x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] # q + 1 is number of predictors including intercept vals <- c(q + 3 + floor(n/100), q + 3 + floor(n/40), q + 3 + floor(n/20), q + 3 + floor(n/10), q + 3 + floor(n/5), q + 3 + floor(n/3), q + 3 + floor(n/2)) covv <- diag(q) centers <- sample(n, nsamp) temp <- lsfit(x, y) mbaf <- temp$coef ## get LATA criterion res <- temp$residuals crit <- k^2*median(res^2) cn <- sum(res^2 <= crit) absres <- sort(abs(res)) critf <- sum(absres[1:cn]) ## for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) smd2 <- sort(md2) for(j in 1:7) { temp <- lsfit(x[md2 <= smd2[vals[j]], ], y[md2 <= smd2[vals[j]]]) #Use OLS on rows with md2 <= cutoff = smd2[vals[j]] res <- y - temp$coef[1] - x %*% temp$coef[-1] ## get LATA criterion crit <- k^2*median(res^2) cn <- sum(res^2 <= crit) absres <- sort(abs(res)) crit <- sum(absres[1:cn]) ## if(crit < critf) { critf <- crit mbaf <- temp$coef } } } list(coef = mbaf, critf = critf) } mbamv<-function(x, y, nsamp = 7) {# This function is for simple linear regression. The # highlighted boxes get weight 1. Click on right # mouse button to advance plot, and in R, highlight "stop." Only uses 50% trimming. x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] covv <- diag(q) centers <- sample(n, nsamp) for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) med <- median(md2) plot(x, y) points(x[md2 < med], y[md2 < med], pch = 15) abline(lsfit(x[md2 < med],y[md2 < med])) identify(x, y) } } mbamv2<-function(x, Y, nsamp = 7) { # This function is for multiple linear regression. The # highlighted boxes get weight 1. Click on right # mouse button to advance plot, and in R, highlight "stop." Only uses 50% trimming. x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] covv <- diag(q) centers <- sample(n, nsamp) par(mfrow=c(2,1)) for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) med <- median(md2) if(q ==1){out <- lsfit(x[md2 < med],Y[md2 < med])} else{out <- lsfit(x[md2 < med,],Y[md2 < med])} FIT <- out$coef[1] + x%*%out$coef[-1] RES <- Y - FIT plot(FIT,Y) points(FIT[md2 < med], Y[md2 < med], pch = 15) abline(0,1) identify(FIT, Y) plot(FIT,RES) points(FIT[md2 < med], RES[md2 < med], pch = 15) abline(0,0) identify(FIT, RES) } par(mfrow=c(1,1)) } mbareg<-function(x, y, nsamp = 7) {#gets the mbareg fit with 7 centers, med resid crit, 7 ball sizes x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] # q + 1 is number of predictors including intercept vals <- c(q + 3 + floor(n/100), q + 3 + floor(n/40), q + 3 + floor(n/20 ), q + 3 + floor(n/10), q + 3 + floor(n/5), q + 3 + floor(n/3), q + 3 + floor(n/2)) covv <- diag(q) centers <- sample(n, nsamp) temp <- lsfit(x, y) mbaf <- temp$coef critf <- median(temp$residuals^2) for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) smd2 <- sort(md2) for(j in 1:7) { temp <- lsfit(x[md2 <= smd2[vals[j]], ], y[md2 <= smd2[ vals[j]]]) #Use OLS on rows with md2 <= cutoff = smd2[vals[j]] res <- y - temp$coef[1] - x %*% temp$coef[-1] crit <- median(res^2) if(crit < critf) { critf <- crit mbaf <- temp$coef } } } list(coef = mbaf, critf = critf) } mbsim <-function(n = 100, p = 2, csteps = 5, gam = 0.4, runs = 20, outliers = 0, pm = 10, cscale=F){ # Need p > 1. # This R function simulates the biased MB estimator. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a point mass on major axis (0,...,0,pm)' # 2 for outliers a point mass on minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # 6 for outliers a tight cluster at major axis (0,...,0,pm)' #set.seed(974) A <- sqrt(diag(1:p)) cloc <- 0 * (1:p) csig <- 0 * A mbloc <- cloc mbsig <- csig cct <- 0 mbct <- 0 val <- floor(gam * n) for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- 0 x[1:val,p] <- pm } if(outliers == 2) { x[1:val, ] <- 0 x[1:val,1] <- pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p ] <- pm } if(outliers == 5) { x[1:val, 1 ] <- pm } if(outliers == 6) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, p] <- x[1:val, p] + pm } out <- covmb(x, steps = csteps, scale = cscale) mbloc <- mbloc + out$center mbsig <- mbsig + out$cov rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbct <- mbct + 1 cmn <- apply(x,2,mean) cloc <- cloc + cmn ccov <- var(x) csig <- csig + ccov rd2 <- mahalanobis(x, cmn, ccov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) cct <- cct + 1 } cloc <- cloc/runs csig <- csig/runs mbloc <- mbloc/runs mbsig <- mbsig/runs list(cloc = cloc, mbloc = mbloc, csig = csig, mbsig=mbsig, cct = cct, mbct=mbct) } med2ci<-function(x, cc = 4, alpha = 0.05) {#gets ~ 50% trimmed mean se for sample median and the corresponding robust 100 (1-alpha)% CI #defaults are alpha = .05, cc = 5 may be better than the default up <- 1 - alpha/2 n <- length(x) med <- median(x) ln <- floor(n/2) - ceiling(sqrt(n/cc)) un <- n - ln low <- ln + 1 d <- sort(x) if(ln > 0) { d[1:ln] <- d[(low)] d[(un + 1):n] <- d[un] } den <- ((un - ln)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- un - low rval <- qt(up, rdf) * sqrt(swv/n) rlo <- med - rval rhi <- med + rval list(int = c(rlo, rhi), med = med, swv = swv) } medci<-function(x, alpha = 0.05) {#Gets Bloch and Gastwirth SE for sample median and the #corresponding resistant 100(1-alpha)% CI. The default is alpha = .05. n <- length(x) up <- 1 - alpha/2 med <- median(x) ln <- floor(n/2) - ceiling(sqrt(n/4)) un <- n - ln d <- sort(x) rdf <- un - ln - 1 cut <- qt(up, rdf) sebg <- 0.5 * (d[un] - d[ln + 1]) rval <- cut * sebg rlo <- med - rval rhi <- med + rval list(int = c(rlo, rhi), med = med, sebg = sebg) } medout<-function(x) {# finds squared Euclidean distances from the coordinatewise median x<-as.matrix(x) p <- dim(x)[2] covv <- diag(p) med <- apply(x, 2, median) d2 <- mahalanobis(x, center = med, covv) plot(d2) list(d2 = d2) } mldsim <-function(n = 100, p = 2, steps = 5, gam = 0.4, runs = 20, outliers = 0, pm = 10, loc = 0.5){ # Need p > 1. # This R function compares the MBA, FCH, RFCH, RMVN, scaled DGK, classical, OGK, # FASTMCD, CMVE and biased MB estimators. # R needs library(robustbase), calls covrob, covdgk # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a point mass on major axis (0,...,0,pm)' # 2 for outliers a point mass on minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # 6 for outliers a tight cluster at major axis (0,...,0,pm)' # Also finds n*var(T[p]) which approx p for the classical estimator # and n var(C(p,p)) for MLD estimator (T,C). A <- sqrt(diag(1:p)) qchi <- qchisq(0.5, p) cs <- 1:runs cm <- cs cloc <- 0 * (1:p) csig <- 0 * A mbas <- 1:runs mbam <- cs mbaloc <- cloc mbasig <- csig fchs <- mbas fchm <- cs fchloc <- cloc fchsig <- csig rfchs <- mbas rfchm <- cs rfchloc <- cloc rfchsig <- csig rmvns <- mbas rmvnm <- cs rmvnloc <- cloc rmvnsig <- csig dgks <- mbas dgkm <- cs dgkloc <- cloc dgksig <- csig ogks <- mbas ogkm <- cs ogkloc <- cloc ogksig <- csig fmcds <- mbas fmcdm <- cs fmcdloc <- cloc fmcdsig <- csig cmveloc <- cloc cmvesig <- csig mbs <- mbas mbm <- cs mbsig <- csig mbloc <- cloc mbact <- 0 fchct <- 0 rfchct <- 0 rmvnct <- 0 dgkct <- 0 cct <- 0 ogkct <- 0 fmcdct <- 0 cmvect <- 0 mbct <- 0 val <- floor(gam * n) for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- 0 x[1:val,p] <- pm } if(outliers == 2) { x[1:val, ] <- 0 x[1:val,1] <- pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p ] <- pm } if(outliers == 5) { x[1:val, 1 ] <- pm } if(outliers == 6) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, p] <- x[1:val, p] + pm } out <- covrob(x, csteps = steps, locc = loc) mbaloc <- mbaloc + out$center mbasig <- mbasig + out$cov mbam[i] <- out$center[p] mbas[i] <- out$cov[p,p] rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbact <- mbact + 1 rd2 <- mahalanobis(x, out$mnm, out$covm) const <- median(rd2)/qchi covm <- const * out$covm mbloc <- mbloc + out$mnm mbsig <- mbsig + covm mbm[i] <- out$mnm[p] mbs[i] <- covm[p,p] #scaling does not effect outlier count if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbct <- mbct + 1 fchloc <- fchloc + out$mnf fchsig <- fchsig + out$covf fchm[i] <- out$mnf[p] fchs[i] <- out$covf[p,p] rd2 <- mahalanobis(x, out$mnf, out$covf) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fchct <- fchct + 1 rfchloc <- rfchloc + out$rmnf rfchsig <- rfchsig + out$rcovf rfchm[i] <- out$rmnf[p] rfchs[i] <- out$rcovf[p,p] rd2 <- mahalanobis(x, out$rmnf, out$rcovf) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rfchct <- rfchct + 1 rmvnloc <- rmvnloc + out$rmnmvn rmvnsig <- rmvnsig + out$rcovmvn rmvnm[i] <- out$rmnmvn[p] rmvns[i] <- out$rcovmvn[p,p] rd2 <- mahalanobis(x, out$rmnmvn, out$rcovmvn) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rmvnct <- rmvnct + 1 cmveloc <- cmveloc + out$mncmv cmvesig <- cmvesig + out$covcmv rd2 <- mahalanobis(x, out$mncmv, out$covcmv) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) cmvect <- cmvect + 1 out <- covdgk(x, csteps = 10) dgkloc <- dgkloc + out$center dgksig <- dgksig + out$cov dgkm[i] <- out$center[p] dgks[i] <- out$cov[p,p] rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) dgkct <- dgkct + 1 center <- apply(x,2,mean) cloc <- cloc + center ccov <- var(x) csig <- csig + ccov cm[i] <- center[p] cs[i] <- ccov[p,p] rd2 <- mahalanobis(x, center, ccov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) cct <- cct + 1 out <- covOGK(x,sigmamu = scaleTau2) ogkloc <- ogkloc + out$center ogksig <- ogksig + out$cov ogkm[i] <- out$center[p] ogks[i] <- out$cov[p,p] rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) ogkct <- ogkct + 1 out <- covMcd(x) #for Det-MCD, use #out <- covMcd(x,nsamp="deterministic") #Use out <- covmb2(x) for the covmb2 estimator. fmcdloc <- fmcdloc + out$center fmcdsig <- fmcdsig + out$cov fmcdm[i] <- out$center[p] fmcds[i] <- out$cov[p,p] rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fmcdct <- fmcdct + 1 } mbasv <- n*var(mbas) mbam <- n*var(mbam) mbaloc <- mbaloc/runs mbasig <- mbasig/runs fchsv <- n*var(fchs) fchm <- n*var(fchm) fchloc <- fchloc/runs fchsig <- fchsig/runs rfchsv <- n*var(rfchs) rfchm <- n*var(rfchm) rfchloc <- rfchloc/runs rfchsig <- rfchsig/runs rmvnsv <- n*var(rmvns) rmvnm <- n*var(rmvnm) rmvnloc <- rmvnloc/runs rmvnsig <- rmvnsig/runs dgksv <- n*var(dgks) dgkm <- n*var(dgkm) dgkloc <- dgkloc/runs dgksig <- dgksig/runs ogksv <- n*var(ogks) ogkm <- n*var(ogkm) ogkloc <- ogkloc/runs ogksig <- ogksig/runs csv <- n*var(cs) cm <- n*var(cm) cloc <- cloc/runs csig <- csig/runs fmcdsv <- n*var(fmcds) fmcdm <- n*var(fmcdm) fmcdloc <- fmcdloc/runs fmcdsig <- fmcdsig/runs cmveloc <- cmveloc/runs cmvesig <- cmvesig/runs mbsv <- n*var(mbs) mbm <- n*var(mbm) mbloc <- mbloc/runs mbsig <- mbsig/runs list(mbaloc = mbaloc, fchloc = fchloc, rfchloc = rfchloc, rmvnloc = rmvnloc,dgkloc = dgkloc, cloc = cloc, ogkloc = ogkloc, fmcdloc = fmcdloc, cmveloc = cmveloc, mbloc = mbloc, mbasig = mbasig, fchsig = fchsig, rfchsig = rfchsig, rmvnsig = rmvnsig, dgksig = dgksig, csig = csig, ogksig = ogksig, fmcdsig = fmcdsig, cmvesig = cmvesig, mbsig=mbsig, mbasv = mbasv, fchsv=fchsv, rfchsv = rfchsv, rmvnsv=rmvnsv, dgksv=dgksv, ogksv=ogksv, csv=csv, fmcdsv=fmcdsv, mbsv=mbsv, mbam=mbam, fchm=fchm, rfchm=rfchm, rmvnm=rmvnm, dgkm=dgkm, ogkm=ogkm, cm=cm, fmcdm=fmcdm, mbm=mbm, cct = cct, dgkct=dgkct, mbact=mbact, fchct = fchct, rfchct = rfchct, rmvnct=rmvnct, ogkct=ogkct, fmcdct=fmcdct,cmvect=cmvect,mbct=mbct) } mldsim6<-function(n = 100, p = 2, steps = 5, gam = 0.4, runs = 100, outliers = 0, pm = 10, kk=5, osteps = 0){ # This R function compares the # FCH, RFCH, CMVE, RCMVE, RMVN, COVMB2, and MB estimators. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # Calls cmve, covfch, covrmvn, covmb2 A <- sqrt(diag(1:p)) fchct <- 0 rfchct <- 0 cmvect <- 0 rcmvect <- 0 rmvnct <- 0 mbct <- 0 covmb2ct <- 0 val <- floor(gam * n) for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } out <- covfch(x, csteps = steps) rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fchct <- fchct + 1 rd2 <- mahalanobis(x, out$rmnf, out$rcovf) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rfchct <- rfchct + 1 out <- cmve(x, csteps = steps) rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) cmvect <- cmvect + 1 rd2 <- mahalanobis(x, out$rmnf, out$rcovf) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rcmvect <- rcmvect + 1 rd2 <- mahalanobis(x, out$mnm, out$covm) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbct <- mbct + 1 out <- covrmvn(x, csteps = steps) rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rmvnct <- rmvnct + 1 out <- covmb2(x, k=kk, msteps=osteps) rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) covmb2ct <- covmb2ct + 1 } list(fchct = fchct, rfchct = rfchct, cmvect = cmvect, rcmvect = rcmvect, rmvnct = rmvnct, covmb2ct = covmb2ct, mbct = mbct) } mldsim7<-function(n = 100, p = 2, gam = 0.4, runs = 100, outliers = 0, pm = 10, kk=5, osteps = 0){ # This R function examines the COVMB2 estimators. #The function mldsim6 uses the weighted covariance matrix out$cov #The function mldsim7 uses squared Euclidian distances based on Ip are used, #or the squared Mahalanobis distances using diag(out$cov) # Counts number of times all outlier distances > clean distances. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # Calls covmb2 A <- sqrt(diag(1:p)) covv <- diag(p) #identity matrix Ip covmb2ct <- 0 diagct <- 0 val <- floor(gam * n) for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } out <- covmb2(x, k=kk, msteps=osteps) rd2 <- mahalanobis(x, out$center, covv) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) covmb2ct <- covmb2ct + 1 rd2 <- mahalanobis(x, out$center, diag(diag(out$cov))) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) diagct <- diagct + 1 } list( covmb2ct = covmb2ct, diagct = diagct) } MLRplot<-function(x, Y){ # Response plot and residual plot. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(lqs)" if a robust estimator replaces lsfit. # Advance the view with the right mouse button. x <- as.matrix(x) out <- lsfit(x, Y) cook <- ls.diag(out)$cooks n <- dim(x)[1] p <- dim(x)[2] + 1 tem <- cook > min(0.5, (2 * p)/n) bhat <- out$coef FIT <- Y - out$res cmar <- par("mar") par(mfrow = c(2, 1)) par(mar=c(4.0,4.0,2.0,0.5)) plot(FIT, Y) abline(0, 1) points(FIT[tem], Y[tem], pch = 15) title("Response Plot") identify(FIT, Y) RES <- Y - FIT plot(FIT, RES) points(FIT[tem], RES[tem], pch = 15) title("Residual Plot") identify(FIT, RES) par(mfrow = c(1, 1)) par(mar=cmar) } mlrplot2 <- function(x, Y) {# Makes the response plot and residual plot for two mbareg estimators. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(MASS)" if a robust estimator replaces lsfit. # Advance the view with the right mouse button, and in R, highlight "stop." x <- as.matrix(x) out <- mbareg(x, Y) bhat <- out$coef FIT <- bhat[1] + x %*% bhat[-1] par(mfrow = c(2, 2)) plot(FIT, Y) abline(0, 1) identify(FIT, Y) title("MBA Response Plot") RES <- Y - FIT plot(FIT, RES) identify(FIT, RES) title("MBA Residual Plot") # out <- mbalata(x, Y) bhat <- out$coef FIT <- bhat[1] + x %*% bhat[-1] plot(FIT, Y) abline(0, 1) identify(FIT, Y) title("MBALATA Response Plot") RES <- Y - FIT plot(FIT, RES) identify(FIT, RES) title("MBALATA Residual Plot") par(mfrow=c(1,1)) } mlrplot4<-function(x, Y){ # This function is for R. Use mlrplot2 for R and Splus. # Makes response plot and residual plot with large # Cook's distances and Pena's distances highlighted. # Squares have large Cook's distances # and crosses have large Pena's distances. x <- as.matrix(x) out <- lsfit(x, Y) cook <- ls.diag(out)$cooks n <- dim(x)[1] p <- dim(x)[2] + 1 #get Pena's distances s one <- 1 + 0 * 1:n w <- cbind(one, x) cp <- t(w) %*% w h <- w %*% solve(cp) %*% t(w) s <- 0 * 1:n for(i in 1:n) { for(j in 1:n) { s[i] <- s[i] + (cook[j] * h[i, j]^2)/(h[i, i] * h[j, j]) } } tem <- cook > min(0.5, (2 * p)/n) medd <- median(s) madd <- mad(s, constant = 1) tem2 <- abs(s - medd) > 4.5 * madd bhat <- out$coef FIT <- bhat[1] + x %*% bhat[-1] cmar <- par("mar") par(mfrow = c(2, 1)) par(mar=c(4.0,4.0,2.0,0.5)) plot(FIT, Y) abline(0, 1) points(FIT[tem], Y[tem], pch = 0) points(FIT[tem2], Y[tem2], pch = 3) identify(FIT, Y) title("Response Plot") RES <- Y - FIT plot(FIT, RES) points(FIT[tem], RES[tem], pch = 0) points(FIT[tem2], RES[tem2], pch = 3) identify(FIT, RES) title("Residual Plot") par(mfrow = c(1, 1)) par(mar=cmar) } mplot<-function(x) {# Need p > 1. # Makes a DD plot only using the MDi, the RDi are not used. p <- dim(x)[2] center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) md <- sqrt(md2) rd <- md const <- sqrt(qchisq(0.5, p))/median(rd) rd <- const * rd plot(md, rd) abline(0, 1) identify(md, rd) } nav<-function(alpha = 0.01, k = 5) {#gets n(asy var) for the alpha trimmed mean #and T_(A,n)(k) if errors are N(0,1) z <- - qnorm(alpha) den <- 1 - (2 * z * dnorm(z))/(2 * pnorm(z) - 1 ) val <- den/(1 - 2 * alpha) ntmav <- val + (2 * alpha * z^2)/(1 - 2 * alpha )^2 zj <- k * qnorm(0.75) alphaj <- pnorm( - zj) alphaj <- ceiling(100 * alphaj)/100 zj <- - qnorm(alphaj) den <- 1 - (2 * zj * dnorm(zj))/(2 * pnorm(zj) - 1) val <- den/(1 - 2 * alphaj) natmav <- val + (2 * alphaj * zj^2)/(1 - 2 * alphaj)^2 list(ntmav=ntmav, natmv=natmav) } nltv<-function(gam = 0.5) {# Gets asy var for lts(h) and lta(h) at standard normal # where h/n -> gam. k <- qnorm(0.5 + gam/2) den <- gam - 2 * k * dnorm(k) ltsv <- 1/den tem <- (1 - exp( - (k^2)/2))^2 ltav <- (2 * pi * gam)/(4 * tem) list(ltsv=ltsv, ltav=ltav) } oddata<-function(n = 100, q = 5, k = 1) {# Generates overdispersion (negative binomial) data # for Poisson regression. y <- 1:n beta <- 0 * 1:q beta[1:3] <- 1 alpha <- -2.5 x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- 0.5 * x + 1 SP <- alpha + x %*% beta for(i in 1:n) { pr <- k/(exp(SP[i]) + k) y[i] <- rnbinom(1, size = k, pr) } list(x = x, y = y) } olskout<-function(n=100,pm=20,k=1){ #Shows that OLS can fit well if there are #k outliers and n inliers if n is huge. #Outliers approx N2( (pm,-pm)^T, 0.001 I) = #near point mass at (x,y) = (pm, -pm)^T. x <- runif(n,-10,10) y <- x + rnorm(n) z <- matrix(rnorm(k * 2, sd = 0.01), ncol = 2, nrow = k) z[,1] <- z[,1] + pm z[,2] <- z[,2] - pm x <- c(x,z[,1]) y<-c(y,z[,2]) plot(x,y) abline(lsfit(x,y)) } pifclean<-function(k, gam) { p <- floor(log(3/k)/log(1 - gam)) list(p = p) } piplot<-function(x, y, alpha = 0.05) {# For Unix, use X11() to turn on the graphics device before # using this function. # Makes a response plot with prediction limits added. x <- as.matrix(x) p <- dim(x)[2] + 1 n <- length(y) up <- 1:n low <- up out <- lsfit(x, y) tem <- ls.diag(out) lev <- tem$hat res <- out$residuals FIT <- y - res Y <- y corfac <- (1 + 15/n)*sqrt(n/(n - p)) val2 <- quantile(res, c(alpha/2, 1 - alpha/2)) #get lower and upper PI limits for each case for(i in 1:n) { val <- sqrt(1 + lev[i]) val3 <- as.single(corfac * val2[1] * val) val4 <- as.single(corfac * val2[2] * val) up[i] <- FIT[i] + val4 low[i] <- FIT[i] + val3 } zy <- c(min(low), Y, max(up)) zx <- c(min(FIT), FIT, max(FIT)) #change labels so plot labels are good ff <- FIT yy <- Y Y <- zy FIT <- zx plot(FIT, Y, type = "n") points(ff, yy) abline(0, 1) points(ff, up, pch = 17) points(ff, low, pch = 17) } pisim<-function(n = 100,q = 7,nruns = 100,alpha = 0.05,eps = 0.1,shift = 9,type = 1) {# compares new and classical PIs for multiple linear regression # if type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors # constant = 1 so there are p = q+1 coefficients b <- 0 * 1:q + 1 cpicov <- 0 npicov <- 0 acpicov <- 0 opicov <- 0 val3 <- 1:nruns val4 <- val3 val5 <- val3 pilen <- matrix(0, nrow = nruns, ncol = 4) coef <- matrix(0, nrow = nruns, ncol = q + 1) corfac <- (1 + 15/n) * sqrt(n/(n - q - 1)) corfac2 <- sqrt(n/(n - q - 1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) xf <- rnorm(q) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) xf <- rnorm(q) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 xf <- rnorm(q) yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) xf <- rnorm(q) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err xf <- rnorm(q) yf <- 1 + xf %*% b + rnorm(1, sd = 1 + rbinom(1, 1, eps ) * shift) } out <- lsfit(x, y) fres <- out$resid coef[i, ] <- out$coef yfhat <- out$coef[1] + xf %*% out$coef[-1] w <- cbind(1, x) xtxinv <- solve(t(w) %*% w) xf <- c(1, xf) hf <- xf %*% xtxinv hf <- hf %*% xf val <- sqrt(1 + hf) #get classical PI mse <- sum(fres^2)/(n - q - 1) val2 <- qt(1 - alpha/2, n - q - 1) * sqrt(mse) * val up <- yfhat + val2 low <- yfhat - val2 pilen[i, 1] <- up - low if(low < yf && up > yf) cpicov <- cpicov + 1 #get semiparametric PI val2 <- quantile(fres, c(alpha/2, 1 - alpha/2)) val3[i] <- as.single(corfac * val2[1] * val) val4[i] <- as.single(corfac * val2[2] * val) up <- yfhat + val4[i] low <- yfhat + val3[i] pilen[i, 2] <- up - low if(low < yf && up > yf) npicov <- npicov + 1 # asymptotically conservative PI val6 <- corfac2 * max(abs(val2)) val5[i] <- val6 * val up <- yfhat + val5[i] low <- yfhat - val5[i] pilen[i, 3] <- up - low if(low < yf && up > yf) acpicov <- acpicov + 1 # asymptotically optimal PI sres <- sort(fres) cc <- ceiling(n * (1 - alpha)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac * val * rup low <- yfhat + corfac * val * rlow pilen[i, 4] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } pimnlen <- apply(pilen, 2, mean) mnbhat <- apply(coef, 2, mean) lcut <- mean(val3) hcut <- mean(val4) accut <- mean(val5) cpicov <- cpicov/nruns npicov <- npicov/nruns acpicov <- acpicov/nruns opicov <- opicov/nruns list(mnbhat = mnbhat, pimenlen = pimnlen, cpicov = cpicov, spicov = npicov, acpicov = acpicov, opicov = opicov, lcut = lcut, hcut = hcut, accut = accut) } pisim3<-function(n = 100, nruns = 100, alpha = 0.05, eps = 0.1, shift = 9, type = 1, modt=1){ #need to type library(mgcv) #compares asymptotically optimal PIs for GAM #applied to Y = 0 + x1 + x1^2 + 0 x2 + 0 x3 + e #if type = 1 for N(0,1) errors, 2 for t3 errors, #3 for exp(1) - 1 errors #4 for uniform(-1,1) errors, #5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors #constant = 1 so there are p = q+1 coefficients #modt = 1: Y = x1 + x1^2 #modt = 2: Y = sin(x1) + cos(x2) + log(|x3|) #modt = 3: Y = sqrt(abs(x1)) + sqrt(abs(x2)) + sqrt(abs(x3)) q <- 3 p <- q + 1 npicov <- 0 opicov <- 0 val3 <- 1:nruns val4 <- val3 pilen <- matrix(0, nrow = nruns, ncol = 2) corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} else {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) xf <- rnorm(q) if(modt==1){ymn <- x[,1] + x[,1]^2 yfmn <- xf[1] + xf[1]^2} if(modt==2){ymn <- sin(x[,1]) + cos(x[,2]) + log(abs(x[,3])) yfmn <- sin(xf[1]) + cos(xf[2]) + log(abs(xf[3]))} if(modt==3){ymn <- sqrt(abs(x[,1])) + sqrt(abs(x[,2])) + sqrt(abs(x[,3])) yfmn <- sqrt(abs(xf[1])) + sqrt(abs(xf[2])) + sqrt(abs(xf[3]))} if(type == 1) { y <- ymn + rnorm(n) yf <- yfmn + rnorm(1) } if(type == 2) { y <- ymn + rt(n, df = 3) yf <- yfmn + rt(1, df = 3) } if(type == 3) { y <- ymn + rexp(n) - 1 yf <- yfmn + rexp(1) - 1 } if(type == 4) { y <- ymn + runif(n, min = -1, max = 1) yf <- yfmn + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- ymn + err yf <- yfmn + rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) } ###change 5 of the next 6 lines if q is changed from 3 x1 <- x[,1] x2 <- x[,2] x3 <- x[,3] out <- gam(y ~ s(x1) + s(x2) + s(x3)) fres <- out$resid yfhat <- predict.gam(out,newdata=data.frame(x1=xf[1],x2=xf[2],x3=xf[3])) #get semiparametric PI val2 <- quantile(fres, c(alphan/2, 1 - alphan/2)) val3[i] <- val2[1] val4[i] <- val2[2] up <- yfhat + corfac*val4[i] low <- yfhat + corfac*val3[i] pilen[i, 1] <- up - low if(low < yf && up > yf) npicov <- npicov + 1 # asymptotically optimal PI sres <- sort(fres) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i, 2] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } pimnlen <- apply(pilen, 2, mean) lcut <- mean(val3) hcut <- mean(val4) npicov <- npicov/nruns opicov <- opicov/nruns list(pimenlen = pimnlen, spicov = npicov, opicov = opicov, lcut = lcut, hcut = hcut) } pisim4<-function(n = 100, q = 7, nruns = 100, alpha = 0.05, eps = 0.1, shift = 9, type = 1, mod = 1){ #this function is for SPLUS # compares asymptotically optimal PIs for multiple linear regression # if type = 1 for N(0,1) errors, 2 for t3 errors, #3 for exp(1) - 1 errors #4 for uniform(-1,1) errors, #5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors #constant = 1 so there are p = q+1 coefficients #mod = 1 for least squares #mod = 2 for l1fit #mod = 3 for rrfit set.seed(974) p <- q + 1 b <- 0 * 1:q + 1 npicov <- 0 opicov <- 0 val3 <- 1:nruns val4 <- val3 pilen <- matrix(0, nrow = nruns, ncol = 2) coef <- matrix(0, nrow = nruns, ncol = q + 1) corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} else {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) xf <- rnorm(q) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) xf <- rnorm(q) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 xf <- rnorm(q) yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) xf <- rnorm(q) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err xf <- rnorm(q) yf <- 1 + xf %*% b + rnorm(1, sd = 1 + rbinom(1, 1, eps ) * shift) } if(mod == 1) out <- lsfit(x, y) if(mod == 2) out <- l1fit(x, y) if(mod == 3) out <- rreg(x, y) fres <- out$resid coef[i, ] <- out$coef yfhat <- out$coef[1] + xf %*% out$coef[-1] #get semiparametric PI val2 <- quantile(fres, c(alphan/2, 1 - alphan/2)) val3[i] <- val2[1] val4[i] <- val2[2] up <- yfhat + corfac*val4[i] low <- yfhat + corfac*val3[i] pilen[i, 1] <- up - low if(low < yf && up > yf) npicov <- npicov + 1 # asymptotically optimal PI sres <- sort(fres) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i, 2] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } pimnlen <- apply(pilen, 2, mean) mnbhat <- apply(coef, 2, mean) lcut <- mean(val3) hcut <- mean(val4) npicov <- npicov/nruns opicov <- opicov/nruns list(mnbhat = mnbhat, pimenlen = pimnlen, spicov = npicov, opicov = opicov, lcut = lcut, hcut = hcut) } pisim5<-function(n = 100, nruns = 100, alpha = 0.05, eps = 0.1, shift = 9, type = 1){ #Better for Splus. Need to keep nruns small, say 500, with R. #Compares asymptotically optimal PIs for nonlinear regression #applied to Y = 0 + x1 + x1^2 + 0 x2 + 0 x3 + e #if type = 1 for N(0,1) errors, 2 for t3 errors, #3 for exp(1) - 1 errors #4 for uniform(-1,1) errors, #5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors #p = q+1 coefficients #even with start values = true values, nls often wont converge #for 5000 runs, so use 500 runs #some versions of R can use nls.control(warnOnly=TRUE) q <- 3 p <- q + 1 npicov <- 0 opicov <- 0 val3 <- 1:nruns val4 <- val3 pilen <- matrix(0, nrow = nruns, ncol = 2) corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} else {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- x[,1] + x[,1]^2 + rnorm(n) xf <- rnorm(q) yf <- xf[1] + xf[1]^2 + rnorm(1) } if(type == 2) { y <- x[,1] + x[,1]^2 + rt(n, df = 3) xf <- rnorm(q) yf <- xf[1] + xf[1]^2 + rt(1, df = 3) } if(type == 3) { y <- x[,1] + x[,1]^2 + rexp(n) - 1 xf <- rnorm(q) yf <- xf[1] + xf[1]^2 + rexp(1) - 1 } if(type == 4) { y <- x[,1] + x[,1]^2 + runif(n, min = -1, max = 1) xf <- rnorm(q) yf <- xf[1] + xf[1]^2 + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- x[,1] + x[,1]^2 + err xf <- rnorm(q) yf <- xf[1] + xf[1]^2 + rnorm(1, sd = 1 + rbinom(1, 1, eps ) * shift) } ###change 5 of the next 6 lines if q is changed from 3 x1 <- x[,1] x2 <- x[,2] x3 <- x[,3] out <- nls(y ~ b1*x1 + b2*I(x1^2) + b3*x2 + b4*I(x2^2) + b5*x3 + b6*I(x3^2),start=list(b1=1.1,b2=1.1,b3=0.1,b4=0.1,b5=0.1,b6=0.1)) #out <- nls(y ~ b1*x1 + b2*I(x1^2) + b3*x2 + b4*I(x2^2) + b5*x3 + #b6*I(x3^2),start=list(b1=1.1,b2=1.1,b3=0.1,b4=0.1,b5=0.1,b6=0.1), #control = nls.control(minFactor = 1/4056)) fres <- residuals(out) yfhat <- predict(out,newdata=data.frame(x1=xf[1],x2=xf[2],x3=xf[3])) #get semiparametric PI val2 <- quantile(fres, c(alphan/2, 1 - alphan/2)) val3[i] <- val2[1] val4[i] <- val2[2] up <- yfhat + corfac*val4[i] low <- yfhat + corfac*val3[i] pilen[i, 1] <- up - low if(low < yf && up > yf) npicov <- npicov + 1 # asymptotically optimal PI sres <- sort(fres) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i, 2] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } pimnlen <- apply(pilen, 2, mean) lcut <- mean(val3) hcut <- mean(val4) npicov <- npicov/nruns opicov <- opicov/nruns list(pimenlen = pimnlen, spicov = npicov, opicov = opicov, lcut = lcut, hcut = hcut) } prdata <- function(n = 100, q = 5) {# Generates data for Poisson regression. y <- 0 * 1:n beta <- 0 * 1:q beta[1:3] <- 1 alpha <- -2.5 x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- 0.5*x + 1 SP <- alpha + x%*%beta y <- rpois(n,lambda=exp(SP)) # change formula to x[,1]+ ... + x[,q] with q #out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + # x[, 4] + x[,5], family = poisson) #list(alpha = alpha, beta = beta, prcoef = out$coef,x=x,y=y) list(x=x,y=y) } predreg<-function(x, alpha = 0.05){ # Makes a prediction region for the rows of x. # If p = 1, the shorth interval should work better. #Also computes the distance for the 0 vector #to check if the 0 vector is in the prediction region. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] inr<-0 zero <- 0*(1:p) up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical distance MD <- sqrt(md2) #get nonparametric prediction region boundary cuplim <- sqrt(quantile(md2, up)) D0 <- sqrt(mahalanobis(zero, center, cov)) if(D0 <= cuplim) inr <- 1 list(MD=MD, center=center,cov=cov,cuplim = cuplim, D0=D0, inr=inr) } predsim<-function(n = 100, p = 4, nruns = 100, xtype = 1, dd = 1, eps = 0.25, alpha = 0.1) {# MAY NOT WORK IF p = 1. # Gets coverages of nonparametric, semiparametric and # parametric MVN prediction regions. # Multiply x by A where xtype = 1 for MVN Nq(0,I), # 2, 3, 4 and 10 (with delta = eps) for delta Np(0,I) + (1-delta) Np(0, 25 I) # 5 for lognormal, # 6, 7, 8 and 9 for multivariate t_d with d = 3, 5, 19 or dd # mahalanobis gives squared Maha distances # set.seed(974) ccvr <- 0 scvr <- 0 rcvr <- 0 volc <- 1:nruns vols <- volc volr <- volc #up <- 1 - alpha up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha A <- sqrt(diag(1:p)) m <- n + 1 for(i in 1:nruns) { #make data x <- matrix(rnorm(m * p), nrow = m, ncol = p) if(xtype == 2) { zu <- runif(m) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(xtype == 3) { zu <- runif(m) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(xtype == 4) { zu <- runif(m) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(xtype == 5) x <- exp(x) if(xtype == 6) { zu <- sqrt(rchisq(m, 3)/3) x <- x/zu } if(xtype == 7) { zu <- sqrt(rchisq(m, 5)/5) x <- x/zu } if(xtype == 8) { zu <- sqrt(rchisq(m, 19)/19) x <- x/zu } if(xtype == 9) { zu <- sqrt(rchisq(m, dd)/dd) x <- x/zu } if(xtype == 10) { zu <- runif(m) x[zu < eps, ] <- x[zu < eps, ] * 5 } x <- x %*% A xx <- x[ - m, ] center <- apply(xx, 2, mean) cov <- var(xx) md2 <- mahalanobis(xx, center, cov) hsq <- quantile(md2, up) if(mahalanobis(t(x[m, ]), center, cov) <= hsq) ccvr <- ccvr + 1 volc[i] <- sqrt(hsq)^p * prod(diag(chol(cov))) out <- covrmvn(xx) center <- out$center cov <- out$cov md2 <- mahalanobis(xx, center, cov) hsq <- quantile(md2, up) dsq <- mahalanobis(t(x[m, ]), center, cov) if(dsq <= hsq) scvr <- scvr + 1 sqrtdet <- prod(diag(chol(cov))) vols[i] <- sqrt(hsq)^p * sqrtdet hsq <- qchisq(up, p) if(dsq <= hsq) rcvr <- rcvr + 1 volr[i] <- sqrt(hsq)^p * sqrtdet } ccvr <- ccvr/nruns scvr <- scvr/nruns rcvr <- rcvr/nruns #get a measure of efficiency wrt vols, so eff(vols) = 1 vols <- mean(vols) volc <- mean(volc)/vols volr <- mean(volr)/vols vols <- 1 list(ncvr = ccvr, scvr = scvr, mcvr = rcvr, voln = volc, vols = vols, volm = volr, up = up) } predsim2<-function(n = 50, p = 50, nv=19, nruns = 100, xtype = 1, dtype = 1, alpha = 0.05) {# MAY NOT WORK IF p = 1. # Gets coverages of the data splitting prediction region. # Multiply x by A where xtype = 1 for MVN Np(0,I), 2 for lognormal #xtype = 3: don't multiply by A so x is Np(0,I) #dtype = 1 if (T,C) = (xbar, Ip ), dtype = 2 if (T,C) = (coord median, Ip) # mahalanobis gives squared Maha distances #want n/2 > nv #coverage is very sensitive to quantile type. Use type = 1: inverse quantile #function so percentile = Y(ceiling(nv*(1-alpha)) cvr <- 0 nv <- min(nv, floor(n/2)) nH <- n - nv uv <- min(nv, ceiling( (1 - alpha)*(1+nv) ) ) up <- uv/(nv+1) #quantile for prediction region A <- sqrt(diag(1:p)) m <- n + 1 C <- diag(p) indx <- 1:n mnhsq <- 1:nruns for(i in 1:nruns) { #make data x <- matrix(rnorm(m * p), nrow = m, ncol = p) if(xtype == 2) x <- exp(x) if(xtype != 3) x <- x %*% A xx <- x[ - m, ] #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- xx[H,] xV <- xx[-H,] if(dtype==1){ center <- apply(xH, 2, mean) disp = C} if(dtype==2){ center <- apply(xH, 2, median) disp = C} md2 <- mahalanobis(xV, center, cov=disp) hsq <- quantile(md2, up, type=1) mnhsq[i] <- hsq if(mahalanobis(t(x[m, ]), center, cov=disp) <= hsq) cvr <- cvr + 1 } cvr <- cvr/nruns mnhsq <- mean(mnhsq) list(cvr = cvr, up=up, mnhsq=mnhsq) } pressp <- function(x,y) {# Makes the response plot for Poisson regression. # Workstation: need to activate a graphics # device with command "X11()" or "motif()." # #ESP <- x %*% out$coef[-1] + out$coef[1] #If q is changed, change the formula in the glm statement. #q <- 5 # change formula to x[,1]+ ... + x[,q] with q #out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[, 5], family = poisson) out <- glm(y~., family=poisson, data=data.frame(cbind(x,y))) ESP <- predict(out) Y <- y plot(ESP,Y) abline(mean(y),0) fit <- y fit <- exp(ESP) indx <- sort.list(ESP) lines(ESP[indx],fit[indx]) lines(lowess(ESP,y),type="s") } prplot<- function(x, y) {# Makes response plot, OD plot, weighted forward response # and residual plots for Poisson regression. # Workstation: need to activate a graphics # device with command "X11()" or "motif()." # #If q is changed, change the formula in the glm statement. #q <- 5 # change formula to x[,1]+ ... + x[,q] with q #out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[, 5], family = poisson) out <- glm(y~., family=poisson, data=data.frame(cbind(x,y))) ESP <- predict(out) par(mfrow = c(2, 2)) Y <- y plot(ESP, Y) abline(mean(Y), 0) Ehat <- exp(ESP) indx <- sort.list(ESP) lines(ESP[indx], Ehat[indx]) lines(lowess(ESP, Y), type = "s") title("a) Response Plot") Vhat <- (Y - Ehat)^2 plot(Ehat, Vhat) abline(0, 1) abline(0, 4) #lines(lowess(Ehat, Vhat)) #abline(lsfit(Ehat, Vhat)$coef) title("b) OD Plot") Z <- Y Z[Y < 1] <- Z[Y < 1] + 0.5 WRES <- sqrt(Z) * (log(Z) - x %*% out$coef[-1] - out$coef[1]) WFIT <- sqrt(Z) * log(Z) - WRES plot(WFIT, sqrt(Z) * log(Z)) abline(0, 1) #abline(lsfit(WFIT, sqrt(Z) * log(Z))$coef) title("c) WFRP") plot(WFIT, WRES) title("d) Wtd Residual Plot") par(mfrow = c(1, 1)) } prplot2 <- function(ESP, x, y) {# Makes response plot and OD plot for Poisson regression. #Can be used for a Poisson GAM or GLM. # If t<-1:13 # y<-c(12,14,33,50,67,74,123,141,165,204,253,246,240) # If out <- glm(y~t+I(t^2),family=poisson) # use ESP <- predict(out). # If outgam <- gam(y~s(t),poisson) # use ESP <- predict.gam(outgam). # Workstation: need to activate a graphics # device with command "X11()" or "motif()." #WRES <- sqrt(Z) * (log(Z) - x %*% out$coef[-1] - out$coef[1])#for a GLM x<-as.matrix(x) par(mfrow = c(2, 2)) Y <- y plot(ESP, Y) abline(mean(Y), 0) Ehat <- exp(ESP) indx <- sort.list(ESP) lines(ESP[indx], Ehat[indx]) lines(lowess(ESP, Y), type = "s") title("a) Response Plot") Vhat <- (Y - Ehat)^2 plot(Ehat, Vhat) abline(0, 1) abline(0, 4) #lines(lowess(Ehat, Vhat)) #abline(lsfit(Ehat, Vhat)$coef) title("b) OD Plot") Z <- Y Z[Y < 1] <- Z[Y < 1] + 0.5 WRES <- sqrt(Z) * (log(Z) - ESP) WFIT <- sqrt(Z) * log(Z) - WRES plot(WFIT, sqrt(Z) * log(Z)) abline(0, 1) #abline(lsfit(WFIT, sqrt(Z) * log(Z))$coef) title("c) WFRP") plot(WFIT, WRES) title("d) Wtd Residual Plot") par(mfrow = c(1, 1)) } prsim<-function(n = 100, nruns = 1, type = 1, k = 1) {# Runs prplot nruns times on simulated data. # Type = 1 for Poisson regression data, #Type = 2 for negative binomial regression data # Calls prdata, oddata, prplot. q <- 5 for(i in 1:nruns) { if(type == 1) out <- prdata(n, q) else out <- oddata(n, q, k) x <- out$x y <- out$y prplot(x, y) #identify(WFIT, WRES) } } ratmn<-function(x, k1 = 6, k2 = 6) {#robust 2 stage asymmetically trimmed mean madd <- mad(x, constant = 1) med <- median(x) LM <- sum(x < (med - k1 * madd)) nmUM <- sum(x > (med + k2 * madd)) n <- length(x) # ll (hh) is the percentage to be trimmed to the left (right) ll <- ceiling((100 * LM)/n) hh <- ceiling((100 * (nmUM))/n) tem <- sort(x) ln <- floor((ll * n)/100) un <- floor((n * (100 - hh))/100) low <- ln + 1 val1 <- tem[low] val2 <- tem[un] mean(x[(x >= val1) & (x <= val2)]) } rcisim<-function(n,runs = 500,type = 1,eps = 0.25,shift = 100,df = 1,kaa = 6,kss = 3.5) {# Used to simulate one CI at a time. # type = 1: normal data, type = 2: contaminated normal data # type = 3: t(df) data, type = 4: double exponential # type = 5: exponential if(type == 1) x <- matrix(rnorm(n * runs), nrow = runs, ncol = n) if(type == 2) { x <- matrix(rnorm(n * runs), nrow = runs, ncol = n) x <- x + shift * matrix(rbinom(n * runs, 1, eps), nrow = runs, ncol = n) } if(type == 3) x <- matrix(rt(n * runs, df = df), nrow = runs, ncol = n) if(type == 4) { x <- matrix(rexp(n * runs), nrow = runs, ncol = n) w <- matrix(rbinom(n * runs, 1, 0.5), nrow = runs, ncol = n) w <- 2 * w - 1 x <- w * x } if(type == 5) x <- matrix(rexp(n * runs), nrow = runs, ncol = n) cov <- 0 ## change this value with each interval epar <- 1.0 low <- 1:runs up <- low for(i in 1:runs) { ## change the following line with each interval out <- cci(x[i, ]) low[i] <- out$int[1] up[i] <- out$int[2] if(type == 5) { if(low[i] < epar && up[i] > epar) cov <- cov + 1 } else { if(low[i] < 0 && up[i] > 0) cov <- cov + 1 } } cov <- cov/runs len <- sqrt(n) * mean(up - low) list(cov = cov, len = len) } rcovsim<-function(n = 100, p = 2, steps = 5, gam = 0.4, runs = 20, outliers = T) {# Need p > 1. # This function demonstrates that covmba estimates mu # but is slightly biased for sigma if n is small. The function rmba # is better for small n. Similar results hold for covfch and rfch. # R users need to type "library(MASS)." A <- sqrt(diag(1:p)) cloc <- 0 * (1:p) csig <- 0 * A mbaloc <- cloc mbasig <- csig rmbaloc <- cloc rmbasig <- csig fmcdloc <- cloc fmcdsig <- csig fchloc <- cloc fchsig <- csig rfchloc <- cloc rfchsig <- csig for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) # code below would give mean = (10, ..., 10)^T to the outliers # if(outliers == T) { # val <- floor(gam * n) # tem <- 10 + 0 * 1:p # x <- x %*% A # x[1:val, ] <- x[1:val, ] + # tem # } # else { # x <- x %*% A # } ## code below: outliers have mean (10, 10 sqrt(2), ..., 10 sqrt(p))^T if(outliers == T) { val <- floor(gam * n) tem <- 10 + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } x <- x %*% A out <- covmba(x, csteps = steps) mbaloc <- mbaloc + out$center mbasig <- mbasig + out$cov out <- rmba(x) rmbaloc <- rmbaloc + out$center rmbasig <- rmbasig + out$cov cloc <- cloc + apply(x, 2, mean) csig <- csig + var(x) out <- cov.mcd(x) fmcdloc <- fmcdloc + out$center fmcdsig <- fmcdsig + out$cov out <- covfch(x, csteps = steps) fchloc <- fchloc + out$center fchsig <- fchsig + out$cov rfchloc <- rfchloc + out$rmnf rfchsig <- rfchsig + out$rcovf } mbaloc <- mbaloc/runs mbasig <- mbasig/runs rmbaloc <- rmbaloc/runs rmbasig <- rmbasig/runs cloc <- cloc/runs csig <- csig/runs fmcdloc <- fmcdloc/runs fmcdsig <- fmcdsig/runs fchloc <- fchloc/runs fchsig <- fchsig/runs rfchloc <- rfchloc/runs rfchsig <- rfchsig/runs list(mbaloc = mbaloc, rmbaloc = rmbaloc, fchloc = fchloc, rfchloc = rfchloc, cloc = cloc, fmcdloc = fmcdloc, mbasig = mbasig, rmbasig = rmbasig, fchsig = fchsig, rfchsig = rfchsig, csig = csig, fmcdsig = fmcdsig) } rcplot <-function(x, Y) {# Makes an RC plot. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # Advance the view with the right mouse button, and in R, highlight "stop." x <- as.matrix(x) out <- lsfit(x, Y) #zz <- ls.diag(out) CD <- zz$cooks n <- dim(x)[1] p <- dim(x)[2] + 1 RES <- out$resid #tem <- zz$std.dev #tem <- p * tem * (n - p)^2 #sortr <- sort(RES) #quad <- (n * (sortr^2))/tem plot(RES, CD) #lines(sortr, quad) identify(RES,CD) title("RC Plot") } regboot<-function(x,y, B = 1000){ #bootstrap with residuals for MLR #asymptotically the same as MSE (X'X)^(-1), so use to compare vselboot #out <- regboot(belx,bely) #ddplot4(out$betas) #this does not work for vselboot: too many 0s #mplot(out$betas) #cov(out$betas) full <- lsfit(x,y) res <- full$resid fit <- y - res n <- length(y) x <- as.matrix(x) p <- dim(x)[2]+1 betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ yb <- fit + sample(res,n,replace=T) betas[i,] <- lsfit(x,yb)$coef } list(betas=betas) } regbootsim2<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi=0.0, BB=1000, alph = 0.05){ #Simulates residual bootstrap for regression (full model). #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-1, k is the number of nonnoise variables #Need k < p-1 so there are some zeroes in the model. #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-regboot(x,y,B=BB) #residual bootstrap for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 tem <- predreg(out$betas[,(k+2):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} regbootsim3<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi=0.0, BB=1000, alph = 0.05){ ##Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #Simulates residual bootstrap for regression (full model). #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-1, k is the number of nonnoise variables #Need k < p-1 so there are some zeroes in the model. #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 one <- as.vector(0*1:(k+1) + 1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-regboot(x,y,B=BB) #residual bootstrap bhat <- lsfit(x,y)$coef #bhat_OLS for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- bhat[(k+2):p] tem <- confreg(out$betas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim #test whether the first k+1 values of beta are 1 gg <- k + 1 tstat <- bhat[1:(k+1)] tem <- confreg(out$betas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} rmaha<-function(x) {# Need p > 1 or x a matrix. # Produces robust Mahalanobis distances (scaled for normal data). p <- dim(x)[2] out <- cov.mcd(x) center <- out$center cov <- out$cov rd <- mahalanobis(x, center, cov) const <- sqrt(qchisq(0.5, p))/median(rd) return(const * sqrt(rd)) } rmba<-function(x, csteps = 5) {# gets the reweighted MBA estimator, works for p = 1 zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covb <- covs mnb <- mns ##get the square root of det(covb) critb <- prod(diag(chol(covb))) ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } crit <- prod(diag(chol(covs))) if(crit < critb) { critb <- crit covb <- covs mnb <- mns } ##scale for better performance at MVN rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/(qchisq(0.5, p)) covb <- const * covb ##reweight the above MBA estimator (mnb,covb) for efficiency rd2 <- mahalanobis(x, mnb, covb) up <- qchisq(0.975, p) if(p > 1) { rmnb <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnb = mean(zx[rd2 <= up]) } rcovb <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnb, rcovb) const <- median(rd2)/(qchisq(0.5, p)) rcovb <- const * rcovb ## reweight again rd2 <- mahalanobis(x, rmnb, rcovb) up <- qchisq(0.975, p) if(p > 1){ rmnb <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnb = mean(zx[rd2 <= up]) } rcovb <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnb, rcovb) const <- median(rd2)/(qchisq(0.5, p)) rcovb <- const * rcovb list(center = rmnb, cov = rcovb) } rmbsim <-function(n = 100, p = 2, csteps = 5, gam = 0.4, runs = 20, outliers = 0, pm = 10){# This R function simulates the MB and RMB estimators. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a point mass on major axis (0,...,0,pm)' # 2 for outliers a point mass on minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # 6 for outliers a tight cluster at major axis (0,...,0,pm)' #set.seed(974) A <- sqrt(diag(1:p)) cloc <- 0 * (1:p) csig <- 0 * A mbloc <- cloc mbsig <- csig rmbloc <- cloc rmbsig <- csig cct <- 0 mbct <- 0 rmbct <- 0 val <- floor(gam * n) for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- 0 x[1:val,p] <- pm } if(outliers == 2) { x[1:val, ] <- 0 x[1:val,1] <- pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p ] <- pm } if(outliers == 5) { x[1:val, 1 ] <- pm } if(outliers == 6) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, p] <- x[1:val, p] + pm } out <- covrmb(x, csteps = csteps) rmbloc <- rmbloc + out$center rmbsig <- rmbsig + out$cov rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rmbct <- rmbct + 1 mbloc <- mbloc + out$mnm mbsig <- mbsig + out$covm rd2 <- mahalanobis(x, out$mnm, out$covm) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbct <- mbct + 1 cmn <- apply(x,2,mean) cloc <- cloc + cmn ccov <- var(x) csig <- csig + ccov rd2 <- mahalanobis(x, cmn, ccov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) cct <- cct + 1 } cloc <- cloc/runs csig <- csig/runs mbloc <- mbloc/runs mbsig <- mbsig/runs rmbloc <- rmbloc/runs rmbsig <- rmbsig/runs list(cloc = cloc, mbloc = mbloc, rmbloc = rmbloc, csig = csig, mbsig=mbsig, rmbsig=rmbsig, cct = cct, mbct=mbct, rmbct=rmbct) } rmregboot<-function(x,y, B = 1000){ #bootstrap with residuals for MLR using rmreg3 #out <- rmregboot(belx,bely) #ddplot4(out$betas); ddplot5(out$betas) #mplot(out$betas) #cov(out$betas) full <- lsfit(x,y) res <- full$resid fit <- y - res n <- length(y) x <- as.matrix(x) p <- dim(x)[2]+1 betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ yb <- fit + sample(res,n,replace=T) betas[i,] <- rmreg3(x,yb)$Bhat } list(betas=betas) } rmregbootsim<-function(n = 100, p = 4, nruns = 100, eps = 0.1, shift = 9, type = 1, alph = 0.05){ #Very Slow. #Simulates residual bootstrap for rmreg3 with m = 1. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k ones, p - k zeroes q <- p-1 k <- floor(p/2) b <- 0 * 1:q b[1:(k-1)] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-rmregboot(x,y) #residual bootstrap using rmreg3 for (j in 1:p){ tem <- shorth2(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p - k values of beta are 0 tem <- predreg(out$betas[,(k+1):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} rmreg2<-function(x, y, csteps = 5, locc = 0.5){ # Does robust multivariate linear regression based on RMVN. # Here x contains the nontrivial predictors. # Advance the plot by highlighting Stop with the right mouse button. x <- as.matrix(x) y <- as.matrix(y) n <- dim(x)[1] q <- dim(x)[2] p <- q+1 m <- dim(y)[2] d <- m + q u <- cbind(x,y) res <- matrix(nrow = n, ncol = m, 0) fit <- res Bhat <- matrix(nrow = p, ncol = m, 0) # q + 1 = p is number of predictors including intercept cmar <- par("mar") par(mfrow = c(2, 1)) par(mar=c(4.0,4.0,2.0,0.5)) #get the RMVN subset of the predictors and response u up <- qchisq(0.975, d) qchi <- qchisq(0.5, d) ##get the DGK estimator covs <- var(u) mns <- apply(u, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(u, mns, covs) medd2 <- median(md2) mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(d) med <- apply(u, 2, median) md2 <- mahalanobis(u, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(u, mns, covs) medd2 <- median(md2) mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(u, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) rd2 <- mahalanobis(u, mnf, covf) rmnmvn <- apply(u[rd2 <= up, ], 2, mean) rcovmvn <- var(u[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(u, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, d) rcovmvn <- const * rcovmvn ##now get the subset of data w used to construct the RMVN estimator rd2 <- mahalanobis(u, rmnmvn, rcovmvn) w <- u[rd2 <= up, ] xw <- as.matrix(w[,1:q]) yw <- as.matrix(w[,(q+1):d]) #get the multivariate regression after ellipsoidal trimming for(i in 1:m){ out <- lsfit(xw,yw[,i]) res[,i] <- y[,i] - out$coef[1] - x %*% out$coef[-1] fit[,i] <- y[,i] - res[,i] Bhat[,i] <- out$coef } for(i in 1:m){ plot(fit[,i],y[,i]) abline(0, 1) title("Response Plot") identify(fit[,i],y[,i]) plot(fit[,i], res[,i]) title("Residual Plot") identify(fit[,i], res[,i]) } par(mfrow = c(1, 1)) par(mar=cmar) list(fit = fit, res = res, Bhat = Bhat) } rmreg3<-function(x, y, csteps = 5, locc = 0.5){ # Does robust multiple linear regression based on RMVN. # Here x contains the nontrivial predictors. # This is rmreg2 except plots are not made. # Used by hbreg. x <- as.matrix(x) y <- as.matrix(y) n <- dim(x)[1] q <- dim(x)[2] p <- q+1 m <- dim(y)[2] d <- m + q u <- cbind(x,y) res <- matrix(nrow = n, ncol = m, 0) fit <- res Bhat <- matrix(nrow = p, ncol = m, 0) # q + 1 = p is number of predictors including intercept #get the RMVN subset of the predictors and response u up <- qchisq(0.975, d) qchi <- qchisq(0.5, d) ##get the DGK estimator covs <- var(u) mns <- apply(u, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(u, mns, covs) medd2 <- median(md2) mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(d) med <- apply(u, 2, median) md2 <- mahalanobis(u, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(u, mns, covs) medd2 <- median(md2) mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(u, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) rd2 <- mahalanobis(u, mnf, covf) rmnmvn <- apply(u[rd2 <= up, ], 2, mean) rcovmvn <- var(u[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(u, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, d) rcovmvn <- const * rcovmvn ##now get the subset of data w used to construct the RMVN estimator rd2 <- mahalanobis(u, rmnmvn, rcovmvn) w <- u[rd2 <= up, ] xw <- as.matrix(w[,1:q]) yw <- as.matrix(w[,(q+1):d]) #get the multivariate regression after ellipsoidal trimming for(i in 1:m){ out <- lsfit(xw,yw[,i]) res[,i] <- y[,i] - out$coef[1] - x %*% out$coef[-1] fit[,i] <- y[,i] - res[,i] Bhat[,i] <- out$coef } list(fit = fit, res = res, Bhat = Bhat) } robci <-function(x, alpha = 0.05, trmp = 0.25, ka = 6, ks = 3.5) {#Gets several robust 100 (1-alpha)% CI's for data x. #defaults are alpha = .05 n <- length(x) up <- 1 - alpha/2 med <- median(x) madd <- mad(x, constant = 1) d <- sort(x) dtem <- d ## get the CI for T_A,n LM <- sum(x < (med - ka * madd)) nmUM <- sum(x > (med + ka * madd)) # ll (hh) is the percentage to be trimmed to the left (right) ll <- ceiling((100 * LM)/n) hh <- ceiling((100 * (nmUM))/n) ln <- floor((ll * n)/100) un <- floor((n * (100 - hh))/100) low <- ln + 1 val1 <- dtem[low] val2 <- dtem[un] tstmn <- mean(x[(x >= val1) & (x <= val2)]) #have obtained the two stage asymmetrically trimmed mean if(ln > 0) { d[1:ln] <- d[low] } if(un < n) { d[(un + 1):n] <- d[un] } den <- ((un - ln)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- un - low rval <- qt(up, rdf) * sqrt(swv/n) talo <- tstmn - rval tahi <- tstmn + rval ##got low and high endpoints of robust T_A,n CI ##get robust T_S,n CI d <- dtem lo <- sum(x < (med - ks * madd)) hi <- sum(x > (med + ks * madd)) low <- ceiling((100 * lo)/n) high <- ceiling((100 * hi)/n) tp <- min(max(low, high)/100, 0.5) tstmn <- mean(x, trim = tp) #have obtained the two stage symetrically trimmed mean ln <- floor(n * tp) un <- n - ln if(ln > 0) { d[1:ln] <- d[(ln + 1)] } if(un < n) { d[(un + 1):n] <- d[un] } den <- ((un - ln)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- un - ln - 1 rval <- qt(up, rdf) * sqrt(swv/n) tslo <- tstmn - rval tshi <- tstmn + rval ##got low and high endpoints of robust T_S,n CI ##get median CI that uses a scaled Winsorized variance (CI V) d <- dtem lnbg <- floor(n/2) - ceiling(sqrt(n/4)) unbg <- n - lnbg lowbg <- lnbg + 1 if(lnbg > 0) { d[1:lnbg] <- d[(lowbg)] } if(unbg < n) { d[(unbg + 1):n] <- d[unbg] } den <- ((unbg - lnbg)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- unbg - lnbg - 1 cut <- qt(up, rdf) rval <- cut * sqrt(swv/n) rlo <- med - rval rhi <- med + rval ##got median CI that uses a scaled Winsorized variance ##get MED CI that uses BG SE se2 <- 0.5 * (d[unbg] - d[lowbg]) rval <- cut * se2 rlo2 <- med - rval rhi2 <- med + rval #got low and high endpoints of BG CI ## get classical CI mn <- mean(x) v <- var(x) se <- sqrt(v/n) val <- qt(up, n - 1) * se lo <- mn - val hi <- mn + val ##got classical CI endpoints ## get trimmed mean CI d <- dtem ln <- floor(n * trmp) un <- n - ln trmn <- mean(x, trim = trmp) if(ln > 0) { d[1:ln] <- d[(ln + 1)] } if(un < n) { d[(un + 1):n] <- d[un] } den <- ((un - ln)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- un - ln - 1 rval <- qt(up, rdf) * sqrt(swv/n) trlo <- trmn - rval trhi <- trmn + rval ##got trimmed mean CI endpoints list(tint = c(lo, hi), taint = c(talo, tahi), tsint = c(tslo, tshi), mint = c(rlo2, rhi2), vint = c(rlo, rhi), trint = c( trlo, trhi)) } rrplot<-function(x, y, nsamps = 7){ # For Splus since R does not have l1fit. # In Unix, use X11() to turn on the graphics device before # using this function. # Makes an RR plot. Needs hbreg and mbareg. n <- length(y) x <- as.matrix(x) rmat <- matrix(nrow = n, ncol = 6) lsres <- lsfit(x, y)$residuals print("got OLS") l1res <- l1fit(x, y)$residuals print("got L1") almsres <- lmsreg(x, y)$resid print("got ALMS") altsres <- ltsreg(x, y)$residuals print("got ALTS") mbacoef <- mbareg(x, y, nsamp = nsamps)$coef MBAres <- y - mbacoef[1] - x %*% mbacoef[-1] print("got MBA") bbcoef <- hbreg(x, y)$bbcoef BBres <- y - bbcoef[1] - x %*% bbcoef[-1] print("got BB") rmat[, 1] <- lsres rmat[, 2] <- l1res rmat[, 3] <- almsres rmat[, 4] <- altsres rmat[, 5] <- MBAres rmat[, 6] <- BBres pairs(rmat, labels = c("OLS residuals", "L1 residuals", "ALMS residuals", "ALTS residuals", "MBA residuals", "BB residuals")) } rrplot2<-function(x, y, nsamps = 7) {# In Unix, use X11() to turn on the graphics device before # using this function. For R, first type library(MASS). # Makes an RR plot. Needs hbreg, mbareg, mbalata , and rmreg3. n <- length(y) x <- as.matrix(x) rmat <- matrix(nrow = n, ncol = 7) lsres <- lsfit(x, y)$residuals print("got OLS") almsres <- lmsreg(x, y)$resid print("got ALMS") altsres <- ltsreg(x, y)$residuals print("got ALTS") mbacoef <- mbareg(x, y, nsamp = nsamps)$coef MBAres <- y - mbacoef[1] - x %*% mbacoef[-1] print("got MBA") bbcoef <- hbreg(x, y)$bbcoef BBres <- y - bbcoef[1] - x %*% bbcoef[-1] print("got BB") mbalcoef <- mbalata(x, y, nsamp = nsamps)$coef MBALATA <- y - mbalcoef[1] - x %*% mbalcoef[-1] print("got MBALATA") RMREG2 <- rmreg3(x,y)$res print("got RMREG2") rmat[, 1] <- lsres rmat[, 2] <- almsres rmat[, 3] <- altsres rmat[, 4] <- MBAres rmat[, 5] <- BBres rmat[, 6] <- MBALATA rmat[, 7] <- RMREG2 pairs(rmat, labels = c("OLS resid", "ALMS resid", "ALTS resid", "MBA resid", "BB resid", "MBALATA","RMREG2")) } rstmn<-function(x, k1 = 6, k2=6) {#robust symmetically trimmed 2 stage mean #truncates too many cases when the contamination is asymmetric madd <- mad(x, constant = 1) med <- median(x) LM <- sum(x < (med - k1 * madd)) nmUM <- sum(x > (med + k2 * madd)) n <- length(x) #ll (hh) is the percentage trimmed to the left (right) # tp is the trimming proportion ll <- ceiling((100 * LM)/n) hh <- ceiling((100 * nmUM)/n) tp <- min(max(ll, hh)/100, 0.5) mean(x, trim = tp) } sctplt <-function(n = 10,p = 2,steps = 5,gam = 0.4,outliers = 0,pm = 10,jit = F) {# Use for 1 < p < 11. # This R function makes a DD plot and scatterplot matrix # for sample data set from function mldsim. # Click left mouse button to identify points. # Click right mouse button after examining the DD plot # and in R, highlight "stop". # # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a point mass on major axis (0,...,0,pm)' # 2 for outliers a point mass on minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # #generate data set A <- sqrt(diag(1:p)) cloc <- 0 * (1:p) csig <- 0 * A val <- floor(gam * n) x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- 0 x[1:val,p] <- pm } if(outliers == 2) { x[1:val, ] <- 0 x[1:val,1] <- pm } if(outliers == 3) { val <- floor(gam * n) tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p ] <- pm } if(outliers == 5) { x[1:val, 1 ] <- pm } ddplot2(x) par(mfrow=c(1,1)) z <- x if(jit == T){ z <- x + matrix(runif(n * p, max=0.4), ncol = p, nrow = n) } pairs(z) } shorth<-function(y, alpha = 0.05){ # computes the shorth(c) interval [Ln,Un] for c = ceiling[n(1-alpha)]. n <- length(y) cc <- ceiling(n * (1 - alpha)) sy <- sort(y) rup <- sy[cc] rlow <- sy[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sy[j] rlow <- sy[j - cc + 1] } } } list(Ln = rlow, Un = rup) } shorth3<-function(y, alpha = 0.05){ # computes the shorth(c) interval [Ln,Un] for c = cc. #shorth lists Ln and Un using Frey's correction n <- length(y) cc <- ceiling(n * (1 - alpha + 1.12*sqrt(alpha/n))) cc <- min(n,cc) sy <- sort(y) rup <- sy[cc] rlow <- sy[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sy[j] rlow <- sy[j - cc + 1] } } } Ln <- rlow; Un <- rup list(shorth=c(Ln, Un)) } sir<-function(x, y, h) {# Obtained from STATLIB. Contributed by Thomas Koetter. # Calculates the effective dimension-reduction (e.d.r.) # directions by Sliced Inverse Regression (K.C. Li 1991, JASA 86, 316-327) # # Input: x n x p matrix, explanatory variable # y n x 1 vector, dependent variable # h scalar: if h >= 2 number of slices # if h <= -2 number of elements within a slice # 0 < h < 1 width of a slice: h = slicewidth / # range # # Output: list(edr, evalues) # edr p x p matrix, estimates for the e.d.r. directions # evalues p x 1 vector, the eigenvalues to the directions # # written by Thomas Koetter (thomas@wiwi.hu-berlin.de) 1995 # last modification: 7/18/95 # based on the implementation in XploRe # a full description of the XploRe program can be found in (chapter 11) # 'XploRe: An interactive statistical computing environment', # W. Haerdle, S. Klinke, B.A. Turlach, Springer, 1995 # # This software can be freely used for non-commercial purposes and freely # distributed. #+-----------------------------------------------------------------------------+ #| Thomas Koetter | #| Institut fuer Statistik und Oekonometrie | #| Fakultaet Wirtschaftswissenschaften | #| Humboldt-Universitaet zu Berlin, 10178 Berlin, GERMANY | #+-----------------------------------------------------------------------------+ #| Tel. voice: +49 30 2468-321 | #| Tel. FAX: +49 30 2468-249 | #| E-mail: thomas@wiwi.hu-berlin.de | #+-----------------------------------------------------------------------------+ n <- nrow(x) ndim <- ncol(x) if(n != length(c(y))) { stop("length of y doesn't match to number of rows of x !!") } if( - h > n) { stop("Number of elements within slices can't exceed number of data !!" ) } # stanardize the x variable to z (mean 0 and cov I) xb <- apply(x, 2, mean) si2 <- solve(chol(var(x))) xt <- (x - matrix(xb, nrow(x), ncol(x), byrow = T)) %*% si2 # sort the data regarding y. x values are now packed into slices ord1 <- order(y) data <- cbind(y[ord1], xt[ord1, ]) # determine slicing strategy if(h <= -2) { # abs(h) is number of elements per slice h <- abs(h) ns <- floor(n/h) condit <- 1:n choice <- (1:ns) * h # if there are observations left, add them to the first and last slice if(h * ns != n) { hk <- floor((n - h * ns)/2) choice <- choice + hk choice[ns] <- n # to aviod numerical problems } } else if(h >= 2) { # h is number of slices ns <- h slwidth <- (data[n, 1] - data[1, 1])/ns slend <- seq(data[1, 1] + slwidth, length = ns, by = slwidth) slend[ns] <- data[n, 1] condit <- c(data[, 1]) choice <- slend } else if((0 < h) && (h < 1)) { # h is width of a slice divides by the range of y ns <- floor(1/h) slwidth <- (data[n, 1] - data[1, 1]) * h slend <- seq(data[1, 1] + slwidth, length = ns, by = slwidth) slend[ns] <- data[n, 1] # to aviod numerical problems condit <- c(data[, 1]) choice <- slend } else stop("values of third parameter not valid") v <- matrix(0, ndim, ndim) # estimate for Cov(E[z|y]) ind <- rep(T, n) # index for already sliced elements ndim <- ndim + 1 j <- 1 # loop counter while(j <= ns) { sborder <- (condit <= choice[j]) & ind # index of slice j if(any(sborder)) { # are there elements in slice j ? ind <- ind - sborder xslice <- data[sborder, 2:ndim] if(sum(sborder) == 1) { # xslice is a vector ! xmean <- xslice v <- v + outer(xmean, xmean, "*") } else { xmean <- apply(xslice, 2, mean) v <- v + outer(xmean, xmean, "*") * nrow(xslice ) } } j <- j + 1 } if(any(ind)) { print("Error: elements unused !!") print(ind) } v <- (v + t(v))/(2 * n) # to prevent numerical errors (v is symmetric) eig <- eigen(v) b <- si2 %*% eig$vectors # estimates for e.d.r. directions data <- sqrt(apply(b * b, 2, sum)) b <- t(b)/data return(list(edr = t(b), evalues = eig$values)) } sirviews<-function(x, Y, ii = 10) { # Uses the STATLIB function "sir." # Trimmed views for 0, 10, ..., 90 percent trimming. # Use ii = 10 for 0, ii = 9 for 10 then 0, ..., # ii = 1 for 90 then 80 ..., then 0 percent trimming. # Allows visualization of m # and crude estimation of c beta in models # of the form y = m(x^T beta) + e. # beta is obtained from SIR. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # Advance the view with the right mouse button, and # in R, highlight "stop." x <- as.matrix(x) q <- dim(x)[2] out <- covfch(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) h <- q + 7 for(i in ii:10) { val <- quantile(rd2, tem[i]) b <- sir(x[rd2 <= val, ], Y[rd2 <= val], h)$edr[, 1] ESP <- x %*% b plot(ESP, Y) title(labs[i]) identify(ESP, Y) print(b) } } stmci<-function(x, alpha = 0.05, ks = 3.5) {#gets se for sample median and the corresponding robust 100 (1-alpha)% CI #defaults are alpha = .05 n <- length(x) up <- 1 - alpha/2 med <- median(x) madd <- mad(x, constant = 1) lo <- sum(x < (med - ks * madd)) hi <- sum(x > (med + ks * madd)) low <- ceiling((100 * lo)/n) high <- ceiling((100 * hi)/n) tp <- min(max(low, high)/100, 0.5) tstmn <- mean(x, trim = tp) #have obtained the two stage symetrically trimmed mean ln <- floor(n * tp) un <- n - ln d <- sort(x) if(ln > 0) { d[1:ln] <- d[(ln + 1)] d[(un + 1):n] <- d[un] } den <- ((un - ln)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- un - ln - 1 rval <- qt(up, rdf) * sqrt(swv/n) tslo <- tstmn - rval tshi <- tstmn + rval list(int = c(tslo, tshi), tp = tp) } symviews<-function(x, Y) {# Need number of predictors > 1. # Makes trimmed views for 90, 80, ..., 0 # percent trimming and sometimes works even if m # is symmetric about E(x^t beta) where # y = m(x^T beta ) + e. # For work stations, activate a graphics # device with command "X11()" or "motif()." # Use the rightmost mouse button to advance # the view, and in R, highlight ``stop." x <- as.matrix(x) tem <- seq(0.1, 1, 0.1) bols <- lsfit(x, Y)$coef fit <- x %*% bols[-1] temx <- x[fit > median(fit), ] temy <- Y[fit > median(fit)] out <- covmba(temx) center <- out$center cov <- out$cov rd2 <- mahalanobis(temx, center, cov) for(i in 1:10) { val <- quantile(rd2, tem[i]) bhat <- lsfit(temx[rd2 <= val, ], temy[rd2 <= val])$coef ESP <- x %*% bhat[-1] + bhat[1] plot(ESP, Y) identify(ESP, Y) print(bhat) } } tmci<-function(x, alpha = 0.05, tp = 0.25) {#gets se for the tp trimmed mean and the corresponding robust 100 (1-alpha)% CI #defaults are alpha = .05 n <- length(x) up <- 1 - alpha/2 tmn <- mean(x, trim = tp) ln <- floor(n * tp) un <- n - ln d <- sort(x) if(ln > 0) { d[1:ln] <- d[(ln + 1)] d[(un + 1):n] <- d[un] } den <- ((un - ln)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- un - ln - 1 rval <- qt(up, rdf) * sqrt(swv/n) tmlo <- tmn - rval tmhi <- tmn + rval list(int = c(tmlo, tmhi), tp = tp) } tmn<-function(x, tp = 0.25) {# computes 100tp% trimmed mean mean(x, trim = tp) } tplot<-function(x, y){ # Use the rightmost mouse button to advance # the plot, and in R, highlight ``stop." # Values of y should be positive. # Assume the MLR model contains a constant, but # x is the design matrix without the vector of ones. # This function makes transformation plots from the MLR # of y^L on x where L = -1, -0.5, -1/3, 0, 1/3, 0.5, or 1 # except log(Y) is used if L = 0. # If plot is linear for L, use y^L (or log(Y) for L = 0) instead # of Y. This is a graphical method for a response transform. lam <- c(-1, -1/2, -1/3, 0, 1/3, 1/2, 1) xl <- c("1/Z", "1/sqrt(Z))", "Z**(-1/3)", "LOG(Z)", "Z**(1/3)", "sqrt(Z)", "Z") for(i in 1:length(lam)) { if(lam[i] == 0) ytem <- log(y) else ytem <- y^lam[i] fit <- ytem - lsfit(x,ytem)$resid plot(fit, ytem, xlab = "TZHAT", ylab = xl[i]) abline(0, 1) identify(fit, ytem) } } Tplt<-function(x, y) {# For Unix, use X11() to turn on the graphics device before using this function. # This function plots y^L vs OLS fit. If plot is linear for L, use y^L instead of y. # This is a graphical method for a response transform. olsfit <- y - lsfit(x, y)$resid lam <- c(-1, -2/3, -1/2, -1/3, -1/4, 0, 1/4, 1/ 3, 1/2, 2/3, 1) xl <- c("Y**(-1)", "Y**(-2/3)", "Y**(-0.5)", "Y**(-1/3)", "Y**(-1/4)", "LOG(Y)", "Y**(1/4)", "Y**(1/3)", "Y**(1/2)", "Y**(2/3)", "Y") for(i in 1:length(lam)) { if(lam[i] == 0) ytem <- log(y) else if(lam[i] == 1) ytem <- y else ytem <- (y^lam[i] - 1)/lam[i] plot(olsfit, ytem, xlab = "YHAT", ylab = xl[i]) abline(lsfit(olsfit, ytem)$coef) identify(olsfit, ytem) } } trviews<-function(x, Y, ii = 1, type = 3) {# Trimmed views for 0, 10, ..., 90 percent trimming. # Use ii = 10 for 0, ii = 9 for 10 then 0, ..., # ii = 1 for 90 then 80 ..., then 0 percent trimming. # Allows visualization of m and crudely estimation of # c beta in models of the form y = m(x^T beta) + e. # Workstation: activate a graphics device # with commands "X11()" or "motif()." # R needs command "library(MASS)." # Advance the view with the right mouse button and # in R, highight "stop." x <- as.matrix(x) if(type == 1) out <- cov.mcd(x) if(type == 2) out <- covmba(x) if(type == 3) out <- covfch(x) if(type == 4) out <- covrmvn(x) if (type <= 4){ center <- out$center cov <- out$cov} #type = 5 for rfch estimator if (type == 5){ out <- covfch(x) center <- out$rmnf cov <- out$rcovf} rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) bhat <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$coef ESP <- x %*% bhat[-1] + bhat[1] plot(ESP, Y) title(labs[i]) identify(ESP, Y) print(bhat) } } tvreg<-function(x, Y, ii = 1, type = 4){ # Trimmed views (TV) regression for 0, 10, ..., 90 percent # trimming. Use ii = 10 for 0, ii = 9 for 10 then 0, ... # ii = 1 for 90 then 80 ... then 0 percent trimming. # Workstation: activate a graphics device # with commands "X11()" or "motif()." # R needs command "library(MASS)." # Advance the view with the right mouse button and # in R, highight "stop." x <- as.matrix(x) if(type == 1) out <- cov.mcd(x) if(type == 2) out <- covmba(x) if (type == 3) out <- covfch(x) if(type > 3) out <- covrmvn(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) bhat <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$coef FIT <- x %*% bhat[-1] + bhat[1] plot(FIT, Y) abline(0, 1) title(labs[i]) identify(FIT, Y) print(bhat) } } tvreg2<-function(X, Y, M = 0, type = 4) {# Trimmed views regression for M percent trimming. # Workstation: activate a graphics device # with commands "X11()" or "motif()." # R needs command "library(MASS)." X <- as.matrix(X) if(type == 1) out <- cov.mcd(X) if(type == 2) out <- covmba(X) if(type == 3) out <- covfch(X) if(type > 3) out <- covrmvn(X) center <- out$center cov <- out$cov rd2 <- mahalanobis(X, center, cov) tem <- (100 - M)/100 val <- quantile(rd2, tem) bhat <- lsfit(X[rd2 <= val, ], Y[rd2 <= val])$coef FIT <- X %*% bhat[-1] + bhat[1] plot(FIT, Y) abline(0, 1) identify(FIT, Y) list(coef = bhat) } vecw <- function(z){ #Computes vec(z): stacks the columns of z into a column vector. z <- as.matrix(z) vecz <- as.vector(z[,1]) p <- dim(z)[2] if(p > 1){ for(i in 2:p) vecz <- c(vecz,as.vector(z[,i])) } list(vecz=vecz) } vsbootsim3<-function(n = 100, p = 4, k=1, nruns = 100, eps = 0.1, shift = 9, type = 1, psi = 0.0, BB=1000, alph = 0.05){ #Needs library(leaps). Has psi and uses the prediction region method. #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for forward selection variable selection. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-1, k is the number of nonnoise variables #Need k < p-1 so there are some zeroes in the model. #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-fselboot(x,y,B=BB) #bootstrap the forward sel minimum Cp model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p-k-1 values of beta are 0 tem <- predreg(out$betas[,(k+2):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} vsbootsim5<-function(n=100,p=4,k=1,nruns=100, eps=0.1,shift=9, cc=0.01, augm=F,type = 1, psi = 0.0, BB=1000, alph = 0.05){ #Needs library(leaps). Has psi and used 3 confidence regions. #Gets CIs and does test with pred reg, hybrid, and Bickel and Ren methods. #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for forward selection variable selection and bhatMIX. # Needs fselboot2 and confreg. Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. If augm neq F, adds cB full model bootstrap samples so S*_T is better #conditioned. The constant = 1 so there are p = q+1 coefficients. #1 <= k <= p-1, k is the number of nonnoise variables #Need k < p-1 so there are some zeroes in the model. #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. q <- p-1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) cicovmix <- cicov avelenmix <- avelen rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 one <- as.vector(0*1:(k+1) + 1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-fselboot2(x,y,B=BB,c=cc,aug=augm) #bootstrap the forward sel minimum Cp model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] tem <- shorth3(out$btmix[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicovmix[j] <- cicovmix[j] + 1 avelenmix[j] <- avelenmix[j] + tem$shorth[2] - tem$shorth[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- out$bhatimin0[(k+2):p] tem <- confreg(out$betas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim tem <- confreg(out$btmix[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicovmix[pp1] <- cicovmix[pp1] + 1 avelenmix[pp1] <- avelenmix[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicovmix[pp2] <- cicovmix[pp2] + 1 avelenmix[pp2] <- avelenmix[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicovmix[pp3] <- cicovmix[pp3] + 1 avelenmix[pp3] <- avelenmix[pp3] + tem$brlim #test whether the first k+1 values of beta are 1 gg <- k + 1 tstat <- out$bhatimin0[1:(k+1)] tem <- confreg(out$betas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim tem <- confreg(out$btmix[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicovmix[pp4] <- cicovmix[pp4] + 1 avelenmix[pp4] <- avelenmix[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicovmix[pp5] <- cicovmix[pp5] + 1 avelenmix[pp5] <- avelenmix[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicovmix[pp6] <- cicovmix[pp6] + 1 avelenmix[pp6] <- avelenmix[pp6] + tem$brlim } cicov <- cicov/nruns avelen <- avelen/nruns cicovmix <- cicovmix/nruns avelenmix <- avelenmix/nruns list(cicov=cicov,avelen=avelen,cicovmix=cicovmix,avelenmix=avelenmix, beta=beta,k=k)} vselboot<-function(x,y,B = 1000){ #needs library(leaps) #need n and p small, 2 < p < 30 #allsubsets minimum Cp regression #does not make sense to do variable selection if there #is only one nontrivial predictor x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vars <- as.vector(1:(p-1)) #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the minimum Cp submodel out<-leaps(x,y) mincp <- out$which[out$Cp==min(out$Cp)] vin <- vars[mincp] sub <- lsfit(x[,vin],y) betas <- matrix(0,nrow=B,ncol=p) #bootstrap the minimum Cp submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) out<-leaps(x,y=yb) mincp <- out$which[out$Cp==min(out$Cp)] vin <- vars[mincp] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef } list(full=full,sub=sub,betas=betas) } wddplot<-function(x) {# Shows the southwest corner of the DD plot. Need p > 1. n <- dim(x)[1] wt <- 0 * (1:n) p <- dim(x)[2] center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) out <- covfch(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) md <- sqrt(md2) rd <- sqrt(rd2) const <- sqrt(qchisq(0.5, p))/median(rd) rd <- const * rd wt[rd < sqrt(qchisq(0.975, p))] <- 1 MD <- md[wt > 0] RD <- rd[wt > 0] plot(MD, RD) }