*R code for high dimensinal statistics homework problems; Copy and paste code one problem part at a time. The easiest way to get slpack.txt and sldata.txt is to copy and paste the following two commands into R. source("http://parker.ad.siu.edu/Olive/slpack.txt") source("http://parker.ad.siu.edu/Olive/sldata.txt") #The following commands are often used. library(glmnet) library(pls) library(leaps) #If slpack.txt and sldata.txt are on J drive, get lregpack and lregdata with #the following commands: #source("J:/slpack.txt") #source("J:/sldata.txt") You will often need packages and may need to install packages the first time you use a given computer. install.packages("glmnet") install.packages("pls") install.packages("leaps") install.packages("e1071") install.packages("randomForest") install.packages("ISLR") #install.packages("gbm") #install.packages("tree") #install.packages("ROCR") #These commands can fail if your computer has a firewall. #Then an administrator may be needed to install the packages. ##HW1 E) Problem 1.10 a) n<-100000; q<-7 b <- 0 * 1:q + 1 x <- matrix(rnorm(n * q), nrow = n, ncol = q) y <- 1 + x %*% b + rnorm(n) out<-lsfit(x,y) res<-out$res yhat<-y-res dd<-length(out$coef) AERplot(yhat,y,res=res,d=dd,alph=1) #usual response plot # 1.10 b) AERplot(yhat,y,res=res,d=dd,alph=0.01) #plots data outside the 99% pointwise PIs ##HW2 A) a) x<-trees[,-3] vol<-trees[,3] zw<-cbind(x,vol) #pairs(zw) OPLSplot(x,vol) #right click Stop on the 1st plot and then the second plot ##HW2 A) b) OPLSEEplot(x,vol) #right click Stop on the plot ##HW2 A) c) rcovxy(x,vol) ##HW2 B) a) x<-matrix(rnorm(10000),nrow=100,ncol=100) y <- rnorm(100) out<-covxycis(x,y) plot(out$cis[,1],ylim=c(-0.5,0.5)) points(out$cis[,2],pch=3) sum(out$cis[,1] <= 0 & out$cis[,2] >= 0) ##HW2 B) b) w<-matrix(rnorm(10000),nrow=100,ncol=100) p<-100 dd <- sqrt(1:p) A <- diag(dd) x <- w%*%A diag(cov(x)) #pop cov(x) = diag(1,2,3,4,...,p) b <- 0*dd b[c(1,2)]<-c(1,1) #pop beta OLS = c(1,1,0,...,0)^T y <- x%*%b + rnorm(100) cov(x,y) #pop cov(x,y) = pop cov(x) b = (1,2,0,0,...,0)^T covxy <- b covxy[c(1,2)]<-c(1,2) out<-covxycis(x,y) cov <- 0 for(i in 1:p){ if(out$cis[i,1] <= covxy[i] & out$cis[i,2] >= covxy[i]) cov <- cov+1} cov #hit Enter ##HW2 B) c) b <- 0*dd+1 #beta = (1,...,1)^T y <- x%*%b + rnorm(100) #pop cov(x,y) = pop cov(x) b = (1,2,3,4,...,p)^T covxy <- 1:p out<-covxycis(x,y) cov <- 0 for(i in 1:p){ if(out$cis[i,1] <= covxy[i] & out$cis[i,2] >= covxy[i]) cov <- cov+1} cov #hit Enter ##HW3 A) a) MLRplot(cbrainx,cbrainy) #right click twice outf<-lsfit(cbrainx,cbrainy) ls.print(outf) ##HW3 A) b) x<-cbrainx[,c(1,4,5,7,9,10)] MLRplot(x,cbrainy) #right click twice out2<-lsfit(x,cbrainy) ls.print(out2) ##HW3 A) c) #i) xx <-cbrainx[,-c(1,4,5,7,9,10)] MLRplot(xx,cbrainy) #right click twice #iii) out3<-lsfit(xx,cbrainy) ls.print(out3) ##HW3 B) Problem 1.15 a) mldsim6(n=100,p=10,outliers=1,gam=0.25,pm=20, osteps=9) mldsim6(n=100,p=45,outliers=1,gam=0.25,pm=60, osteps=9) ##Problem 1.15 b) #takes a few minutes since p = 1000 #gam = 0.4 = default value mldsim7(n=100,p=1000,outliers=2,pm=900,osteps=0) mldsim7(n=100,p=1000,outliers=2,pm=900,osteps=9) ##Problem 1.15 c) mldsim7(n=100,p=100,outliers=3,gam=0.25,pm=8,osteps=0) mldsim7(n=100,p=100,outliers=3,gam=0.25,pm=8,osteps=9) ##HW5 5) #5a) predsim2(n=50,p=5,nv=19,xtype=2,dtype=1) #5b) predsim2(n=50,p=5,nv=19,xtype=2,dtype=2) #5c) predsim2(n=50,p=50,nv=19,xtype=2,dtype=1) #5d) predsim2(n=50,p=50,nv=19,xtype=2,dtype=2) #5e) predsim2(n=50,p=500,nv=19,xtype=2,dtype=1) #5f) predsim2(n=50,p=500,nv=19,xtype=2,dtype=2) ##HW7 D) read the top of the homework library(pls) tpls(cbrainx,cbrainy) ##HW7 E) read the top of the homework #E a) library(glmnet) mlrsplitsim(n=100,p=100,k=1,nruns=100,J=5,dtype=1,psi=0.0) #E b) mlrsplitsim(n=100,p=100,k=10,nruns=100,J=7,dtype=1,psi=0.0) ##HW8 do not forget the two source commands ##HW8D a) oplswls(n=100,p=4,etype=1,wtype=2) #right click Stop twice #D b) oplswls(n=100,p=4,etype=3,wtype=2) #right click stop twice #D c) oplswls(n=100,p=100,etype=1,wtype=2) #right click Stop #D d) oplswls(n=100,p=100,etype=3,wtype=2) #right click stop #HW8 E a) library(glmnet) prsplit(n=100,p=100,k=1,nruns=100,J=5,psi=0.0) #E b) prsplit(n=100,p=100,k=10,nruns=100,J=7,psi=0.1) ##HW9 E a) do not forget the two source commands hdhot1sim(n=100,p=100) #E b) hdhot1sim(n=100,p=100,delta=0.1) ##HW10 do not forget the two source commands #HW10 C a) hdhot2sim(n1=100,n2=100,p=100) #HW10 C b) hdhot2sim(n1=100,n2=100,p=100,delta=0.13) #HW10 D) leukemia_small <- read.csv("http://hastie.su.domains/CASI_files/DATA/leukemia_small.csv") golub1<-leukemia_small[,1:27] #data for leukemia ALL golub2<-leukemia_small[,28:38] #data for leukemia AML m<-3051 p<-rep(0,m) library(stats) #compute the pvalues with a 2 sample t test (pooled?) for(i in 1:m) p[i]<-t.test(golub1[i,],golub2[i,])$p.value k<-sum(sort(p)<=0.05*(1:m)/m) #khat=645=number of pvalues where Ho is rejected R<-(1:m)[p<=0.05*k/m] #rejection set #plot pvalues with an enlargement showing the 645 par(mfrow=c(1,2)) plot(1:m,sort(p),col=c(rep(2,k),rep(3,m-k)),type="p", pch=20,cex=0.7,xlab="index",ylab="pvalues") points(1:m,0.05*(1:m)/m,col=1,type="l",lwd=2) #1st plot plot(1:750,sort(p)[1:750],type="p",col=c(rep(2,k),rep(3,750-k)), pch=20,cex=0.7,xlab="index",ylab="pvalues") points(1:750,0.05*(1:750)/m,col=1,type="l",lwd=2) #2nd plot # #HW10 E) #Problem 4.10 HW7 C: Binary logistic regression with lasso #E) a) set.seed(1976) #Binary regression library(glmnet) n<-100 m<-1 #binary regression q <- 100 #100 nontrivial predictors, 95 inactive k <- 5 #k_S = 5 population active predictors y <- 1:n mv <- m + 0 * y vars <- 1:q beta <- 0 * 1:q beta[1:k] <- beta[1:k] + 1 beta alpha <- 0 x <- matrix(rnorm(n * q), nrow = n, ncol = q) SP <- alpha + x[,1:k] %*% beta[1:k] pv <- exp(SP)/(1 + exp(SP)) y <- rbinom(n,size=m,prob=pv) y out<-cv.glmnet(x,y,family="binomial") lam <- out$lambda.min bhat <- as.vector(predict(out,type="coefficients",s=lam)) ahat <- bhat[1] #alphahat bhat<-bhat[-1] vin <- vars[bhat!=0] #want 1-5, overfit # [1] 1 2 3 4 5 6 16 59 61 74 75 76 96 ind <- as.data.frame(cbind(y,x[,vin])) #relaxed lasso GLM tem <- glm(y~.,family="binomial",data=ind) tem$coef lrplot3(tem=tem,x=x[,vin]) #binary response plot #E) b) #now use MLR lasso outm<-cv.glmnet(x,y) lamm <- outm$lambda.min bm <- as.vector(predict(outm,type="coefficients",s=lamm)) am <- bm[1] #alphahat bm<-bm[-1] vm <- vars[bm!=0] #1 more variable than GLM lasso vm inm <- as.data.frame(cbind(y,x[,vm])) #relaxed lasso GLM tm <- glm(y~.,family="binomial",data=inm) lrplot3(tem=tm,x=x[,vm]) #binary response plot #E) c) library(leaps) out<-fsel(x,y) vin<-out$vin vin #severe underfit #[1] 4 inm <- as.data.frame(cbind(y,x[,vin])) tm <- glm(y~.,family="binomial",data=inm) lrplot3(tem=tm,x=x[,vin]) #binary response plot #HW10 F) Problem 4.11 lasso for Poisson regression #F) a) set.seed(1976) #Poisson regression library(glmnet) n<-100 q <- 100 #100 nontrivial predictors, 95 inactive k <- 5 #k_S = 5 population active predictors y <- 1:n mv <- m + 0 * y vars <- 1:q beta <- 0 * 1:q beta[1:k] <- beta[1:k] + 1 beta alpha <- 0 x <- matrix(rnorm(n * q), nrow = n, ncol = q) SP <- alpha + x[,1:k] %*% beta[1:k] y <- rpois(n,lambda=exp(SP)) out<-cv.glmnet(x,y,family="poisson") lam <- out$lambda.min bhat <- as.vector(predict(out,type="coefficients",s=lam)) ahat <- bhat[1] #alphahat bhat<-bhat[-1] vin <- vars[bhat!=0] #want 1-5, overfit vin ind <- as.data.frame(cbind(y,x[,vin])) #relaxed lasso GLM out <- glm(y~.,family="poisson",data=ind) ESP <- predict(out) prplot2(ESP,x=x[,vin],y) #response and OD plots #F) b) #now use MLR lasso outm<-cv.glmnet(x,y) lamm <- outm$lambda.min bm <- as.vector(predict(outm,type="coefficients",s=lamm)) am <- bm[1] #alphahat bm<-bm[-1] vm <- vars[bm!=0] vm #less overfit than GLM lasso inm <- as.data.frame(cbind(y,x[,vm])) #relaxed lasso GLM out <- glm(y~.,family="poisson",data=inm) ESP <- predict(out) prplot2(ESP,x=x[,vm],y) #response and OD plots #F) c) Now use MLR forward selection with EBIC since n < 10p. library(leaps) out<-fsel(x,y) vin<-out$vin vin #severe underfit causes poor fit and overdispersion #[1] 5 inm <- as.data.frame(cbind(y,x[,vin])) out <- glm(y~.,family="poisson",data=inm) ESP <- predict(out) prplot2(ESP,x=x[,vin],y) #response and OD plots ##HW11 do not forget the two source commands #A) Problem 5.15 Classification Tree library(rpart) fit <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis) plot(fit) text(fit) #Problem 5.16 HW11 Problem B) pottery data with two groups: Bagging, Random Forests, SVM #a) y <- pottery[pottery[,1]!=5,1] y <- (as.integer(y!=1)) + 1 y <- 2*(y-1.5) #levels -1 and 1 x <- pottery[pottery[,1]!=5,-1] shards <-data.frame(y=as.factor(y),x) set.seed(1) library(randomForest) #bagging: use mtry = number of predictors = 20 bagshards <- randomForest(y~.,data=shards,mtry=20,importance=TRUE) pred<-predict(bagshards) table(predict=pred,truth=shards$y) #b) random forests default uses mtry approx sqrt(p) rfshards <- randomForest(y~.,data=shards,importance=TRUE) pred<-predict(rfshards) table(predict=pred,truth=shards$y) #c) SVM with a fixed cost library(e1071) svmfit=svm(y~., data=shards, kernel="linear",cost=10,scale=FALSE) pred <- predict(svmfit) table(predict=pred,truth=shards$y) #d) SVM with cost chosen by 10 fold CV set.seed(1) #tuneout takes a minute tuneout <- tune(svm,y~.,data=shards,kernel="linear", ranges=list(cost=c(0.001,0.01,0.1,1,5,10,100))) summary(tuneout) #10 fold CV on several models pred<-predict(tuneout$best.mod) table(predict=pred,truth=shards$y) ##Problem 5.17 HW11 Problem C) Gladstone data with Gender as response #a) bagging AER brainwt <- cbrainy y <- cbrainx[,10]; y[y==0] <- -1 y<-as.factor(y) #-1=F, 1 = M brainx<-cbind(cbrainx[,-10],brainwt) brainx<-brainx[,-c(2,4)] brainx<-brainx[-c(230,254:258),] #remove outliers y<-y[-c(230,254:258)] brain<-data.frame(y,brainx) set.seed(1) train=sample(1:nrow(brain),200) library(randomForest) #bagging: use mtry = number of predictors = 9 bagbrain <- randomForest(y~.,data=brain,mtry=9,importance=TRUE) pred<-predict(bagbrain) table(predict=pred,truth=brain$y) #get AER #b) bagging with a validation set error rate bagbrain <- randomForest(y~.,data=brain,subset=train,mtry=9,importance=TRUE) pred<-predict(bagbrain,newdata=brain[-train,]) table(predict=pred,truth=brain$y[-train]) #c) random forests default uses mtry approx sqrt(p) rfbrain <- randomForest(y~.,data=brain,importance=TRUE) pred<-predict(rfbrain) table(predict=pred,truth=brain$y) #d) random forests with a validation set error rate rfbrain <- randomForest(y~.,data=brain,subset=train,importance=TRUE) pred<-predict(rfbrain,newdata=brain[-train,]) table(predict=pred,truth=brain$y[-train]) #e) SVM with cost chosen by 10 fold CV library(e1071) set.seed(1) #tuneout takes a minute tuneout <- tune(svm,y~.,data=brain,kernel="linear", ranges=list(cost=c(0.001,0.01,0.1,1,5,10,100))) summary(tuneout) #10 fold CV on several models pred<-predict(tuneout$best.mod) table(predict=pred,truth=brain$y) #f) SVM with cost chosen by 10 fold CV: validation set error rate set.seed(1) #tuneout takes a minute tuneout <- tune(svm,y~.,data=brain,subset=train,kernel="linear", ranges=list(cost=c(0.001,0.01,0.1,1,5,10,100))) summary(tuneout) #10 fold CV on several models pred<-predict(tuneout$best.mod,newdata=brain[-train,]) table(predict=pred,truth=brain$y[-train])