*R code  for high dimensinal statistics homework problems;

Copy and paste code one problem part at a time.


The easiest way to get hdpack.txt and hddata.txt is to copy and paste the following 
two commands into R.

source("http://parker.ad.siu.edu/Olive/hdpack.txt")
source("http://parker.ad.siu.edu/Olive/hddata.txt")


#The following commands are often used.

library(glmnet)
library(pls)
library(leaps)


#If hdpack.txt and hddata.txt are on J drive, get lregpack and lregdata with
#the following commands:

#source("J:/hdpack.txt") 
#source("J:/hddata.txt")

You will often need packages and may need to install packages
the first time you use a given computer.

install.packages("glmnet")
install.packages("pls")
install.packages("leaps")

install.packages("e1071")
install.packages("randomForest")
install.packages("ISLR")


#install.packages("gbm")
#install.packages("tree")
#install.packages("ROCR")

#These commands can fail if your computer has a firewall.
#Then an administrator may be needed to install the packages.



##HW1 E) Problem 1.10 a) 

n<-100000; q<-7
b <- 0 * 1:q + 1
x <- matrix(rnorm(n * q), nrow = n, ncol = q)
y <- 1 + x %*% b + rnorm(n)
out<-lsfit(x,y)
res<-out$res
yhat<-y-res
dd<-length(out$coef)
AERplot(yhat,y,res=res,d=dd,alph=1) #usual response plot

# 1.10 b) 

AERplot(yhat,y,res=res,d=dd,alph=0.01)
#plots data outside the 99% pointwise PIs



##HW2 A) a) 

x<-trees[,-3]
vol<-trees[,3]
zw<-cbind(x,vol)
#pairs(zw)
OPLSplot(x,vol) #right click Stop on the 1st plot and then the second plot

##HW2 A) b) 

OPLSEEplot(x,vol) #right click Stop on the plot

##HW2 A) c) 

rcovxy(x,vol)


##HW2 B) a)

x<-matrix(rnorm(10000),nrow=100,ncol=100)
y <- rnorm(100)
out<-covxycis(x,y)
plot(out$cis[,1],ylim=c(-0.5,0.5))
points(out$cis[,2],pch=3)
sum(out$cis[,1] <= 0 & out$cis[,2] >= 0)


##HW2 B) b)


w<-matrix(rnorm(10000),nrow=100,ncol=100)
p<-100
dd <- sqrt(1:p)
A <- diag(dd)
x <- w%*%A
diag(cov(x)) #pop cov(x) = diag(1,2,3,4,...,p)
b <- 0*dd
b[c(1,2)]<-c(1,1)   #pop beta OLS = c(1,1,0,...,0)^T
y <- x%*%b + rnorm(100)
cov(x,y) #pop cov(x,y) = pop cov(x) b = (1,2,0,0,...,0)^T
covxy <- b
covxy[c(1,2)]<-c(1,2)
out<-covxycis(x,y)
cov <- 0
for(i in 1:p){
if(out$cis[i,1] <= covxy[i] & out$cis[i,2] >= covxy[i])
cov <- cov+1}
cov  #hit Enter


##HW2 B) c)

b <- 0*dd+1 #beta = (1,...,1)^T
y <- x%*%b + rnorm(100)
#pop cov(x,y) = pop cov(x) b = (1,2,3,4,...,p)^T
covxy <- 1:p
out<-covxycis(x,y)
cov <- 0
for(i in 1:p){
if(out$cis[i,1] <= covxy[i] & out$cis[i,2] >= covxy[i])
cov <- cov+1}
cov #hit Enter

##HW3 A)  a)

MLRplot(cbrainx,cbrainy)  #right click twice
outf<-lsfit(cbrainx,cbrainy)
ls.print(outf)

##HW3 A)  b)

x<-cbrainx[,c(1,4,5,7,9,10)]
MLRplot(x,cbrainy) #right click twice
out2<-lsfit(x,cbrainy)
ls.print(out2)

##HW3 A)  c)
#i) 
xx <-cbrainx[,-c(1,4,5,7,9,10)]
MLRplot(xx,cbrainy) #right click twice

#iii)
out3<-lsfit(xx,cbrainy)
ls.print(out3)


##HW3 B) Problem 1.15 a) 

mldsim6(n=100,p=10,outliers=1,gam=0.25,pm=20, osteps=9)

mldsim6(n=100,p=45,outliers=1,gam=0.25,pm=60, osteps=9)


##Problem 1.15 b) #takes a few minutes since p = 1000
#gam = 0.4 = default value

mldsim7(n=100,p=1000,outliers=2,pm=900,osteps=0)

mldsim7(n=100,p=1000,outliers=2,pm=900,osteps=9)


##Problem 1.15 c)

mldsim7(n=100,p=100,outliers=3,gam=0.25,pm=8,osteps=0)

mldsim7(n=100,p=100,outliers=3,gam=0.25,pm=8,osteps=9)


##HW5 5)

#5a) 

predsim2(n=50,p=5,nv=19,xtype=2,dtype=1)

#5b) 

predsim2(n=50,p=5,nv=19,xtype=2,dtype=2)

#5c)

predsim2(n=50,p=50,nv=19,xtype=2,dtype=1)

#5d)

predsim2(n=50,p=50,nv=19,xtype=2,dtype=2)

#5e)

predsim2(n=50,p=500,nv=19,xtype=2,dtype=1)

#5f)

predsim2(n=50,p=500,nv=19,xtype=2,dtype=2)



##HW7 D)  read the top of the homework 

library(pls)
tpls(cbrainx,cbrainy)


##HW7 E)  read the top of the homework 

#E a)

library(glmnet)
mlrsplitsim(n=100,p=100,k=1,nruns=100,J=5,dtype=1,psi=0.0)

#E b) 

mlrsplitsim(n=100,p=100,k=10,nruns=100,J=7,dtype=1,psi=0.0)




##HW8 do not forget the two source commands

##HW8D a)

oplswls(n=100,p=4,etype=1,wtype=2) #right click Stop twice


#D b)     

oplswls(n=100,p=4,etype=3,wtype=2)  #right click stop twice


#D c)

oplswls(n=100,p=100,etype=1,wtype=2) #right click Stop 


#D d)     

oplswls(n=100,p=100,etype=3,wtype=2)  #right click stop 


#HW8 E a)

library(glmnet)
prsplit(n=100,p=100,k=1,nruns=100,J=5,psi=0.0)

#E b) 

prsplit(n=100,p=100,k=10,nruns=100,J=7,psi=0.1)



##HW9 E a) do not forget the two source commands

 hdhot1sim(n=100,p=100)

#E b)

hdhot1sim(n=100,p=100,delta=0.1)



##HW10 do not forget the two source commands

#HW10 C a) 

hdhot2sim(n1=100,n2=100,p=100)


#HW10 C b) 

hdhot2sim(n1=100,n2=100,p=100,delta=0.13)


#HW10 D)

leukemia_small <- 
read.csv("http://hastie.su.domains/CASI_files/DATA/leukemia_small.csv")

golub1<-leukemia_small[,1:27] #data for leukemia ALL
golub2<-leukemia_small[,28:38] #data for leukemia AML
m<-3051
p<-rep(0,m)
library(stats)
#compute the pvalues with a 2 sample t test (pooled?)
for(i in 1:m) p[i]<-t.test(golub1[i,],golub2[i,])$p.value
k<-sum(sort(p)<=0.05*(1:m)/m)  #khat=645=number of pvalues where Ho is rejected
R<-(1:m)[p<=0.05*k/m]   #rejection set
#plot pvalues with an enlargement showing the 645
par(mfrow=c(1,2))
plot(1:m,sort(p),col=c(rep(2,k),rep(3,m-k)),type="p",
pch=20,cex=0.7,xlab="index",ylab="pvalues")
points(1:m,0.05*(1:m)/m,col=1,type="l",lwd=2) #1st plot
plot(1:750,sort(p)[1:750],type="p",col=c(rep(2,k),rep(3,750-k)),
pch=20,cex=0.7,xlab="index",ylab="pvalues")
points(1:750,0.05*(1:750)/m,col=1,type="l",lwd=2) #2nd plot
#

#HW10 E) 

#Problem 4.10 HW7 C: Binary logistic regression with lasso

#E) a)

set.seed(1976)  #Binary regression 
library(glmnet)
n<-100
m<-1 #binary regression
q <- 100 #100 nontrivial predictors, 95 inactive
k <- 5 #k_S = 5 population active predictors
y <- 1:n
mv <- m + 0 * y
vars <- 1:q
beta <- 0 * 1:q
beta[1:k] <- beta[1:k] + 1
beta
alpha <- 0
x <- matrix(rnorm(n * q), nrow = n, ncol = q)
SP <- alpha + x[,1:k] %*% beta[1:k]
pv <- exp(SP)/(1 + exp(SP))
y <- rbinom(n,size=m,prob=pv)
y
out<-cv.glmnet(x,y,family="binomial")
lam <- out$lambda.min
bhat <- as.vector(predict(out,type="coefficients",s=lam))
ahat <- bhat[1] #alphahat
bhat<-bhat[-1]
vin <- vars[bhat!=0] #want 1-5, overfit
# [1]  1  2  3  4  5  6 16 59 61 74 75 76 96
ind <- as.data.frame(cbind(y,x[,vin])) #relaxed lasso GLM
tem <- glm(y~.,family="binomial",data=ind)
tem$coef
lrplot3(tem=tem,x=x[,vin]) #binary response plot

#E) b)

#now use MLR lasso
outm<-cv.glmnet(x,y)
lamm <- outm$lambda.min
bm <- as.vector(predict(outm,type="coefficients",s=lamm))
am <- bm[1] #alphahat
bm<-bm[-1]
vm <- vars[bm!=0] #1 more variable than GLM lasso
vm
inm <- as.data.frame(cbind(y,x[,vm])) #relaxed lasso GLM
tm <- glm(y~.,family="binomial",data=inm)
lrplot3(tem=tm,x=x[,vm]) #binary response plot

#E) c)

library(leaps)
out<-fsel(x,y)
vin<-out$vin
vin  #severe underfit
#[1] 4
inm <- as.data.frame(cbind(y,x[,vin])) 
tm <- glm(y~.,family="binomial",data=inm)
lrplot3(tem=tm,x=x[,vin]) #binary response plot


#HW10 F) Problem 4.11 lasso for Poisson regression

#F) a)

set.seed(1976)  #Poisson regression 
library(glmnet)
n<-100
q <- 100 #100 nontrivial predictors, 95 inactive
k <- 5 #k_S = 5 population active predictors
y <- 1:n
mv <- m + 0 * y
vars <- 1:q
beta <- 0 * 1:q
beta[1:k] <- beta[1:k] + 1
beta
alpha <- 0
x <- matrix(rnorm(n * q), nrow = n, ncol = q)
SP <- alpha + x[,1:k] %*% beta[1:k]
y <- rpois(n,lambda=exp(SP))
out<-cv.glmnet(x,y,family="poisson")
lam <- out$lambda.min
bhat <- as.vector(predict(out,type="coefficients",s=lam))
ahat <- bhat[1] #alphahat
bhat<-bhat[-1]
vin <- vars[bhat!=0] #want 1-5, overfit
vin
ind <- as.data.frame(cbind(y,x[,vin])) #relaxed lasso GLM
out <- glm(y~.,family="poisson",data=ind)
ESP <- predict(out)
prplot2(ESP,x=x[,vin],y) #response and OD plots

#F) b)

#now use MLR lasso
outm<-cv.glmnet(x,y)
lamm <- outm$lambda.min
bm <- as.vector(predict(outm,type="coefficients",s=lamm))
am <- bm[1] #alphahat
bm<-bm[-1]
vm <- vars[bm!=0] 
vm #less overfit than GLM lasso
inm <- as.data.frame(cbind(y,x[,vm])) #relaxed lasso GLM
out <- glm(y~.,family="poisson",data=inm)
ESP <- predict(out)
prplot2(ESP,x=x[,vm],y) #response and OD plots

#F) c) Now use MLR forward selection with EBIC since n < 10p.

library(leaps)
out<-fsel(x,y)
vin<-out$vin
vin  #severe underfit causes poor fit and overdispersion
#[1] 5
inm <- as.data.frame(cbind(y,x[,vin]))
out <- glm(y~.,family="poisson",data=inm)
ESP <- predict(out)
prplot2(ESP,x=x[,vin],y) #response and OD plots


##HW11 do not forget the two source commands

#A) Problem 5.15  Classification  Tree

library(rpart)

fit <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis)
plot(fit)
text(fit)




#Problem 5.16 HW11 Problem B) pottery data with two groups: Bagging, Random Forests, SVM

#a) 

y <- pottery[pottery[,1]!=5,1]
y <- (as.integer(y!=1)) + 1
y <- 2*(y-1.5) #levels -1 and 1
x <- pottery[pottery[,1]!=5,-1]
shards <-data.frame(y=as.factor(y),x)

set.seed(1)
library(randomForest)

#bagging: use mtry = number of predictors = 20
bagshards <- randomForest(y~.,data=shards,mtry=20,importance=TRUE)
pred<-predict(bagshards)
table(predict=pred,truth=shards$y) 

#b) random forests default uses mtry approx sqrt(p)
rfshards <- randomForest(y~.,data=shards,importance=TRUE)
pred<-predict(rfshards)
table(predict=pred,truth=shards$y) 

#c) SVM with a fixed cost
library(e1071)
svmfit=svm(y~., data=shards, kernel="linear",cost=10,scale=FALSE)
pred <- predict(svmfit)
table(predict=pred,truth=shards$y) 

#d) SVM with cost chosen by 10 fold CV
set.seed(1) #tuneout takes a minute
tuneout <- tune(svm,y~.,data=shards,kernel="linear",
ranges=list(cost=c(0.001,0.01,0.1,1,5,10,100)))
summary(tuneout) #10 fold CV on several models
pred<-predict(tuneout$best.mod)
table(predict=pred,truth=shards$y) 


##Problem 5.17 HW11 Problem C) Gladstone data with Gender as response

#a) bagging AER

brainwt <- cbrainy
y <- cbrainx[,10]; y[y==0] <- -1 
y<-as.factor(y) #-1=F, 1 = M
brainx<-cbind(cbrainx[,-10],brainwt)
brainx<-brainx[,-c(2,4)]
brainx<-brainx[-c(230,254:258),] #remove outliers
y<-y[-c(230,254:258)]
brain<-data.frame(y,brainx)
set.seed(1)
train=sample(1:nrow(brain),200)

library(randomForest)

#bagging: use mtry = number of predictors = 9
bagbrain <- randomForest(y~.,data=brain,mtry=9,importance=TRUE)
pred<-predict(bagbrain)
table(predict=pred,truth=brain$y) #get AER

#b) bagging  with a validation set error rate

bagbrain <- randomForest(y~.,data=brain,subset=train,mtry=9,importance=TRUE)
pred<-predict(bagbrain,newdata=brain[-train,])
table(predict=pred,truth=brain$y[-train]) 

#c) random forests default uses mtry approx sqrt(p)
rfbrain <- randomForest(y~.,data=brain,importance=TRUE)
pred<-predict(rfbrain)
table(predict=pred,truth=brain$y) 

#d) random forests with a validation set error rate

rfbrain <- randomForest(y~.,data=brain,subset=train,importance=TRUE)
pred<-predict(rfbrain,newdata=brain[-train,])
table(predict=pred,truth=brain$y[-train]) 

#e) SVM with  cost chosen by 10 fold CV
library(e1071)
set.seed(1) #tuneout takes a minute
tuneout <- tune(svm,y~.,data=brain,kernel="linear",
ranges=list(cost=c(0.001,0.01,0.1,1,5,10,100)))
summary(tuneout) #10 fold CV on several models
pred<-predict(tuneout$best.mod)
table(predict=pred,truth=brain$y) 


#f) SVM with cost chosen by 10 fold CV: validation set error rate
set.seed(1) #tuneout takes a minute
tuneout <- tune(svm,y~.,data=brain,subset=train,kernel="linear",
ranges=list(cost=c(0.001,0.01,0.1,1,5,10,100)))
summary(tuneout) #10 fold CV on several models
pred<-predict(tuneout$best.mod,newdata=brain[-train,])
table(predict=pred,truth=brain$y[-train])