#Rcode3output.txt & #This is the unix command to run this code in the background and save the output in the file code3output.txt postscript("code3plot.ps") par(mfrow=c(1,1),cex=1.5,cex.axis=1.5,cex.lab=1.5) set.seed(3333, kind = NULL) repeat.times<-100 q<-.1 J<-5 iterations<-1000 n<-200 d<-20 m<-1000 #m is the size of the hold-out sample (i.e., the test sample) misclassAB.store<-matrix(0,repeat.times,iterations) misclassLB.store<-matrix(0,repeat.times,iterations) #throughout the code object names ending in "AB" correspond to AdaBoost #names ending in "LB" correspond to LogitBoost bayeserror.store<-rep(0,repeat.times) nnerror.store<-rep(0,repeat.times) rferror.store<-rep(0,repeat.times) library(class) library(rpart) #You can also try using Random Forests for comparison by uncommenting the code provided later #I commented it out because Random Forests is not part of the basic R package and must be installed separately #I put mine in the directory "temp" using the following commands (from within R) #options(CRAN = "http://cran.us.r-project.org/") #install.packages("randomForest",lib="temp") #library(randomForest, lib.loc="temp") for (iui in 1:repeat.times){ x<-matrix(0,n,d) for (ddd in 1:d){ x[,ddd]<-runif(n) } hold.out.x<-matrix(0,m,d) for (k in 1:d) { hold.out.x[,k]<-runif(m) } y<-rep(0,n) for (i in 1:n){ y[i]<-1*(runif(1)<(q+(1-2*q)*1*(sum((x[i,1:J]))>(J/2)))) } hold.out.y<-rep(0,m) for (i in 1:(m)){ hold.out.y[i]<-1*(runif(1)<(q+(1-2*q)*1*(sum((hold.out.x[i,1:J]))>(J/2)))) } #error for bayes rule bayespred<-rep(0,m) for (i in 1:(m)){ bayespred[i]<-1*(sum((hold.out.x[i,1:J]))>(J/2)) } bayeserror<-1-sum((as.numeric(bayespred)==hold.out.y))/length(bayespred) bayeserror.store[iui]<-bayeserror #nearest neighbor fit<-knn(x,hold.out.x,y,k=1,prob=T) ppp<-attributes(fit)$prob ppp[as.numeric(fit)==1]<-1-ppp[as.numeric(fit)==1] nnerror<-(1-sum((ppp==hold.out.y)/length(ppp))) nnerror.store[iui]<-nnerror #AdaBoost y<-(-1)+2*y hold.out.y<-(-1)+2*hold.out.y fAB<-rep(0,n) hold.out.fAB<-0 i<-1 teAB<-rep(0,iterations) misclassAB<-rep(0,iterations) while(i<=iterations){ wAB<-exp(-y*fAB) wAB<-wAB/sum(wAB) treeeAB<-rpart(y~.,data.frame(x),wAB,method="class", control=rpart.control(minsplit=0,minbucket=0,cp=-1,maxcompete=0, maxsurrogate=0, usesurrogate=0, xval=0,maxdepth=3)) gminAB<-(-1)+2*(predict(treeeAB,data.frame(x),type="prob")[,1]< predict(treeeAB,data.frame(x),type="prob")[,2]) hold.out.gminAB<-(-1)+2*(predict(treeeAB,data.frame(hold.out.x),type="prob")[,1]< predict(treeeAB,data.frame(hold.out.x),type="prob")[,2]) eAB<-sum(wAB*(1-(y==gminAB))) alphaAB<-.5*log ( (1-eAB) / eAB ) hold.out.fAB<-hold.out.fAB+alphaAB*hold.out.gminAB fAB<-fAB+alphaAB*gminAB teAB[i]<-sum(1*fAB*y<0)/n misclassAB[i]<-sum(1*hold.out.fAB*hold.out.y<0)/length(hold.out.fAB) print(c(i,teAB[i],misclassAB[i])) i<-i+1 } #Random Forests (you need to install the package as described above if you want to use this) #y<-as.factor(y) #rf<-randomForest(x, y) #classest<-predict(rf,hold.out.x) #rferror<-(1-sum((classest==hold.out.y)/length(hold.out.y))) #rferror.store[iui]<-rferror #LogitBoost y<-as.factor(y) #Marcel Dettling's and Peter Buhlmann's code (http://stat.ethz.ch/~dettling/boosting.html) #with two small modifications to use 8-node trees instead fo stumps logitboost<-function (xlearn, ylearn, xtest, mfinal, presel = 0, estimate = 0, verbose = FALSE) { if (nlevels(as.factor(ylearn)) == 2) { if (presel > 0) { s <- apply(xlearn, 2, score, ylearn) quality <- apply(rbind(s, -s + (sum(ylearn == 0) * sum(ylearn == 1))), 2, max) genes <- rev(order(quality))[1:presel] xlearn <- xlearn[, genes] xtest <- xtest[, genes, drop = FALSE] } if (estimate > 0) { if (verbose) { print("Stopping Parameter Estimation") } likeli <- numeric(mfinal) probabs <- crossval(xlearn, ylearn, estimate, mfinal)$probs for (k in 1:mfinal) { a <- pmax(log(probabs[, k]), -1e+36) b <- pmax(log(1 - probabs[, k]), -1e+36) likeli[k] <- (ylearn %*% a) + ((1 - ylearn) %*% b) } } learn <- dim(xlearn)[1] test <- dim(xtest)[1] Flearn <- numeric(learn) Ftest <- numeric(test) flearn <- numeric(learn) ftest <- numeric(test) z <- numeric(learn) w <- numeric(learn) plearn <- rep(1/2, learn) ptest <- matrix(0, test, mfinal) if (verbose) { print("Boosting Iterations") } for (m in 1:mfinal) { w <- pmax(plearn * (1 - plearn), 1e-24) z <- (ylearn - plearn)/w cntrl <- rpart.control(maxdepth = 3, minsplit = 0, maxcompete = 0, maxsurrogate = 0, cp = 0, xval = 0) xx <- xlearn fit <- rpart(z ~ xx, weights = w/mean(w), control = cntrl) #print(fit) flearn <- predict(fit) xx <- xtest ftest <- predict(fit, newdata = data.frame(xx)) Flearn <- Flearn + (1/2) * flearn Ftest <- Ftest + (1/2) * ftest plearn <- 1/(1 + exp((-2) * Flearn)) ptest[, m] <- 1/(1 + exp((-2) * Ftest)) } output <- list(probs = ptest) if (estimate > 0) { output <- list(probs = ptest, loglikeli = matrix(likeli, nr = test, nc = mfinal, byrow = TRUE)) } } output } #end of Marcel Dettling's and Peter Buhlmann's code y<-as.numeric(y)-1 logit.fit<-logitboost(x, y, hold.out.x, (iterations), presel = 0, estimate = 0,verbose = FALSE)$prob logit.fit.class<-(-1)+2*(logit.fit>.5) for (yyy in 1:iterations){ misclassLB.store[iui,yyy]<-sum(logit.fit.class[,yyy]!=hold.out.y)/m } print("bayes error") print(bayeserror) print("nn error") print(nnerror) #print("random Forest error") #print(rferror) misclassAB.store[iui,]<-misclassAB } mean.misclassAB<-apply(misclassAB.store,2,mean) mean.misclassLB<-apply(misclassLB.store,2,mean) plot(seq(1,iterations),mean.misclassAB,ylim=c(.22,.38),xlab="Iterations",ylab="Misclassification Error", type="n",main="") lines(seq(1,iterations),mean.misclassAB,col="red") lines(seq(1,iterations),mean.misclassLB,col="blue") dev.off() print("###############################################################") print("Nearest Neighbor Mean and Standard Deviation") print(mean(nnerror.store)) print(sqrt(var(nnerror.store))) #print("Random Forests Mean and Standard Deviation") #print(mean(rferror.store)) #print(sqrt(var(rferror.store))) print("Bayes Error Mean and Standard Deviation") print(mean(bayeserror.store)) print(sqrt(var(bayeserror.store))) print("LogitBoost Final Mean and Standard Deviation") print(mean(misclassLB.store[,iterations])) print(sqrt(var(misclassLB.store[,iterations]))) print("AdaBoost Final Mean and Standard Deviation") print(mean(misclassAB.store[,iterations])) print(sqrt(var(misclassAB.store[,iterations]))) print("Mean and Standard Deviation of Difference") print(mean(misclassLB.store[,iterations]-misclassAB.store[,iterations])) print(sqrt(var(misclassLB.store[,iterations]-misclassAB.store[,iterations]))) print("Percent of Postive Differences") print(sum(1*(misclassLB.store[,iterations]-misclassAB.store[,iterations])>0)/repeat.times)