#Rsection4-code1output.txt & #This is the unix command to run this code in the background and save the output in the file section4-code1output.txt postscript("section4-code1plot.ps") par(mfrow=c(1,1),cex=1.5,cex.axis=1.5,cex.lab=1.5) set.seed(1111, kind = NULL) repeat.times<-100 q<-.1 iterations<-1000 n<-400 d<-20 m<-1000 #m is the size of the hold-out sample (i.e., the test sample) misclass.store<-matrix(0,repeat.times,iterations) misclass8.store<-matrix(0,repeat.times,iterations) #throughout the code object names ending in "8" correspond to AdaBoost with the 8-node trees #those without "8" are for the stumps bayeserror.store<-rep(0,repeat.times) nnerror.store<-rep(0,repeat.times) rferror.store<-rep(0,repeat.times) library(class) library(rpart) #You can also try using Random Forests for comparison by uncommenting the code provided later #I commented it out because Random Forests is not part of the basic R package and must be installed separately #I put mine in the directory "temp" using the following commands (from within R) #options(CRAN = "http://cran.us.r-project.org/") #install.packages("randomForest",lib="temp") #library(randomForest, lib.loc="temp") for (iui in 1:repeat.times){ x<-matrix(0,n,d) for (ddd in 1:d){ x[,ddd]<-runif(n) } hold.out.x<-matrix(0,m,d) for (k in 1:d) { hold.out.x[,k]<-runif(m) } mod2<-function(int) { if (round(int)==int) return(1-1*(round(int/2,0)==int/2)) else return("error: this function only works on integers") } y<-rep(0,n) for (i in 1:n){ y[i]<-1*(runif(1)<(q+(1-2*q)*1*mod2(floor(10*x[i,1])))) } hold.out.y<-rep(0,m) for (i in 1:(m)){ hold.out.y[i]<-1*(runif(1)<(q+(1-2*q)*1*mod2(floor(10*hold.out.x[i,1])))) } #error for bayes rule bayespred<-rep(0,m) for (i in 1:(m)){ bayespred[i]<-1*mod2(floor(10*hold.out.x[i,1])) } bayeserror<-1-sum((as.numeric(bayespred)==hold.out.y))/length(bayespred) bayeserror.store[iui]<-bayeserror #nearest neighbor fit<-knn(x,hold.out.x,y,k=1,prob=T) ppp<-attributes(fit)$prob ppp[as.numeric(fit)==1]<-1-ppp[as.numeric(fit)==1] nnerror<-(1-sum((ppp==hold.out.y)/length(ppp))) nnerror.store[iui]<-nnerror #AdaBoost y<-(-1)+2*y hold.out.y<-(-1)+2*hold.out.y f<-rep(0,n) f8<-rep(0,n) hold.out.f<-0 hold.out.f8<-0 i<-1 te<-rep(0,iterations) te8<-rep(0,iterations) misclass<-rep(0,iterations) misclass8<-rep(0,iterations) while(i<=iterations){ w<-exp(-y*f) w8<-exp(-y*f8) w<-w/sum(w) w8<-w8/sum(w8) treee<-rpart(y~.,data.frame(x),w,method="class", control=rpart.control(minsplit=0,minbucket=0,cp=-1,maxcompete=0, maxsurrogate=0, usesurrogate=0, xval=0,maxdepth=1)) treee8<-rpart(y~.,data.frame(x),w8,method="class", control=rpart.control(minsplit=0,minbucket=0,cp=-1,maxcompete=0, maxsurrogate=0, usesurrogate=0, xval=0,maxdepth=3)) gmin<-(-1)+2*(predict(treee,data.frame(x),type="prob")[,1]< predict(treee,data.frame(x),type="prob")[,2]) gmin8<-(-1)+2*(predict(treee8,data.frame(x),type="prob")[,1]< predict(treee8,data.frame(x),type="prob")[,2]) hold.out.gmin<-(-1)+2*(predict(treee,data.frame(hold.out.x),type="prob")[,1]< predict(treee,data.frame(hold.out.x),type="prob")[,2]) hold.out.gmin8<-(-1)+2*(predict(treee8,data.frame(hold.out.x),type="prob")[,1]< predict(treee8,data.frame(hold.out.x),type="prob")[,2]) e<-sum(w*(1-(y==gmin))) e8<-sum(w8*(1-(y==gmin8))) alpha<-.5*log ( (1-e) / e ) alpha8<-.5*log ( (1-e8) / e8 ) hold.out.f<-hold.out.f+alpha*hold.out.gmin hold.out.f8<-hold.out.f8+alpha8*hold.out.gmin8 f<-f+alpha*gmin f8<-f8+alpha8*gmin8 te[i]<-sum(1*f*y<0)/n te8[i]<-sum(1*f8*y<0)/n misclass[i]<-sum(1*hold.out.f*hold.out.y<0)/length(hold.out.f) misclass8[i]<-sum(1*hold.out.f8*hold.out.y<0)/length(hold.out.f8) print(c(i,te[i],misclass[i],te8[i],misclass8[i])) i<-i+1 } #Random Forests (you need to install the package as described above if you want to use this) #y<-as.factor(y) #rf<-randomForest(x, y) #classest<-predict(rf,hold.out.x) #rferror<-(1-sum((classest==hold.out.y)/length(hold.out.y))) #rferror.store[iui]<-rferror print("bayes error") print(bayeserror) print("nn error") print(nnerror) #print("random Forest error") #print(rferror) misclass.store[iui,]<-misclass misclass8.store[iui,]<-misclass8 } mean.misclass<-apply(misclass.store,2,mean) mean.misclass8<-apply(misclass8.store,2,mean) plot(seq(1,iterations),mean.misclass,ylim=c(.18,.30),xlab="AdaBoost Iterations",ylab="Misclassification Error", type="n",main="") lines(seq(1,iterations),mean.misclass) lines(seq(1,iterations),mean.misclass8,col="red") dev.off() print("###############################################################") print("Nearest Neighbor Mean and Standard Deviation") print(mean(nnerror.store)) print(sqrt(var(nnerror.store))) #print("Random Forests Mean and Standard Deviation") #print(mean(rferror.store)) #print(sqrt(var(rferror.store))) print("Bayes Error Mean and Standard Deviation") print(mean(bayeserror.store)) print(sqrt(var(bayeserror.store))) print("Stumps Final Mean and Standard Deviation") print(mean(misclass.store[,iterations])) print(sqrt(var(misclass.store[,iterations]))) print("8-Node Trees Final Mean and Standard Deviation") print(mean(misclass8.store[,iterations])) print(sqrt(var(misclass8.store[,iterations]))) print("Mean and Standard Deviation of Difference") print(mean(misclass.store[,iterations]-misclass8.store[,iterations])) print(sqrt(var(misclass.store[,iterations]-misclass8.store[,iterations]))) print("Percent of Postive Differences") print(sum(1*(misclass.store[,iterations]-misclass8.store[,iterations])>0)/repeat.times)