#Rsection4-code9output.txt & #This is the unix command to run this code in the background and save the output in the file section4-code9output.txt set.seed(9999, kind = NULL) repeat.times<-1 q<-.1 iterations<-1000 n<-400 misclass.store<-matrix(0,repeat.times,iterations) bayeserror.store<-rep(0,repeat.times) nnerror.store<-rep(0,repeat.times) rferror.store<-rep(0,repeat.times) library(class) library(rpart) #You can also try using Random Forests for comparison by uncommenting the code provided later #I commented it out because Random Forests is not part of the basic R package and must be installed separately #I put mine in the directory "temp" using the following commands (from within R) #options(CRAN = "http://cran.us.r-project.org/") #install.packages("randomForest",lib="temp") #library(randomForest, lib.loc="temp") for (iui in 1:repeat.times){ #Use a Latin Hypercube Design for the x (the Training Data) for Plottting Purposes x<-matrix(0,n,2) for (dd in 1:2){ x[,dd]<-seq(1,n)/n-.5/n u<-runif(n) o<-order(runif(n)) x[,dd]<-x[o,dd] } mod2<-function(int) { if (round(int)==int) return(1-1*(round(int/2,0)==int/2)) else return("error: this function only works on integers") } y<-rep(0,n) for (i in 1:n){ y[i]<-1*(runif(1)<(q+(1-2*q)*1*mod2(floor(10*x[i,1])))) } #Use a Stratified Design for the hold out x of size (n/2)^2 for Plottting Purposes hold.out.x<-matrix(0,(n/2)^2,2) hold.out.x[,1]<-rep((seq(1,n/2)/(n/2)-.5/(n/2)),n/2) hold.out.x[,2]<-rep((seq(1,n/2)/(n/2)-.5/(n/2)),each=n/2) #We will label all the points in the hold out sample according to the Bayes rule so misclassification error now #instead just computes percentage of points classified differently from Bayes rule hold.out.y<-rep(0,(n/2)^2) for (i in 1:((n/2)^2)){ hold.out.y[i]<-1*mod2(floor(10*hold.out.x[i,1])) } #error for bayes rule bayespred<-rep(0,(n/2)^2) for (i in 1:((n/2)^2)){ bayespred[i]<-1*mod2(floor(10*hold.out.x[i,1])) } bayeserror<-1-sum((as.numeric(bayespred)==hold.out.y))/length(bayespred) bayeserror.store[iui]<-bayeserror #nearest neighbor fit<-knn(x,hold.out.x,y,k=1,prob=T) ppp<-attributes(fit)$prob ppp[as.numeric(fit)==1]<-1-ppp[as.numeric(fit)==1] nnerror<-(1-sum((ppp==hold.out.y)/length(ppp))) nnerror.store[iui]<-nnerror #AdaBoost y<-(-1)+2*y hold.out.y<-(-1)+2*hold.out.y f<-rep(0,n) hold.out.f<-0 i<-1 te<-rep(0,iterations) misclass<-rep(0,iterations) while(i<=iterations){ w<-exp(-y*f) w<-w/sum(w) treee<-rpart(y~.,data.frame(x),w,method="class", control=rpart.control(minsplit=0,minbucket=0,cp=-1,maxcompete=0, maxsurrogate=0, usesurrogate=0, xval=0,maxdepth=3)) gmin<-(-1)+2*(predict(treee,data.frame(x),type="prob")[,1]< predict(treee,data.frame(x),type="prob")[,2]) hold.out.gmin<-(-1)+2*(predict(treee,data.frame(hold.out.x),type="prob")[,1]< predict(treee,data.frame(hold.out.x),type="prob")[,2]) e<-sum(w*(1-(y==gmin))) alpha<-.5*log ( (1-e) / e ) hold.out.f<-hold.out.f+alpha*hold.out.gmin f<-f+alpha*gmin te[i]<-sum(1*f*y<0)/n misclass[i]<-sum(1*hold.out.f*hold.out.y<0)/length(hold.out.f) print(c(i,te[i],misclass[i])) i<-i+1 } #Random Forests (you need to install the package as described above if you want to use this) #y<-as.factor(y) #rf<-randomForest(x, y) #classest<-predict(rf,hold.out.x) #rferror<-(1-sum((classest==hold.out.y)/length(hold.out.y))) #rferror.store[iui]<-rferror print("bayes error") print(bayeserror) print("nn error") print(nnerror) #print("random Forest error") #print(rferror) misclass.store[iui,]<-misclass } mean.misclass<-apply(misclass.store,2,mean) #AdaBoost Plot postscript("section4-code9plota.ps") par(mfrow=c(1,1),cex=1.5,cex.axis=1.5,cex.lab=1.5,pty="s") plot(x[,1],x[,2],xlab=expression(x^{(1)}),ylab=expression(x^{(2)}),type="n",xlim=c(0,1),ylim=c(0,1)) points(hold.out.x[,1],hold.out.x[,2],pch=15,col=(-1*(hold.out.f>0)+6),cex=.23) points(x[,1],x[,2],pch=20,col=(6*(y>0)+1)) dev.off() #One Nearest Neighbor Plot postscript("section4-code9plotb.ps") par(mfrow=c(1,1),cex=1.5,cex.axis=1.5,cex.lab=1.5,pty="s") plot(x[,1],x[,2],xlab=expression(x^{(1)}),ylab=expression(x^{(2)}),type="n",xlim=c(0,1),ylim=c(0,1)) points(hold.out.x[,1],hold.out.x[,2],pch=15,col=(-ppp+6),cex=.23) points(x[,1],x[,2],pch=20,col=(6*(y>0)+1)) dev.off() print("###############################################################") print("Nearest Neighbor Mean and Standard Deviation") print(mean(nnerror.store)) print(sqrt(var(nnerror.store))) #print("Random Forests Mean and Standard Deviation") #print(mean(rferror.store)) #print(sqrt(var(rferror.store))) print("Bayes Error Mean and Standard Deviation") print(mean(bayeserror.store)) print(sqrt(var(bayeserror.store))) print("AdaBoost Final Mean and Standard Deviation") print(mean(misclass.store[,iterations])) print(sqrt(var(misclass.store[,iterations]))) print("AdaBoost First Iteration Mean and Standard Deviation") print(mean(misclass.store[,1])) print(sqrt(var(misclass.store[,1]))) print("Mean and Standard Deviation of Difference") print(mean(misclass.store[,1]-misclass.store[,iterations])) print(sqrt(var(misclass.store[,1]-misclass.store[,iterations]))) print("Percent of Postive Differences") print(sum(1*(misclass.store[,1]-misclass.store[,iterations])>0)/repeat.times)