#Rsection4-code10output.txt & #This is the unix command to run this code in the background and save the output in the file section4-code10output.txt postscript("section4-code10plot.ps") par(mfrow=c(1,1),cex=1.5,cex.axis=1.5,cex.lab=1.5) set.seed(10101010, kind = NULL) repeat.times<-100 q<-.1 iterations<-1000 n<-10000 d<-20 m<-1000 #m is the size of the hold-out sample (i.e., the test sample) misclass.store<-matrix(0,repeat.times,iterations) bayeserror.store<-rep(0,repeat.times) nnerror.store<-rep(0,repeat.times) rferror.store<-rep(0,repeat.times) library(class) library(rpart) #You can also try using Random Forests for comparison by uncommenting the code provided later #I commented it out because Random Forests is not part of the basic R package and must be installed separately #I put mine in the directory "temp" using the following commands (from within R) #options(CRAN = "http://cran.us.r-project.org/") #install.packages("randomForest",lib="temp") #library(randomForest, lib.loc="temp") for (iui in 1:repeat.times){ x<-matrix(0,n,d) for (ddd in 1:d){ x[,ddd]<-runif(n) } hold.out.x<-matrix(0,m,d) for (k in 1:d) { hold.out.x[,k]<-runif(m) } mod2<-function(int) { if (round(int)==int) return(1-1*(round(int/2,0)==int/2)) else return("error: this function only works on integers") } y<-rep(0,n) for (i in 1:n){ y[i]<-1*(runif(1)<(q+(1-2*q)*1*mod2(floor(10*x[i,1])))) } hold.out.y<-rep(0,m) for (i in 1:(m)){ hold.out.y[i]<-1*(runif(1)<(q+(1-2*q)*1*mod2(floor(10*hold.out.x[i,1])))) } #error for bayes rule bayespred<-rep(0,m) for (i in 1:(m)){ bayespred[i]<-1*mod2(floor(10*hold.out.x[i,1])) } bayeserror<-1-sum((as.numeric(bayespred)==hold.out.y))/length(bayespred) bayeserror.store[iui]<-bayeserror #nearest neighbor fit<-knn(x,hold.out.x,y,k=1,prob=T) ppp<-attributes(fit)$prob ppp[as.numeric(fit)==1]<-1-ppp[as.numeric(fit)==1] nnerror<-(1-sum((ppp==hold.out.y)/length(ppp))) nnerror.store[iui]<-nnerror #AdaBoost y<-(-1)+2*y hold.out.y<-(-1)+2*hold.out.y f<-rep(0,n) hold.out.f<-0 i<-1 te<-rep(0,iterations) misclass<-rep(0,iterations) while(i<=iterations){ w<-exp(-y*f) w<-w/sum(w) treee<-rpart(y~.,data.frame(x),w,method="class", control=rpart.control(minsplit=0,minbucket=0,cp=-1,maxcompete=0, maxsurrogate=0, usesurrogate=0, xval=0,maxdepth=8)) gmin<-(-1)+2*(predict(treee,data.frame(x),type="prob")[,1]< predict(treee,data.frame(x),type="prob")[,2]) hold.out.gmin<-(-1)+2*(predict(treee,data.frame(hold.out.x),type="prob")[,1]< predict(treee,data.frame(hold.out.x),type="prob")[,2]) e<-sum(w*(1-(y==gmin))) alpha<-.5*log ( (1-e) / e ) hold.out.f<-hold.out.f+alpha*hold.out.gmin f<-f+alpha*gmin te[i]<-sum(1*f*y<0)/n misclass[i]<-sum(1*hold.out.f*hold.out.y<0)/length(hold.out.f) print(c(i,te[i],misclass[i])) i<-i+1 } #Random Forests (you need to install the package as described above if you want to use this) #y<-as.factor(y) #rf<-randomForest(x, y) #classest<-predict(rf,hold.out.x) #rferror<-(1-sum((classest==hold.out.y)/length(hold.out.y))) #rferror.store[iui]<-rferror print("bayes error") print(bayeserror) print("nn error") print(nnerror) #print("random Forest error") #print(rferror) misclass.store[iui,]<-misclass } mean.misclass<-apply(misclass.store,2,mean) plot(seq(1,iterations),mean.misclass,ylim=c(.1,.2),xlab="AdaBoost Iterations",ylab="Misclassification Error", type="n",main="") lines(seq(1,iterations),mean.misclass,col="red") dev.off() print("###############################################################") print("Nearest Neighbor Mean and Standard Deviation") print(mean(nnerror.store)) print(sqrt(var(nnerror.store))) #print("Random Forests Mean and Standard Deviation") #print(mean(rferror.store)) #print(sqrt(var(rferror.store))) print("Bayes Error Mean and Standard Deviation") print(mean(bayeserror.store)) print(sqrt(var(bayeserror.store))) print("AdaBoost Final Mean and Standard Deviation") print(mean(misclass.store[,iterations])) print(sqrt(var(misclass.store[,iterations])))