#R<section4-code9.txt --no-save>section4-code9output.txt &
#This is the unix command to run this code in the background and save the output in the file section4-code9output.txt

set.seed(9999, kind = NULL)

repeat.times<-1

q<-.1
iterations<-1000
n<-400


misclass.store<-matrix(0,repeat.times,iterations)


bayeserror.store<-rep(0,repeat.times)
nnerror.store<-rep(0,repeat.times)
rferror.store<-rep(0,repeat.times)

library(class)
library(rpart)

#You can also try using Random Forests for comparison by uncommenting the code provided later
#I commented it out because Random Forests is not part of the basic R package and must be installed separately
#I put mine in the directory "temp" using the following commands (from within R)
#options(CRAN = "http://cran.us.r-project.org/")
#install.packages("randomForest",lib="temp")
#library(randomForest, lib.loc="temp")


for (iui in 1:repeat.times){

#Use a Latin Hypercube Design for the x (the Training Data) for Plottting Purposes
x<-matrix(0,n,2)
for (dd in 1:2){ 
x[,dd]<-seq(1,n)/n-.5/n  
u<-runif(n) 
o<-order(runif(n))
x[,dd]<-x[o,dd]
}

mod2<-function(int) {
if (round(int)==int) return(1-1*(round(int/2,0)==int/2))
else return("error: this function only works on integers")
}

y<-rep(0,n)
for (i in 1:n){
y[i]<-1*(runif(1)<(q+(1-2*q)*1*mod2(floor(10*x[i,1]))))
}


#Use a Stratified Design for the hold out x of size (n/2)^2 for Plottting Purposes
hold.out.x<-matrix(0,(n/2)^2,2)
hold.out.x[,1]<-rep((seq(1,n/2)/(n/2)-.5/(n/2)),n/2) 
hold.out.x[,2]<-rep((seq(1,n/2)/(n/2)-.5/(n/2)),each=n/2)


#We will label all the points in the hold out sample according to the Bayes rule so misclassification error now  
#instead just computes percentage of points classified differently from Bayes rule
hold.out.y<-rep(0,(n/2)^2)
for (i in 1:((n/2)^2)){
hold.out.y[i]<-1*mod2(floor(10*hold.out.x[i,1]))
}


#error for bayes rule
bayespred<-rep(0,(n/2)^2)
for (i in 1:((n/2)^2)){
bayespred[i]<-1*mod2(floor(10*hold.out.x[i,1]))
}

bayeserror<-1-sum((as.numeric(bayespred)==hold.out.y))/length(bayespred)
bayeserror.store[iui]<-bayeserror

#nearest neighbor
fit<-knn(x,hold.out.x,y,k=1,prob=T)
ppp<-attributes(fit)$prob
ppp[as.numeric(fit)==1]<-1-ppp[as.numeric(fit)==1]
nnerror<-(1-sum((ppp==hold.out.y)/length(ppp)))
nnerror.store[iui]<-nnerror


#AdaBoost

y<-(-1)+2*y
hold.out.y<-(-1)+2*hold.out.y
f<-rep(0,n)
hold.out.f<-0
i<-1
te<-rep(0,iterations)
misclass<-rep(0,iterations)

while(i<=iterations){
w<-exp(-y*f)

w<-w/sum(w)

treee<-rpart(y~.,data.frame(x),w,method="class",
control=rpart.control(minsplit=0,minbucket=0,cp=-1,maxcompete=0, maxsurrogate=0, usesurrogate=0, xval=0,maxdepth=3))

gmin<-(-1)+2*(predict(treee,data.frame(x),type="prob")[,1]<
predict(treee,data.frame(x),type="prob")[,2])

hold.out.gmin<-(-1)+2*(predict(treee,data.frame(hold.out.x),type="prob")[,1]<
predict(treee,data.frame(hold.out.x),type="prob")[,2])

e<-sum(w*(1-(y==gmin)))

alpha<-.5*log ( (1-e) / e )

hold.out.f<-hold.out.f+alpha*hold.out.gmin
f<-f+alpha*gmin

te[i]<-sum(1*f*y<0)/n

misclass[i]<-sum(1*hold.out.f*hold.out.y<0)/length(hold.out.f)

print(c(i,te[i],misclass[i]))

i<-i+1


}

#Random Forests (you need to install the package as described above if you want to use this) 
#y<-as.factor(y)
#rf<-randomForest(x, y)
#classest<-predict(rf,hold.out.x)
#rferror<-(1-sum((classest==hold.out.y)/length(hold.out.y)))
#rferror.store[iui]<-rferror



print("bayes error")
print(bayeserror)

print("nn error")
print(nnerror)

#print("random Forest error")
#print(rferror)

misclass.store[iui,]<-misclass

}

mean.misclass<-apply(misclass.store,2,mean)

#AdaBoost Plot
postscript("section4-code9plota.ps")
par(mfrow=c(1,1),cex=1.5,cex.axis=1.5,cex.lab=1.5,pty="s")
plot(x[,1],x[,2],xlab=expression(x^{(1)}),ylab=expression(x^{(2)}),type="n",xlim=c(0,1),ylim=c(0,1))
points(hold.out.x[,1],hold.out.x[,2],pch=15,col=(-1*(hold.out.f>0)+6),cex=.23)
points(x[,1],x[,2],pch=20,col=(6*(y>0)+1))
dev.off()

#One Nearest Neighbor Plot
postscript("section4-code9plotb.ps")
par(mfrow=c(1,1),cex=1.5,cex.axis=1.5,cex.lab=1.5,pty="s")
plot(x[,1],x[,2],xlab=expression(x^{(1)}),ylab=expression(x^{(2)}),type="n",xlim=c(0,1),ylim=c(0,1))
points(hold.out.x[,1],hold.out.x[,2],pch=15,col=(-ppp+6),cex=.23)
points(x[,1],x[,2],pch=20,col=(6*(y>0)+1))
dev.off()



print("###############################################################")
print("Nearest Neighbor Mean and Standard Deviation")
print(mean(nnerror.store))
print(sqrt(var(nnerror.store)))

#print("Random Forests Mean and Standard Deviation")
#print(mean(rferror.store))
#print(sqrt(var(rferror.store)))

print("Bayes Error Mean and Standard Deviation")
print(mean(bayeserror.store))
print(sqrt(var(bayeserror.store)))

print("AdaBoost Final Mean and Standard Deviation")
print(mean(misclass.store[,iterations]))
print(sqrt(var(misclass.store[,iterations])))

print("AdaBoost First Iteration Mean and Standard Deviation")
print(mean(misclass.store[,1]))
print(sqrt(var(misclass.store[,1])))

print("Mean and Standard Deviation of Difference")
print(mean(misclass.store[,1]-misclass.store[,iterations]))
print(sqrt(var(misclass.store[,1]-misclass.store[,iterations])))

print("Percent of Postive Differences")
print(sum(1*(misclass.store[,1]-misclass.store[,iterations])>0)/repeat.times)