# Generate data for illustrations of classification and clustering

# HTFgen generates data according to a generalization of the model 
# of HTF, as described on p. 16.

# The model of HTF has 2 variables.
# There are 10 parent distributions that are "blue"
# and 10 parent distributions that are "orange".

# The means of the parent distributions are called mk.

# The mk are distributed as
# N((1,0),I) for blue
# N((0,1),I) for orange

# The HTF data consists of 200 observations distributed as
# N(mk,I/5).
# Of these, 100 were blue observations each from a random blue mk
# and 100 were orange observations each from a random orange mk.

# Generalizations:

# 4 variables
# controllable separation of the means of blues and oranges.

# means of parent distributions are called m4bk and m4ok

# m4bk from N((mxb,0,mzb,mwb),I) are blue
# m4ok from N((0,my0,mzo,mwo),I) are orange

# nmb blue parent distributions are generated
# nmo orange parent distributions are generated

# Data consists of nb blue observations distributed as N(m4bk,Vb)
# and no orange observations distributed as N(m4ok,Vo).
# In each case, for each observation, the mean is uniformly from the parents.

# The output is a nb+no X 5 matrix in which the first column is an indicator:
#  1 = blue
#  0 = orange
# The groups are randomly distributed among the rows.

# Uses:

# Simulate replication of HTF data:
#   mxb = 1
#   myo = 1
#   mzb, mwb, mzo, mwo are arbitrary
#   nmb = 10
#   nmo = 10
#   nb = 100
#   no = 100
#   Vb = I/5
#   Vo = I/5
#   (actually, Vb[3:4,3:4] Vo[3:4,3:4] and are arbitrary)


HTFgen <- function(mxb=1,mzb=0,mwb=0,myo=1,mzo=0,mwo=0,
          nmb=10,nmo=10,nb=100,no=100,
          Vb=matrix(c(.2,0,0,0,0,.2,0,0,0,0,.2,0,0,0,0,.2),ncol=4),
          Vo=matrix(c(.2,0,0,0,0,.2,0,0,0,0,.2,0,0,0,0,.2),ncol=4)){
#   Generate means of parent populations
      m4bk <- cbind(rnorm(nmb,mxb),rnorm(nmb,0),rnorm(nmb,mzb),rnorm(nmb,mwb))
      m4ok <- cbind(rnorm(nmb,0),rnorm(nmb,myo),rnorm(nmb,mzo),rnorm(nmb,mwo))
#   Generate blues
      Xb <- rbind(m4bk[sample(nmb,nb,replace=TRUE),])
      Vbc <- chol(Vb)
      Xb <- matrix(rnorm(4*nb),nrow=nb)%*%Vbc + Xb
#   Generate oranges
      Xo <- rbind(m4ok[sample(nmo,no,replace=TRUE),])
      Voc <- chol(Vo)
      Xo <- matrix(rnorm(4*no),nrow=no)%*%Voc + Xo
#   Randomize
      n <- nb+no
      X <- rbind(cbind(rep(1,nb),Xb),cbind(rep(0,no),Xo))[sample(n,n),]
  }

#  Samples

#  HTF replication
  
x <- HTFgen()
plot(x[,2],x[,3],col=2*(x[,1]+1))

#  More separation

mxb <- 3
myo <- 3
x <- HTFgen(mxb=mxb,myo=myo)
plot(x[,2],x[,3],col=2*(x[,1]+1))

#  Two meaningless predictors
plot(x[,4],x[,5],col=2*(x[,1]+1))

#  One meaningless predictor
plot(x[,2],x[,5],col=2*(x[,1]+1))

################################################################################

# Determine the percentage of correct binary classifications using k nearest neighbors
# Correct classification means more than half of the k correctly classify.

#  ind  is the vector of class indicators, assumed to be 0 or 1.

pctnn <- function(ind, x, knn){
### For now, this function does not accept the dist matrix.  
### For now, this function will compute it and coerce it to a full matrix.
    distm <- as.matrix(dist(x))
    n <- dim(distm)[1]
    correct <- 0
    for (i in 1:n){
        tmp <- which(rank(distm[-i,i])<=knn)
        if (abs(sum(ind[tmp[1:knn]])/knn-ind[i])<.5) correct <- correct+1
    }
    correct
}


pctnn(x[,1],x=x[,c(2,3)],10)
pctnn(x[,1],x=x[,c(4,5)],10)