# Generate data for illustrations of classification and clustering # HTFgen generates data according to a generalization of the model # of HTF, as described on p. 16. # The model of HTF has 2 variables. # There are 10 parent distributions that are "blue" # and 10 parent distributions that are "orange". # The means of the parent distributions are called mk. # The mk are distributed as # N((1,0),I) for blue # N((0,1),I) for orange # The HTF data consists of 200 observations distributed as # N(mk,I/5). # Of these, 100 were blue observations each from a random blue mk # and 100 were orange observations each from a random orange mk. # Generalizations: # 4 variables # controllable separation of the means of blues and oranges. # means of parent distributions are called m4bk and m4ok # m4bk from N((mxb,0,mzb,mwb),I) are blue # m4ok from N((0,my0,mzo,mwo),I) are orange # nmb blue parent distributions are generated # nmo orange parent distributions are generated # Data consists of nb blue observations distributed as N(m4bk,Vb) # and no orange observations distributed as N(m4ok,Vo). # In each case, for each observation, the mean is uniformly from the parents. # The output is a nb+no X 5 matrix in which the first column is an indicator: # 1 = blue # 0 = orange # The groups are randomly distributed among the rows. # Uses: # Simulate replication of HTF data: # mxb = 1 # myo = 1 # mzb, mwb, mzo, mwo are arbitrary # nmb = 10 # nmo = 10 # nb = 100 # no = 100 # Vb = I/5 # Vo = I/5 # (actually, Vb[3:4,3:4] Vo[3:4,3:4] and are arbitrary) HTFgen <- function(mxb=1,mzb=0,mwb=0,myo=1,mzo=0,mwo=0, nmb=10,nmo=10,nb=100,no=100, Vb=matrix(c(.2,0,0,0,0,.2,0,0,0,0,.2,0,0,0,0,.2),ncol=4), Vo=matrix(c(.2,0,0,0,0,.2,0,0,0,0,.2,0,0,0,0,.2),ncol=4)){ # Generate means of parent populations m4bk <- cbind(rnorm(nmb,mxb),rnorm(nmb,0),rnorm(nmb,mzb),rnorm(nmb,mwb)) m4ok <- cbind(rnorm(nmb,0),rnorm(nmb,myo),rnorm(nmb,mzo),rnorm(nmb,mwo)) # Generate blues Xb <- rbind(m4bk[sample(nmb,nb,replace=TRUE),]) Vbc <- chol(Vb) Xb <- matrix(rnorm(4*nb),nrow=nb)%*%Vbc + Xb # Generate oranges Xo <- rbind(m4ok[sample(nmo,no,replace=TRUE),]) Voc <- chol(Vo) Xo <- matrix(rnorm(4*no),nrow=no)%*%Voc + Xo # Randomize n <- nb+no X <- rbind(cbind(rep(1,nb),Xb),cbind(rep(0,no),Xo))[sample(n,n),] } # Samples # HTF replication x <- HTFgen() plot(x[,2],x[,3],col=2*(x[,1]+1)) # More separation mxb <- 3 myo <- 3 x <- HTFgen(mxb=mxb,myo=myo) plot(x[,2],x[,3],col=2*(x[,1]+1)) # Two meaningless predictors plot(x[,4],x[,5],col=2*(x[,1]+1)) # One meaningless predictor plot(x[,2],x[,5],col=2*(x[,1]+1)) ################################################################################ # Determine the percentage of correct binary classifications using k nearest neighbors # Correct classification means more than half of the k correctly classify. # ind is the vector of class indicators, assumed to be 0 or 1. pctnn <- function(ind, x, knn){ ### For now, this function does not accept the dist matrix. ### For now, this function will compute it and coerce it to a full matrix. distm <- as.matrix(dist(x)) n <- dim(distm)[1] correct <- 0 for (i in 1:n){ tmp <- which(rank(distm[-i,i])<=knn) if (abs(sum(ind[tmp[1:knn]])/knn-ind[i])<.5) correct <- correct+1 } correct } pctnn(x[,1],x=x[,c(2,3)],10) pctnn(x[,1],x=x[,c(4,5)],10)