File        a12_sortingDataTablesandMatrices
By          Daniel B. Carr
Copyright   2005, 2006, 2007 Class use permitted

Read        Companion Notes:  Ranking Objects

Purpose     Different ways to sort rows and columns
            Color matrix (heat maps) view
            Plotting variables as data
                1. Scaled gene mRNA expression data
                2. Data table color matrix (heat table) views
                3. Repeat of 2 subtracting variable means 
                4. Singular value decompositions and matrix approximation
                5. Plotting variables as data
                6. Plotting correlation matrices
                7. Correlation, Data, and Dissimilarity Matrix Visualization

Data         gemIdGene.csv from earlier the class data sets
             geneMstRowSubs.txt, geneMstColSubs.txt   # schedule link
             centerMstRowOrder.txt, centerMstColOrder.txt   # schedule link

Functions    panel Layout function  # already have installed
             colorCorMatBars.R      # from schedule link 	
             colorMatView.R         # from schedule link

Due          Plots from
             2, 3, 4.5, 5
             6.3 
             7.3 One pdf file of your choice
             7.4 pdf file
				
1. Scaled gene mRNA expression data

This data has been described and used previously

##Run

gemIdGene = read.csv(file='gemIdGene.csv',row.names=1)
genes = gemIdGene[,1:9]
mat = as.matrix(genes)
## End
 
# 2. Color Matrix Data Table (heat table) views======================================= 

##Run

# find breaks for 20% in each class
vals = sort(as.vector(mat))
f = ppoints(vals)
brks = c(min(vals)-.1,approx(f,vals,c(.2,.4,.6,.8))$y,
        max(vals))

# Set colors, make sure colors 4,5,6,7,8 are good colors
colors = matrix(c(
0,0,0,
1,1,1,
.5,.5,.5,
0,.4,.8,
.5,.75,1,
.7,.7,.7,
1,.75,.5,
.8,.4,0),
 ncol=3,byrow=T)

windows(width=8,height=10.5)
tmpPalette = rgb(colors[,1],colors[,2],colors[,3])
palette(tmpPalette)

# device and layout
pan = panelLayout(nrow=1,ncol=6,
       leftMar=0,topMar=.6,colSep=.15)
panHead = panelLayout(nrow=1,ncol=1,
          leftMar=0,topMar=.6)
cex = .8
loc1 = 1.1
loc2 = .3
nr = 112
nc = 9

# rectangle boundaries for plotting
# the R function rect() could be used 
px = c(-.5,.5,.5,-.5,NA)
py = c(-.5,-.5,.5,.5,NA)
polyx = rep(px,nc*nr) + rep(1:nc,rep(5*nr,nc))
polyy = rep(rep(py,nr)+rep(nr:1,rep(5,nr)),nc)

# as is__________________________________________________

matcol = as.numeric(cut(mat,brks))+3
panelSelect(pan,1,1)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline()
mtext('Original',side=3,line=loc1,cex=cex)
mtext('Order',side=3,line=loc2,cex=cex)
	
# row and columns by median________________________________
rmed = apply(mat,1,median)
cmed = apply(mat,2,median)
newmat= mat[order(rmed),order(cmed)]
matcol = as.numeric(cut(newmat,brks))+3

#Could just order a color matrix 

panelSelect(pan,1,2)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline()
mtext('Row and Column',side=3,line=loc1,cex=cex)
mtext('Medians',side=3,line=loc2,cex=cex)

# principal components__________________________________

rprin = prcomp(mat,center=F,retx=T)$x[,1]
cprin = prcomp(t(mat),center=F,retx=T)$x[,1]
newmat= mat[rev(order(rprin)),order(cprin)]
matcol = as.numeric(cut(newmat,brks))+3

panelSelect(pan,1,3)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline()
mtext('"Principal"',side=3,
      line=loc1,cex=cex)
mtext('Components',side=3,
      line=loc2,cex=cex)

# singular values________________________________________
svdResult = svd(mat)

# Don't need to transpose mat
# t(uDv') == vD'u'
# diff(svd(t(mat))$u[,1] - svd(mat)$v[,1])
newmat = mat[rev(order(svdResult$u[,1])),order(svdResult$v[,1])]
matcol = as.numeric(cut(newmat,brks))+3 
 
panelSelect(pan,1,4)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline()
mtext('First SVD ',side=3,
      line=loc1,cex=cex)
mtext('Eigenvectors',side=3,
      line=loc2,cex=cex)

# dendrogram

# The original clustering augment data with slopes
# Since the dissimilarities are obtained using
# the Euclidean metric, this effectively 
# reduced the weight given to the beginning 
# and ending values in the time series.  
# augMat = t(apply(genes[,1:9],1,diff))  
# geneMat = cbind(genes[,1:9],augMat) 

geneDis = dist(mat)
geneClusSummary  = hclust(geneDis,method='average')
geneReorder = geneClusSummary$order

varDis = dist(t(mat))
varClusSummary = hclust(varDis,method='average')
varReorder = varClusSummary$order
newmat = mat[geneReorder,varReorder] 

matcol = as.numeric(cut(newmat,brks))+3  
panelSelect(pan,1,5)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline()
mtext('Dendrogram',side=3,
      line=loc1,cex=cex)

# minimal spanning tree___________________________________

# need breadth traveral algorithm for R
# rmst = mstree(mat)
# cmst = mstree(t(mat)) 
# newmat= mat[rev(rmst$order[,1]),rev(cmst$order[,1])]
# reverse sort to make similar to principal components
# in this particular case 

rowOrder = scan('geneMstRowSubs.txt') # from Splus
colOrder = scan('geneMstColSubs.txt') # from Splus
geneMatMst= mat[rowOrder,colOrder]
matcol = as.numeric(cut(geneMatMst,brks))+3
#Could just order a color matrix 
panelSelect(pan,1,6)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline()
mtext('Spanning Tree',side=3,line=loc1,cex=cex)
mtext('Traversals',side=3,line=loc2,cex=cex)

# title______________________________________________

panelSelect(panHead,mar='top')
panelScale()
text(.5,.85,'Sorting Uncentered Data Matrix: "PC" really SVD',adj=.5)
##End


The median sort has most of the blue at the top as expected.

Here the first principal component and singular value decomposition
first left and right eigenvector sorts are the same.  Actually
both are really using SVD. Since the mean has  not be subtracted from
the variables, it might argued that the label principal components
is inappropriate.  The row order has been reversed to put the 
blue at the top.  Then there is a lot of dark blue at the top
and a lot of dark orange at the bottom.  The columns might
also be reverse to look more like the median sort.   

The dendrogram and have visual breaks where clusters are juxtaposed.   
At each of the n-1 joins the two clusters being 
joined can be individually flipped and the sides for joining chosen.
Of course previous flips and joins can be reconsidered.  
The default for joining clusters is based on the orginal data
order and the subcluster  creation order. 
It is easy to proposed heuristic algorithms that are likely to be
better in terms of minimize traversal lengths than joining
based on a happenstance ordering ordering of the data.   

The minimal spanning tree breadth traversal ordering tends to put
nearest neighbors together. This is likely the reason the matrix
appear better organized or simpler.  Observe that when looking from
the top down, the orange brown moves from top right to the center,
back to the right and then over to the left at the bottom.  There
is blue at the top left and blue at the bottom right .  The points
at the top and bottom are pretty far a part.  Of course all the sorting
methods are trying, in different ways, to put points far from each other
at the top and the bottom
 
An option not shown is to use multidimensional scaling into 1 dimension
to provide the sort order.  This has connection to other methods   

3. Variable centered data=========================================

If we subtract the variable means, the Euclidean distance between
cases stays the same.  In terms of principal components analysis,
it is conceptually pleasing to think about centering the data and
then rotating it so orthogonal projections shows the large variance
on the x-axis, the second larges variance on the y-axis and so on.
It is convenient to imagine rotating a p-dimension ellipsoid
so our viewing coordinates show the ellipsoid axis lengths in 
descending order.  

Note that even with multivariate normal data, whose density contours are 
ellipsoids, we may not see a lot of ellipsoids.  We don't assess
data density well from scatterplots.  Rather our attention is more
likely to focus on happenstance clusters (perceptual grouping)
and on edges such as the fringes of the data cloud,    

##Run

cmat = scale(mat,center=T,scale=F)

# find breaks for 20% in each class
vals = sort(as.vector(cmat))
f = ppoints(vals)
brks = c(min(vals)-.1,approx(f,vals,c(.2,.4,.6,.8))$y,
        max(vals))

# Set colors, make sure colors 4,5,6,7,8 are good colors
windows(width=8,height=10)
palette(tmpPalette)

# device and layout
pan = panelLayout(nrow=1,ncol=6,
       leftMar=0,topMar=.6,colSep=.15)
panHead = panelLayout(nrow=1,ncol=1,
         leftMar=0,topMar=.6,colSep=.15)

# as is__________________________________________________

matcol = as.numeric(cut(cmat,brks))+3
panelSelect(pan,1,1)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline()
mtext('Original',side=3,line=loc1,cex=cex)
mtext('Order',side=3,line=loc2,cex=cex)
	
# row and columns by median________________________________
rmed = apply(cmat,1,median)
cmed = apply(cmat,2,median)
newmat= cmat[order(rmed),order(cmed)]
matcol = as.numeric(cut(newmat,brks))+3

panelSelect(pan,1,2)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline()
mtext('Row and Column',side=3,line=loc1,cex=cex)
mtext('Medians',side=3,line=loc2,cex=cex)

# principal components__________________________________
rprin = prcomp(cmat,center=F,retx=T)$x[,1]  # already centered
cprin = prcomp(t(cmat),center=F,retx=T)$x[,1] # not centered
newmat= cmat[order(rprin),order(cprin)]
matcol = as.numeric(cut(newmat,brks))+3

panelSelect(pan,1,3)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline()
mtext('Principal',side=3,
      line=loc1,cex=cex)
mtext('Components',side=3,
      line=loc2,cex=cex)

# singular values________________________________________
svdResult = svd(cmat)
# note that scaling by the first singluar value
#      does not change the order
newmatSvd = cmat[order(svdResult$u[,1]),order(svdResult$v[,1])]
matcol = as.numeric(cut(newmatSvd,brks))+3 
 
panelSelect(pan,1,4)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline()
mtext('1st SVD',side=3,
      line=loc1,cex=cex)
mtext('Eigenvectors',side=3,
      line=loc2,cex=cex)

# dendrograph______________________________________________

# The original clustering augment data with slopes
# Since the dissimilarities are obtained using
# the Euclidean metric, this effectively 
# reduced the weight given to the beginning 
# and ending values in the time series.  
# augMat = t(apply(genes[,1:9],1,diff))
# geneMat = cbind(genes[,1:9],augMat) 

geneDis = dist(cmat)
geneClusSummary  = hclust(geneDis,method='average')
geneReorder = geneClusSummary$order

varDis = dist(t(cmat))
varClusSummary = hclust(varDis,method='average')
varReorder = varClusSummary$order
newmat = cmat[geneReorder,varReorder] 

matcol = as.numeric(cut(newmat,brks))+3  
panelSelect(pan,1,5)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline()
mtext('Dendrogram',side=3,
      line=loc1,cex=cex)

# minimal spanning tree___________________________________

# need breadth traveral algorithm for R
# rmst = mstree(cmat)
# cmst = mstree(t(cmat)) 
# newmat= cmat[rev(rmst$order[,1]),rev(cmst$order[,1])]
# reverse use to make similar to principal components
# in this particular case 

rowOrder = scan('centerMstRowOrder.txt') # from Splus
colOrder = scan('centerMstColOrder.txt') # from Splus
geneMatMst= cmat[rowOrder,colOrder]
matcol = as.numeric(cut(geneMatMst,brks))+3
#Could just order a color matrix 
panelSelect(pan,1,6)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline()
mtext('Spanning Tree',side=3,line=loc1,cex=cex)
mtext('Traversals',side=3,line=loc2,cex=cex)

# title______________________________________________

panelSelect(panHead,mar='top')
panelScale()
text(.5,.85,'Sorting Variable Mean Centered Data Matrix ',adj=.5)
##End


The column order of the dendrogram might be reversed to look a bit
more like the other sorts.  

A further plot could look as centered and scaled data.  


4. Singular Value Decompositions (SVD) and uses==============

The SVD algorithm decomposes a matrix Mnxp into UDV'
   Unxp is a matrix whose columns are called left eigenvectors
   Dpxp is vector of diagonal elements for a matrix of singular values
   Vpxp is a column matrix of right eigenvectors
Note that in UDV', V' is the transpose of V

R syntax:

svdResult = svd(mat)

svdResult$u is an n by p matrix of p left eigenvectors
svdResult$d is vector of singular values from a p x p diagonal matrix
svdResult$v is a p x p matrix of right eigenvectors
mat = svdResult$u %*% diag(svdResult$d) %*% t(svdResult$v) 

The singular value decomposition (SVD)has many uses.

4.1 Centered Data and Principal Components

As indicated above when the data is centered 
svd can be used to obtain principal components.
Svd is the preferred approach to obtain principal
components from center data for centered and scaled data.  
The computations are fewer and results are more accurate.  

The transformation matrix to produce principal components can be
obtained by spectral decomposition of a covariance matrice
or a correlation matrix.  The two matrices are derived from
centered or centered and scaled data respectively. The principal
components transformation can be viewed as a decorrelation
transformation since the principal components are orthogonal. 

In most applications the variables are scaled by their
standard deviations so the variance of all variables is
1.  The values are then unitless and linear combinations of
variables are not mixing measurements in different units.
In some applications the analyst chooses not to scale
the variables.  These applications will mostly be a subset
of applications involving variables that are already in
the same units.   

If the singular valued decomposition is applied to
a square matrix symmetric matrix such as a covariance
or correlation matrix, the left and right eigvectors will
be the same and the match the eigenvectors from spectral
decomposition.  The square root of the singular values
will the match the eigenvalues from the spectral decomposition.


## Run: Compare principal components from a principal components
#       algorithm and from svd for a centered matrix, mat
 
cmat = scale(mat,center=T,scale=F)
cmatPrin = princomp(cmat)
cmatPrinScores = cmatPrin$score

cmatSvd = svd(cmat)
cmatSvdPrin = cmatSvd$u %*% diag(cmatSvd$d)
one = rep(1,nrow(cmatSvdPrin))
matSign = cbind(one,-one,-one,-one,-one,-one,one,-one,one)
round(cmatPrinScores*matSign - cmatSvdPrin,6)
## End

Apparently there are differences in sign in the eigenvectors  

4.2 Sorting rows and columns of a matrix

Some examples use the first left eigenvector and
first right eigenvectors to sort rows and columns
respectively. 

4.3 Nonlinear dimension reduction

SVD has been used in nonlinear dimension reduction. After an appropriate
matrix construction a few rows of the left eigenvectors become new
plotting coordinates. Sometimes, as in the Fiedler projection, 
eigenvectors associates with the small non-zero eigenvalues are the
most important.

4.4  Error in variables:   

SVD is sometimes used to reduce error
At COMPSTAT'2004 Sabine Van Huffel gave an invited talk on 
total least squares and error-in-variables modeling.
The general idea is that all the variables are measured with error.
With some assumptions about the relative error magnitudes for the
variables, SVD can be used to de-noise the data in the regression
context.  It would seem that additional knowledge can be brought
to bear in the table smoothing context  

See S. Van Huffe. 2004.  "Total Least Squares and Error-in-Variables
Modeling: Bridging the Gap between Statistics, Computational Mathematics
and Engineering"  Proceeding in Computational Statistics,
Physica-Verlag, Springer, 539-555.

4.5 Matrix (table) approximation

This assignment provides an illustration.  At the moment
I don't have specific examples in mind where this has been used. It likely
is useful, but may in some variant of what is shown here.
   
For each i = 1,..., p, the outer product, %o% in R, of 
the ith column of U and the ith column of V'
scaled by the ith diagonal element of D produce a n x p matrix
or table

(U[,i]%o% V[,i]') * D[i,i] 

The sum of these p matrices is the original matrix M.
Constituent matrices with zero eigenvalues are zero
and contribute nothing.  Constituent matrices that
were computed using small eigenvalues are sometimes
omitted from the sum (or subtract from the original
matrix M) to produce as an approximation to the matrix
M.  Under some assumptions the matrices dropped may
be considered noise, and the result a "smoothed matrix."  
 
##Run

# For help in making visual comparisons, start using a data table with  
# rows and columns by sorted medians

rmed = apply(mat,1,median)
cmed = apply(mat,2,median)
matMedian= mat[order(rmed),order(cmed)]

# 4.5.1  Remove smallest singular value table 

svdResult = svd(matMedian)

# $u is a n by p variables matrix of p eigenvectors
# $d is vector of singular values from a p x p diagaonal matrix
# $v is a p x p matrix of eigenvectors
# Remember in general M = UDV' 

svdResult$d
#[1] 19.0341707  5.9078618  3.9923952  2.8743387  1.7641930
#     1.6330355  1.4790630  1.2416417  0.6924514
# Note the smallest eigenvalue, svdresult$d[9], is not zero


# 1 means the first matrix 
# M9 is made up to mean minus the 9th matrix
# M89 means minus the 8th and 9th matrix
mat1 = (svdResult$u[,1] %o% svdResult$v[,1])*svdResult$d[1]
mat12 = mat1 + (svdResult$u[,2] %o% svdResult$v[,2])*svdResult$d[2]
matM9 = matMedian - (svdResult$u[,9] %o% svdResult$v[9,])*svdResult$d[9]
matM89 = matM9 - (svdResult$u[,8] %o% svdResult$v[,8])*svdResult$d[8]


# 4.5.2 Graphics set up_________________________________

# Set break points so that roughly 20 percent of the cells
# in the first original matrix are in each color class.
# 
# Add two new categories for more extreme values in the 
# in the approximations: black = low, red= high

vals = sort(as.vector(matMedian))
f = ppoints(vals)
brks = c(min(vals)-.0001,approx(f,vals,c(.2,.4,.6,.8))$y,
        max(vals))
globalRange=range(matMedian,mat1,mat12,matM9,matM89)
brks = c(globalRange[1]-.001,brks,globalRange[2]+.001)

colors = matrix(c(
0,0,0,
1,1,1,
.5,.5,.5,
0,0,0,
0,.4,.8,
.5,.75,1,
.7,.7,.7,
1,.75,.5,
.7,.35,0,
1,.1,.1),
ncol=3,byrow=T)

windows(width=8,height=10)
tailsPalette = rgb(colors[,1],colors[,2],colors[,3])
palette(tailsPalette)

pan = panelLayout(nrow=1,ncol=5,
       leftMar=0,topMar=.6,colSep=.1)
panLabel = panelLayout(nrow=1,ncol=1,
       leftMar=0,topMar=.6)
cex = .8
loc = .5

nr = 112
nc = 9

# rectangle boundaries for plotting
px = c(-.5,.5,.5,-.5,NA)
py = c(-.5,-.5,.5,.5,NA)
polyx = rep(px,nc*nr) + rep(1:nc,rep(5*nr,nc))
polyy = rep(rep(py,nr)+rep(nr:1,rep(5,nr)),nc)

# 4.5.3 Panel 1 Median Sort Order__________________

matcol = as.numeric(cut(matMedian,brks))+3
panelSelect(pan,1,1)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline(col=3)
mtext('Median Sort',side=3,line=loc,cex=cex)

# 4.5.4 Panel 5 mat1 ____________________________
 
range(matMedian)
#[1] 0 1

range(mat1)
#[1] 0.04483357 1.06767840
# Should be some red cells
matcol = as.numeric(cut(mat1,brks))+3
 
panelSelect(pan,1,5)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline(col=3)
mtext('SVD 1',side=3,line=loc,cex=cex)

# 4.5.5 Panel 4 mat12________________________________

range(mat12)
# [1] -0.2206048  1.0701367
# Should be some black and red cells
matcol = as.numeric(cut(mat12,brks))+3
 
panelSelect(pan,1,4)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline(col=3)
mtext('SVD 12',side=3,line=loc,cex=cex)


# 4.5.6 Panel 3 matM89 ___________________________

range(matM89)
#[1] -0.07294524  1.03671208
# Should be some black and red
matcol = as.numeric(cut(matM89,brks))+3 

panelSelect(pan,1,3)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline(col=3)
mtext('SVD-89',side=3,line=loc,cex=cex)

# 4.5.7 Panel 2 matM9

range(matM9)
#[1] -0.02274396  1.02946160
matcol = as.numeric(cut(matM9,brks))+3

panelSelect(pan,1,2)
panelScale(c(.5,nc+.5),c(.5,nr+.5))
polygon(polyx,polyy,col=matcol,border=F)
polygon(polyx,polyy,col=3,density=F)
panelOutline(col=3)
mtext('SVD-9',side=3,line=loc,cex=cex)

panelSelect(panLabel,mar='top')
panelScale()
text(.5,.9,"More Extreme Values Shown as Black and Red") 

##End


4.6 Application:  Latent Semantic Index

One application is latent semantic indexing where rows are word
frequencies and columns are documents. This will be briefly described
in class.  

#5. Plotting variables as data===============================

##Run
correl = cor(mat)
dis = 1-abs(correl)
# dis = 1-correl^2
dis
locs = cmdscale(dis,eig=T,k=2)

x = locs$points[,1]
y = locs$points[,2]

windows()
palette(tmpPalette)
plot(x,y,type='n')
text(x,y,names(genes))
title("Plotting variables based on correlation dissimilarity")

##End

6. Plotting correlation matrices ============================

6.1  Discussion

It is simple to class the elements of correlation matrix as a
basis for assigning colors and producing a color matrix. The
merit is that this allows visualization of fairly large
correlation matrices. The matrix size that can be handled
is limited by both the number of pixels available and the
ability to see the colors. Labeling and knowing which cell
is which becomes problematic with large matrices. This is a
bit of a problem with say 100,000 variables.  Mouseovers in an
dynamic setting can help when the number of variables is modest.   

Remember that color is poor encoding in terms of perceptual
accuracy of extraction.  Back in the 1980s I used to sort
the correlations by a minimal spanning tree traversal when
there was no other strong basis for ordering.  When the matrices
were of modest size, I encoded the correlation magnitudes using
the area of a square plotted at the center of correlation 
matrix cell. I used two colors such as yellow and cyan to
distinguish between positive and negative correlations.  

Example 6.2 shows how to get a spanning tree traversal for a
correlation matrix in the Splus context. This involved
constructing pseudo data since the Splus mstree () algorithm
the include the breadth traversal start with data and
not dissimilarities. This is commented out at least
until I find or implement a transversal algorithm in 
R.  We don't use this actually use this ordering with the genes
since there is an already a meaningful age ordering.
  
Example 6.3 revises the 1980s work to show bars.  This upgrades
the area encoding to position along a scale.  The drawbacks include
asymmetry and space required. The function can be refined a bit as
indicated in its header.

# 6.2  Generating fake data when starting with dissimilarities.     

# The Splus mstree algorithm expects data, but
# we are starting with a dissimilarity matrix.
# Hence we use this matrix to generate suitable
# fake data 

# corMat = cor(mat)
# corDis = 1-abs(corMat)    # a dissimilarity measure

# fakeDat = cmdscale(corDis,k=5)  
   # the dissimilarity matrix is not full rank
   # Distances between 5D points preserve
   # the dissimilarities  
# mst = mstree(fakeDat) 

# mst$order[,1]				# rough the correct Euclidean
# 9 8 7 6 5 4 3 2 1
# newCorrel = correl[mst$order[,1],mst$order[,1]]

6.3  Using bars to show correlation magnitudes
 
##Run  install needed function for the call web site
source('colorCorMatBars.R')
windows(width=8,height=5.7)
colorCorMatBars(cor(as.matrix(genes)))
##End

Reading down the columns one can see the large correlation between
E21 and P0. 

#7. Correlation, Data, and Dissimilarity Matrix Visualization========

7.1  Discussion

For small data sets color matrices for correlations, data, and case
dissimilarities can be put together in a single plot.  This will 
be done in 7.3 and 7.4.  

This juxtaposition was illustrated with some extensions in
Chen,C-H, H-G Hwu, W-J Jang, C-H Kao, Y-J,Tien, SL Tzeng, and H-M Wu, 2004.
Matrix Visualization and Information Mining,
COMPSTAT'2004, Physica-Verlag/Springer, 85-100.

The idea of using color matrices and sorting is not new. 
Assignments for this class have been using them for years.
The juxtaposition of the three matrices in the above paper has some merit. 
This raises some interesting design issues of space and simplicity.

In one page plot with many cases, juxtaposition can lead to a tiny
correlation matrix that is hard to study.  In an 
an interactive setting and enlargement can address this.  
In a static setting, separate enlarged plots seem desirable.  

The three different matrices require three different color scales.  
For simplicity there is merit in using the same color scheme with
different interpretations.  The example below does this. 
However a double-ended scale seems more natural for correlations
while sequential scales seem a little more natural for data and
dissimilarities.   

Transformations provide one way to make the same type of color scheme
seem more appropriate.  For examples the correlation
matrices can be converted to p-values so a sequential scale
makes more sense.   Alternatively a central value can be subtracted
from data to provide a context for a double-ended scale.  The
dissimilarity matrices might be thought of relative to the median value
so a double-ended scale might see a little more natural. 
There are also some drawbacks to these ideas.  
   
The choice below currently defaults to a double ended scale.  
You are free to substitute you own color scale and may well
prefer a sequential scale.    

7.2  Some colors for palettes

##Run
#blueRed5
cmat = matrix(c(
 0.00, 0.00, 0.00,   # black
 1.00, 1.00, 1.00,   # white
  .90,  .90,  .90,   # light gray
 0.18, 0.28, 1.00,   # blue+slight_green, slightly desaturated
 0.53, 0.69, 1.00,   # blue+slight_green, moderately desaturated
 0.85, 0.85, 0.85,   # light gray
 1.00, 0.60, 0.60,   # red, moderately desaturated
 1.00, 0.05, 0.05),   # red
  ncol=3,byrow=T)
blueRed5 = rgb(cmat[,1],cmat[,2], cmat[,3])

# blueRed6
cmat = matrix(c(
 0.00, 0.00, 0.00,   # black
 1.00, 1.00, 1.00,   # white
  .90,  .90,  .90,   # light gray
 0.20, 0.35, 1.00,   # slightly desaturated blue (slightly green)
 0.50, 0.60, 1.00,   # moderately desaturated blue (slightly green)
 0.80, 0.86, 0.96,   # desaturated blue (slightly green)
 0.97, 0.76, 0.76,   # desaturated red
 1.00, 0.50, 0.50,   # moderately desaturated red
 1.00, 0.05, 0.05),   # red 
  ncol=3,byrow=T)
blueRed6 = rgb(cmat[,1],cmat[,2], cmat[,3])

#blueOrange5
cmat = matrix(c(
   0,    0,    0,
1.00, 1.00, 1.00,
 .90,  .90,  .90,
   0,  .35,  .70,
 .35,  .60, 1.00,
 .80,  .80,  .80,
1.00,  .58,  .35,
 .70,  .35,    0),
 ncol=3,byrow=T)
blueOrange5 =  rgb(cmat[,1],cmat[,2], cmat[,3])

# yellowBrown5
cmat = matrix(c(
   0,   0,   0,  # black
 255, 255, 255,  # white
 240, 240, 240,  # light gray
 255, 255, 212,  # very light yellow  
 254, 217, 142,  # 
 254, 153,  41,  # 
 217,  94,  14,  # 
 154,  52,   4), # brown
  ncol=3,byrow=T)
yellowBrown5  =  rgb(cmat[,1],cmat[,2], cmat[,3],max=255)


# blueOrange7 = 
cmat= matrix(c(
   0,    0,    0,
1.00, 1.00, 1.00,
 .85,  .85,  .85,
   0,  .35,  .70,
   0,  .50, 1.00,
 .50,  .75, 1.00,
 .80,  .80,  .80,
1.00,  .75,  .50,
1.00,   .5,    0,
 .70,  .35,    0),
 ncol=3,byrow=T)
blueOrange7 = rgb(cmat[,1],cmat[,2], cmat[,3])
##End

7.3 Minimal spanning tree sort 

This uses geneMatMst from 2. above

## Run  Install needed function from the class web site.

source('colorMatView.R')
geneMstDis = dist(geneMatMst,method="euc") 
dissimMst  = as.matrix(geneMstDis) # dissimilarities
corMst = cor(geneMatMst)         # correlation matrix

titles = c("Scaled Gene mRNA Data",
            "Minimal Spanning Tree Sort: Rows and Columns")
  
# view with 5 color classes
pdf(file='colorMat5.pdf',width=8.5,height=11)
colorMatView(geneMatMst,corMst,dissimMst,titles,colors=blueOrange5) 
dev.off()

# view with 7 color classes
pdf(file='colorMat7.pdf',width=8.5,height=11)
colorMatView(geneMatMst,corMst,dissimMst,titles,colors=blueOrange7)
dev.off()
 
##End

7.4  Sorting data with clusters

If one has clustered the data, a reasonable next step
for viewing the data is to order the cases within each cluster
This still leave sorting options.  With Splus I gave used
minimal spanning tree sorts.  For R I am temporarily using
SVD sorts.  Variations such as reversing the sort order may
better show the edges between adjacent clusters.  Some are hard
to see in the example below.

##Run

# The authors of the paper clustered the genes into six clusters
# The following reorder the rows so the genes in the same cluster
# are together.  
#
# The genes of each cluster are ordered by SVD sort (or
# a minimal spanning tree traversal 

mat = as.matrix(gemIdGene[,1:9]) # basic gene dat from 2. above
clusid = gemIdGene[,14]	   #  cluster membership 

# Define an MST sorting function for rows of a matrix 
#sortclass= function(class){
#  rmst = mstree(class)
#  return (class[rev(rmst$order[,1]),])
#  }

# Define an SVD eigenvector sorting function for rows of a matrix 
sortSVDclass = function(datClass)
   {tmpSvd =  svd(datClass,nu=1,nv=1)
    return (datClass[order(tmpSvd$u[,1]),])
   }

geneDataClus = sortSVDclass(mat[clusid==1,])
for (i in 2:6){
geneDataClus = rbind(geneDataClus,sortSVDclass(mat[clusid==i,]))
}

# Obtain dissimilarities and correlations
geneClusDis= dist(geneDataClus,method="euc") 
dissimClus = as.matrix(geneClusDis)    # dissimilarity matrix
corClus = cor(geneDataClus)            # correlation matrix

# plot the data, correlation and dissimilarities
titles = c("Scaled Gene mRNA Data",
            "Minimal Spanning Tree Sorting Within Clusters: Rows")
pdf(file='colorMatSorted7.pdf',width=8.5,height=11)
colorMatView(geneDataClus,corClus,dissimClus,titles,colors=blueOrange7)  
dev.off()

##End