File a12_sortingDataTablesandMatrices By Daniel B. Carr Copyright 2005, 2006, 2007 Class use permitted Read Companion Notes: Ranking Objects Purpose Different ways to sort rows and columns Color matrix (heat maps) view Plotting variables as data 1. Scaled gene mRNA expression data 2. Data table color matrix (heat table) views 3. Repeat of 2 subtracting variable means 4. Singular value decompositions and matrix approximation 5. Plotting variables as data 6. Plotting correlation matrices 7. Correlation, Data, and Dissimilarity Matrix Visualization Data gemIdGene.csv from earlier the class data sets geneMstRowSubs.txt, geneMstColSubs.txt # schedule link centerMstRowOrder.txt, centerMstColOrder.txt # schedule link Functions panel Layout function # already have installed colorCorMatBars.R # from schedule link colorMatView.R # from schedule link Due Plots from 2, 3, 4.5, 5 6.3 7.3 One pdf file of your choice 7.4 pdf file 1. Scaled gene mRNA expression data This data has been described and used previously ##Run gemIdGene = read.csv(file='gemIdGene.csv',row.names=1) genes = gemIdGene[,1:9] mat = as.matrix(genes) ## End # 2. Color Matrix Data Table (heat table) views======================================= ##Run # find breaks for 20% in each class vals = sort(as.vector(mat)) f = ppoints(vals) brks = c(min(vals)-.1,approx(f,vals,c(.2,.4,.6,.8))$y, max(vals)) # Set colors, make sure colors 4,5,6,7,8 are good colors colors = matrix(c( 0,0,0, 1,1,1, .5,.5,.5, 0,.4,.8, .5,.75,1, .7,.7,.7, 1,.75,.5, .8,.4,0), ncol=3,byrow=T) windows(width=8,height=10.5) tmpPalette = rgb(colors[,1],colors[,2],colors[,3]) palette(tmpPalette) # device and layout pan = panelLayout(nrow=1,ncol=6, leftMar=0,topMar=.6,colSep=.15) panHead = panelLayout(nrow=1,ncol=1, leftMar=0,topMar=.6) cex = .8 loc1 = 1.1 loc2 = .3 nr = 112 nc = 9 # rectangle boundaries for plotting # the R function rect() could be used px = c(-.5,.5,.5,-.5,NA) py = c(-.5,-.5,.5,.5,NA) polyx = rep(px,nc*nr) + rep(1:nc,rep(5*nr,nc)) polyy = rep(rep(py,nr)+rep(nr:1,rep(5,nr)),nc) # as is__________________________________________________ matcol = as.numeric(cut(mat,brks))+3 panelSelect(pan,1,1) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline() mtext('Original',side=3,line=loc1,cex=cex) mtext('Order',side=3,line=loc2,cex=cex) # row and columns by median________________________________ rmed = apply(mat,1,median) cmed = apply(mat,2,median) newmat= mat[order(rmed),order(cmed)] matcol = as.numeric(cut(newmat,brks))+3 #Could just order a color matrix panelSelect(pan,1,2) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline() mtext('Row and Column',side=3,line=loc1,cex=cex) mtext('Medians',side=3,line=loc2,cex=cex) # principal components__________________________________ rprin = prcomp(mat,center=F,retx=T)$x[,1] cprin = prcomp(t(mat),center=F,retx=T)$x[,1] newmat= mat[rev(order(rprin)),order(cprin)] matcol = as.numeric(cut(newmat,brks))+3 panelSelect(pan,1,3) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline() mtext('"Principal"',side=3, line=loc1,cex=cex) mtext('Components',side=3, line=loc2,cex=cex) # singular values________________________________________ svdResult = svd(mat) # Don't need to transpose mat # t(uDv') == vD'u' # diff(svd(t(mat))$u[,1] - svd(mat)$v[,1]) newmat = mat[rev(order(svdResult$u[,1])),order(svdResult$v[,1])] matcol = as.numeric(cut(newmat,brks))+3 panelSelect(pan,1,4) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline() mtext('First SVD ',side=3, line=loc1,cex=cex) mtext('Eigenvectors',side=3, line=loc2,cex=cex) # dendrogram # The original clustering augment data with slopes # Since the dissimilarities are obtained using # the Euclidean metric, this effectively # reduced the weight given to the beginning # and ending values in the time series. # augMat = t(apply(genes[,1:9],1,diff)) # geneMat = cbind(genes[,1:9],augMat) geneDis = dist(mat) geneClusSummary = hclust(geneDis,method='average') geneReorder = geneClusSummary$order varDis = dist(t(mat)) varClusSummary = hclust(varDis,method='average') varReorder = varClusSummary$order newmat = mat[geneReorder,varReorder] matcol = as.numeric(cut(newmat,brks))+3 panelSelect(pan,1,5) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline() mtext('Dendrogram',side=3, line=loc1,cex=cex) # minimal spanning tree___________________________________ # need breadth traveral algorithm for R # rmst = mstree(mat) # cmst = mstree(t(mat)) # newmat= mat[rev(rmst$order[,1]),rev(cmst$order[,1])] # reverse sort to make similar to principal components # in this particular case rowOrder = scan('geneMstRowSubs.txt') # from Splus colOrder = scan('geneMstColSubs.txt') # from Splus geneMatMst= mat[rowOrder,colOrder] matcol = as.numeric(cut(geneMatMst,brks))+3 #Could just order a color matrix panelSelect(pan,1,6) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline() mtext('Spanning Tree',side=3,line=loc1,cex=cex) mtext('Traversals',side=3,line=loc2,cex=cex) # title______________________________________________ panelSelect(panHead,mar='top') panelScale() text(.5,.85,'Sorting Uncentered Data Matrix: "PC" really SVD',adj=.5) ##End The median sort has most of the blue at the top as expected. Here the first principal component and singular value decomposition first left and right eigenvector sorts are the same. Actually both are really using SVD. Since the mean has not be subtracted from the variables, it might argued that the label principal components is inappropriate. The row order has been reversed to put the blue at the top. Then there is a lot of dark blue at the top and a lot of dark orange at the bottom. The columns might also be reverse to look more like the median sort. The dendrogram and have visual breaks where clusters are juxtaposed. At each of the n-1 joins the two clusters being joined can be individually flipped and the sides for joining chosen. Of course previous flips and joins can be reconsidered. The default for joining clusters is based on the orginal data order and the subcluster creation order. It is easy to proposed heuristic algorithms that are likely to be better in terms of minimize traversal lengths than joining based on a happenstance ordering ordering of the data. The minimal spanning tree breadth traversal ordering tends to put nearest neighbors together. This is likely the reason the matrix appear better organized or simpler. Observe that when looking from the top down, the orange brown moves from top right to the center, back to the right and then over to the left at the bottom. There is blue at the top left and blue at the bottom right . The points at the top and bottom are pretty far a part. Of course all the sorting methods are trying, in different ways, to put points far from each other at the top and the bottom An option not shown is to use multidimensional scaling into 1 dimension to provide the sort order. This has connection to other methods 3. Variable centered data========================================= If we subtract the variable means, the Euclidean distance between cases stays the same. In terms of principal components analysis, it is conceptually pleasing to think about centering the data and then rotating it so orthogonal projections shows the large variance on the x-axis, the second larges variance on the y-axis and so on. It is convenient to imagine rotating a p-dimension ellipsoid so our viewing coordinates show the ellipsoid axis lengths in descending order. Note that even with multivariate normal data, whose density contours are ellipsoids, we may not see a lot of ellipsoids. We don't assess data density well from scatterplots. Rather our attention is more likely to focus on happenstance clusters (perceptual grouping) and on edges such as the fringes of the data cloud, ##Run cmat = scale(mat,center=T,scale=F) # find breaks for 20% in each class vals = sort(as.vector(cmat)) f = ppoints(vals) brks = c(min(vals)-.1,approx(f,vals,c(.2,.4,.6,.8))$y, max(vals)) # Set colors, make sure colors 4,5,6,7,8 are good colors windows(width=8,height=10) palette(tmpPalette) # device and layout pan = panelLayout(nrow=1,ncol=6, leftMar=0,topMar=.6,colSep=.15) panHead = panelLayout(nrow=1,ncol=1, leftMar=0,topMar=.6,colSep=.15) # as is__________________________________________________ matcol = as.numeric(cut(cmat,brks))+3 panelSelect(pan,1,1) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline() mtext('Original',side=3,line=loc1,cex=cex) mtext('Order',side=3,line=loc2,cex=cex) # row and columns by median________________________________ rmed = apply(cmat,1,median) cmed = apply(cmat,2,median) newmat= cmat[order(rmed),order(cmed)] matcol = as.numeric(cut(newmat,brks))+3 panelSelect(pan,1,2) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline() mtext('Row and Column',side=3,line=loc1,cex=cex) mtext('Medians',side=3,line=loc2,cex=cex) # principal components__________________________________ rprin = prcomp(cmat,center=F,retx=T)$x[,1] # already centered cprin = prcomp(t(cmat),center=F,retx=T)$x[,1] # not centered newmat= cmat[order(rprin),order(cprin)] matcol = as.numeric(cut(newmat,brks))+3 panelSelect(pan,1,3) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline() mtext('Principal',side=3, line=loc1,cex=cex) mtext('Components',side=3, line=loc2,cex=cex) # singular values________________________________________ svdResult = svd(cmat) # note that scaling by the first singluar value # does not change the order newmatSvd = cmat[order(svdResult$u[,1]),order(svdResult$v[,1])] matcol = as.numeric(cut(newmatSvd,brks))+3 panelSelect(pan,1,4) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline() mtext('1st SVD',side=3, line=loc1,cex=cex) mtext('Eigenvectors',side=3, line=loc2,cex=cex) # dendrograph______________________________________________ # The original clustering augment data with slopes # Since the dissimilarities are obtained using # the Euclidean metric, this effectively # reduced the weight given to the beginning # and ending values in the time series. # augMat = t(apply(genes[,1:9],1,diff)) # geneMat = cbind(genes[,1:9],augMat) geneDis = dist(cmat) geneClusSummary = hclust(geneDis,method='average') geneReorder = geneClusSummary$order varDis = dist(t(cmat)) varClusSummary = hclust(varDis,method='average') varReorder = varClusSummary$order newmat = cmat[geneReorder,varReorder] matcol = as.numeric(cut(newmat,brks))+3 panelSelect(pan,1,5) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline() mtext('Dendrogram',side=3, line=loc1,cex=cex) # minimal spanning tree___________________________________ # need breadth traveral algorithm for R # rmst = mstree(cmat) # cmst = mstree(t(cmat)) # newmat= cmat[rev(rmst$order[,1]),rev(cmst$order[,1])] # reverse use to make similar to principal components # in this particular case rowOrder = scan('centerMstRowOrder.txt') # from Splus colOrder = scan('centerMstColOrder.txt') # from Splus geneMatMst= cmat[rowOrder,colOrder] matcol = as.numeric(cut(geneMatMst,brks))+3 #Could just order a color matrix panelSelect(pan,1,6) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline() mtext('Spanning Tree',side=3,line=loc1,cex=cex) mtext('Traversals',side=3,line=loc2,cex=cex) # title______________________________________________ panelSelect(panHead,mar='top') panelScale() text(.5,.85,'Sorting Variable Mean Centered Data Matrix ',adj=.5) ##End The column order of the dendrogram might be reversed to look a bit more like the other sorts. A further plot could look as centered and scaled data. 4. Singular Value Decompositions (SVD) and uses============== The SVD algorithm decomposes a matrix Mnxp into UDV' Unxp is a matrix whose columns are called left eigenvectors Dpxp is vector of diagonal elements for a matrix of singular values Vpxp is a column matrix of right eigenvectors Note that in UDV', V' is the transpose of V R syntax: svdResult = svd(mat) svdResult$u is an n by p matrix of p left eigenvectors svdResult$d is vector of singular values from a p x p diagonal matrix svdResult$v is a p x p matrix of right eigenvectors mat = svdResult$u %*% diag(svdResult$d) %*% t(svdResult$v) The singular value decomposition (SVD)has many uses. 4.1 Centered Data and Principal Components As indicated above when the data is centered svd can be used to obtain principal components. Svd is the preferred approach to obtain principal components from center data for centered and scaled data. The computations are fewer and results are more accurate. The transformation matrix to produce principal components can be obtained by spectral decomposition of a covariance matrice or a correlation matrix. The two matrices are derived from centered or centered and scaled data respectively. The principal components transformation can be viewed as a decorrelation transformation since the principal components are orthogonal. In most applications the variables are scaled by their standard deviations so the variance of all variables is 1. The values are then unitless and linear combinations of variables are not mixing measurements in different units. In some applications the analyst chooses not to scale the variables. These applications will mostly be a subset of applications involving variables that are already in the same units. If the singular valued decomposition is applied to a square matrix symmetric matrix such as a covariance or correlation matrix, the left and right eigvectors will be the same and the match the eigenvectors from spectral decomposition. The square root of the singular values will the match the eigenvalues from the spectral decomposition. ## Run: Compare principal components from a principal components # algorithm and from svd for a centered matrix, mat cmat = scale(mat,center=T,scale=F) cmatPrin = princomp(cmat) cmatPrinScores = cmatPrin$score cmatSvd = svd(cmat) cmatSvdPrin = cmatSvd$u %*% diag(cmatSvd$d) one = rep(1,nrow(cmatSvdPrin)) matSign = cbind(one,-one,-one,-one,-one,-one,one,-one,one) round(cmatPrinScores*matSign - cmatSvdPrin,6) ## End Apparently there are differences in sign in the eigenvectors 4.2 Sorting rows and columns of a matrix Some examples use the first left eigenvector and first right eigenvectors to sort rows and columns respectively. 4.3 Nonlinear dimension reduction SVD has been used in nonlinear dimension reduction. After an appropriate matrix construction a few rows of the left eigenvectors become new plotting coordinates. Sometimes, as in the Fiedler projection, eigenvectors associates with the small non-zero eigenvalues are the most important. 4.4 Error in variables: SVD is sometimes used to reduce error At COMPSTAT'2004 Sabine Van Huffel gave an invited talk on total least squares and error-in-variables modeling. The general idea is that all the variables are measured with error. With some assumptions about the relative error magnitudes for the variables, SVD can be used to de-noise the data in the regression context. It would seem that additional knowledge can be brought to bear in the table smoothing context See S. Van Huffe. 2004. "Total Least Squares and Error-in-Variables Modeling: Bridging the Gap between Statistics, Computational Mathematics and Engineering" Proceeding in Computational Statistics, Physica-Verlag, Springer, 539-555. 4.5 Matrix (table) approximation This assignment provides an illustration. At the moment I don't have specific examples in mind where this has been used. It likely is useful, but may in some variant of what is shown here. For each i = 1,..., p, the outer product, %o% in R, of the ith column of U and the ith column of V' scaled by the ith diagonal element of D produce a n x p matrix or table (U[,i]%o% V[,i]') * D[i,i] The sum of these p matrices is the original matrix M. Constituent matrices with zero eigenvalues are zero and contribute nothing. Constituent matrices that were computed using small eigenvalues are sometimes omitted from the sum (or subtract from the original matrix M) to produce as an approximation to the matrix M. Under some assumptions the matrices dropped may be considered noise, and the result a "smoothed matrix." ##Run # For help in making visual comparisons, start using a data table with # rows and columns by sorted medians rmed = apply(mat,1,median) cmed = apply(mat,2,median) matMedian= mat[order(rmed),order(cmed)] # 4.5.1 Remove smallest singular value table svdResult = svd(matMedian) # $u is a n by p variables matrix of p eigenvectors # $d is vector of singular values from a p x p diagaonal matrix # $v is a p x p matrix of eigenvectors # Remember in general M = UDV' svdResult$d #[1] 19.0341707 5.9078618 3.9923952 2.8743387 1.7641930 # 1.6330355 1.4790630 1.2416417 0.6924514 # Note the smallest eigenvalue, svdresult$d[9], is not zero # 1 means the first matrix # M9 is made up to mean minus the 9th matrix # M89 means minus the 8th and 9th matrix mat1 = (svdResult$u[,1] %o% svdResult$v[,1])*svdResult$d[1] mat12 = mat1 + (svdResult$u[,2] %o% svdResult$v[,2])*svdResult$d[2] matM9 = matMedian - (svdResult$u[,9] %o% svdResult$v[9,])*svdResult$d[9] matM89 = matM9 - (svdResult$u[,8] %o% svdResult$v[,8])*svdResult$d[8] # 4.5.2 Graphics set up_________________________________ # Set break points so that roughly 20 percent of the cells # in the first original matrix are in each color class. # # Add two new categories for more extreme values in the # in the approximations: black = low, red= high vals = sort(as.vector(matMedian)) f = ppoints(vals) brks = c(min(vals)-.0001,approx(f,vals,c(.2,.4,.6,.8))$y, max(vals)) globalRange=range(matMedian,mat1,mat12,matM9,matM89) brks = c(globalRange[1]-.001,brks,globalRange[2]+.001) colors = matrix(c( 0,0,0, 1,1,1, .5,.5,.5, 0,0,0, 0,.4,.8, .5,.75,1, .7,.7,.7, 1,.75,.5, .7,.35,0, 1,.1,.1), ncol=3,byrow=T) windows(width=8,height=10) tailsPalette = rgb(colors[,1],colors[,2],colors[,3]) palette(tailsPalette) pan = panelLayout(nrow=1,ncol=5, leftMar=0,topMar=.6,colSep=.1) panLabel = panelLayout(nrow=1,ncol=1, leftMar=0,topMar=.6) cex = .8 loc = .5 nr = 112 nc = 9 # rectangle boundaries for plotting px = c(-.5,.5,.5,-.5,NA) py = c(-.5,-.5,.5,.5,NA) polyx = rep(px,nc*nr) + rep(1:nc,rep(5*nr,nc)) polyy = rep(rep(py,nr)+rep(nr:1,rep(5,nr)),nc) # 4.5.3 Panel 1 Median Sort Order__________________ matcol = as.numeric(cut(matMedian,brks))+3 panelSelect(pan,1,1) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline(col=3) mtext('Median Sort',side=3,line=loc,cex=cex) # 4.5.4 Panel 5 mat1 ____________________________ range(matMedian) #[1] 0 1 range(mat1) #[1] 0.04483357 1.06767840 # Should be some red cells matcol = as.numeric(cut(mat1,brks))+3 panelSelect(pan,1,5) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline(col=3) mtext('SVD 1',side=3,line=loc,cex=cex) # 4.5.5 Panel 4 mat12________________________________ range(mat12) # [1] -0.2206048 1.0701367 # Should be some black and red cells matcol = as.numeric(cut(mat12,brks))+3 panelSelect(pan,1,4) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline(col=3) mtext('SVD 12',side=3,line=loc,cex=cex) # 4.5.6 Panel 3 matM89 ___________________________ range(matM89) #[1] -0.07294524 1.03671208 # Should be some black and red matcol = as.numeric(cut(matM89,brks))+3 panelSelect(pan,1,3) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline(col=3) mtext('SVD-89',side=3,line=loc,cex=cex) # 4.5.7 Panel 2 matM9 range(matM9) #[1] -0.02274396 1.02946160 matcol = as.numeric(cut(matM9,brks))+3 panelSelect(pan,1,2) panelScale(c(.5,nc+.5),c(.5,nr+.5)) polygon(polyx,polyy,col=matcol,border=F) polygon(polyx,polyy,col=3,density=F) panelOutline(col=3) mtext('SVD-9',side=3,line=loc,cex=cex) panelSelect(panLabel,mar='top') panelScale() text(.5,.9,"More Extreme Values Shown as Black and Red") ##End 4.6 Application: Latent Semantic Index One application is latent semantic indexing where rows are word frequencies and columns are documents. This will be briefly described in class. #5. Plotting variables as data=============================== ##Run correl = cor(mat) dis = 1-abs(correl) # dis = 1-correl^2 dis locs = cmdscale(dis,eig=T,k=2) x = locs$points[,1] y = locs$points[,2] windows() palette(tmpPalette) plot(x,y,type='n') text(x,y,names(genes)) title("Plotting variables based on correlation dissimilarity") ##End 6. Plotting correlation matrices ============================ 6.1 Discussion It is simple to class the elements of correlation matrix as a basis for assigning colors and producing a color matrix. The merit is that this allows visualization of fairly large correlation matrices. The matrix size that can be handled is limited by both the number of pixels available and the ability to see the colors. Labeling and knowing which cell is which becomes problematic with large matrices. This is a bit of a problem with say 100,000 variables. Mouseovers in an dynamic setting can help when the number of variables is modest. Remember that color is poor encoding in terms of perceptual accuracy of extraction. Back in the 1980s I used to sort the correlations by a minimal spanning tree traversal when there was no other strong basis for ordering. When the matrices were of modest size, I encoded the correlation magnitudes using the area of a square plotted at the center of correlation matrix cell. I used two colors such as yellow and cyan to distinguish between positive and negative correlations. Example 6.2 shows how to get a spanning tree traversal for a correlation matrix in the Splus context. This involved constructing pseudo data since the Splus mstree () algorithm the include the breadth traversal start with data and not dissimilarities. This is commented out at least until I find or implement a transversal algorithm in R. We don't use this actually use this ordering with the genes since there is an already a meaningful age ordering. Example 6.3 revises the 1980s work to show bars. This upgrades the area encoding to position along a scale. The drawbacks include asymmetry and space required. The function can be refined a bit as indicated in its header. # 6.2 Generating fake data when starting with dissimilarities. # The Splus mstree algorithm expects data, but # we are starting with a dissimilarity matrix. # Hence we use this matrix to generate suitable # fake data # corMat = cor(mat) # corDis = 1-abs(corMat) # a dissimilarity measure # fakeDat = cmdscale(corDis,k=5) # the dissimilarity matrix is not full rank # Distances between 5D points preserve # the dissimilarities # mst = mstree(fakeDat) # mst$order[,1] # rough the correct Euclidean # 9 8 7 6 5 4 3 2 1 # newCorrel = correl[mst$order[,1],mst$order[,1]] 6.3 Using bars to show correlation magnitudes ##Run install needed function for the call web site source('colorCorMatBars.R') windows(width=8,height=5.7) colorCorMatBars(cor(as.matrix(genes))) ##End Reading down the columns one can see the large correlation between E21 and P0. #7. Correlation, Data, and Dissimilarity Matrix Visualization======== 7.1 Discussion For small data sets color matrices for correlations, data, and case dissimilarities can be put together in a single plot. This will be done in 7.3 and 7.4. This juxtaposition was illustrated with some extensions in Chen,C-H, H-G Hwu, W-J Jang, C-H Kao, Y-J,Tien, SL Tzeng, and H-M Wu, 2004. Matrix Visualization and Information Mining, COMPSTAT'2004, Physica-Verlag/Springer, 85-100. The idea of using color matrices and sorting is not new. Assignments for this class have been using them for years. The juxtaposition of the three matrices in the above paper has some merit. This raises some interesting design issues of space and simplicity. In one page plot with many cases, juxtaposition can lead to a tiny correlation matrix that is hard to study. In an an interactive setting and enlargement can address this. In a static setting, separate enlarged plots seem desirable. The three different matrices require three different color scales. For simplicity there is merit in using the same color scheme with different interpretations. The example below does this. However a double-ended scale seems more natural for correlations while sequential scales seem a little more natural for data and dissimilarities. Transformations provide one way to make the same type of color scheme seem more appropriate. For examples the correlation matrices can be converted to p-values so a sequential scale makes more sense. Alternatively a central value can be subtracted from data to provide a context for a double-ended scale. The dissimilarity matrices might be thought of relative to the median value so a double-ended scale might see a little more natural. There are also some drawbacks to these ideas. The choice below currently defaults to a double ended scale. You are free to substitute you own color scale and may well prefer a sequential scale. 7.2 Some colors for palettes ##Run #blueRed5 cmat = matrix(c( 0.00, 0.00, 0.00, # black 1.00, 1.00, 1.00, # white .90, .90, .90, # light gray 0.18, 0.28, 1.00, # blue+slight_green, slightly desaturated 0.53, 0.69, 1.00, # blue+slight_green, moderately desaturated 0.85, 0.85, 0.85, # light gray 1.00, 0.60, 0.60, # red, moderately desaturated 1.00, 0.05, 0.05), # red ncol=3,byrow=T) blueRed5 = rgb(cmat[,1],cmat[,2], cmat[,3]) # blueRed6 cmat = matrix(c( 0.00, 0.00, 0.00, # black 1.00, 1.00, 1.00, # white .90, .90, .90, # light gray 0.20, 0.35, 1.00, # slightly desaturated blue (slightly green) 0.50, 0.60, 1.00, # moderately desaturated blue (slightly green) 0.80, 0.86, 0.96, # desaturated blue (slightly green) 0.97, 0.76, 0.76, # desaturated red 1.00, 0.50, 0.50, # moderately desaturated red 1.00, 0.05, 0.05), # red ncol=3,byrow=T) blueRed6 = rgb(cmat[,1],cmat[,2], cmat[,3]) #blueOrange5 cmat = matrix(c( 0, 0, 0, 1.00, 1.00, 1.00, .90, .90, .90, 0, .35, .70, .35, .60, 1.00, .80, .80, .80, 1.00, .58, .35, .70, .35, 0), ncol=3,byrow=T) blueOrange5 = rgb(cmat[,1],cmat[,2], cmat[,3]) # yellowBrown5 cmat = matrix(c( 0, 0, 0, # black 255, 255, 255, # white 240, 240, 240, # light gray 255, 255, 212, # very light yellow 254, 217, 142, # 254, 153, 41, # 217, 94, 14, # 154, 52, 4), # brown ncol=3,byrow=T) yellowBrown5 = rgb(cmat[,1],cmat[,2], cmat[,3],max=255) # blueOrange7 = cmat= matrix(c( 0, 0, 0, 1.00, 1.00, 1.00, .85, .85, .85, 0, .35, .70, 0, .50, 1.00, .50, .75, 1.00, .80, .80, .80, 1.00, .75, .50, 1.00, .5, 0, .70, .35, 0), ncol=3,byrow=T) blueOrange7 = rgb(cmat[,1],cmat[,2], cmat[,3]) ##End 7.3 Minimal spanning tree sort This uses geneMatMst from 2. above ## Run Install needed function from the class web site. source('colorMatView.R') geneMstDis = dist(geneMatMst,method="euc") dissimMst = as.matrix(geneMstDis) # dissimilarities corMst = cor(geneMatMst) # correlation matrix titles = c("Scaled Gene mRNA Data", "Minimal Spanning Tree Sort: Rows and Columns") # view with 5 color classes pdf(file='colorMat5.pdf',width=8.5,height=11) colorMatView(geneMatMst,corMst,dissimMst,titles,colors=blueOrange5) dev.off() # view with 7 color classes pdf(file='colorMat7.pdf',width=8.5,height=11) colorMatView(geneMatMst,corMst,dissimMst,titles,colors=blueOrange7) dev.off() ##End 7.4 Sorting data with clusters If one has clustered the data, a reasonable next step for viewing the data is to order the cases within each cluster This still leave sorting options. With Splus I gave used minimal spanning tree sorts. For R I am temporarily using SVD sorts. Variations such as reversing the sort order may better show the edges between adjacent clusters. Some are hard to see in the example below. ##Run # The authors of the paper clustered the genes into six clusters # The following reorder the rows so the genes in the same cluster # are together. # # The genes of each cluster are ordered by SVD sort (or # a minimal spanning tree traversal mat = as.matrix(gemIdGene[,1:9]) # basic gene dat from 2. above clusid = gemIdGene[,14] # cluster membership # Define an MST sorting function for rows of a matrix #sortclass= function(class){ # rmst = mstree(class) # return (class[rev(rmst$order[,1]),]) # } # Define an SVD eigenvector sorting function for rows of a matrix sortSVDclass = function(datClass) {tmpSvd = svd(datClass,nu=1,nv=1) return (datClass[order(tmpSvd$u[,1]),]) } geneDataClus = sortSVDclass(mat[clusid==1,]) for (i in 2:6){ geneDataClus = rbind(geneDataClus,sortSVDclass(mat[clusid==i,])) } # Obtain dissimilarities and correlations geneClusDis= dist(geneDataClus,method="euc") dissimClus = as.matrix(geneClusDis) # dissimilarity matrix corClus = cor(geneDataClus) # correlation matrix # plot the data, correlation and dissimilarities titles = c("Scaled Gene mRNA Data", "Minimal Spanning Tree Sorting Within Clusters: Rows") pdf(file='colorMatSorted7.pdf',width=8.5,height=11) colorMatView(geneDataClus,corClus,dissimClus,titles,colors=blueOrange7) dev.off() ##End