setwd("/Users/ovitek/Dropbox/Olga/Teaching/CS6220/Fall15/LectureNotes/5-unsupervised") ################################################################ # Principle components ################################################################ load('ALL_bcrneg.RData') # prcomp - based on singular value decomposition - preferred ?prcomp # princomp - decomposition of covariance/correlation matrix # exists for backward compatibility ?princomp # Goal: visualize samples in a space of genes # Therefore samples are observations; genes are dimensions pc <- prcomp(x=t(X), center=TRUE, scale.=TRUE) summary(pc) # parts of the output names(pc) # proportion of explained variance # look up the description of 'sdev' in the help of prcomp barplot( pc$sdev^2/sum(pc$sdev^2) , xlab="principle component", ylab="% of variance") barplot( cumsum( pc$sdev^2/sum(pc$sdev^2) ) , xlab="principle component", ylab="cumulative % of variance" ) # Conclusion: the two first principle components are insufficient # define a color for each sample myColor <- rep(NA, length(Y)) myColor[Y == "BCR/ABL"] <- "red" myColor[Y == "NEG"] <- "blue" # score plot plot(pc$x[,1:2], col=myColor, pch=16, cex=2) legend("topleft", pch=16, col=c("red", "blue"), c("BCR/ABL", "NEG")) # Conclusion: not enough separation in the first two dimensions ################################################################ # Principle components on an 'informative' subset of genes ################################################################ # principle component pc_subset <- prcomp(x=t(Xsubset), center=TRUE, scale.=TRUE) # score plot plot(pc_subset$x[,1:2], col=myColor, pch=16, cex=2) legend("topleft", pch=16, col=c("red", "blue"), c("BCR/ABL", "NEG")) # Conclusion: better separation when only use 'informative' genes # (However cannot do this in practice!) # proportion of explained variance barplot( cumsum( pc_subset$sdev^2/sum(pc_subset$sdev^2) ) , xlab="principle component", ylab="cumulative % of variance" ) # Conclusion: two PC are still not enough in this case ################################################################ # Additional views ################################################################ # simultaneously plots sample coordinates after rotation (scores) # and weights of each gene (loadings) biplot(pc_subset) # scaling vs no scaling pc_subset_noscale <- prcomp(x=t(Xsubset), center=TRUE, scale.=FALSE) plot(pc_subset_noscale$x[,1:2], col=myColor, pch=16, cex=2) legend("topleft", pch=16, col=c("red", "blue"), c("BCR/ABL", "NEG")) barplot( cumsum( pc_subset_noscale$sdev^2/sum(pc_subset_noscale$sdev^2) ) , xlab="principle component", ylab="cumulative % of variance" ) ################################################################ ################################################################ #################### H E A T M A P S ######################## # Iconic graphics for high-throughput data ################## CAUTION: HIDDEN PITFALLS #################### ################################################################ ################################################################ ################################################################ # Heatmap of individual values of DE genes ################################################################ heatmap(Xsubset) ?heatmap # Check http://www.biomedcentral.com/1471-2105/13/S16/S10 for more details ################################################################ # Note 1: choice of the distance metric matters ################################################################ library(bioDist) # toy example: 3 genes, 5 samples gene1 <- c(1, 6, 2, 4, 7) gene2 <- gene1 + 4 gene3 <- gene2/3+ c(0, 2, 0, 4, 0) e <- rbind(gene1, gene2, gene3) dimnames(e) <- list(paste("gene", 1:3, sep=""), paste("sample", 1:5, sep="")) e # plot plot(e[1,], col="red", type="l", ylim=c(0,15), xlab="samples", ylab="abundance", main="Original abundance values", lwd=3) lines(e[2,], col="blue", lwd=3) lines(e[3,], col="green", lwd=3) legend("topright", col=c("red", "blue", "green"), paste("gene", 1:3, sep=""), lty=1, lwd=3, cex=1.5) # default: distance fnction eucledian library(bioDist) heatmap(e, distfun=euc, margins=c(10,10), main="Eucledian distance") # correlation heatmap(e, distfun=cor.dist, margins=c(10,10), main="Correlation distance") ################################################################ # Note 2: Clustering genes ################################################################ # genes are multivariate objects (=observations) in the space of samples (=variables) # if we are not interested in similarity of abundance # but in similarity of the direction of change, # then it will help to standardize the observations across variables # now each gene has mean 0 and sample variance 1 across arrays library(genefilter) e_centerScale <- ( e - rowMeans(e) ) / rowSds(e) # plot plot(e_centerScale[1,], col="red", type="l", ylim=c(-2,2), xlab="samples", ylab="abundance", main="Original abundance values") lines(e_centerScale[2,], col="blue") lines(e_centerScale[3,], col="green") legend("topright", col=c("red", "blue", "green"), paste("gene", 1:3, sep=""), lty=1) # Eucledian distance is sensitive to centering and scaling heatmap(e, distfun=euc, margins=c(10,10), main="Eucledian, raw") heatmap(e_centerScale, distfun=euc, margins=c(10,10), main="Eucledian, centered and scaled") # Correlation distance is not sensitive to centering and scaling heatmap(e, distfun=cor.dist, margins=c(10,10), main="Correlation, raw") heatmap(e_centerScale, distfun=cor.dist, margins=c(10,10), main="Correlation, centered and scaled") # (note that centering and scaling of genes across samples + correlation distance # affect the order of the samples) # my choice is to use correlation on the original values, # or to compute dendrograms separately, save the order of genes or samples, and use "image" to visualize ################################################################ # Note 3: The heatmap has an option "scale" by default # However it only affects the color, and not the calculation of dendrograms. # It's usually better to make custom color definitions ################################################################ library(RColorBrewer) # scaling of color matters for unscaled data heatmap(e, distfun=euc, margins=c(10,10), scale="none", main="Eucledian, raw") heatmap(e, distfun=euc, margins=c(10,10), main="Eucledian, scaled") # Conclusion: genes 1 and 2 are far from each other on the dendrogram # (because of Eucledian distance) but have same color patterns of # intensities (because of scaled colors) # scaling of color irrelevant if data themselves are scaled already heatmap(e_centerScale, distfun=euc, margins=c(10,10)) heatmap(e_centerScale, distfun=euc, margins=c(10,10), scale="none") # Example with real dataset # Default: raw data, scaled by row (i.e. gene) heatmap(Xsubset) # change color code: non-standardized expressions # equal-spaced breaks for values of abundance summary(as.vector(Xsubset)) # min ~2, max ~ 13 exprs_brk <- seq(from=2, to=13, length=11) heatmap(Xsubset, col=brewer.pal(length(exprs_brk)-1, "RdBu"), breaks=exprs_brk) # change color code: gene-standardized expressions # quantile-spaced breaks in positive and negative directions ALL_e_centerScale <- ( Xsubset - rowMeans(Xsubset) ) / rowSds(Xsubset) exprs_brk <- c( quantile(ALL_e_centerScale[ALL_e_centerScale<0], seq(0, 1, length=5)), 0, quantile(ALL_e_centerScale[ALL_e_centerScale>0], seq(0, 1, length=5))) heatmap(ALL_e_centerScale, margins=c(10,10), col=brewer.pal(length(exprs_brk)-1, "RdBu"), breaks=exprs_brk) ################################################################ # Note 4: Label sample types ################################################################ heatmap(ALL_e_centerScale, margins=c(10,10), col=brewer.pal(length(exprs_brk)-1, "RdBu"), breaks=exprs_brk, ColSideColors=myColor) ################################################################ # Note 5: The choice of the linkage method in clustering matters ################################################################ # clusters in the sample space: note the use of (t) # method "complete" by default hc <- hclust( dist( t(ALL_e_centerScale)) ) plot(hc) # sinle linkage hc <- hclust( dist( t(ALL_e_centerScale)), metho="single" ) plot(hc) # cut the tree at a specific group hc <- hclust( dist( t(ALL_e_centerScale)) ) cc <- cutree(hc, k=2) cc ################################################################ # K-means ################################################################ # K-means clustering is a partitioning method where the number of clusters is pre-defined. # The basic R implementation uses Euclidean distance. km <- kmeans(t(ALL_e_centerScale), 2) km$cluster