setwd("/Users/ovitek/Dropbox/Olga/Teaching/CS6220/Fall15/LectureNotes/5-unsupervised")

################################################################
# Principle components
################################################################
load('ALL_bcrneg.RData')


# prcomp - based on singular value decomposition - preferred
?prcomp

# princomp - decomposition of covariance/correlation matrix 
# exists for backward compatibility
?princomp

# Goal: visualize samples in a space of genes
# Therefore samples are observations; genes are dimensions
pc <- prcomp(x=t(X), center=TRUE, scale.=TRUE)
summary(pc)

# parts of the output
names(pc)

# proportion of explained variance
# look up the description of 'sdev' in the help of prcomp
barplot( pc$sdev^2/sum(pc$sdev^2) , xlab="principle component", 
         ylab="% of variance")
barplot( cumsum( pc$sdev^2/sum(pc$sdev^2) ) , xlab="principle component", 
         ylab="cumulative % of variance" )
# Conclusion: the two first principle components are insufficient         

# define a color for each sample 
myColor <- rep(NA, length(Y))
myColor[Y == "BCR/ABL"] <- "red" 
myColor[Y == "NEG"] <- "blue" 

# score plot
plot(pc$x[,1:2], col=myColor, pch=16, cex=2)
legend("topleft", pch=16, col=c("red", "blue"), c("BCR/ABL", "NEG"))
# Conclusion: not enough separation in the first two dimensions       


################################################################
# Principle components on an 'informative' subset of genes
################################################################
# principle component
pc_subset <- prcomp(x=t(Xsubset), center=TRUE, scale.=TRUE)

# score plot
plot(pc_subset$x[,1:2], col=myColor, pch=16, cex=2)
legend("topleft", pch=16, col=c("red", "blue"), c("BCR/ABL", "NEG"))
# Conclusion: better separation when only use 'informative' genes
# (However cannot do this in practice!)

# proportion of explained variance
barplot( cumsum( pc_subset$sdev^2/sum(pc_subset$sdev^2) ) , xlab="principle component", 
         ylab="cumulative % of variance" )
# Conclusion: two PC are still not enough in this case         


################################################################
# Additional views
################################################################
# simultaneously plots sample coordinates after rotation (scores) 
# and weights of each gene (loadings)
biplot(pc_subset)

# scaling vs no scaling
pc_subset_noscale <- prcomp(x=t(Xsubset), center=TRUE, scale.=FALSE)

plot(pc_subset_noscale$x[,1:2], col=myColor, pch=16, cex=2)
legend("topleft", pch=16, col=c("red", "blue"), c("BCR/ABL", "NEG"))
barplot( cumsum( pc_subset_noscale$sdev^2/sum(pc_subset_noscale$sdev^2) ) , xlab="principle component", 
         ylab="cumulative % of variance" )


################################################################
################################################################
####################   H E A T M A P S  ########################
#           Iconic graphics for high-throughput data
################## CAUTION: HIDDEN PITFALLS ####################
################################################################
################################################################

################################################################
# Heatmap of individual values of DE genes
################################################################
heatmap(Xsubset)
?heatmap
# Check http://www.biomedcentral.com/1471-2105/13/S16/S10 for more details

################################################################
# Note 1: choice of the distance metric matters
################################################################
library(bioDist)

# toy example: 3 genes, 5 samples
gene1 <- c(1, 6, 2, 4, 7)
gene2 <- gene1 + 4 
gene3 <- gene2/3+ c(0, 2, 0, 4, 0)
e <- rbind(gene1, gene2, gene3)
dimnames(e) <- list(paste("gene", 1:3, sep=""), paste("sample", 1:5, sep=""))
e

# plot
plot(e[1,], col="red", type="l", ylim=c(0,15), xlab="samples", ylab="abundance",
     main="Original abundance values", lwd=3)
lines(e[2,], col="blue", lwd=3)
lines(e[3,], col="green", lwd=3)
legend("topright", col=c("red", "blue", "green"), paste("gene", 1:3, sep=""), lty=1, lwd=3, cex=1.5)


# default: distance fnction eucledian
library(bioDist)
heatmap(e, distfun=euc, margins=c(10,10), main="Eucledian distance")
# correlation
heatmap(e, distfun=cor.dist, margins=c(10,10), main="Correlation distance")


################################################################
# Note 2: Clustering genes
################################################################
# genes are multivariate objects (=observations) in the space of samples (=variables)
# if we are not interested in similarity of abundance 
# but in similarity of the direction of change, 
# then it will help to standardize the observations across variables 

# now each gene has mean 0 and sample variance 1 across arrays 

library(genefilter)
e_centerScale <- ( e - rowMeans(e) ) / rowSds(e)

# plot
plot(e_centerScale[1,], col="red", type="l", ylim=c(-2,2), xlab="samples", ylab="abundance",
     main="Original abundance values")
lines(e_centerScale[2,], col="blue")
lines(e_centerScale[3,], col="green")
legend("topright", col=c("red", "blue", "green"), paste("gene", 1:3, sep=""), lty=1)

# Eucledian distance is sensitive to centering and scaling
heatmap(e, distfun=euc, margins=c(10,10), main="Eucledian, raw")
heatmap(e_centerScale, distfun=euc, margins=c(10,10), main="Eucledian, centered and scaled")

# Correlation distance is not sensitive to centering and scaling
heatmap(e, distfun=cor.dist, margins=c(10,10), main="Correlation, raw")
heatmap(e_centerScale, distfun=cor.dist, margins=c(10,10), main="Correlation, centered and scaled")
# (note that centering and scaling of genes across samples + correlation distance
# affect the order of the samples)
# my choice is to use correlation on the original values, 
# or to compute dendrograms separately, save the order of genes or samples, and use "image" to visualize 


################################################################
# Note 3: The heatmap has an option "scale" by default
# However it only affects the color, and not the calculation of dendrograms.
# It's usually better to make custom color definitions
################################################################
library(RColorBrewer)

# scaling of color matters for unscaled data
heatmap(e, distfun=euc, margins=c(10,10), scale="none", main="Eucledian, raw")
heatmap(e, distfun=euc, margins=c(10,10), main="Eucledian, scaled")
# Conclusion: genes 1 and 2 are far from each other on the dendrogram
# (because of Eucledian distance) but have same color patterns of
# intensities (because of scaled colors)


# scaling of color irrelevant if data themselves are scaled already
heatmap(e_centerScale, distfun=euc, margins=c(10,10))
heatmap(e_centerScale, distfun=euc, margins=c(10,10), scale="none")


# Example with real dataset
# Default: raw data, scaled by row (i.e. gene)
heatmap(Xsubset)

# change color code: non-standardized expressions
# equal-spaced breaks for values of abundance
summary(as.vector(Xsubset)) # min ~2, max ~ 13
exprs_brk <- seq(from=2, to=13, length=11)
heatmap(Xsubset, col=brewer.pal(length(exprs_brk)-1, "RdBu"), breaks=exprs_brk)


# change color code: gene-standardized expressions
# quantile-spaced breaks in positive and negative directions
ALL_e_centerScale <- 
 ( Xsubset - rowMeans(Xsubset) ) / rowSds(Xsubset)

exprs_brk <- c( quantile(ALL_e_centerScale[ALL_e_centerScale<0], seq(0, 1, length=5)),
                0,
                quantile(ALL_e_centerScale[ALL_e_centerScale>0], seq(0, 1, length=5)))


heatmap(ALL_e_centerScale, margins=c(10,10), 
				col=brewer.pal(length(exprs_brk)-1, "RdBu"), breaks=exprs_brk)


################################################################
# Note 4: Label sample types
################################################################
heatmap(ALL_e_centerScale, margins=c(10,10), col=brewer.pal(length(exprs_brk)-1, "RdBu"), breaks=exprs_brk, 
        ColSideColors=myColor)


################################################################
# Note 5: The choice of the linkage method in clustering matters
################################################################

# clusters in the sample space: note the use of (t)
# method "complete" by default
hc <- hclust( dist( t(ALL_e_centerScale)) )
plot(hc)

# sinle linkage
hc <- hclust( dist( t(ALL_e_centerScale)), metho="single" )
plot(hc)

# cut the tree at a specific group
hc <- hclust( dist( t(ALL_e_centerScale)) )
cc <- cutree(hc, k=2)
cc


################################################################
# K-means
################################################################


# K-means clustering is a partitioning method where the number of clusters is pre-defined.
# The basic R implementation uses Euclidean distance.
km <- kmeans(t(ALL_e_centerScale), 2)
km$cluster