Biostatistical Computing, PHC 6068

Clustering algorithm

Zhiguang Huo (Caleb)

Wednesday October 17, 2018

Outlines

Unsupervised machine learning

https://cdn-images-1.medium.com/max/1600/1*6hfFWPITJxbtw4ztoC1YeA.png

https://cdn-images-1.medium.com/max/1600/1*6hfFWPITJxbtw4ztoC1YeA.png

Motivating example in R

library(MASS)
set.seed(32611)
a<- mvrnorm(50, c(0,0), matrix(c(1,0,0,1),2))
b<- mvrnorm(50, c(5,6), matrix(c(1,0,0,1),2))
c<- mvrnorm(50, c(10,10), matrix(c(1,0,0,1),2))
d0 <- rbind(a,b,c)
plot(d0)

Motivating example in R (applying kmeans)

set.seed(32611)
result_km <- kmeans(d0, centers = 3)
plot(d0, col=result_km$cluster)
legend("topleft", legend = 1:3, col = 1:3, pch=1)

\(k\)-means

\(k\)-means objective function

\[\min_C \sum_{k=1}^K \sum_{i \in C_k} \|x_i - \bar{x}_{C_k}\|^2,\]

\(k\)-means algorithm

  1. Initialize \(K\) centers \(c_1, \ldots, c_K\) as starting values.
  2. (Updating clustering labels) Form the clusters \(C_1, \ldots, C_K\) as follows.
    • Denote the clustering labels for subject \(i\): \(g_i = \arg \min_k \|x_i - c_k\|^2\).
    • \(C_k = \{i: g_i = k\}\)
  3. (Updating clustering centers) For \(k = 1, \ldots, K\), set the \(k^{th}\) clustering center of \(c_k\) as: \[c_k \rightarrow \frac{1}{|C_k|} \sum_{i\in C_k} x_i\]
  4. Repeat 2 and 3 until converge
  5. Output: centers \(c_1, \ldots, c_K\) and clusters \(C_1, \ldots, C_K\).

\(k\)-means 2-dimensional example

set.seed(32611)
N<-100
d1<-mvrnorm(N, c(0,3), matrix(c(2, 1, 1, 2), 2, 2))
d2<-mvrnorm(N, c(-2,-2), matrix(c(2, 1, 1, 2), 2, 2))
d3<-mvrnorm(N, c(2,-2), matrix(c(2, 1, 1, 2), 2, 2))
d <- rbind(d1, d2, d3)
colnames(d) <- c("x", "y")
label <- c(rep("1", N), rep("2", N), rep("3", N))
plot(d, pch = 19, col=as.numeric(label), main = "underlying true labels")
legend("topleft", legend = unique(label), col=unique(as.numeric(label)), pch=19)

\(k\)-means demonstration

K <- 3
set.seed(32611)
centers <- mvrnorm(K, mu = c(0,0), Sigma = diag(c(1,1)))
colnames(centers) <- c("x", "y")

plot(d, pch=19)
points(centers, col = 2:4, pch=9, cex=2)


l2n <- function(avec){
  return(sum(avec^2))
}

## update group labels
groupsDist <- matrix(0,nrow=nrow(d),ncol=K)
for(k in 1:K){
  vecDiff <- t(d) - centers[k,]
  al2n <- apply(vecDiff,2,l2n)
  groupsDist[,k] <- al2n
}
groups <- apply(groupsDist,1,which.min)

plot(d, pch=19, col=groups + 1)
points(centers, pch=9, cex=2)



## update centers
for(k in 1:K){
  asubset <- d[groups==k,]
  centers[k,] <- colMeans(asubset)
}

groups0 <- groups ## save the previous clustering result, in order to test convergence

plot(d, pch=19)
points(centers, col = 2:4, pch=9, cex=2)

Apply kmeans on the iris data example (\(p>2\))

iris.data <- iris[,1:4]
ir.pca <- prcomp(iris.data,
                 center = TRUE,
                 scale = TRUE) 
PC1 <- ir.pca$x[,"PC1"]
PC2 <- ir.pca$x[,"PC2"]
variance <- ir.pca$sdev^2 / sum(ir.pca$sdev^2)
v1 <- paste0("variance: ",signif(variance[1] * 100,3), "%")
v2 <- paste0("variance: ",signif(variance[2] * 100,3), "%")
plot(PC1, PC2, col=as.numeric(iris$Species),pch=19, xlab=v1, ylab=v2)
legend("topright", legend = levels(iris$Species), col =  unique(iris$Species), pch = 19)

Apply \(k\)-means on the iris data

set.seed(32611)
kmeans_iris <- kmeans(iris.data, 3)

plot(PC1, PC2, col=as.numeric(kmeans_iris$cluster),pch=19, xlab=v1, ylab=v2)
legend("topright", legend = unique(kmeans_iris$cluster), col =  unique(kmeans_iris$cluster), pch = 19)

Put together (original lables and \(k\)-means labels)

par(mfrow=c(1,2))
plot(PC1, PC2, col=as.numeric(iris$Species),pch=19, xlab=v1, ylab=v2, main="true label")
legend("topright", legend = levels(iris$Species), col =  unique(iris$Species), pch = 19)

plot(PC1, PC2, col=as.numeric(kmeans_iris$cluster),pch=19, xlab=v1, ylab=v2, main="kmeans label")
legend("topright", legend = unique(kmeans_iris$cluster), col =  unique(kmeans_iris$cluster), pch = 19)

Some issues about kmeans algorithm

Is kmeans deterministic algorithm? Or random?

set.seed(32611)
result_km <- kmeans(d0, centers = 3)
plot(d0, col=result_km$cluster)
legend("topleft", legend = 1:3, col = 1:3, pch=1)

set.seed(32610)
result_km <- kmeans(d0, centers = 3)
plot(d0, col=result_km$cluster)
legend("topleft", legend = 1:3, col = 1:3, pch=1)

set.seed(32619)
result_km <- kmeans(d0, centers = 3)
plot(d0, col=result_km$cluster)
legend("topleft", legend = 1:3, col = 1:3, pch=1)

How to determine the optimal K? – Elbow method

How to determine the optimal K? – Gap statsitics

\[\max Gap_n (k) = E_n^*(\log(W(k))) - \log(W(k))\]

Gap statistics

library(cluster)
gsP.Z <- clusGap(d, FUN = kmeans, K.max = 8, B = 50)
plot(gsP.Z, main = "k = 3  cluster is optimal")

gsP.Z
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = d, FUNcluster = kmeans, K.max = 8, B = 50)
## B=50 simulated reference sets, k = 1..8; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstSEmax', SE.factor=1): 3
##          logW   E.logW       gap     SE.sim
## [1,] 5.769038 6.051226 0.2821886 0.01881499
## [2,] 5.412645 5.733809 0.3211639 0.02536302
## [3,] 5.118928 5.532351 0.4134237 0.02039893
## [4,] 5.015425 5.344578 0.3291527 0.02571314
## [5,] 4.909918 5.234112 0.3241944 0.01961303
## [6,] 4.840721 5.133776 0.2930550 0.02270741
## [7,] 4.708611 5.048810 0.3401989 0.02019276
## [8,] 4.653654 4.976211 0.3225574 0.02284422

How to evaluate the clustering results?

rand index and adjusted rand index

suppressMessages(library(fossil))
set.seed(32611)
g1 <- sample(1:2, size=10, replace=TRUE)
g2 <- sample(1:3, size=10, replace=TRUE)
rand.index(g1, g2)
## [1] 0.4444444
adj.rand.index(g1, g2)
## [1] 0.1697417

Hierarchical Clustering example

set.seed(32611)
index <- sample(nrow(iris), 30)
iris_subset <- iris[index, 1:4]
species_subset <- iris$Species[index]

hc <- hclust(dist(iris_subset))
plot(hc)

Hierarchical Clustering example (with color)

suppressMessages(library(dendextend))
dend = as.dendrogram(hc)
labels_colors(dend) = as.numeric(species_subset)[order.dendrogram(dend)]

plot(dend)
legend("topright", legend=levels(species_subset), col=1:3, pch=1)

\(K\)-means and hierarchical clustering

Hierarchical methods

Hierarchical clustering illustration

Distance between clusters

Distance between clusters

Select a distance measurement \(d\) (e.g. Euclidean distance)

Hierarchical tree ordering

Hierarchical clustering algorithm

  1. Input: dissimilarity matrix (distance matrix \(d \in \mathbb{R}^{n \times n}\))
  2. Let \(T_n = \{C_1, C_2, \ldots, C_n\}\).
  3. For \(j = n - 1\) to \(1\):
    1. Find \(l,m\) to minimize \(d(C_l, C_m)\) over all \(C_l, C_m \in T_{j+1}\)
    2. Let \(T_j\) be the same as \(T_{j+1}\) except that \(C_l\) and \(C_m\) are replaced with \(C_l \cup C_m\)
  4. Return the sets of clusters \(T_1, \ldots, T_n\)

Cut Hierarchical trees

plot(d0)

hc <- hclust(dist(d0))
plot(hc)

res_cutree <- cutree(hc, k = 1:5) #k = 1 is trivial

set.seed(32611)
result_km <- kmeans(d0, centers = 3)

table(res_cutree[,3], result_km$cluster)
##    
##      1  2  3
##   1 50  0  0
##   2  0 50  0
##   3  0  0 50
rand.index(res_cutree[,3], result_km$cluster)
## [1] 1
adj.rand.index(res_cutree[,3], result_km$cluster)
## [1] 1