Biostatistical Computing, PHC 6068

Clustering algorithm

Zhiguang Huo (Caleb)

Wednesday October 9, 2019

Outlines

Unsupervised machine learning

https://cdn-images-1.medium.com/max/1600/1*6hfFWPITJxbtw4ztoC1YeA.png

https://cdn-images-1.medium.com/max/1600/1*6hfFWPITJxbtw4ztoC1YeA.png

Motivating example in R

library(MASS)
set.seed(32611)
a<- mvrnorm(50, c(0,0), matrix(c(1,0,0,1),2))
b<- mvrnorm(50, c(5,6), matrix(c(1,0,0,1),2))
c<- mvrnorm(50, c(10,10), matrix(c(1,0,0,1),2))
d0 <- rbind(a,b,c)
plot(d0)

Motivating example in R (applying kmeans)

set.seed(32611)
result_km <- kmeans(d0, centers = 3)
plot(d0, col=result_km$cluster)
legend("topleft", legend = 1:3, col = 1:3, pch=1)

\(k\)-means

\(k\)-means objective function

\[\min_C \sum_{k=1}^K \sum_{i \in C_k} \|x_i - \bar{x}_{C_k}\|^2,\]

\(k\)-means algorithm

  1. Initialize \(K\) centers \(c_1, \ldots, c_K\) as starting values.
  2. (Updating clustering labels) Form the clusters \(C_1, \ldots, C_K\) as follows.
    • Denote the clustering labels for subject \(i\): \(g_i = \arg \min_k \|x_i - c_k\|^2\).
    • \(C_k = \{i: g_i = k\}\)
  3. (Updating clustering centers) For \(k = 1, \ldots, K\), set the \(k^{th}\) clustering center of \(c_k\) as: \[c_k \rightarrow \frac{1}{|C_k|} \sum_{i\in C_k} x_i\]
  4. Repeat 2 and 3 until converge
  5. Output: centers \(c_1, \ldots, c_K\) and clusters \(C_1, \ldots, C_K\).

\(k\)-means 2-dimensional example

set.seed(32611)
N<-100
d1<-mvrnorm(N, c(0,3), matrix(c(2, 1, 1, 2), 2, 2))
d2<-mvrnorm(N, c(-2,-2), matrix(c(2, 1, 1, 2), 2, 2))
d3<-mvrnorm(N, c(2,-2), matrix(c(2, 1, 1, 2), 2, 2))
d <- rbind(d1, d2, d3)
colnames(d) <- c("x", "y")
label <- c(rep("1", N), rep("2", N), rep("3", N))
plot(d, pch = 19, col=as.numeric(label), main = "underlying true labels")
legend("topleft", legend = unique(label), col=unique(as.numeric(label)), pch=19)

\(k\)-means demonstration

K <- 3
set.seed(32611)
centers <- mvrnorm(K, mu = c(0,0), Sigma = diag(c(1,1)))
colnames(centers) <- c("x", "y")

plot(d, pch=19)
points(centers, col = 2:4, pch=9, cex=2)

## update group labels
groupsDist <- matrix(0,nrow=nrow(d),ncol=K)
for(k in 1:K){
  vecDiff <- t(d) - centers[k,]
  adist <- apply(vecDiff,2,function(x) sum(x^2))
  groupsDist[,k] <- adist
}
groups <- apply(groupsDist,1,which.min)

plot(d, pch=19, col=groups + 1)
points(centers, pch=9, cex=2)



## update centers
for(k in 1:K){
  asubset <- d[groups==k,]
  centers[k,] <- colMeans(asubset)
}

groups0 <- groups ## save the previous clustering result, in order to test convergence

plot(d, pch=19)
points(centers, col = 2:4, pch=9, cex=2)

Apply kmeans on the iris data example (\(p>2\))

iris.data <- iris[,1:4]
ir.pca <- prcomp(iris.data,
                 center = TRUE,
                 scale = TRUE) 
PC1 <- ir.pca$x[,"PC1"]
PC2 <- ir.pca$x[,"PC2"]
variance <- ir.pca$sdev^2 / sum(ir.pca$sdev^2)
v1 <- paste0("variance: ",signif(variance[1] * 100,3), "%")
v2 <- paste0("variance: ",signif(variance[2] * 100,3), "%")
plot(PC1, PC2, col=as.numeric(iris$Species),pch=19, xlab=v1, ylab=v2)
legend("topright", legend = levels(iris$Species), col =  unique(iris$Species), pch = 19)

Apply \(k\)-means on the iris data

set.seed(32611) 
kmeans_iris <- kmeans(iris.data, 3)

plot(PC1, PC2, col=as.numeric(kmeans_iris$cluster),pch=19, xlab=v1, ylab=v2)
legend("topright", legend = unique(kmeans_iris$cluster), col =  unique(kmeans_iris$cluster), pch = 19)

Put together (original lables and \(k\)-means labels)

par(mfrow=c(1,2))
plot(PC1, PC2, col=as.numeric(iris$Species),pch=19, xlab=v1, ylab=v2, main="true label")
legend("topright", legend = levels(iris$Species), col =  unique(iris$Species), pch = 19)

plot(PC1, PC2, col=as.numeric(kmeans_iris$cluster),pch=19, xlab=v1, ylab=v2, main="kmeans label")
legend("topright", legend = unique(kmeans_iris$cluster), col =  unique(kmeans_iris$cluster), pch = 19)

Some issues about kmeans algorithm

Is kmeans deterministic algorithm? Or random?

set.seed(32611)
result_km <- kmeans(d0, centers = 3)
plot(d0, col=result_km$cluster)
legend("topleft", legend = 1:3, col = 1:3, pch=1)

set.seed(32610)
result_km <- kmeans(d0, centers = 3)
plot(d0, col=result_km$cluster)
legend("topleft", legend = 1:3, col = 1:3, pch=1)

set.seed(32619)
result_km <- kmeans(d0, centers = 3)
plot(d0, col=result_km$cluster)
legend("topleft", legend = 1:3, col = 1:3, pch=1)

kmeans(d0, centers = 3, nstart = 20)
## K-means clustering with 3 clusters of sizes 50, 50, 50
## 
## Cluster means:
##         [,1]        [,2]
## 1  5.0201835  5.84577942
## 2 10.0223462  9.93004155
## 3  0.1262428 -0.05688753
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 2 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 99.84741 87.84906 87.79105
##  (between_SS / total_SS =  94.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

How to determine the optimal K? – Elbow method

How to determine the optimal K? – Gap statsitics

\[\max Gap_n (k) = \frac{1}{B}(\sum_{b = 1}^B\log(W_b^*(k))) - \log(W(k))\]

Gap statistics

library(cluster)
gsP.Z <- clusGap(d, FUN = kmeans, K.max = 8, B = 50)
plot(gsP.Z, main = "k = 3  cluster is optimal")

gsP.Z
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = d, FUNcluster = kmeans, K.max = 8, B = 50)
## B=50 simulated reference sets, k = 1..8; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstSEmax', SE.factor=1): 3
##          logW   E.logW       gap     SE.sim
## [1,] 5.769038 6.051298 0.2822604 0.01953592
## [2,] 5.412645 5.742842 0.3301974 0.03763254
## [3,] 5.118928 5.536650 0.4177221 0.02187489
## [4,] 5.031797 5.344077 0.3122792 0.01963785
## [5,] 4.904344 5.234730 0.3303857 0.02161604
## [6,] 4.789622 5.132684 0.3430620 0.02399886
## [7,] 4.779464 5.052895 0.2734309 0.02399082
## [8,] 4.648428 4.975084 0.3266552 0.02359000

How to evaluate the clustering results?

rand index and adjusted rand index

suppressMessages(library(fossil))
set.seed(32611)
g1 <- sample(1:2, size=10, replace=TRUE)
g2 <- sample(1:3, size=10, replace=TRUE)
rand.index(g1, g2)
## [1] 0.4444444
adj.rand.index(g1, g2)
## [1] 0.1697417

Hierarchical Clustering example

set.seed(32611)
index <- sample(nrow(iris), 30)
iris_subset <- iris[index, 1:4]
species_subset <- iris$Species[index]

hc <- hclust(dist(iris_subset))
plot(hc)

Hierarchical Clustering example (with color)

suppressMessages(library(dendextend))
dend = as.dendrogram(hc)
labels_colors(dend) = as.numeric(species_subset)[order.dendrogram(dend)]

plot(dend)
legend("topright", legend=levels(species_subset), col=1:3, pch=1)

\(K\)-means and hierarchical clustering

Hierarchical methods

Hierarchical clustering illustration

Distance between clusters

Distance between clusters

Select a distance measurement \(d\) (e.g. Euclidean distance)

Hierarchical tree ordering

Hierarchical clustering algorithm

  1. Input: dissimilarity matrix (distance matrix \(d \in \mathbb{R}^{n \times n}\))
  2. Let \(T_n = \{C_1, C_2, \ldots, C_n\}\).
  3. For \(j = n - 1\) to \(1\):
    1. Find \(l,m\) to minimize \(d(C_l, C_m)\) over all \(C_l, C_m \in T_{j+1}\)
    2. Let \(T_j\) be the same as \(T_{j+1}\) except that \(C_l\) and \(C_m\) are replaced with \(C_l \cup C_m\)
  4. Return the sets of clusters \(T_1, \ldots, T_n\)

Cut Hierarchical trees

plot(d0)

hc <- hclust(dist(d0))
plot(hc)

res_cutree <- cutree(hc, k = 1:5) #k = 1 is trivial

set.seed(32611)
result_km <- kmeans(d0, centers = 3)

table(res_cutree[,3], result_km$cluster)
##    
##      1  2  3
##   1 50  0  0
##   2  0 50  0
##   3  0  0 50
rand.index(res_cutree[,3], result_km$cluster)
## [1] 1
adj.rand.index(res_cutree[,3], result_km$cluster)
## [1] 1