Zhiguang Huo (Caleb)
Wednesday October 14, 2020
\[\min_C \sum_{k=1}^K \sum_{i \in C_k} \|x_i - \bar{x}_{C_k}\|^2,\]
K <- 3
set.seed(32611)
centers <- mvrnorm(K, mu = c(0,0), Sigma = diag(c(1,1)))
colnames(centers) <- c("x", "y")
plot(d, pch=19)
points(centers, col = 2:4, pch=9, cex=2)
## update group labels
groupsDist <- matrix(0,nrow=nrow(d),ncol=K)
for(k in 1:K){
vecDiff <- t(d) - centers[k,]
adist <- apply(vecDiff,2,function(x) sum(x^2))
groupsDist[,k] <- adist
}
groups <- apply(groupsDist,1,which.min)
plot(d, pch=19, col=groups + 1)
points(centers, pch=9, cex=2)
## update centers
for(k in 1:K){
asubset <- d[groups==k,]
centers[k,] <- colMeans(asubset)
}
groups0 <- groups ## save the previous clustering result, in order to test convergence
plot(d, pch=19)
points(centers, col = 2:4, pch=9, cex=2)
iris.data <- iris[,1:4]
ir.pca <- prcomp(iris.data,
center = TRUE,
scale = TRUE)
PC1 <- ir.pca$x[,"PC1"]
PC2 <- ir.pca$x[,"PC2"]
variance <- ir.pca$sdev^2 / sum(ir.pca$sdev^2)
v1 <- paste0("variance: ",signif(variance[1] * 100,3), "%")
v2 <- paste0("variance: ",signif(variance[2] * 100,3), "%")
plot(PC1, PC2, col=as.numeric(iris$Species),pch=19, xlab=v1, ylab=v2)
legend("topright", legend = levels(iris$Species), col = unique(iris$Species), pch = 19)
par(mfrow=c(1,2))
plot(PC1, PC2, col=as.numeric(iris$Species),pch=19, xlab=v1, ylab=v2, main="true label")
legend("topright", legend = levels(iris$Species), col = unique(iris$Species), pch = 19)
plot(PC1, PC2, col=as.numeric(kmeans_iris$cluster),pch=19, xlab=v1, ylab=v2, main="kmeans label")
legend("topright", legend = unique(kmeans_iris$cluster), col = unique(kmeans_iris$cluster), pch = 19)
## K-means clustering with 3 clusters of sizes 50, 50, 50
##
## Cluster means:
## [,1] [,2]
## 1 5.0201835 5.84577942
## 2 10.0223462 9.93004155
## 3 0.1262428 -0.05688753
##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [149] 2 2
##
## Within cluster sum of squares by cluster:
## [1] 99.84741 87.84906 87.79105
## (between_SS / total_SS = 94.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
\[\max Gap_n (k) = \frac{1}{B}(\sum_{b = 1}^B\log(W_b^*(k))) - \log(W(k))\]
library(cluster)
gsP.Z <- clusGap(d, FUN = kmeans, K.max = 8, B = 50)
plot(gsP.Z, main = "k = 3 cluster is optimal")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = d, FUNcluster = kmeans, K.max = 8, B = 50)
## B=50 simulated reference sets, k = 1..8; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstSEmax', SE.factor=1): 3
## logW E.logW gap SE.sim
## [1,] 5.769038 6.051298 0.2822604 0.01953592
## [2,] 5.412645 5.742842 0.3301974 0.03763254
## [3,] 5.118928 5.536650 0.4177221 0.02187489
## [4,] 5.031797 5.344077 0.3122792 0.01963785
## [5,] 4.904344 5.234730 0.3303857 0.02161604
## [6,] 4.789622 5.132684 0.3430620 0.02399886
## [7,] 4.779464 5.052895 0.2734309 0.02399082
## [8,] 4.648428 4.975084 0.3266552 0.02359000
suppressMessages(library(fossil))
set.seed(32611)
g1 <- sample(1:2, size=10, replace=TRUE)
g2 <- sample(1:3, size=10, replace=TRUE)
rand.index(g1, g2)
## [1] 0.4444444
Adjusted Rand index is the corrected-for-chance version of the Rand index:
\[AdjustedRandIndex = \frac{RI - expected(RI)}{MaxRI-expected(RI)}\]
## [1] 0.1697417
Hierarchical clustering methods produce a tree or dendrogram.
No need to specify how many clusters
Select a distance measurement \(d\) (e.g. Euclidean distance)
single linkage \[d(A, B) = \min_{x\in A, y \in B} d(x,y)\]
average linkage \[d(A, B) = \frac{1}{N_AN_B}\sum_{x\in A, y \in B} d(x,y)\]
centroid linkage \[d(A, B) = d(\bar{x},\bar{y}),\] where \({x\in A, y \in B}\)
complete linkage \[d(A, B) = \max_{x\in A, y \in B} d(x,y)\]
res_cutree <- cutree(hc, k = 1:5) #k = 1 is trivial
set.seed(32611)
result_km <- kmeans(d0, centers = 3)
table(res_cutree[,3], result_km$cluster)
##
## 1 2 3
## 1 50 0 0
## 2 0 50 0
## 3 0 0 50
## [1] 1
## [1] 1