Zhiguang Huo (Caleb)
Wed Feb 7, 2018
\[\min_C \sum_{k=1}^K \sum_{i \in C_k} \|x_i - \bar{x}_{C_k}\|^2,\]
library(MASS)
set.seed(32611)
N<-100
d1<-mvrnorm(N, c(0,3), matrix(c(2, 1, 1, 2), 2, 2))
d2<-mvrnorm(N, c(-2,-2), matrix(c(2, 1, 1, 2), 2, 2))
d3<-mvrnorm(N, c(2,-2), matrix(c(2, 1, 1, 2), 2, 2))
d <- rbind(d1, d2, d3)
colnames(d) <- c("x", "y")
label <- c(rep("1", N), rep("2", N), rep("3", N))
plot(d, pch = 19, col=as.numeric(label))
legend("topleft", legend = unique(label), col=unique(as.numeric(label)), pch=19)
K <- 3
set.seed(32611)
centers <- mvrnorm(K, mu = c(0,0), Sigma = diag(c(1,1)))
colnames(centers) <- c("x", "y")
plot(d, pch=19)
points(centers, col = 2:4, pch=9, cex=2)
l2n <- function(avec){
return(sqrt(sum(avec^2)))
}
## update group labels
groupsDist <- matrix(0,nrow=nrow(d),ncol=K)
for(k in 1:K){
vecDiff <- t(d) - centers[k,]
al2n <- apply(vecDiff,2,l2n)
groupsDist[,k] <- al2n
}
groups <- apply(groupsDist,1,which.min)
plot(d, pch=19, col=groups + 1)
points(centers, pch=9, cex=2)
## update centers
for(k in 1:K){
asubset <- d[groups==k,]
centers[k,] <- colMeans(asubset)
}
groups0 <- groups
plot(d, pch=19)
points(centers, col = 2:4, pch=9, cex=2)
set.seed(32611)
akmeans <- kmeans(d, centers = 3)
kmeansCenters <- akmeans$centers
colnames(kmeansCenters) <- c("x","y")
plot(d, pch=19, col=akmeans$cluster + 1)
points(kmeansCenters, pch=9, cex=2)
iris.data <- iris[,1:4]
ir.pca <- prcomp(iris.data,
center = TRUE,
scale = TRUE)
PC1 <- ir.pca$x[,"PC1"]
PC2 <- ir.pca$x[,"PC2"]
variance <- ir.pca$sdev^2 / sum(ir.pca$sdev^2)
v1 <- paste0("variance: ",signif(variance[1] * 100,3), "%")
v2 <- paste0("variance: ",signif(variance[2] * 100,3), "%")
plot(PC1, PC2, col=as.numeric(iris$Species),pch=19, xlab=v1, ylab=v2)
legend("topright", legend = levels(iris$Species), col = unique(iris$Species), pch = 19)
set.seed(32611)
kmeans_iris <- kmeans(iris.data, 3)
plot(PC1, PC2, col=as.numeric(kmeans_iris$cluster),pch=19, xlab=v1, ylab=v2)
legend("topright", legend = unique(kmeans_iris$cluster), col = unique(kmeans_iris$cluster), pch = 19)
par(mfrow=c(1,2))
plot(PC1, PC2, col=as.numeric(iris$Species),pch=19, xlab=v1, ylab=v2, main="true label")
legend("topright", legend = levels(iris$Species), col = unique(iris$Species), pch = 19)
plot(PC1, PC2, col=as.numeric(kmeans_iris$cluster),pch=19, xlab=v1, ylab=v2, main="kmeans label")
legend("topright", legend = unique(kmeans_iris$cluster), col = unique(kmeans_iris$cluster), pch = 19)
dim(iris.data)
## [1] 150 4
Preprint: https://arxiv.org/abs/1602.07277
This is a heuristic algorithm, can we formulate an objective function?
\[\min_C \sum_{k=1}^K \sum_{i \in C_k} \|x_i - \bar{x}_{C_K}\|^2,\]
\[\min_C \sum_{j=1}^p \sum_{k=1}^K \sum_{i \in C_k} (x_{ji} - \bar{x}_{jC_K})^2,\]
\[\min_{C, \textbf{w}} \sum_{j=1}^p w_j \sum_{k=1}^K \sum_{i \in C_k} (x_{ji} - \bar{x}_{jC_K})^2 + \|\textbf{w}\|_1,\] such that \(w_j \ge 0, \forall j\).
\[\min_{C, \textbf{w}} \sum_{j=1}^p w_j \sum_{k=1}^K \sum_{i \in C_k} (x_{ji} - \bar{x}_{jC_K})^2 + \|\textbf{w}\|_1,\] such that \(w_j \ge 0, \forall j\).
Since total sum of square (TSS) can be decomposed as between cluster sum of square (BCSS) plus within cluster sum of square (WCSS) \[TSS = BCSS + WCSS\]
Minimizing WCSS is equivalent to maximizing BCSS
\[\min_C \sum_{k=1}^K \sum_{i \in C_k} \|x_i - \bar{x}_{C_K}\|^2,\]
is equivalent to \[\max_C \sum_{i} \|x_i - \bar{x}\|^2 -\sum_{k=1}^K \sum_{i \in C_k} \|x_i - \bar{x}_{C_K}\|^2,\] We can write the objective function of sparse \(k\)-means as: \[\max_{C, \textbf{w}} \sum_{j=1}^p w_j \left(\sum_{i} \|x_i - \bar{x}\|^2 -\sum_{k=1}^K \sum_{i \in C_k} \|x_i - \bar{x}_{C_K}\|^2 \right) + \|\textbf{w}\|_1,\] such that \(w_j \ge 0, \forall j\).
\[\max_{C, \textbf{w}} \sum_{j=1}^p w_j \left(\sum_{i} \|x_i - \bar{x}\|^2 -\sum_{k=1}^K \sum_{i \in C_k} \|x_i - \bar{x}_{C_K}\|^2 \right) + \|\textbf{w}\|_1,\] such that \(w_j \ge 0, \forall j\).
\[\max_{C, \textbf{w}} \sum_{j=1}^p w_j \left(\sum_{i} \|x_i - \bar{x}\|^2 -\sum_{k=1}^K \sum_{i \in C_k} \|x_i - \bar{x}_{C_K}\|^2 \right) + \|\textbf{w}\|_1,\] such that \(w_j \ge 0, \forall j\), \(\| \textbf{w} \|_2^2 \le 1\).
\[\max_{C, \textbf{w}} \sum_{j=1}^p w_j \left(\sum_{i} \|x_i - \bar{x}\|^2 -\sum_{k=1}^K \sum_{i \in C_k} \|x_i - \bar{x}_{C_K}\|^2 \right),\] such that \(w_j \ge 0, \forall j\), \(\| \textbf{w}\|_1 \le \mu\), and \(\| \textbf{w} \|_2^2 \le 1\).
\[\max_{C, \textbf{w}} \sum_{j=1}^p w_j \left(\sum_{i} \|x_i - \bar{x}\|^2 -\sum_{k=1}^K \sum_{i \in C_k} \|x_i - \bar{x}_{C_K}\|^2 \right),\] such that \(w_j \ge 0, \forall j\), \(\| \textbf{w}\|_1 \le \mu\), and \(\| \textbf{w} \|_2^2 \le 1\).
library(sparcl)
set.seed(11)
x <- matrix(rnorm(50*70),ncol=70)
x[1:25,1:20] <- x[1:25,1:20]+1
x <- scale(x, TRUE, TRUE)
# run sparse $k$-means
km.out <- KMeansSparseCluster(x,K=2,wbounds=3)
## 012
print(km.out)
## Wbound is 3 :
## Number of non-zero weights: 13
## Sum of weights: 3.00005
## Clustering: 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2
## 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2
plot(km.out)