Zhiguang Huo (Caleb)
Monday September 25, 2017
Several popular dimension reduction approaches
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
dim(iris)
## [1] 150 5
iris.data <- iris[,1:4]
ir.pca <- prcomp(iris.data,
center = TRUE,
scale = TRUE)
PC1 <- ir.pca$x[,"PC1"]
PC2 <- ir.pca$x[,"PC2"]
variance <- ir.pca$sdev^2 / sum(ir.pca$sdev^2)
v1 <- paste0("variance: ",signif(variance[1] * 100,3), "%")
v2 <- paste0("variance: ",signif(variance[2] * 100,3), "%")
plot(PC1, PC2, col=as.numeric(iris$Species),pch=19, xlab=v1, ylab=v2)
legend("topright", legend = levels(iris$Species), col = unique(iris$Species), pch = 19)
biplot(ir.pca)
where \(D = \begin{pmatrix} d_1 & 0 & 0 & \\ 0 & d_2 & & \\ 0 & & \ddots & 0\\ & & 0 & d_G \end{pmatrix},\)
\(d_1 \ge d_2 \ge \ldots \ge d_G \ge 0\) are eigenvalues. \({\bf V} = ({\bf v}_1, {\bf v}_2, \ldots, {\bf v}_G)\) are eigen-vectors.
For any vector \(\alpha \in \mathbb{R}^{G}\), it can be spanned as linear combination of \({\bf v}_i\)’s
\(\begin{aligned} Var(\alpha^\top {\bf x}) &= \alpha^\top \Sigma \alpha \\ &= (\sum_{g=1}^G a_g {\bf v}_g)^\top \Sigma (\sum_{g=1}^G a_g {\bf v}_g) \\ &= (\sum_{g=1}^G a_g {\bf v}_g)^\top(\sum_{g=1}^G a_g d_g {\bf v}_g) \\ &= \sum_{g=1}^G d_g a_g^2 \|{\bf v}_g\|^2 = \sum_{g=1}^G d_g a_g^2 \\ &\le \sum_{g=1}^G d_1 a_g^2 \\ &\le d_1 \\ \end{aligned}\)
\(\begin{aligned} Var(\alpha^\top {\bf x}) &= \alpha^\top \Sigma \alpha \\ &= {\bf v}_1^\top \Sigma {\bf v}_1 \\ &= d_1 \\ \end{aligned}\)
Therefore, \(\alpha = {\bf v}_1\) maximize \(\max_\alpha Var(\alpha^\top {\bf x})\).
And the projected random variable \(L_1 = {\bf v}_1^\top {\bf x}\) has the largest variance \(d_1\)
Similarly, \(L_g = {\bf v}_g^\top {\bf x}\) has the largest variance among all possible projections orthogonal to \(L_1, L_2, \ldots, L_{g-1}\) for \(1 \le g \le G\).
Human being have 23 chromosomes.
Chromosome 1 of HAPMAP data is avaliable on /ufrc/phc6068/share/data/HAPMAP/chr1Data.Rdata
Annotation is on /ufrc/phc6068/share/data/HAPMAP/relationships_w_pops_121708.txt
These data are avaliable on hpg2.rc.ufl.edu
You can either perform the analysis on HiperGator, or downloaded the data to your local computer and perform the PCA analysis.
Repeat these two PCA plot
library(impute)
raw = read.table(url("http://Caleb-Huo.github.io/teaching/data/cellCycle/cellCycle.txt"),header=TRUE,as.is=TRUE)
cellCycle = raw
cellCycle[,2:78]<- impute.knn(as.matrix(raw[,2:78]))$data ## missing value imputation
\(\begin{aligned} V^\top X^\top X V &= V^\top (U E V^\top)^\top U E V^\top V \\ &= V^\top V E U^\top U E V^\top V \\ &= E^2 \end{aligned}\)
iris.data <- iris[,1:4]
iris.data.center <- scale(iris.data)
asvd <- svd(iris.data.center)
UE <- asvd$u %*% diag(asvd$d)
PC1 <- UE[,1]
PC2 <- UE[,2]
variance <- asvd$d^2 / sum(asvd$d^2)
v1 <- paste0("variance: ",signif(variance[1] * 100,3), "%")
v2 <- paste0("variance: ",signif(variance[2] * 100,3), "%")
plot(PC1, PC2, col=as.numeric(iris$Species),pch=19, xlab=v1, ylab=v2)
legend("topright", legend = levels(iris$Species), col = unique(iris$Species), pch = 19)
Multi-dimensional scaling (MDS) aims to map data (distance or dissimilarity) structure to low dimensional space (usually 2D or 3D Euclidian space).
Flight mileage of ten cities obtained from http://www.webyer.com/travel/mileage_calculator/.
z <- read.delim(url("https://Caleb-Huo.github.io/teaching/data/UScityDistance/Mileage.txt"), row.names=1)
knitr::kable(z, caption = "distance between cities (miles)")
BOST | NY | DC | MIAM | CHIC | SEAT | SF | LA | DENV | GAINESVILLE | |
---|---|---|---|---|---|---|---|---|---|---|
BOSTON | 0 | 206 | 429 | 1504 | 963 | 2976 | 3095 | 2979 | 1949 | 1219 |
NY | 206 | 0 | 233 | 1308 | 802 | 2815 | 2934 | 2786 | 1771 | 1000 |
DC | 429 | 233 | 0 | 1075 | 671 | 2684 | 2799 | 2631 | 1616 | 780 |
MIAMI | 1504 | 1308 | 1075 | 0 | 1329 | 3273 | 3053 | 2687 | 1010 | 338 |
CHICAGO | 963 | 802 | 671 | 1329 | 0 | 2013 | 2142 | 2054 | 996 | 1047 |
SEATTLE | 2976 | 2815 | 2684 | 3273 | 2013 | 0 | 808 | 1131 | 1307 | 2985 |
SF | 3095 | 2934 | 2799 | 3053 | 2142 | 808 | 0 | 379 | 1235 | 2780 |
LA | 2979 | 2786 | 2631 | 2687 | 2054 | 1131 | 379 | 0 | 1059 | 2409 |
DENVER | 1949 | 1771 | 1616 | 2037 | 996 | 1307 | 1235 | 1059 | 0 | 1740 |
GAINESVILLE | 1219 | 1000 | 780 | 338 | 1047 | 2985 | 2780 | 2409 | 1740 | 0 |
Perform MDS on the following data, with both classical method and Sammon’s stress.
Note:
z0 <- read.csv(url("https://Caleb-Huo.github.io/teaching/data/SNP_MDS/SNP_PCA.csv"), row.names=1)
knitr::kable(z0, caption = "Genetic dissimilarity between races")
X.Kung | Alur | AndhraBrahmin | CEU | CHB | Chinese | Dalit | Hema | Iban | Irula | JPT | Japanese | Khmer.Cambodian | Luhya | Madiga | Mala | Nguni | Pedi | Pygmy | Sotho.Tswana | Stalskoe | Tamil.Brahmin | Tuscan | Urkarah | Utah.N..European | Vietnamese | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
!Kung | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Alur | 0.050 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Andhra_Brahmin | 0.168 | 0.135 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
CEU | 0.183 | 0.154 | 0.033 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
CHB | 0.216 | 0.187 | 0.075 | 0.110 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Chinese | 0.213 | 0.179 | 0.071 | 0.108 | 0.001 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Dalit | 0.178 | 0.143 | 0.010 | 0.056 | 0.078 | 0.075 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Hema | 0.052 | 0.013 | 0.108 | 0.123 | 0.162 | 0.153 | 0.116 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Iban | 0.216 | 0.185 | 0.075 | 0.112 | 0.024 | 0.020 | 0.078 | 0.160 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Irula | 0.193 | 0.160 | 0.030 | 0.073 | 0.091 | 0.089 | 0.031 | 0.133 | 0.092 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
JPT | 0.218 | 0.188 | 0.076 | 0.112 | 0.007 | 0.009 | 0.079 | 0.163 | 0.030 | 0.092 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Japanese | 0.212 | 0.179 | 0.072 | 0.108 | 0.008 | 0.009 | 0.076 | 0.153 | 0.031 | 0.089 | 0.003 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Khmer Cambodian | 0.199 | 0.162 | 0.055 | 0.094 | 0.012 | 0.009 | 0.059 | 0.136 | 0.013 | 0.075 | 0.018 | 0.018 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Luhya | 0.049 | 0.009 | 0.131 | 0.148 | 0.180 | 0.171 | 0.137 | 0.011 | 0.177 | 0.153 | 0.181 | 0.171 | 0.156 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Madiga | 0.174 | 0.138 | 0.005 | 0.050 | 0.072 | 0.069 | 0.006 | 0.111 | 0.073 | 0.027 | 0.073 | 0.070 | 0.052 | 0.133 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Mala | 0.174 | 0.138 | 0.005 | 0.053 | 0.072 | 0.068 | 0.006 | 0.112 | 0.072 | 0.026 | 0.073 | 0.069 | 0.051 | 0.133 | 0.001 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Nguni | 0.038 | 0.019 | 0.146 | 0.164 | 0.198 | 0.191 | 0.154 | 0.022 | 0.197 | 0.172 | 0.199 | 0.190 | 0.174 | 0.014 | 0.150 | 0.150 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Pedi | 0.036 | 0.016 | 0.140 | 0.157 | 0.192 | 0.184 | 0.147 | 0.018 | 0.190 | 0.165 | 0.193 | 0.184 | 0.167 | 0.010 | 0.143 | 0.143 | 0.004 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Pygmy | 0.071 | 0.065 | 0.196 | 0.209 | 0.239 | 0.240 | 0.207 | 0.077 | 0.240 | 0.219 | 0.240 | 0.238 | 0.230 | 0.073 | 0.204 | 0.203 | 0.075 | 0.072 | NA | NA | NA | NA | NA | NA | NA | NA |
Sotho/Tswana | 0.029 | 0.020 | 0.144 | 0.161 | 0.196 | 0.189 | 0.152 | 0.022 | 0.195 | 0.169 | 0.197 | 0.188 | 0.172 | 0.015 | 0.147 | 0.147 | 0.004 | 0.004 | 0.072 | NA | NA | NA | NA | NA | NA | NA |
Stalskoe | 0.180 | 0.142 | 0.023 | 0.012 | 0.104 | 0.101 | 0.046 | 0.110 | 0.108 | 0.066 | 0.105 | 0.101 | 0.086 | 0.136 | 0.040 | 0.042 | 0.153 | 0.145 | 0.212 | 0.150 | NA | NA | NA | NA | NA | NA |
Tamil_Brahmin | 0.170 | 0.135 | 0.001 | 0.033 | 0.078 | 0.074 | 0.010 | 0.107 | 0.080 | 0.032 | 0.080 | 0.075 | 0.058 | 0.130 | 0.006 | 0.006 | 0.146 | 0.139 | 0.199 | 0.143 | 0.023 | NA | NA | NA | NA | NA |
Tuscan | 0.179 | 0.148 | 0.032 | 0.005 | 0.112 | 0.109 | 0.055 | 0.116 | 0.113 | 0.072 | 0.113 | 0.109 | 0.094 | 0.141 | 0.048 | 0.051 | 0.159 | 0.151 | 0.208 | 0.156 | 0.010 | 0.031 | NA | NA | NA | NA |
Urkarah | 0.183 | 0.150 | 0.028 | 0.014 | 0.109 | 0.106 | 0.051 | 0.119 | 0.111 | 0.069 | 0.111 | 0.105 | 0.091 | 0.144 | 0.044 | 0.047 | 0.161 | 0.154 | 0.212 | 0.158 | 0.007 | 0.027 | 0.013 | NA | NA | NA |
Utah N. European | 0.184 | 0.152 | 0.033 | 0.000 | 0.112 | 0.109 | 0.057 | 0.121 | 0.114 | 0.074 | 0.114 | 0.109 | 0.095 | 0.146 | 0.051 | 0.052 | 0.163 | 0.156 | 0.212 | 0.160 | 0.012 | 0.033 | 0.004 | 0.014 | NA | NA |
Vietnamese | 0.210 | 0.175 | 0.066 | 0.104 | 0.006 | 0.002 | 0.071 | 0.148 | 0.014 | 0.086 | 0.014 | 0.015 | 0.003 | 0.167 | 0.064 | 0.064 | 0.186 | 0.180 | 0.238 | 0.185 | 0.099 | 0.070 | 0.105 | 0.102 | 0.105 | NA |
YRI | 0.051 | 0.015 | 0.141 | 0.157 | 0.184 | 0.177 | 0.146 | 0.018 | 0.181 | 0.161 | 0.185 | 0.177 | 0.164 | 0.009 | 0.142 | 0.143 | 0.014 | 0.011 | 0.076 | 0.016 | 0.147 | 0.140 | 0.151 | 0.154 | 0.155 | 0.174 |
library(ggplot2)
library(ggrepel)
z <- as.dist(cbind(z0,NA))
z_mds = - cmdscale(dist(z),k=2)
mdsDataFrame <- data.frame(Race=rownames(z_mds),x = z_mds[,1], y = z_mds[,2])
ggplot(data=mdsDataFrame, aes(x=x, y=y, label=Race)) + geom_point(aes(color=Race)) + geom_text_repel(data=mdsDataFrame, aes(label=Race))
\[\min_{W\in \mathbb{R}^{p\times r}, H \in \mathbb{R}^{r \times n}} \|X - WH\|_F,\]
where \(X \in \mathbb{R}^{p \times n}\), \(\|X\|_F = \sqrt{\sum_{j=1}^p \sum_{i=1}^n X_{ji}^2}\)
library(NMF)
## Loading required package: pkgmaker
## Loading required package: registry
##
## Attaching package: 'pkgmaker'
## The following object is masked from 'package:base':
##
## isNamespaceLoaded
## Loading required package: rngtools
## Loading required package: cluster
## NMF - BioConductor layer [OK] | Shared memory capabilities [NO: bigmemory] | Cores 3/4
## To enable shared memory capabilities, try: install.extras('
## NMF
## ')
iris.data <- iris[,1:4]
anmf <- nmf(iris.data, rank = 2, method = "lee")
W <- anmf@fit@W
H <- anmf@fit@H
plot(W[,1], W[,2], col=as.numeric(iris$Species),pch=19)
legend("topright", legend = levels(iris$Species), col = unique(iris$Species), pch = 19)
basismap(anmf) ## W
coefmap(anmf) ## H
knitr::purl("SVDandPCA.Rmd", output = "SVDandPCA.R ", documentation = 2)
##
##
## processing file: SVDandPCA.Rmd
##
|
| | 0%
|
|... | 5%
|
|...... | 9%
|
|......... | 14%
|
|............ | 18%
|
|............... | 23%
|
|.................. | 27%
|
|..................... | 32%
|
|........................ | 36%
|
|........................... | 41%
|
|.............................. | 45%
|
|................................ | 50%
|
|................................... | 55%
|
|...................................... | 59%
|
|......................................... | 64%
|
|............................................ | 68%
|
|............................................... | 73%
|
|.................................................. | 77%
|
|..................................................... | 82%
|
|........................................................ | 86%
|
|........................................................... | 91%
|
|.............................................................. | 95%
|
|.................................................................| 100%
## output file: SVDandPCA.R
## [1] "SVDandPCA.R "