More examples of dimensionality reduction using PCA and UMAP.
Reading 1 and 2, Recording, Rmarkdown
library(embed)
library(tidyverse)
library(ggrepel)
library(tidymodels)
library(tidytext)
theme479 <- theme_minimal() +
theme(
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "#f7f7f7"),
panel.border = element_rect(fill = NA, color = "#0c0c0c", size = 0.6),
legend.position = "bottom"
)
theme_set(theme479)
set.seed(479)
moons <- read_csv("https://uwmadison.box.com/shared/static/kdt9qqvonhcz2ssb599p1nqganrg1w6k.csv")
ggplot(moons, aes(X, Y, col = Class)) +
geom_point() +
scale_color_brewer(palette = "Set2")
moons_ <- recipe(~ ., data = moons) %>%
update_role(Class, new_role = "id")
pca_rec <- step_pca(moons_, all_predictors(), num_comp = 1)
umap_rec <- step_umap(moons_, all_predictors(), num_comp = 1)
scores <- bind_cols(
prep(umap_rec) %>% juice() %>% mutate(UMAP1 = scale(UMAP1)),
prep(pca_rec) %>% juice() %>% select(-Class)
) %>%
pivot_longer(-Class, names_to = "method")
ggplot(scores, aes(value, method, col = Class)) +
geom_point(position = position_jitter(h = 0.1), alpha = 0.8) +
scale_color_brewer(palette = "Set2")
taxa
variable.step_log
. The rest of the definition is like
the 2D example above.antibiotic_ <- recipe(~ ., data = antibiotic) %>%
update_role(sample:antibiotic, new_role = "id") %>%
step_log(all_predictors(), offset = 1) %>%
step_normalize(all_predictors())
pca_rec <- step_pca(antibiotic_, all_predictors())
pca_prep <- prep(pca_rec)
scores <- juice(pca_prep)
variances <- tidy(pca_prep, 2, type = "variance")
ggplot(scores, aes(PC1, PC2, col = antibiotic)) +
geom_point(aes(shape = ind), size = 1.5) +
geom_text_repel(aes(label = sample), check_overlap = TRUE, size = 3) +
coord_fixed(sqrt(variances$value[2] / variances$value[1])) +
scale_color_brewer(palette = "Set2")
components_ <- tidy(pca_prep, 3) %>%
filter(component %in% str_c("PC", 1:6)) %>%
mutate(terms_ = reorder_within(terms, abs(value), component)) %>%
group_by(component) %>%
top_n(20, abs(value)) %>%
left_join(taxa)
ggplot(components_, aes(value, terms_, fill = Phylum)) +
geom_col() +
facet_wrap(~ component, scales = "free_y") +
scale_y_reordered() +
labs(y = NULL) +
scale_fill_brewer(palette = "Set1") +
theme(axis.text = element_text(size = 5))
components
with which to interpret the different axes. Instead, a typical
approach to interpret the representation is to find points that are close
together (e.g., using \(K\)-means) and take their average species profile.umap_rec <- step_umap(antibiotic_, all_predictors(), min_dist = 1.5)
umap_prep <- prep(umap_rec)
scores <- juice(umap_prep)
ggplot(scores, aes(UMAP1, UMAP2, col = antibiotic)) +
geom_point(aes(shape = ind), size = 1.5) +
geom_text_repel(aes(label = sample), max.overlaps = 10) +
scale_color_brewer(palette = "Set2")
fashion <- read_csv("https://uwmadison.box.com/shared/static/aur84ttkwa2rqvzo99qo7yhxemoc6om0.csv") %>%
sample_frac(0.2) %>%
mutate(
image = row_number(),
label = as.factor(label)
)
fashion_ <- recipe(~ ., data = fashion) %>%
update_role(label, image, new_role = "id")
pca_rec <- step_pca(fashion_, all_predictors())
pca_prep <- prep(pca_rec)
scores <- juice(pca_prep) %>%
rename(x = PC1, y = PC2)
ggplot(scores, aes(x, y, col = label)) +
geom_point() +
scale_color_brewer(palette = "Set3") +
coord_fixed()
pivot_scores(scores, fashion) %>%
overlay_images()
umap_rec <- step_umap(fashion_, all_predictors(), num_comp = 2, min_dist = 0.5)
umap_prep <- prep(umap_rec)
scores <- juice(umap_prep) %>%
rename(x = UMAP1, y = UMAP2)
ggplot(scores, aes(x, y, col = label)) +
geom_point() +
scale_color_brewer(palette = "Set3") +
coord_fixed()
pivot_scores(scores, fashion, scale_factor = 0.05) %>%
overlay_images(scale_factor = 0.05)
For attribution, please cite this work as
Sankaran (2024, March 31). STAT 436 (Spring 2024): PCA and UMAP Examples. Retrieved from https://krisrs1128.github.io/stat436_s24/website/stat436_s24/posts/2024-12-27-week10-5/
BibTeX citation
@misc{sankaran2024pca, author = {Sankaran, Kris}, title = {STAT 436 (Spring 2024): PCA and UMAP Examples}, url = {https://krisrs1128.github.io/stat436_s24/website/stat436_s24/posts/2024-12-27-week10-5/}, year = {2024} }