More examples of dimensionality reduction using PCA and UMAP.
Reading 1 and 2, Recording, Rmarkdown
library("embed")
library("ggplot2")
library("ggrepel")
library("readr")
library("stringr")
library("tidymodels")
library("tidytext")
theme479 <- theme_minimal() +
theme(
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "#f7f7f7"),
panel.border = element_rect(fill = NA, color = "#0c0c0c", size = 0.6),
legend.position = "bottom"
)
theme_set(theme479)
set.seed(479)
moons <- read_csv("https://uwmadison.box.com/shared/static/kdt9qqvonhcz2ssb599p1nqganrg1w6k.csv")
ggplot(moons, aes(X, Y, col = Class)) +
geom_point() +
scale_color_brewer(palette = "Set2")
moons_ <- recipe(~ ., data = moons) %>%
update_role(Class, new_role = "id")
pca_rec <- step_pca(moons_, all_predictors(), num_comp = 1)
umap_rec <- step_umap(moons_, all_predictors(), num_comp = 1)
scores <- bind_cols(
prep(umap_rec) %>% juice() %>% mutate(umap_1 = scale(umap_1)),
prep(pca_rec) %>% juice() %>% select(-Class)
) %>%
pivot_longer(-Class, names_to = "method")
ggplot(scores, aes(value, method, col = Class)) +
geom_point(position = position_jitter(h = 0.1), alpha = 0.8) +
scale_color_brewer(palette = "Set2")
taxa
variable.step_log
. The rest of the definition is like the 2D example above.antibiotic_ <- recipe(~ ., data = antibiotic) %>%
update_role(sample:antibiotic, new_role = "id") %>%
step_log(all_predictors(), offset = 1) %>%
step_normalize(all_predictors())
pca_rec <- step_pca(antibiotic_, all_predictors())
pca_prep <- prep(pca_rec)
scores <- juice(pca_prep)
variances <- tidy(pca_prep, 2, type = "variance")
ggplot(scores, aes(PC1, PC2, col = antibiotic)) +
geom_point(aes(shape = ind), size = 1.5) +
geom_text_repel(aes(label = sample), check_overlap = TRUE, size = 3) +
coord_fixed(sqrt(variances$value[2] / variances$value[1])) +
scale_color_brewer(palette = "Set2")
components_ <- tidy(pca_prep, 3) %>%
filter(component %in% str_c("PC", 1:6)) %>%
mutate(terms_ = reorder_within(terms, abs(value), component)) %>%
group_by(component) %>%
top_n(20, abs(value)) %>%
left_join(taxa)
ggplot(components_, aes(value, terms_, fill = Phylum)) +
geom_col() +
facet_wrap(~ component, scales = "free_y") +
scale_y_reordered() +
labs(y = NULL) +
scale_fill_brewer(palette = "Set1") +
theme(axis.text = element_text(size = 5))
components
with which to interpret the different axes. Instead, a typical approach to interpret the representation is to find points that are close together (e.g., using \(K\)-means) and take their average species profile.umap_rec <- step_umap(antibiotic_, all_predictors(), min_dist = 1.5)
umap_prep <- prep(umap_rec)
scores <- juice(umap_prep)
ggplot(scores, aes(umap_1, umap_2, col = antibiotic)) +
geom_point(aes(shape = ind), size = 1.5) +
geom_text_repel(aes(label = sample), max.overlaps = 10) +
scale_color_brewer(palette = "Set2")
fashion_ <- recipe(~ ., data = fashion) %>%
update_role(label, image, new_role = "id")
pca_rec <- step_pca(fashion_, all_predictors())
pca_prep <- prep(pca_rec)
scores <- juice(pca_prep) %>%
rename(x = PC1, y = PC2)
ggplot(scores, aes(x, y, col = label)) +
geom_point() +
scale_color_brewer(palette = "Set3") +
coord_fixed()
pivot_scores(scores, fashion) %>%
overlay_images()
umap_rec <- step_umap(fashion_, all_predictors(), num_comp = 2, min_dist = 0.5)
umap_prep <- prep(umap_rec)
scores <- juice(umap_prep) %>%
rename(x = umap_1, y = umap_2)
ggplot(scores, aes(x, y, col = label)) +
geom_point() +
scale_color_brewer(palette = "Set3") +
coord_fixed()
pivot_scores(scores, fashion, scale_factor = 0.05) %>%
overlay_images(scale_factor = 0.05)