Once we’ve fit a topic model, how should we inspect it?
Reading 1 and 2, Recording, Rmarkdown
library(tidyverse)
library(ggrepel)
library(superheat)
library(tidytext)
library(topicmodels)
theme479 <- theme_minimal() +
theme(
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "#f7f7f7"),
panel.border = element_rect(fill = NA, color = "#0c0c0c", size = 0.6),
legend.position = "bottom"
)
theme_set(theme479)
ggplot(topics %>% filter(beta > 3e-4), aes(term, beta)) +
geom_col() +
facet_grid(topic ~ .) +
theme(axis.text.x = element_blank())
topics %>%
filter(beta > 3e-4) %>%
pivot_wider(names_from = "term", values_from = "beta", values_fill = 0) %>%
select(-1) %>%
superheat(
pretty.order.cols = TRUE,
legend = FALSE
)
slice_max
function, after
first grouping by topic. Then, we use the same reorder_within
function from
the PCA lectures to reorder words within each topic. The resulting plot is much
more interpretable. Judging from the words that are common in each topic’s
distribution, we can guess that the topics approximately correspond to: 1 ->
Great Expectations, 2 -> 20,000 Leagues Under the Sea, 3 -> Pride & Prejudice, 4
-> War of the Worlds.top_terms <- topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
mutate(term = reorder_within(term, beta, topic))
ggplot(top_terms, aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_fill_brewer(palette = "Set2") +
scale_y_reordered()
p <- seq(0.01, .99, length.out = 50)
df <- expand.grid(p, p) %>%
mutate(D = kl_div(Var1, Var2))
ggplot(df, aes(Var2, Var1)) +
geom_tile(aes(col = D, fill = D)) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
coord_fixed() +
scale_color_distiller(direction = 1) +
scale_fill_distiller(direction = 1) +
labs(
y = expression(beta[kw]),
x = expression(beta[lw])
)
discriminative_terms <- topics %>%
group_by(term) %>%
mutate(D = discrepancy(beta)) %>%
ungroup() %>%
slice_max(D, n = 200) %>%
mutate(term = fct_reorder(term, -D))
discriminative_terms %>%
pivot_wider(names_from = "topic", values_from = "beta") %>%
column_to_rownames("term") %>%
select(-D) %>%
superheat(
pretty.order.rows = TRUE,
left.label.size = 1.5,
left.label.text.size = 3,
bottom.label.size = 0.05,
legend = FALSE
)
memberships <- memberships %>%
mutate(
book = str_extract(document, "[^_]+"),
topic = factor(topic)
)
ggplot(memberships, aes(topic, gamma)) +
geom_boxplot() +
facet_wrap(~book)
ggplot(memberships, aes(topic, gamma, col = book)) +
geom_point(position = position_jitter(h = 0.05, w = 0.3)) +
geom_text_repel(aes(label = document), size = 3) +
facet_wrap(~ book) +
scale_color_brewer(palette = "Set1")
pretty.order.rows
. The takeaways here are similar to those in the
scatterplot above.gamma <- memberships %>%
pivot_wider(names_from = topic, values_from = gamma)
hclust_result <- hclust(dist(gamma[, 3:6]))
document_order <- gamma$document[hclust_result$order]
memberships <- memberships %>%
mutate(document = factor(document, levels = document_order))
ggplot(memberships, aes(gamma, document, fill = topic, col = topic)) +
geom_col(position = position_stack()) +
facet_grid(book ~ ., scales = "free", space = "free") +
scale_x_continuous(expand = c(0, 0)) +
scale_fill_brewer(palette = "Set2") +
scale_color_brewer(palette = "Set2") +
theme(axis.text.y = element_blank())
Color is in general harder to compare than bar height.↩︎
For attribution, please cite this work as
Sankaran (2022, Dec. 31). STAT 436 (Spring 2023): Visualizing Topic Models. Retrieved from https://krisrs1128.github.io/stat436_s23/website/stat436_s23/posts/2022-12-27-week11-3/
BibTeX citation
@misc{sankaran2022visualizing, author = {Sankaran, Kris}, title = {STAT 436 (Spring 2023): Visualizing Topic Models}, url = {https://krisrs1128.github.io/stat436_s23/website/stat436_s23/posts/2022-12-27-week11-3/}, year = {2022} }