Once we’ve fit a topic model, how should we inspect it?
Reading 1 and 2, Recording, Rmarkdown
library(tidyverse)
library(ggrepel)
library(superheat)
library(tidytext)
library(topicmodels)
theme479 <- theme_minimal() +
theme(
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "#f7f7f7"),
panel.border = element_rect(fill = NA, color = "#0c0c0c", size = 0.6),
legend.position = "bottom"
)
theme_set(theme479)
ggplot(topics %>% filter(beta > 3e-4), aes(term, beta)) +
geom_col() +
facet_grid(topic ~ .) +
theme(axis.text.x = element_blank())
Figure 1: A faceted barplot view of the original topic distributions, with only very limited filtering.
topics %>%
filter(beta > 3e-4) %>%
pivot_wider(names_from = "term", values_from = "beta", values_fill = 0) %>%
select(-1) %>%
superheat(
pretty.order.cols = TRUE,
legend = FALSE
)
Figure 2: An equivalent heatmap view of the above faceted barplot.
slice_max
function, after
first grouping by topic. Then, we use the same reorder_within
function from
the PCA lectures to reorder words within each topic. The resulting plot is much
more interpretable. Judging from the words that are common in each topic’s
distribution, we can guess that the topics approximately correspond to: 1 ->
Great Expectations, 2 -> 20,000 Leagues Under the Sea, 3 -> Pride & Prejudice, 4
-> War of the Worlds.top_terms <- topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
mutate(term = reorder_within(term, beta, topic))
ggplot(top_terms, aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_fill_brewer(palette = "Set2") +
scale_y_reordered()
Figure 3: The top words associated with the four fitted topics from the Great Library Heist example.
p <- seq(0.01, .99, length.out = 50)
df <- expand.grid(p, p) %>%
mutate(D = kl_div(Var1, Var2))
ggplot(df, aes(Var2, Var1)) +
geom_tile(aes(col = D, fill = D)) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
coord_fixed() +
scale_color_distiller(direction = 1) +
scale_fill_distiller(direction = 1) +
labs(
y = expression(beta[kw]),
x = expression(beta[lw])
)
Figure 4: An illustration of the formula used for computing a word’s discrimination between topics. The value of D is large when topic k has much larger probability than topic l.
discriminative_terms <- topics %>%
group_by(term) %>%
mutate(D = discrepancy(beta)) %>%
ungroup() %>%
slice_max(D, n = 200) %>%
mutate(term = fct_reorder(term, -D))
discriminative_terms %>%
pivot_wider(names_from = "topic", values_from = "beta") %>%
column_to_rownames("term") %>%
select(-D) %>%
superheat(
pretty.order.rows = TRUE,
left.label.size = 1.5,
left.label.text.size = 3,
bottom.label.size = 0.05,
legend = FALSE
)
Figure 5: A heatmap of the terms that are most discriminative across the four topics.
memberships <- memberships %>%
mutate(
book = str_extract(document, "[^_]+"),
topic = factor(topic)
)
ggplot(memberships, aes(topic, gamma)) +
geom_boxplot() +
facet_wrap(~book)
Figure 6: A boxplot of the document memberships. It seems that most documents are definitively assigned to one of the four topics.
ggplot(memberships, aes(topic, gamma, col = book)) +
geom_point(position = position_jitter(h = 0.05, w = 0.3)) +
geom_text_repel(aes(label = document), size = 3) +
facet_wrap(~ book) +
scale_color_brewer(palette = "Set1")
Figure 7: A jittered scatterplot of the topic memberships associated with each document.
pretty.order.rows
. The takeaways here are similar to those in the
scatterplot above.gamma <- memberships %>%
pivot_wider(names_from = topic, values_from = gamma)
hclust_result <- hclust(dist(gamma[, 3:6]))
document_order <- gamma$document[hclust_result$order]
memberships <- memberships %>%
mutate(document = factor(document, levels = document_order))
ggplot(memberships, aes(gamma, document, fill = topic, col = topic)) +
geom_col(position = position_stack()) +
facet_grid(book ~ ., scales = "free", space = "free") +
scale_x_continuous(expand = c(0, 0)) +
scale_fill_brewer(palette = "Set2") +
scale_color_brewer(palette = "Set2") +
theme(axis.text.y = element_blank())
Figure 8: A structure plot view of each chapter’s topic memberships.
For attribution, please cite this work as
Sankaran (2024, Jan. 7). STAT 436 (Spring 2024): Visualizing Topic Models. Retrieved from https://krisrs1128.github.io/stat436_s24/website/stat436_s24/posts/2024-12-27-week11-3/
BibTeX citation
@misc{sankaran2024visualizing, author = {Sankaran, Kris}, title = {STAT 436 (Spring 2024): Visualizing Topic Models}, url = {https://krisrs1128.github.io/stat436_s24/website/stat436_s24/posts/2024-12-27-week11-3/}, year = {2024} }