Once we’ve fit a topic model, how should we inspect it?
Reading 1 and 2, Recording, Rmarkdown
library("dplyr")
library("forcats")
library("ggplot2")
library("ggrepel")
library("readr")
library("stringr")
library("superheat")
library("tibble")
library("tidyr")
library("tidytext")
library("topicmodels")
theme479 <- theme_minimal() +
theme(
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "#f7f7f7"),
panel.border = element_rect(fill = NA, color = "#0c0c0c", size = 0.6),
legend.position = "bottom"
)
theme_set(theme479)
ggplot(topics %>% filter(beta > 3e-4), aes(term, beta)) +
geom_col() +
facet_grid(topic ~ .) +
theme(axis.text.x = element_blank())
topics %>%
filter(beta > 3e-4) %>%
pivot_wider(topic, term, values_from = "beta", values_fill = 0, names_repair = "unique") %>%
select(-1) %>%
superheat(
pretty.order.cols = TRUE,
legend = FALSE
)
slice_max
function, after first grouping by topic. Then, we use the same reorder_within
function from the PCA lectures to reorder words within each topic. The resulting plot is much more interpretable. Judging from the words that are common in each topic’s distribution, we can guess that the topics approximately correspond to: 1 -> Great Expectations, 2 -> 20,000 Leagues Under the Sea, 3 -> Pride & Prejudice, 4 -> War of the Worlds.top_terms <- topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
mutate(term = reorder_within(term, beta, topic))
ggplot(top_terms, aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_fill_brewer(palette = "Set2") +
scale_y_reordered()
p <- seq(0.01, .99, length.out = 50)
df <- expand.grid(p, p) %>%
mutate(D = kl_div(Var1, Var2))
ggplot(df, aes(Var2, Var1)) +
geom_tile(aes(col = D, fill = D)) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
coord_fixed() +
scale_color_distiller(direction = 1) +
scale_fill_distiller(direction = 1) +
labs(
y = expression(beta[kw]),
x = expression(beta[lw])
)
discriminative_terms <- topics %>%
group_by(term) %>%
mutate(D = discrepancy(beta)) %>%
ungroup() %>%
slice_max(D, n = 200) %>%
mutate(term = fct_reorder(term, -D))
discriminative_terms %>%
pivot_wider(term, topic, values_from = "beta") %>%
column_to_rownames("term") %>%
superheat(
pretty.order.rows = TRUE,
left.label.size = 1.5,
left.label.text.size = 3,
bottom.label.size = 0.05,
legend = FALSE
)
memberships <- memberships %>%
mutate(
book = str_extract(document, "[^_]+"),
topic = factor(topic)
)
ggplot(memberships, aes(topic, gamma)) +
geom_boxplot() +
facet_wrap(~book)
ggplot(memberships, aes(topic, gamma, col = book)) +
geom_point(position = position_jitter(h = 0.05, w = 0.3)) +
geom_text_repel(aes(label = document), size = 3) +
facet_wrap(~ book) +
scale_color_brewer(palette = "Set1")
pretty.order.rows
. The takeaways here are similar to those in the scatterplot above.gamma <- memberships %>%
pivot_wider(document:book, topic, values_from = gamma)
hclust_result <- hclust(dist(gamma[, 3:6]))
document_order <- gamma$document[hclust_result$order]
memberships <- memberships %>%
mutate(document = factor(document, levels = document_order))
ggplot(memberships, aes(gamma, document, fill = topic, col = topic)) +
geom_col(position = position_stack()) +
facet_grid(book ~ ., scales = "free", space = "free") +
scale_x_continuous(expand = c(0, 0)) +
scale_fill_brewer(palette = "Set2") +
scale_color_brewer(palette = "Set2") +
theme(axis.text.y = element_blank())
Color is in general harder to compare than bar height.↩︎