pacman::p_load(tidytext, widyr, wordcloud, DT, ggwordcloud, textplot, lubridate, hms,
tidyverse, tidygraph, ggraph, igraph)Hands-on 5
news20 <- "../data/20news/"read_folder <- function(infolder) {
tibble(file = dir(infolder,
full.names = TRUE)) %>%
mutate(text = map(file,
read_lines)) %>%
transmute(id = basename(file),
text) %>%
unnest(text)
}
raw_text <- tibble(folder =
dir(news20,
full.names = TRUE)) %>%
mutate(folder_out = map(folder,
read_folder)) %>%
unnest(cols = c(folder_out)) %>%
transmute(newsgroup = basename(folder),
id, text)
write_rds(raw_text, "../data/rds/news20.rds")raw_text <- read_rds("../data/rds/news20.rds")
raw_text %>%
group_by(newsgroup) %>%
summarize(messages = n_distinct(id)) %>%
ggplot(aes(messages, newsgroup)) +
geom_col(fill = "lightblue") +
labs(y = NULL)
cleaned_text <- raw_text %>%
group_by(newsgroup, id) %>%
filter(cumsum(text == "") > 0,
cumsum(str_detect(
text, "^--")) == 0) %>%
ungroup()cleaned_text <- cleaned_text %>%
filter(str_detect(text, "^[^>]+[A-Za-z\\d]")
| text == "",
!str_detect(text,
"writes(:|\\.\\.\\.)$"),
!str_detect(text,
"^In article <")
)usenet_words <- cleaned_text %>%
unnest_tokens(word, text) %>%
filter(str_detect(word, "[a-z']$"),
!word %in% stop_words$word)
usenet_words %>%
count(word, sort = TRUE)# A tibble: 5,542 × 2
word n
<chr> <int>
1 people 57
2 time 50
3 jesus 47
4 god 44
5 message 40
6 br 27
7 bible 23
8 drive 23
9 homosexual 23
10 read 22
# ℹ 5,532 more rows
words_by_newsgroup <- usenet_words %>%
count(newsgroup, word, sort = TRUE) %>%
ungroup()wordcloud(words_by_newsgroup$word,
words_by_newsgroup$n,
max.words = 300)Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): homosexuals could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): punishment could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): women could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): deaths could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): plaintext could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): jesus could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): science could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): bits could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): generator could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): client_data could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): sexual could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): stream could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): display could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): canuck could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): eeeee could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): osfselect could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): message could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): warning could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): xtvacreatemanagedwidget could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): wm_save_yourself could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): matthew could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): disciples could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): car could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): people could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): address could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): innocent could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): john could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): prophet could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): notice_shell could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): eternal could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): ideal could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): agree could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): cast could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): jets could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): drive could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): jews could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): angels could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): earth could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): peter could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): urban could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): people could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): mark could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): christians could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): special could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): jeruselem could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): keysym could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): random could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): eternity could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): islam could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): bit could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): mac could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): bible could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): truth could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): book could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): cock could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): steve could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): wicked could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): book could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): god could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): null could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): serdar could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): looked could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): relationship could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): systems could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): osfcancel could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): hinnom could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): drawn could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): window could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): human could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): thou could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): application could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): told could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): armenians could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): change could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): adults could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): sink could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): rev could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): bike could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): post could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): modem could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): canucks could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): fire could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): doctor could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): software could not be fit on page. It will not be plotted.

tf_idf <- words_by_newsgroup %>%
bind_tf_idf(word, newsgroup, n) %>%
arrange(desc(tf_idf))DT::datatable(tf_idf, filter = 'top') %>%
formatRound(columns = c('tf', 'idf',
'tf_idf'),
digits = 3) %>%
formatStyle(0,
target = 'row',
lineHeight='25%')tf_idf %>%
filter(str_detect(newsgroup, "^sci\\.")) %>%
group_by(newsgroup) %>%
slice_max(tf_idf,
n = 12) %>%
ungroup() %>%
mutate(word = reorder(word,
tf_idf)) %>%
ggplot(aes(tf_idf,
word,
fill = newsgroup)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ newsgroup,
scales = "free") +
labs(x = "tf-idf",
y = NULL)
newsgroup_cors <- words_by_newsgroup %>%
pairwise_cor(newsgroup,
word,
n,
sort = TRUE)newsgroup_cors <- words_by_newsgroup %>%
pairwise_cor(newsgroup,
word,
n,
sort = TRUE)set.seed(2017)
newsgroup_cors %>%
filter(correlation > .025) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(alpha = correlation,
width = correlation)) +
geom_node_point(size = 6,
color = "lightblue") +
geom_node_text(aes(label = name),
color = "red",
repel = TRUE) +
theme_void()Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
ℹ Please use the `transform` argument instead.

Reflection:
Learned some basic text processing techniques in R