Hands-on 5

pacman::p_load(tidytext, widyr, wordcloud, DT, ggwordcloud, textplot, lubridate, hms,
tidyverse, tidygraph, ggraph, igraph)
news20 <- "../data/20news/"
read_folder <- function(infolder) {
  tibble(file = dir(infolder, 
                    full.names = TRUE)) %>%
    mutate(text = map(file, 
                      read_lines)) %>%
    transmute(id = basename(file), 
              text) %>%
    unnest(text)
}

raw_text <- tibble(folder = 
                     dir(news20, 
                         full.names = TRUE)) %>%
  mutate(folder_out = map(folder, 
                          read_folder)) %>%
  unnest(cols = c(folder_out)) %>%
  transmute(newsgroup = basename(folder), 
            id, text)
write_rds(raw_text, "../data/rds/news20.rds")
raw_text <- read_rds("../data/rds/news20.rds")
raw_text %>%
  group_by(newsgroup) %>%
  summarize(messages = n_distinct(id)) %>%
  ggplot(aes(messages, newsgroup)) +
  geom_col(fill = "lightblue") +
  labs(y = NULL)

cleaned_text <- raw_text %>%
  group_by(newsgroup, id) %>%
  filter(cumsum(text == "") > 0,
         cumsum(str_detect(
           text, "^--")) == 0) %>%
  ungroup()
cleaned_text <- cleaned_text %>%
  filter(str_detect(text, "^[^>]+[A-Za-z\\d]")
         | text == "",
         !str_detect(text, 
                     "writes(:|\\.\\.\\.)$"),
         !str_detect(text, 
                     "^In article <")
  )
usenet_words <- cleaned_text %>%
  unnest_tokens(word, text) %>%
  filter(str_detect(word, "[a-z']$"),
         !word %in% stop_words$word)
usenet_words %>%
  count(word, sort = TRUE)
# A tibble: 5,542 × 2
   word           n
   <chr>      <int>
 1 people        57
 2 time          50
 3 jesus         47
 4 god           44
 5 message       40
 6 br            27
 7 bible         23
 8 drive         23
 9 homosexual    23
10 read          22
# ℹ 5,532 more rows
words_by_newsgroup <- usenet_words %>%
  count(newsgroup, word, sort = TRUE) %>%
  ungroup()
wordcloud(words_by_newsgroup$word,
          words_by_newsgroup$n,
          max.words = 300)
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): homosexuals could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): punishment could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): women could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): deaths could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): plaintext could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): jesus could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): science could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): bits could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): generator could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): client_data could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): sexual could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): stream could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): display could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): canuck could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): eeeee could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): osfselect could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): message could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): warning could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): xtvacreatemanagedwidget could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): wm_save_yourself could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): matthew could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): disciples could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): car could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): people could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): address could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): innocent could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): john could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): prophet could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): notice_shell could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): eternal could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): ideal could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): agree could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): cast could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): jets could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): drive could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): jews could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): angels could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): earth could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): peter could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): urban could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): people could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): mark could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): christians could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): special could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): jeruselem could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): keysym could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): random could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): eternity could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): islam could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): bit could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): mac could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): bible could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): truth could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): book could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): cock could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): steve could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): wicked could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): book could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): god could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): null could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): serdar could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): looked could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): relationship could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): systems could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): osfcancel could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): hinnom could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): drawn could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): window could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): human could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): thou could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): application could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): told could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): armenians could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): change could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): adults could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): sink could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): rev could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): bike could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): post could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): modem could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): canucks could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): fire could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): doctor could not be fit on page. It will not be plotted.
Warning in wordcloud(words_by_newsgroup$word, words_by_newsgroup$n, max.words =
300): software could not be fit on page. It will not be plotted.

tf_idf <- words_by_newsgroup %>%
  bind_tf_idf(word, newsgroup, n) %>%
  arrange(desc(tf_idf))
DT::datatable(tf_idf, filter = 'top') %>% 
  formatRound(columns = c('tf', 'idf', 
                          'tf_idf'), 
              digits = 3) %>%
  formatStyle(0, 
              target = 'row', 
              lineHeight='25%')
tf_idf %>%
  filter(str_detect(newsgroup, "^sci\\.")) %>%
  group_by(newsgroup) %>%
  slice_max(tf_idf, 
            n = 12) %>%
  ungroup() %>%
  mutate(word = reorder(word, 
                        tf_idf)) %>%
  ggplot(aes(tf_idf, 
             word, 
             fill = newsgroup)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ newsgroup, 
             scales = "free") +
  labs(x = "tf-idf", 
       y = NULL)

newsgroup_cors <- words_by_newsgroup %>%
  pairwise_cor(newsgroup, 
               word, 
               n, 
               sort = TRUE)
newsgroup_cors <- words_by_newsgroup %>%
  pairwise_cor(newsgroup, 
               word, 
               n, 
               sort = TRUE)
set.seed(2017)

newsgroup_cors %>%
  filter(correlation > .025) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(alpha = correlation, 
                     width = correlation)) +
  geom_node_point(size = 6, 
                  color = "lightblue") +
  geom_node_text(aes(label = name),
                 color = "red",
                 repel = TRUE) +
  theme_void()
Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
ℹ Please use the `transform` argument instead.

Reflection:

Learned some basic text processing techniques in R