In-class 5

pacman::p_load(tidytext, tidyverse, readtext, quanteda, ggwordcloud)

articles <- "../data/mc1/articles/*"

text_data <- readtext(articles)

corpus_text <- corpus(text_data)
summary(corpus_text,5)

Corpus consisting of 338 documents, showing 5 documents:

                                   Text Types Tokens Sentences
 Alvarez PLC__0__0__Haacklee Herald.txt   206    433        18
    Alvarez PLC__0__0__Lomark Daily.txt   102    170        12
   Alvarez PLC__0__0__The News Buoy.txt    90    200         9
 Alvarez PLC__0__1__Haacklee Herald.txt    96    187         8
    Alvarez PLC__0__1__Lomark Daily.txt   241    504        21

usenet_words <- text_data %>%
  unnest_tokens(word, text) %>%
  filter(str_detect(word, "[a-z']$"),
         !word %in% stop_words$word)

usenet_words %>%
  count(word, sort = TRUE)

readtext object consisting of 3260 documents and 0 docvars.
# A data frame: 3,260 × 3
  word             n text     
  <chr>        <int> <chr>    
1 fishing       2177 "\"\"..."
2 sustainable   1525 "\"\"..."
3 company       1036 "\"\"..."
4 practices      838 "\"\"..."
5 industry       715 "\"\"..."
6 transactions   696 "\"\"..."
# ℹ 3,254 more rows

words_by_doc_id <- usenet_words %>%
  count(doc_id, word, sort = TRUE) %>%
  ungroup()

text_data_split <- text_data %>%
  mutate(Company = str_extract(doc_id, "^[^_]+"),
         News_Agencies = str_extract(doc_id, "(?<=__)[^_]+(?=\\.txt)"))

text_data_splitted <- text_data %>%
  separate_wider_delim("doc_id",
                       delim="__0__",
                       names = c("X","Y"),
                       too_few = "align_end"
  )

usenet_words1 <- text_data_split %>%
  unnest_tokens(word, text) %>%
  filter(str_detect(word, "[a-z']$"),
         !word %in% stop_words$word)

words_by_news_agencies <- usenet_words1 %>%
  count(News_Agencies, word, sort = TRUE) %>%
  ungroup()