pacman::p_load(tidytext, tidyverse, readtext, quanteda, ggwordcloud)In-class 5
articles <- "../data/mc1/articles/*"text_data <- readtext(articles)corpus_text <- corpus(text_data)
summary(corpus_text,5)Corpus consisting of 338 documents, showing 5 documents:
Text Types Tokens Sentences
Alvarez PLC__0__0__Haacklee Herald.txt 206 433 18
Alvarez PLC__0__0__Lomark Daily.txt 102 170 12
Alvarez PLC__0__0__The News Buoy.txt 90 200 9
Alvarez PLC__0__1__Haacklee Herald.txt 96 187 8
Alvarez PLC__0__1__Lomark Daily.txt 241 504 21
usenet_words <- text_data %>%
unnest_tokens(word, text) %>%
filter(str_detect(word, "[a-z']$"),
!word %in% stop_words$word)
usenet_words %>%
count(word, sort = TRUE)readtext object consisting of 3260 documents and 0 docvars.
# A data frame: 3,260 × 3
word n text
<chr> <int> <chr>
1 fishing 2177 "\"\"..."
2 sustainable 1525 "\"\"..."
3 company 1036 "\"\"..."
4 practices 838 "\"\"..."
5 industry 715 "\"\"..."
6 transactions 696 "\"\"..."
# ℹ 3,254 more rows
words_by_doc_id <- usenet_words %>%
count(doc_id, word, sort = TRUE) %>%
ungroup()text_data_split <- text_data %>%
mutate(Company = str_extract(doc_id, "^[^_]+"),
News_Agencies = str_extract(doc_id, "(?<=__)[^_]+(?=\\.txt)"))text_data_splitted <- text_data %>%
separate_wider_delim("doc_id",
delim="__0__",
names = c("X","Y"),
too_few = "align_end"
)
usenet_words1 <- text_data_split %>%
unnest_tokens(word, text) %>%
filter(str_detect(word, "[a-z']$"),
!word %in% stop_words$word)
words_by_news_agencies <- usenet_words1 %>%
count(News_Agencies, word, sort = TRUE) %>%
ungroup()