pacman::p_load(corporaexplorer, tidyverse, stringi, rvest)In-class 6
bible <- readr::read_lines("http://www.gutenberg.org/cache/epub/10/pg10.txt")bible <- paste(bible, collapse = "\n")
start_v <- stri_locate_first_fixed(bible, "The First Book of Moses: Called Genesis")[1]
end_v <- stri_locate_last_fixed(bible, "Amen.")[2]
bible <- stri_sub(bible, start_v, end_v)
books <- stri_split_regex(bible, "\n{5}") %>%
unlist %>%
.[-40]
books <- str_replace_all(books, "\n{2,}", "NEW_PARAGRAPH") %>%
str_replace_all("\n", " ") %>%
str_replace_all("NEW_PARAGRAPH", "\n\n")
books <- books[3:68]
chapters <- str_replace_all(books, "(\\d+:1 )", "NEW_CHAPTER\\1") %>%
stri_split_regex("NEW_CHAPTER")
chapters <- lapply(chapters, function(x) x[-1])
book_titles <- read_html("https://www.esv.org/resources/esv-global-study-bible/list-of-abbreviations") %>%
html_nodes("td:nth-child(1)") %>%
html_text() %>%
.[13:78]
testament <- c(rep("Old", 39), rep("New", 27))
bible_df <- tibble::tibble(Text = chapters,
Book = book_titles,
Testament = testament)
bible_df <- tidyr::unnest(bible_df, Text)KJB <- prepare_data(dataset = bible_df,
date_based_corpus = FALSE,
grouping_variable = "Book",
columns_doc_info = c("Testament", "Book"))Starting.
Document data frame done.
Corpus is not date based. Calendar data frame skipped.
Document term matrix: text processed.
Document term matrix: tokenising completed.
Document term matrix: word list created.
Document term matrix done.
Done.
explore(KJB)Exploring 1,175 documents
Loading required package: shiny
PhantomJS not found. You can install it with webshot::install_phantomjs(). If it is installed, please make sure the phantomjs executable can be found via the PATH variable.
Shiny applications not supported in static R Markdown documents