I am running into an error while trying to make a corpus object from the tm package in R.
The data have been scraped from a website and I have included the full code below so you can run and see how the data were gathered and the tibble was created. The very last line of code is where I am getting stuck! (I have modified the loop so it should run in a few seconds).
Any help would be appreciated. :)
library(tidyverse)
library(rvest)
##########################################
# WEB SCRAPING FROM SCHOLARLYKITCHEN.COM #
##########################################
# create loop that iteratively adds page numbers onto
# keep the loop numbers small for testing before full data is pulled in
output <- character()
for (i in 1:2) {
article.links <- paste0("https://scholarlykitchen.sspnet.org/archives/page/", i ,"/") %>%
read_html() %>%
html_nodes(".list-article__title") %>%
html_nodes("a") %>%
html_attr("href")
output <- c(output, article.links)
}
# get all comments
get.comments <- function(output) {
article.page <- read_html(output)
article.comments <- article.page %>% html_nodes(".comment") %>% html_text() %>% trimws(which = "both")
return(article.comments)
}
text <- sapply(output, FUN = get.comments, USE.NAMES = FALSE)
# get all dates
get.dates <- function(output) {
article.page <- read_html(output)
article.comments <- article.page %>% html_nodes(".comment__meta__date") %>% html_text() %>% trimws(which = "both")
return(article.comments)
}
dates <- sapply(output, FUN = get.dates, USE.NAMES = FALSE)
# create the made df for the analysis
df <- tibble(
text = unlist(text, recursive = TRUE), # unlist is needed because sapply (for some reason) creates a list
dates = unlist(dates, recursive = TRUE)
)
# extract dates from meta data
df$dates <- as.character(gsub(",","",df$dates))
df$dates <- as.Date(df$dates, "%B%d%Y")
###################
# TOPIC MODELLING #
###################
library(tm)
library(topicmodels)
# create df ready for topic modelling
# this needs to have very specifically names columns
df.tm <- df[-2] # create dupelicate for backup (dates not needed for topic modelling yet)
df.tm$doc_id <- row.names(df) # create a unique id for each row as is needed by the tm package
df.tm <- df.tm[c(2,1)] # reorders the columns
# From the comments text, create the corpus
corpus <- VCorpus(DataframeSource(df))
Error is the below
Error in DataframeSource(df) :
all(!is.na(match(c("doc_id", "text"), names(x)))) is not TRUE
DataframeSource()requires the df to have a document index in its first column, and it must be labeled "doc_id".Try: