I'm trying to read many excel files. Each of them is quite large, and they have +200 sheets each. So I'm trying to do this the fastest way possible
library(foreach)
library(doParallel)
library(readxl) # Load the readxl package here
library(readxl)
library(dplyr)
library(tidyr)
library(foreach)
library(doParallel)
library(multidplyr)
# Number of cores to use
num_cores <- 4 # Adjust this based on your system
# Register parallel backend
cl <- makeCluster(num_cores)
registerDoParallel(cl)
# Initialize a list to store the results
int_data <- list()
# Parallel loop through each sheet and read it into a separate dataframe
foreach(sheet_name = sheet_names) %dopar% {
print(sheet_name)
# Load the readxl package in the parallel worker
library(readxl)
# Read the sheet into a dataframe
df <- readxl::read_excel(excel_file, sheet = sheet_name, col_names = FALSE)
df <- as.data.frame(t(df))
# Calculate number of dates and generate date sequence
no_dates <- nrow(df)
dates <- rev(seq(as.Date("2024-01-01"), by = "-1 day", length = no_dates))
# Combine data and dates, and set column names
df <- cbind(df, dates)
colnames(df) <- c("int_rate", "date")
# Merge dataframes based on common variable "int_rate" and "Symbol"
merged_df <- merge(df, references_int[, c("Symbol", "Market", "Name")], by.x = "int_rate", by.y = "Symbol", all.x = TRUE)
merged_df <- merged_df %>%
arrange(date) %>%
fill(c(Market, Name), .direction = "down")
# Remove incomplete cases
merged_df <- merged_df[complete.cases(merged_df), ]
# Extract unique market and indicator names
this_name <- unique(merged_df$Market)
indicator_name <- unique(merged_df$Name)
this_elem=paste(this_name, indicator_name, sep="_")
# Remove unnecessary columns, convert int_rate to numeric, and set column names
merged_df <- merged_df[-1, -c(3, 4)]
merged_df$int_rate <- as.numeric(merged_df$int_rate)
colnames(merged_df) <- c(indicator_name, "date")
# Store the dataframe in the list
int_data[[this_elem]] = merged_df
remove(merged_df)
remove(df)
}
# Stop the parallel backend
stopCluster(cl)
registerDoSEQ() # Restore sequential execution
however, when I run this code I get the error that the %>% was not found. Any ideas how to solve it? The idea is pretty simple: need to be able to read as many sheets at the same time as possible.