parallel reading in R

21 Views Asked by At

I'm trying to read many excel files. Each of them is quite large, and they have +200 sheets each. So I'm trying to do this the fastest way possible

library(foreach)
library(doParallel)
library(readxl)  # Load the readxl package here
library(readxl)  
library(dplyr)
library(tidyr)
library(foreach)
library(doParallel)
library(multidplyr)

# Number of cores to use
num_cores <- 4  # Adjust this based on your system

# Register parallel backend
cl <- makeCluster(num_cores)
registerDoParallel(cl)

# Initialize a list to store the results
int_data <- list()

# Parallel loop through each sheet and read it into a separate dataframe
foreach(sheet_name = sheet_names) %dopar% {
  print(sheet_name)
  
  # Load the readxl package in the parallel worker
  library(readxl)
  
  # Read the sheet into a dataframe
  df <- readxl::read_excel(excel_file, sheet = sheet_name, col_names = FALSE)
  df <- as.data.frame(t(df))
  
  # Calculate number of dates and generate date sequence
  no_dates <- nrow(df)
  dates <- rev(seq(as.Date("2024-01-01"), by = "-1 day", length = no_dates))
  
  # Combine data and dates, and set column names
  df <- cbind(df, dates)
  colnames(df) <- c("int_rate", "date")
  
  # Merge dataframes based on common variable "int_rate" and "Symbol"
  merged_df <- merge(df, references_int[, c("Symbol", "Market", "Name")], by.x = "int_rate", by.y = "Symbol", all.x = TRUE)
  merged_df <- merged_df %>%
    arrange(date) %>%
    fill(c(Market, Name), .direction = "down")
  
  # Remove incomplete cases
  merged_df <- merged_df[complete.cases(merged_df), ]
  
  # Extract unique market and indicator names
  this_name <- unique(merged_df$Market)
  indicator_name <- unique(merged_df$Name)
  this_elem=paste(this_name, indicator_name, sep="_")
  
  # Remove unnecessary columns, convert int_rate to numeric, and set column names
  merged_df <- merged_df[-1, -c(3, 4)]
  merged_df$int_rate <- as.numeric(merged_df$int_rate)
  colnames(merged_df) <- c(indicator_name, "date")
  
  # Store the dataframe in the list
 int_data[[this_elem]] = merged_df
remove(merged_df)
remove(df)
}

# Stop the parallel backend
stopCluster(cl)
registerDoSEQ()  # Restore sequential execution

however, when I run this code I get the error that the %>% was not found. Any ideas how to solve it? The idea is pretty simple: need to be able to read as many sheets at the same time as possible.

0

There are 0 best solutions below