Scrape basketball reference boxscores and output specific list elements to dataframe in R

57 Views Asked by At

I'm looking to scrape each of the boxscores for each game from basketball reference, as an example here:

https://www.basketball-reference.com/boxscores/202402220CHI.html

The tables are shown for different periods of the game, e.g. Q1, Q2, Q3, Q4 and you can see these by clicking on the various options:

enter image description here

My goal is to get each of these tables (for EACH TEAM on EACH DAY of the season), join them together, then specify which period the table applies to, e.g. Q1, Q2, etc. most likely by adding a column that says "Q1", "Q2", etc.

My attempt so far:


    library(rvest)
    library(tidyverse)
    
    ##sample only - ultimately this will include all teams and all months and days
    
    month <- c('02')
    year <- c('2024')
    day <- c('220','270','280')   
    team <- c('CHI')

    make_url <- function(team, year, month, day) {
      paste0(
        'https://www.basketball-reference.com/boxscores/', year, month, day, team, '.html'
      )
    }
    
    dates <- expand.grid(
      team = team, year = year, month = month, day = day
    )
    
    urls <- dates |>
      mutate(
        url = make_url(team, year, month, day),
        team = team,
        date = paste(year, month, gsub('.{1}$', '', day), sep = '-'),
        .keep = 'unused'
      )
    
    
    scrape_table <- function(url) {
      page_html <- url %>%
        rvest::read_html()  
      
      page_html %>%
        rvest::html_nodes("table") %>%
        rvest::html_table(header = FALSE)
      
    }
    
    
    safe_scrape_table <- purrr::safely(scrape_table)
    
    tbl_scrape <- purrr::map(urls$url, \(url) {
      Sys.sleep(5)
      safe_scrape_table(url)
    }) |>
      set_names(paste(urls$team, urls$date, sep = '-'))
    
    
    final_result <- tbl_scrape |>
      purrr::transpose() |>
      pluck('result')

enter image description here

This is where I am stuck. I can see that list[[1]] and list[[9]] are the Game outputs, list[[2]] and list[[10]] are the Q1 outputs, and so on.

How can I only get the ones that I need and bind them altogether? I only require Q1, Q2, Q3 and Q4.

I also need to add a column that is effectively the title of each, e.g. 'CHI-2024-02-22' so I know which game these stats relate to.

Lastly, I am looking to add two columns, one for the home team and one for the away team. I know these details appear on each page, but I cannot figure out how to get them?

1

There are 1 best solutions below

5
Dave2e On BEST ANSWER

Try this. Inside the function "getPageTable", I read the game date and the table headings and all of the tables on the page. I filter out only the box scores for the quarters 1, 2, 3, 4. and remove the headings off the rows and then add on the game date and table heading onto the table and then merge into 1 data frame per game.
See the comments for additional details.

Edited

library(rvest)
library(dplyr)
library(tidyr)

##sample only - ultimately this will include all teams and all months and days
month <- c('02')
year <- c('2024')
day <- c('220','270','280')   
team <- c('CHI')

make_url <- function(team, year, month, day) {
   paste0('https://www.basketball-reference.com/boxscores/', year, month, day, team, '.html')
}

dates <- expand.grid(team = team, year = year, month = month, day = day)

urls <- dates |>
   mutate( url = make_url(team, year, month, day),
      team = team,
      date = paste(year, month, gsub('.{1}$', '', day), sep = '-'),
      .keep = 'unused'
   )

getPageTable <- function(url) {
   #read the page
   page <- read_html(url)

   #get the game's date
   gamedate <- page %>% html_element("div.scorebox_meta div") %>% html_text2()
   
   #get game title
   gameInfo <- page %>% html_elements("div.box h1") %>% html_text()
   #get the table headings
   headings <- page %>% html_elements("div.section_wrapper") %>% html_element("h2") %>% html_text()
   
   #find the quarter scores
   quarters <- grep("Q[1|2|3|4]", headings)
   
   #retrieve the tables from the page
   tables <- page %>% html_elements("div.section_wrapper") %>% html_element("table") 

   #select the desired headings and tables
   headings <- headings[quarters]
   tables <- tables[quarters] %>% html_table(header=FALSE)

   #add game date and team name/quater to the results
   tables <- lapply(1:length(tables), function(i) {
      #set column titles to second row
      names(tables[[i]]) <- tables[[i]][2,]
     tables[[i]] <- tables[[i]][-c(1:2),]  
      tables[[i]]$gamedate <- gamedate
      tables[[i]]$team <- headings[i]
      tables[[i]]$title <- gameInfo
      tables[[i]]
   })
   #merge the quarterly status into 1 dataframe
   df <- bind_rows(tables)
   df <- df %>% filter("Starters" != "Reserves"  | "Starters" != "Team Totals" )
   df
}


#loop through the URLS
dfs <- lapply(urls$url, getPageTable)
#merge into one big table
finalResult <- bind_rows(dfs)
finalResult <- finalResult %>% separate("team", into=c("team", "quarter"), " \\(")
finalResult$quarter <- sub("\\)", "", finalResult$quarter)