Frequency count and combination analysis discrepancy in R?

112 Views Asked by At

The following code generates data:

pacman::p_load(tidyverse, expss)

# Set the seed for reproducibility
set.seed(123)

# Generate random data
n <- 490
PTSD <- sample(c(1, 2, NA), n, replace = TRUE) #class(PTSD) = "numeric"
ANX <- sample(c(1, 2, NA), n, replace = TRUE) #class(ANX) = "numeric"
DEP <- sample(c(1, 2, NA), n, replace = TRUE) #class(DEP) = "numeric"

# Create the data frame
df <- data.frame(PTSD, ANX, DEP) #class(df) = "data.frame"

# Label the values: 1 = Low, 2 = High
expss::val_lab(df$PTSD) = expss::num_lab("1 Low
                                        2 High")
expss::val_lab(df$ANX) = expss::num_lab("1 Low
                                        2 High")
expss::val_lab(df$DEP) = expss::num_lab("1 Low
                                        2 High")

# Create a list of tables for each variable to count 1s, 2s, and NAs
count_results <- list(
  PTSD = table(df$PTSD, useNA = "ifany"),
  ANX = table(df$ANX, useNA = "ifany"),
  DEP = table(df$DEP, useNA = "ifany")
)

This portion of the code does some frequency count and summarises data:

# Combine the count tables into a single table
count_table <- do.call(rbind, count_results)

# Initialize empty vectors to store results
variable_names <- character()
sample_sizes <- numeric()

# Loop through the test results and extract relevant information
for (variable_name in names(count_results)) {
  sample_sizes <- c(sample_sizes, sum(count_results[[variable_name]]))
  variable_names <- c(variable_names, variable_name)
}

# Create summary data frame
summary_df <- data.frame(
  Variable = variable_names,
  N = sample_sizes
)

# Combine the count table and chi-squared summary table by columns
final_result <- cbind(count_table, summary_df)

# Remove Variable column in the middle of the table
final_result <- subset(final_result, select = -c(Variable))

This portion of the code does what I call "combination analysis" (it is based on the accepted answer of this SO thread):

library(dplyr)

out <- df %>%
  mutate(id = row_number())%>%
  tidyr::pivot_longer(PTSD:DEP) %>%
  filter(value == 2)%>%
  summarise(combination = toString(name),.by=id) %>%
  summarise(n = n(), .by = combination)

Printing the frequency count and summary generates this:

> print(final_result)
     Low High  NA   N
PTSD 164  167 159 490
ANX  157  156 177 490
DEP  168  156 166 490

And printing the frequency count and summary generates this:

# A tibble: 7 × 2
  combination        n
  <chr>          <int>
1 ANX               72
2 ANX, DEP          28
3 PTSD              82
4 DEP               76
5 PTSD, DEP         29
6 PTSD, ANX         33
7 PTSD, ANX, DEP    23

What I am really interested in is the "High" frequencies and their combinations (i.e., PTSD == 2, ANX == 2, and DEP == 2).

Therefore, I expected that PTSD High, ANX High, and DEP High be the same between both tables, but this is not the case!

In order to check what the second table (i.e., tibble table) should have shown, I exported the df to a CSV file and imported it in a spreadsheet.

I used the COUNTIFS function (with the following syntax COUNTIFS(criteria_range1, criteria1, [criteria_range2, criteria2]…)) and I got the following table:

Combination        n
--------------------
PTSD             167
ANX              156
DEP              156
PTSD + ANX        56
PTSD + DEP        52
ANX  + DEP        51
PTSD + ANX + DEP  23

My question:

  • Assuming that the results I obtained through the spreadsheet analysis are correct, what would be the code for the "combination analysis" to reflect the same results?
2

There are 2 best solutions below

0
Nir Graham On BEST ANSWER

The following is a reproduction of what you must have done in excel with countif


library(tidyverse)

library(rlang)
t3 <- c("PTSD","ANX","DEP")

(combs <- map(seq_along(t3),\(n)combn(t3,n,simplify = FALSE)) |> flatten())

(filts <- parse_exprs(map_chr(combs,\(x)paste0(x ,'== 2',collapse=' & '))))
(filtsnames <- parse_exprs(map_chr(combs,\(x)paste0(x ,collapse=' + '))))
names(filts) <- filtsnames

(out2 <- map_int(filts,\(x){
     df %>%
  mutate(id = row_number())%>%
    filter(!!(x))%>%
  summarise(
    n = n())
  } |> pull(n)
))

 enframe(out2)
2
Mark On

The problem you had was that (for example) in a row with 'ANX', 'DEP', and 'PTSD' all 2, you were only counting the combination of all three, whereas you should have been counting that, as well as all combinations of 2 and 1. (you can see this by noticing that, for example, PTSD should equal 167, but in your faulty output, 167 is what you get when adding together all rows with PTSD in them, 82 + 23 + 33 + 29)

Update: new method

# helper function
pacman::p_load(tidyverse, RcppAlgos)

b <- df |> 
 mutate(r = row_number()) |> 
 pivot_longer(-r) |>
 filter(value == 2)

foo <- function(cmb) {
  b |>
  filter(if_all(name, ~ . %in% cmb), .by = r) |> 
  distinct(r) |> 
  nrow() |>
  setNames(paste(cmb, collapse = " + ")) |> unlist()
  }

map(seq_along(names(df)), ~ comboGeneral(names(df), .x, FUN = foo)) |>
  unlist() |>
  as.data.frame(x = _) |>
  setNames("n")

Original way

One way of solving this is to create a combination dataframe, and then join it on the original dataframe:

# helper function
pacman::p_load(RcppAlgos, tidyverse)

get_combs <- \(x) map(seq_along(x), ~ comboGeneral(x, .x, FUN = \(a) paste(a, collapse = "_"))) |> unlist()

comb_df <- get_combs(sort(names(df))) |>
    tibble(comb = _ ) |>
    mutate(x = map(comb, ~ get_combs(str_split(.x, "_")[[1]])))

> comb_df
# A tibble: 7 × 2
  comb         x        
  <chr>        <list>   
1 ANX          <chr [1]>
2 DEP          <chr [1]>
3 PTSD         <chr [1]>
4 ANX_DEP      <chr [3]>
5 ANX_PTSD     <chr [3]>
6 DEP_PTSD     <chr [3]>
7 ANX_DEP_PTSD <chr [7]>

Now to join it with our original data:

b |>
 summarise(name = paste(sort(name), collapse = "_"), .by = r) |>
 left_join(comb_df, by = c("name" = "comb")) |>
 unnest(x) |>
 count(x, sort = TRUE)

Output:

  x                n
  <chr>        <int>
1 PTSD           167
2 ANX            156
3 DEP            156
4 ANX_PTSD        56
5 DEP_PTSD        52
6 ANX_DEP         51
7 ANX_DEP_PTSD    23