Adding p value on a group geom barplot

48 Views Asked by At

I made multiple fisher test data that are compared between five categories and inside 4 groups. I would like to add the p value on top of the resulted grouped barplot. I tried some codes that are not working.

Please see the data below

# Data table
###############################################

data_test=data.frame(PE=sample(20:90, 20),
                     BU=rep(c("A1", "A2", "A3", "A4"), 5),
                     PH=c(rep("NAF", 4), rep("AF",4), rep("UNK", 4), rep("TRU",4), rep("ERP",4)))

   PE BU  PH
1  70 A1 NAF
2  39 A2 NAF
3  75 A3 NAF
4  85 A4 NAF
5  31 A1  AF
6  72 A2  AF
7  29 A3  AF
8  44 A4  AF
9  41 A1 UNK
10 76 A2 UNK
11 34 A3 UNK
12 57 A4 UNK
13 78 A1 TRU
14 32 A2 TRU
15 22 A3 TRU
16 52 A4 TRU
17 23 A1 ERP
18 63 A2 ERP
19 67 A3 ERP
20 53 A4 ERP


data_test$PH=factor(data_test$PH, levels=c("NAF", "AF", "UNK", "TRU", "ERP")) 


# Test value table
###############################################

Stat_test = structure(list(group1 = c("NAF", "NAF", "NAF", "NAF", "AF", "AF", 
                                      "AF", "UNK", "UNK", "TRU", "NAF", "NAF", "NAF", "NAF", "AF", 
                                      "AF", "AF", "UNK", "UNK", "TRU", "NAF", "NAF", "NAF", "NAF", 
                                      "AF", "AF", "AF", "UNK", "UNK", "TRU", "NAF", "NAF", "NAF", "NAF", 
                                      "AF", "AF", "AF", "UNK", "UNK", "TRU"), group2 = c("AF", "UNK", 
                                                                                         "TRU", "ERP", "UNK", "TRU", "ERP", "TRU", "ERP", "ERP", "AF", 
                                                                                         "UNK", "TRU", "ERP", "UNK", "TRU", "ERP", "TRU", "ERP", "ERP", 
                                                                                         "AF", "UNK", "TRU", "ERP", "UNK", "TRU", "ERP", "TRU", "ERP", 
                                                                                         "ERP", "AF", "UNK", "TRU", "ERP", "UNK", "TRU", "ERP", "TRU", 
                                                                                         "ERP", "ERP"), p = c(0.029, 0.065, 0.03, 0.072, 0.074, 0.026, 
                                                                                                              0.041, 0.047, 0.052, 0.019, 0.092, 0.013, 0.079, 0.032, 0.048, 
                                                                                                              0.045, 0.033, 0.058, 0.04, 0.02, 0.077, 0.035, 0.057, 0.078, 
                                                                                                              0.022, 0.017, 0.025, 0.021, 0.061, 0.011, 0.084, 0.06, 0.087, 
                                                                                                              0.043, 0.095, 0.088, 0.082, 0.085, 0.014, 0.054), BU = c("A1", 
                                                                                                                                                                       "A1", "A1", "A1", "A1", "A1", "A1", "A1", "A1", "A1", "A2", "A2", 
                                                                                                                                                                       "A2", "A2", "A2", "A2", "A2", "A2", "A2", "A2", "A3", "A3", "A3", 
                                                                                                                                                                       "A3", "A3", "A3", "A3", "A3", "A3", "A3", "A4", "A4", "A4", "A4", 
                                                                                                                                                                       "A4", "A4", "A4", "A4", "A4", "A4")), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                                 -40L))




> Stat_test
   group1 group2     p BU
1     NAF     AF 0.029 A1
2     NAF    UNK 0.065 A1
3     NAF    TRU 0.030 A1
4     NAF    ERP 0.072 A1
5      AF    UNK 0.074 A1
6      AF    TRU 0.026 A1
7      AF    ERP 0.041 A1
8     UNK    TRU 0.047 A1
9     UNK    ERP 0.052 A1
10    TRU    ERP 0.019 A1
11    NAF     AF 0.092 A2
12    NAF    UNK 0.013 A2
13    NAF    TRU 0.079 A2
14    NAF    ERP 0.032 A2
15     AF    UNK 0.048 A2
16     AF    TRU 0.045 A2
17     AF    ERP 0.033 A2
18    UNK    TRU 0.058 A2
19    UNK    ERP 0.040 A2
20    TRU    ERP 0.020 A2
21    NAF     AF 0.077 A3
22    NAF    UNK 0.035 A3
23    NAF    TRU 0.057 A3
24    NAF    ERP 0.078 A3
25     AF    UNK 0.022 A3
26     AF    TRU 0.017 A3
27     AF    ERP 0.025 A3
28    UNK    TRU 0.021 A3
29    UNK    ERP 0.061 A3
30    TRU    ERP 0.011 A3
31    NAF     AF 0.084 A4
32    NAF    UNK 0.060 A4
33    NAF    TRU 0.087 A4
34    NAF    ERP 0.043 A4
35     AF    UNK 0.095 A4
36     AF    TRU 0.088 A4
37     AF    ERP 0.082 A4
38    UNK    TRU 0.085 A4
39    UNK    ERP 0.014 A4
40    TRU    ERP 0.054 A4

I asked a similar question where I had only two categories and four groups. You can find the solution that was working on this link

Add pvalue bracket with stat_pvalue_manual on geom_bar

The problem was that my table with p value did not have xmin and xmax position. I have been explained that the values on the x axis would be equivalent to as.numeric(as.factor(data_test$BU)). So I tried to computed this xmin and xmax position with this following code.

  • First I computed a x_position_BU that correspond to the BU group position -> 1,2,3,4
  • Second I computed the x_position for each PH for group1 and group2 (inside a BU).
  • Third I compute the x_position for each test using x_position_BU, x_position_g1 and x_positiong2

But when I tried to build my plot I had this following error message

@Error in dplyr::mutate(): ℹ In argument: label = as.character(data %>% pull("p")). ℹ In group 1: BU = "A1", group1 = "AF", group2 = "ERP". Caused by error: ! label must be size 1, not 40."


library(dplyr)
library(ggpubr)
library(ggplot2)

# Modified labels                                                                                                                                                                                                                                   
############################

Stat_test_2 = Stat_test %>% 
  mutate(p=ifelse(p<0.05, p, NA))

# Add x position                                                                                                                                                                                                                                    
############################

SStat_test_3 = Stat_test_2 %>%
  mutate(x_position_BU=ifelse(BU=="A1", 1, 
                    ifelse(BU=="A2", 2,
                    ifelse(BU=="A3", 3, 4)))) %>%
  mutate(x_position_g1=ifelse(group1=="NAF",0.64, 
                       ifelse(group1== "AF", 0.82,
                       ifelse(group1== "UNK", 1,
                       ifelse(group1== "TRU", 1.18 , 1.36))))) %>%
  mutate(x_position_g2=ifelse(group2=="NAF", 0.64, 
                       ifelse(group2== "AF", 0.82,
                       ifelse(group2== "UNK", 1,
                       ifelse(group2== "TRU", 1.18, 1.36))))) %>%
  group_by(BU, group1, group2) %>%
  mutate(xmin=ifelse(p<0.05, min(x_position_g1, x_position_g2)+ x_position_BU-1, NA),
         xmax=ifelse(p<0.05, max(x_position_g2, x_position_g1) + x_position_BU-1, NA))  

# Plot
############################

ggplot(data_test, aes(x=BU, y=PE)) +
  geom_bar(aes(fill=PH), stat="identity", position=position_dodge(), color="black") + 
  stat_pvalue_manual(Stat_test_3, y.position = 100, size=4,
                     label = "p", remove.bracket = FALSE, 
                     tip.length = 0.02, bracket.size = 0.8) +
  ylim(c(0,150))



My first question is: what is not working on this code ?

My second question is: is there a way to extract directly the x_position for each bar inside the grouped plot

EDIT:

I found a response to my two questions:

for question 1: the problem was that Stat_test3 was not in a data.frame format, using as.data.frame(Stat_test3) solve the problem.

for question 2 I used ggplot_build as following

plot=ggplot(data_test, aes(x=BU, y=PE)) +
  geom_bar(aes(fill=PH), stat="identity", position=position_dodge(), color="black") 

ggplot_build(plot)$data[[1]]$x
 [1] 0.64 1.64 2.64 3.64 0.82 1.82 2.82 3.82 1.00 2.00 3.00 4.00 1.18 2.18 3.18 4.18 1.36 2.36 3.36
[20] 4.36
attr(,"class")
[1] "mapped_discrete" "numeric" 


0

There are 0 best solutions below