I made multiple fisher test data that are compared between five categories and inside 4 groups. I would like to add the p value on top of the resulted grouped barplot. I tried some codes that are not working.
Please see the data below
# Data table
###############################################
data_test=data.frame(PE=sample(20:90, 20),
BU=rep(c("A1", "A2", "A3", "A4"), 5),
PH=c(rep("NAF", 4), rep("AF",4), rep("UNK", 4), rep("TRU",4), rep("ERP",4)))
PE BU PH
1 70 A1 NAF
2 39 A2 NAF
3 75 A3 NAF
4 85 A4 NAF
5 31 A1 AF
6 72 A2 AF
7 29 A3 AF
8 44 A4 AF
9 41 A1 UNK
10 76 A2 UNK
11 34 A3 UNK
12 57 A4 UNK
13 78 A1 TRU
14 32 A2 TRU
15 22 A3 TRU
16 52 A4 TRU
17 23 A1 ERP
18 63 A2 ERP
19 67 A3 ERP
20 53 A4 ERP
data_test$PH=factor(data_test$PH, levels=c("NAF", "AF", "UNK", "TRU", "ERP"))
# Test value table
###############################################
Stat_test = structure(list(group1 = c("NAF", "NAF", "NAF", "NAF", "AF", "AF",
"AF", "UNK", "UNK", "TRU", "NAF", "NAF", "NAF", "NAF", "AF",
"AF", "AF", "UNK", "UNK", "TRU", "NAF", "NAF", "NAF", "NAF",
"AF", "AF", "AF", "UNK", "UNK", "TRU", "NAF", "NAF", "NAF", "NAF",
"AF", "AF", "AF", "UNK", "UNK", "TRU"), group2 = c("AF", "UNK",
"TRU", "ERP", "UNK", "TRU", "ERP", "TRU", "ERP", "ERP", "AF",
"UNK", "TRU", "ERP", "UNK", "TRU", "ERP", "TRU", "ERP", "ERP",
"AF", "UNK", "TRU", "ERP", "UNK", "TRU", "ERP", "TRU", "ERP",
"ERP", "AF", "UNK", "TRU", "ERP", "UNK", "TRU", "ERP", "TRU",
"ERP", "ERP"), p = c(0.029, 0.065, 0.03, 0.072, 0.074, 0.026,
0.041, 0.047, 0.052, 0.019, 0.092, 0.013, 0.079, 0.032, 0.048,
0.045, 0.033, 0.058, 0.04, 0.02, 0.077, 0.035, 0.057, 0.078,
0.022, 0.017, 0.025, 0.021, 0.061, 0.011, 0.084, 0.06, 0.087,
0.043, 0.095, 0.088, 0.082, 0.085, 0.014, 0.054), BU = c("A1",
"A1", "A1", "A1", "A1", "A1", "A1", "A1", "A1", "A1", "A2", "A2",
"A2", "A2", "A2", "A2", "A2", "A2", "A2", "A2", "A3", "A3", "A3",
"A3", "A3", "A3", "A3", "A3", "A3", "A3", "A4", "A4", "A4", "A4",
"A4", "A4", "A4", "A4", "A4", "A4")), class = "data.frame", row.names = c(NA,
-40L))
> Stat_test
group1 group2 p BU
1 NAF AF 0.029 A1
2 NAF UNK 0.065 A1
3 NAF TRU 0.030 A1
4 NAF ERP 0.072 A1
5 AF UNK 0.074 A1
6 AF TRU 0.026 A1
7 AF ERP 0.041 A1
8 UNK TRU 0.047 A1
9 UNK ERP 0.052 A1
10 TRU ERP 0.019 A1
11 NAF AF 0.092 A2
12 NAF UNK 0.013 A2
13 NAF TRU 0.079 A2
14 NAF ERP 0.032 A2
15 AF UNK 0.048 A2
16 AF TRU 0.045 A2
17 AF ERP 0.033 A2
18 UNK TRU 0.058 A2
19 UNK ERP 0.040 A2
20 TRU ERP 0.020 A2
21 NAF AF 0.077 A3
22 NAF UNK 0.035 A3
23 NAF TRU 0.057 A3
24 NAF ERP 0.078 A3
25 AF UNK 0.022 A3
26 AF TRU 0.017 A3
27 AF ERP 0.025 A3
28 UNK TRU 0.021 A3
29 UNK ERP 0.061 A3
30 TRU ERP 0.011 A3
31 NAF AF 0.084 A4
32 NAF UNK 0.060 A4
33 NAF TRU 0.087 A4
34 NAF ERP 0.043 A4
35 AF UNK 0.095 A4
36 AF TRU 0.088 A4
37 AF ERP 0.082 A4
38 UNK TRU 0.085 A4
39 UNK ERP 0.014 A4
40 TRU ERP 0.054 A4
I asked a similar question where I had only two categories and four groups. You can find the solution that was working on this link
Add pvalue bracket with stat_pvalue_manual on geom_bar
The problem was that my table with p value did not have xmin and xmax position. I have been explained that the values on the x axis would be equivalent to as.numeric(as.factor(data_test$BU)). So I tried to computed this xmin and xmax position with this following code.
- First I computed a x_position_BU that correspond to the BU group position -> 1,2,3,4
- Second I computed the x_position for each PH for group1 and group2 (inside a BU).
- Third I compute the x_position for each test using x_position_BU, x_position_g1 and x_positiong2
But when I tried to build my plot I had this following error message
@Error in dplyr::mutate():
ℹ In argument: label = as.character(data %>% pull("p")).
ℹ In group 1: BU = "A1", group1 = "AF", group2 = "ERP".
Caused by error:
! label must be size 1, not 40."
library(dplyr)
library(ggpubr)
library(ggplot2)
# Modified labels
############################
Stat_test_2 = Stat_test %>%
mutate(p=ifelse(p<0.05, p, NA))
# Add x position
############################
SStat_test_3 = Stat_test_2 %>%
mutate(x_position_BU=ifelse(BU=="A1", 1,
ifelse(BU=="A2", 2,
ifelse(BU=="A3", 3, 4)))) %>%
mutate(x_position_g1=ifelse(group1=="NAF",0.64,
ifelse(group1== "AF", 0.82,
ifelse(group1== "UNK", 1,
ifelse(group1== "TRU", 1.18 , 1.36))))) %>%
mutate(x_position_g2=ifelse(group2=="NAF", 0.64,
ifelse(group2== "AF", 0.82,
ifelse(group2== "UNK", 1,
ifelse(group2== "TRU", 1.18, 1.36))))) %>%
group_by(BU, group1, group2) %>%
mutate(xmin=ifelse(p<0.05, min(x_position_g1, x_position_g2)+ x_position_BU-1, NA),
xmax=ifelse(p<0.05, max(x_position_g2, x_position_g1) + x_position_BU-1, NA))
# Plot
############################
ggplot(data_test, aes(x=BU, y=PE)) +
geom_bar(aes(fill=PH), stat="identity", position=position_dodge(), color="black") +
stat_pvalue_manual(Stat_test_3, y.position = 100, size=4,
label = "p", remove.bracket = FALSE,
tip.length = 0.02, bracket.size = 0.8) +
ylim(c(0,150))
My first question is: what is not working on this code ?
My second question is: is there a way to extract directly the x_position for each bar inside the grouped plot
EDIT:
I found a response to my two questions:
for question 1: the problem was that Stat_test3 was not in a data.frame format, using as.data.frame(Stat_test3) solve the problem.
for question 2 I used ggplot_build as following
plot=ggplot(data_test, aes(x=BU, y=PE)) +
geom_bar(aes(fill=PH), stat="identity", position=position_dodge(), color="black")
ggplot_build(plot)$data[[1]]$x
[1] 0.64 1.64 2.64 3.64 0.82 1.82 2.82 3.82 1.00 2.00 3.00 4.00 1.18 2.18 3.18 4.18 1.36 2.36 3.36
[20] 4.36
attr(,"class")
[1] "mapped_discrete" "numeric"