head(sum) File_pdb Res1 Chain1 Res2 Chain2 1: 7LD1_CM GLN 81 M ASN 501 C 2: 7LD1_CM TYR 128 M PHE" /> head(sum) File_pdb Res1 Chain1 Res2 Chain2 1: 7LD1_CM GLN 81 M ASN 501 C 2: 7LD1_CM TYR 128 M PHE" /> head(sum) File_pdb Res1 Chain1 Res2 Chain2 1: 7LD1_CM GLN 81 M ASN 501 C 2: 7LD1_CM TYR 128 M PHE"/>

Correspondance between values in two df R

45 Views Asked by At

I have two df to confrontate. my first df is "sum"

> head(sum)
   File_pdb  Res1      Chain1     Res2      Chain2
1:  7LD1_CM  GLN 81      M       ASN 501      C
2:  7LD1_CM  TYR 128     M       PHE 377      C
3:  7LD1_CM  ILE 78      M       SER 375      C
4:  7LD1_CM  ASN 76      M       ALA 372      C
5:  7LD1_CM  THR 20      M       TYR 369      C
6:  7LD1_CM  ARG 408     C       LEU 131      M

The second one is "mut"

> head(mut)
   RefAA  MutAA LineagesCount
1  VAL 3  GLY 3             1
2  LEU 5  PHE 5             2
3  LEU 8  VAL 8             1
4 SER 13 ILE 13             2
5 LEU 18 PHE 18             5
6 THR 20 ILE 20             1

I have to check if in sum$res1 and sum$res2 there are values equal to mut$refAA. If it's so, I need to add the whole row of mut$refAA near to sum$res1 or sum$res2.

here an example:

    File_pdb  Res1      Chain1     Res2      Chain2 RefAA  MutAA  LineagesCount
1:  7LD1_CM  GLN 81      M       ASN 501      C
2:  7LD1_CM  TYR 128     M       PHE 377      C
3:  7LD1_CM  ILE 78      M       SER 375      C
4:  7LD1_CM  ASN 76      M       ALA 372      C
5:  7LD1_CM  THR 20      M       TYR 369      C     THR 20   ILE 20     1
6:  7LD1_CM  ARG 408     C       LEU 131      M

How I can do this? I was trying something using merge and join functions but I'm not so experienced so I need to practice more. Can someone help me? Thank you!

2

There are 2 best solutions below

1
Roman On BEST ANSWER

I had to fix the data a bit, to easily import the data. Then you can try a tidyverse

library(tidyverse)
SUM %>% 
  mutate(index = 1:n()) %>% 
  pivot_longer(c(Res1, Res2)) %>%   
  left_join(mutate(MUT, value=RefAA), by = "value") %>%  
  group_by(index) %>% 
  fill(MutAA, RefAA, LineagesCount, .direction = "downup") %>% 
  ungroup() %>% 
  pivot_wider(names_from = name, values_from = value, values_fn = toString) %>% 
  mutate(which_Res = ifelse(RefAA == Res1, "Res1", "Res2"))
# A tibble: 6 x 10
  File_pdb Chain1 Chain2 index RefAA MutAA LineagesCount Res1   Res2   which_Res
  <chr>    <chr>  <chr>  <int> <chr> <chr>         <int> <chr>  <chr>  <chr>    
1 7LD1_CM  M      C          1 NA    NA               NA GLN81  ASN501 NA       
2 7LD1_CM  M      C          2 NA    NA               NA TYR128 PHE377 NA       
3 7LD1_CM  M      C          3 NA    NA               NA ILE78  SER375 NA       
4 7LD1_CM  M      C          4 NA    NA               NA ASN76  ALA372 NA       
5 7LD1_CM  M      C          5 THR20 ILE20             1 THR20  TYR369 Res1     
6 7LD1_CM  C      M          6 NA    NA               NA ARG408 LEU131 NA   

The data

SUM <- read.table(text = "   File_pdb  Res1      Chain1     Res2      Chain2
1:  7LD1_CM  GLN81      M       ASN501      C
2:  7LD1_CM  TYR128     M       PHE377      C
3:  7LD1_CM  ILE78      M       SER375      C
4:  7LD1_CM  ASN76      M       ALA372      C
5:  7LD1_CM  THR20      M       TYR369      C
6:  7LD1_CM  ARG408     C       LEU131      M") 
SUM

MUT <- read.table(text = " RefAA  MutAA LineagesCount
1  VAL3  GLY3             1
2  LEU5  PHE5             2
3  LEU8  VAL8             1
4 SER13 ILE13             2
5 LEU18 PHE18             5
6 THR20 ILE20             1")
0
ThomasIsCoding On

Hope this would help

do.call(
  dplyr::coalesce,
  lapply(
    c("Res1", "Res2"),
    function(x) merge(SUM, MUT, by.x = x, by.y = "RefAA", all.x = TRUE)
  )
)

which gives

    Res1 File_pdb Chain1   Res2 Chain2 MutAA LineagesCount
1 ARG408  7LD1_CM      C LEU131      M  <NA>            NA
2  ASN76  7LD1_CM      M ALA372      C  <NA>            NA
3  GLN81  7LD1_CM      M ASN501      C  <NA>            NA
4  ILE78  7LD1_CM      M SER375      C  <NA>            NA
5  THR20  7LD1_CM      M TYR369      C ILE20             1
6 TYR128  7LD1_CM      M PHE377      C  <NA>            NA

Data

> dput(SUM)
structure(list(File_pdb = c("7LD1_CM", "7LD1_CM", "7LD1_CM",
"7LD1_CM", "7LD1_CM", "7LD1_CM"), Res1 = c("GLN81", "TYR128",
"ILE78", "ASN76", "THR20", "ARG408"), Chain1 = c("M", "M", "M",
"M", "M", "C"), Res2 = c("ASN501", "PHE377", "SER375", "ALA372",
"TYR369", "LEU131"), Chain2 = c("C", "C", "C", "C", "C", "M")), class = "data.frame", row.names = c("1:",
"2:", "3:", "4:", "5:", "6:"))

> dput(MUT)
structure(list(RefAA = c("VAL3", "LEU5", "LEU8", "SER13", "LEU18",
"THR20"), MutAA = c("GLY3", "PHE5", "VAL8", "ILE13", "PHE18", 
"ILE20"), LineagesCount = c(1L, 2L, 1L, 2L, 5L, 1L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))