doing outer for really large data frame hitting memory usage in R

29 Views Asked by At

So i have three data frame having this information

> dim(result)
[1] 652265     10
> dim(rat)
[1] 107190      2
> dim(mouse)
[1] 219016      2
> names(result)
 [1] "ensembl_gene_id"                             "ensembl_transcript_id"                      
 [3] "external_gene_name"                          "mmusculus_homolog_ensembl_gene"             
 [5] "mmusculus_homolog_associated_gene_name"      "rnorvegicus_homolog_ensembl_gene"           
 [7] "rnorvegicus_homolog_associated_gene_name"    "celegans_homolog_ensembl_gene"              
 [9] "celegans_homolog_associated_gene_name"       "celegans_homolog_ensembl_gene_transcript_id"
> names(rat)
[1] "rnorvegicus_homolog_ensembl_gene"          "rnorvegicus_homolog_ensembl_transcript_id"
> names(mouse)
[1] "mmusculus_homolog_ensembl_gene"          "mmusculus_homolog_ensembl_transcript_id"

My dataframes are as such

head(result)
  ensembl_gene_id ensembl_transcript_id external_gene_name mmusculus_homolog_ensembl_gene
1 ENSG00000198888       ENST00000361390             MT-ND1             ENSMUSG00000064341
2            <NA>                  <NA>               <NA>                           <NA>
3 ENSG00000198804       ENST00000361624             MT-CO1             ENSMUSG00000064351
4 ENSG00000198712       ENST00000361739             MT-CO2             ENSMUSG00000064354
5            <NA>                  <NA>               <NA>                           <NA>
6            <NA>                  <NA>               <NA>                           <NA>
  mmusculus_homolog_associated_gene_name rnorvegicus_homolog_ensembl_gene
1                                 mt-Nd1               ENSRNOG00000030644
2                                   <NA>                             <NA>
3                                 mt-Co1               ENSRNOG00000034234
4                                 mt-Co2               ENSRNOG00000030371
5                                   <NA>                             <NA>
6                                   <NA>                             <NA>
  rnorvegicus_homolog_associated_gene_name celegans_homolog_ensembl_gene
1                                   Mt-nd1                WBGene00010959
2                                     <NA>                          <NA>
3                                   Mt-co1                WBGene00010964
4                                   Mt-co2                WBGene00010965
5                                     <NA>                          <NA>
6                                     <NA>                          <NA>
  celegans_homolog_associated_gene_name celegans_homolog_ensembl_gene_transcript_id
1                                                                         MTCE.11.1
2                                  <NA>                                        <NA>
3                                 ctc-1                                   MTCE.26.1
4                                 ctc-2                                   MTCE.31.1
5                                  <NA>                                        <NA>
6                                  <NA>                                        <NA>
> head(rat)
  rnorvegicus_homolog_ensembl_gene rnorvegicus_homolog_ensembl_transcript_id
1               ENSRNOG00000033395                        ENSRNOT00000044979
2               ENSRNOG00000070901                        ENSRNOT00000119508
3               ENSRNOG00000031391                        ENSRNOT00000045306
4               ENSRNOG00000067904                        ENSRNOT00000099225
5               ENSRNOG00000067904                        ENSRNOT00000099225
6               ENSRNOG00000067904                        ENSRNOT00000099225
> head(mouse)
  mmusculus_homolog_ensembl_gene mmusculus_homolog_ensembl_transcript_id
1             ENSMUSG00000064341                      ENSMUST00000082392
2             ENSMUSG00000064345                      ENSMUST00000082396
3             ENSMUSG00000064351                      ENSMUST00000082402
4             ENSMUSG00000064354                      ENSMUST00000082405
5             ENSMUSG00000064356                      ENSMUST00000082407
6             ENSMUSG00000064357                      ENSMUST00000082408

I tried sqldf based solution since dplyr was stuck when i tried

joined_data <- sqldf("
    SELECT *
    FROM result
    LEFT OUTER JOIN rat ON result.rnorvegicus_homolog_ensembl_gene = rat.rnorvegicus_homolog_ensembl_gene
    LEFT OUTER JOIN mouse ON result.mmusculus_homolog_ensembl_gene = mouse.mmusculus_homolog_ensembl_gene
")

The issue is it just hit the upper limit of the system i have with me

Now what is the optimized way to do the same if any suggestion or help would be really appreciated

R memory usage

UPDATE_data

dput(head(result))
structure(list(ensembl_gene_id = c("ENSG00000198888", NA, "ENSG00000198804", 
"ENSG00000198712", NA, NA), ensembl_transcript_id = c("ENST00000361390", 
NA, "ENST00000361624", "ENST00000361739", NA, NA), external_gene_name = c("MT-ND1", 
NA, "MT-CO1", "MT-CO2", NA, NA), mmusculus_homolog_ensembl_gene = c("ENSMUSG00000064341", 
NA, "ENSMUSG00000064351", "ENSMUSG00000064354", NA, NA), mmusculus_homolog_associated_gene_name = c("mt-Nd1", 
NA, "mt-Co1", "mt-Co2", NA, NA), rnorvegicus_homolog_ensembl_gene = c("ENSRNOG00000030644", 
NA, "ENSRNOG00000034234", "ENSRNOG00000030371", NA, NA), rnorvegicus_homolog_associated_gene_name = c("Mt-nd1", 
NA, "Mt-co1", "Mt-co2", NA, NA), celegans_homolog_ensembl_gene = c("WBGene00010959", 
NA, "WBGene00010964", "WBGene00010965", NA, NA), celegans_homolog_associated_gene_name = c("", 
NA, "ctc-1", "ctc-2", NA, NA), celegans_homolog_ensembl_gene_transcript_id = c("MTCE.11.1", 
NA, "MTCE.26.1", "MTCE.31.1", NA, NA)), row.names = c(NA, 6L), class = "data.frame")
 dput(head(rat))
structure(list(rnorvegicus_homolog_ensembl_gene = c("ENSRNOG00000033395", 
"ENSRNOG00000070901", "ENSRNOG00000031391", "ENSRNOG00000067904", 
"ENSRNOG00000067904", "ENSRNOG00000067904"), rnorvegicus_homolog_ensembl_transcript_id = c("ENSRNOT00000044979", 
"ENSRNOT00000119508", "ENSRNOT00000045306", "ENSRNOT00000099225", 
"ENSRNOT00000099225", "ENSRNOT00000099225")), row.names = c(NA, 
6L), class = "data.frame")
 dput(head(mouse))
structure(list(mmusculus_homolog_ensembl_gene = c("ENSMUSG00000064341", 
"ENSMUSG00000064345", "ENSMUSG00000064351", "ENSMUSG00000064354", 
"ENSMUSG00000064356", "ENSMUSG00000064357"), mmusculus_homolog_ensembl_transcript_id = c("ENSMUST00000082392", 
"ENSMUST00000082396", "ENSMUST00000082402", "ENSMUST00000082405", 
"ENSMUST00000082407", "ENSMUST00000082408")), row.names = c(NA, 
6L), class = "data.frame")
1

There are 1 best solutions below

1
Fábio Batista On

For a dataset of this size, you won't be able to simply join your data in a naive way and keep working on it interactively. The best approach will depend on what you expect to to with it.

Maybe you'll need to work in a sample. Maybe split data in chunks of manageable size. Maybe loading them into an Spark instance. Or maybe perform the calculations you need using SQL.

If all you want is the data joined, you can just load them into your database, perform the join and then export to CSV or another format. No need to load them into a data frame.

State your intent and others can help you better.