Home > Software engineering >  Merge two datasets, keep all row differences and add similar rows
Merge two datasets, keep all row differences and add similar rows

Time:01-20

I have two datasets Data and Data1. I want to merge these keeping all difference while adding the numeric values together in the new table for all common rows. Is there any easy tool for this?

head(Data)
  contig position variantID refAllele altAllele refCount altCount totalCount lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
1   chr1   905373         .         T         C        2        4          6            0             0        6          0             0
2   chr1   911428         .         C         T        1        2          3            0             0        3          0             0
3   chr1   953279         .         T         C      146      126        272            0             0      273          1             0
4   chr1   962184         .         T         C       14       15         29            0             0       29          0             0
5   chr1  1024129         .         T         G        1        0          1            0             0        1          0             0
6   chr1  1039514         .         C         T        1        1          2            0             0        2          0             0
head(Data1)
  contig position variantID refAllele altAllele refCount altCount totalCount lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
1   chr1   905373         .         T         C        2        3          5            0             0        5          0             0
2   chr1   933024         .         C         T        1        0          1            0             0        1          0             0
3   chr1   953279         .         T         C      122      124        246            0             0      248          2             0
4   chr1   962184         .         T         C       17       21         38            0             0       38          0             0
5   chr1  1022518         .         G         T        0        1          1            0             0        1          0             0
6   chr1  1024129         .         T         G        1        2          3            0             0        3          0             0

Wanted Output example

contig  position    variantID   refAllele   altAllele   refCount    altCount    totalCount lowMAPQDepth lowBaseQDepth   rawDepth    otherBases  improperPairs
1   chr1    905373  .   T   C   4   7   11  0   0   11  0   0
2   chr1    911428  .   C   T   1   2   3   0   0   3   0   0
2   chr1    933024  .   C   T   1   0   1   0   0   1   0   0
4   chr1    953279  .   T   C   268 150 518 0   0   521 3   0

As we can see in column position site 905373 is common is added together from column refCount on. while site 911428 and 933024 are both unique to their datasets but inserted into the new dataset. Is their a none painful way of creating the output table?

Data <- structure(list(contig = c("chr1", "chr1", "chr1", "chr1", "chr1", 
"chr1"), position = c(905373L, 911428L, 953279L, 962184L, 1024129L, 
1039514L), variantID = c(".", ".", ".", ".", ".", "."), refAllele = c("T", 
"C", "T", "T", "T", "C"), altAllele = c("C", "T", "C", "C", "G", 
"T"), refCount = c(2L, 1L, 146L, 14L, 1L, 1L), altCount = c(4L, 
2L, 126L, 15L, 0L, 1L), totalCount = c(6L, 3L, 272L, 29L, 1L, 
2L), lowMAPQDepth = c(0L, 0L, 0L, 0L, 0L, 0L), lowBaseQDepth = c(0L, 
0L, 0L, 0L, 0L, 0L), rawDepth = c(6L, 3L, 273L, 29L, 1L, 2L), 
    otherBases = c(0L, 0L, 1L, 0L, 0L, 0L), improperPairs = c(0L, 
    0L, 0L, 0L, 0L, 0L)), row.names = c(NA, 6L), class = "data.frame")

Data1 <- structure(list(contig = c("chr1", "chr1", "chr1", "chr1", "chr1", 
"chr1"), position = c(905373L, 933024L, 953279L, 962184L, 1022518L, 
1024129L), variantID = c(".", ".", ".", ".", ".", "."), refAllele = c("T", 
"C", "T", "T", "G", "T"), altAllele = c("C", "T", "C", "C", "T", 
"G"), refCount = c(2L, 1L, 122L, 17L, 0L, 1L), altCount = c(3L, 
0L, 124L, 21L, 1L, 2L), totalCount = c(5L, 1L, 246L, 38L, 1L, 
3L), lowMAPQDepth = c(0L, 0L, 0L, 0L, 0L, 0L), lowBaseQDepth = c(0L, 
0L, 0L, 0L, 0L, 0L), rawDepth = c(5L, 1L, 248L, 38L, 1L, 3L), 
    otherBases = c(0L, 0L, 2L, 0L, 0L, 0L), improperPairs = c(0L, 
    0L, 0L, 0L, 0L, 0L)), row.names = c(NA, 6L), class = "data.frame")

CodePudding user response:

Here a posibility:

Data
#>   contig position variantID refAllele altAllele refCount altCount totalCount
#> 1   chr1   905373         .         T         C        2        4          6
#> 2   chr1   911428         .         C         T        1        2          3
#> 3   chr1   953279         .         T         C      146      126        272
#> 4   chr1   962184         .         T         C       14       15         29
#> 5   chr1  1024129         .         T         G        1        0          1
#> 6   chr1  1039514         .         C         T        1        1          2
#>   lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
#> 1            0             0        6          0             0
#> 2            0             0        3          0             0
#> 3            0             0      273          1             0
#> 4            0             0       29          0             0
#> 5            0             0        1          0             0
#> 6            0             0        2          0             0

Data1
#>   contig position variantID refAllele altAllele refCount altCount totalCount
#> 1   chr1   905373         .         T         C        2        3          5
#> 2   chr1   933024         .         C         T        1        0          1
#> 3   chr1   953279         .         T         C      122      124        246
#> 4   chr1   962184         .         T         C       17       21         38
#> 5   chr1  1022518         .         G         T        0        1          1
#> 6   chr1  1024129         .         T         G        1        2          3
#>   lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
#> 1            0             0        5          0             0
#> 2            0             0        1          0             0
#> 3            0             0      248          2             0
#> 4            0             0       38          0             0
#> 5            0             0        1          0             0
#> 6            0             0        3          0             0

aggregate(. ~ contig    position   variantID   refAllele   altAllele, rbind(Data, Data1), sum)
#>   contig position variantID refAllele altAllele refCount altCount totalCount
#> 1   chr1   905373         .         T         C        4        7         11
#> 2   chr1   953279         .         T         C      268      250        518
#> 3   chr1   962184         .         T         C       31       36         67
#> 4   chr1  1024129         .         T         G        2        2          4
#> 5   chr1   911428         .         C         T        1        2          3
#> 6   chr1   933024         .         C         T        1        0          1
#> 7   chr1  1039514         .         C         T        1        1          2
#> 8   chr1  1022518         .         G         T        0        1          1
#>   lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
#> 1            0             0       11          0             0
#> 2            0             0      521          3             0
#> 3            0             0       67          0             0
#> 4            0             0        4          0             0
#> 5            0             0        3          0             0
#> 6            0             0        1          0             0
#> 7            0             0        2          0             0
#> 8            0             0        1          0             0

CodePudding user response:

You need something like this:

library(dplyr)

bind_rows(Data, Data1) %>% as_tibble() %>% 
  group_by(contig, position, refAllele, altAllele) %>% 
  summarise(across(-c(variantID), ~sum(., na.rm = TRUE)))

 contig position refAllele altAllele refCount altCount totalC…¹ lowMA…² lowBa…³ rawDe…⁴ other…⁵ impro…⁶
  <chr>     <int> <chr>     <chr>        <int>    <int>    <int>   <int>   <int>   <int>   <int>   <int>
1 chr1     905373 T         C                4        7       11       0       0      11       0       0
2 chr1     911428 C         T                1        2        3       0       0       3       0       0
3 chr1     933024 C         T                1        0        1       0       0       1       0       0
4 chr1     953279 T         C              268      250      518       0       0     521       3       0
5 chr1     962184 T         C               31       36       67       0       0      67       0       0
6 chr1    1022518 G         T                0        1        1       0       0       1       0       0
7 chr1    1024129 T         G                2        2        4       0       0       4       0       0
8 chr1    1039514 C         T                1        1        2       0       0       2       0       0
# … with abbreviated variable names ¹​totalCount, ²​lowMAPQDepth, ³​lowBaseQDepth, ⁴​rawDepth, ⁵​otherBases,
#   ⁶​improperPairs

CodePudding user response:

Here's another option using join/merge. I'll demo using dplyr, though it is feasible in base R (with a bit more work):

library(dplyr)
full_join(Data, Data1, by = c("contig", "position", "variantID", "refAllele", "altAllele"), 
          suffix = c("", ".y")) %>%
  mutate(
    across(
      where(is.numeric),
      ~ rowSums(cbind(., cur_data()[[ paste0(cur_column(), ".y") ]]), na.rm = TRUE)
    )
  ) %>%
  select(-ends_with(".y"))
#   contig position variantID refAllele altAllele refCount altCount totalCount lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
# 1   chr1   905373         .         T         C        4        7         11            0             0       11          0             0
# 2   chr1   911428         .         C         T        1        2          3            0             0        3          0             0
# 3   chr1   953279         .         T         C      268      250        518            0             0      521          3             0
# 4   chr1   962184         .         T         C       31       36         67            0             0       67          0             0
# 5   chr1  1024129         .         T         G        2        2          4            0             0        4          0             0
# 6   chr1  1039514         .         C         T        1        1          2            0             0        2          0             0
# 7   chr1   933024         .         C         T        1        0          1            0             0        1          0             0
# 8   chr1  1022518         .         G         T        0        1          1            0             0        1          0             0

I joined on those fields since they were character and seemed to me (uninformed) that you would not want to try to aggregate if they were different.

  • Related