I have two datasets Data and Data1. I want to merge these keeping all difference while adding the numeric values together in the new table for all common rows. Is there any easy tool for this?
head(Data)
contig position variantID refAllele altAllele refCount altCount totalCount lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
1 chr1 905373 . T C 2 4 6 0 0 6 0 0
2 chr1 911428 . C T 1 2 3 0 0 3 0 0
3 chr1 953279 . T C 146 126 272 0 0 273 1 0
4 chr1 962184 . T C 14 15 29 0 0 29 0 0
5 chr1 1024129 . T G 1 0 1 0 0 1 0 0
6 chr1 1039514 . C T 1 1 2 0 0 2 0 0
head(Data1)
contig position variantID refAllele altAllele refCount altCount totalCount lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
1 chr1 905373 . T C 2 3 5 0 0 5 0 0
2 chr1 933024 . C T 1 0 1 0 0 1 0 0
3 chr1 953279 . T C 122 124 246 0 0 248 2 0
4 chr1 962184 . T C 17 21 38 0 0 38 0 0
5 chr1 1022518 . G T 0 1 1 0 0 1 0 0
6 chr1 1024129 . T G 1 2 3 0 0 3 0 0
Wanted Output example
contig position variantID refAllele altAllele refCount altCount totalCount lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
1 chr1 905373 . T C 4 7 11 0 0 11 0 0
2 chr1 911428 . C T 1 2 3 0 0 3 0 0
2 chr1 933024 . C T 1 0 1 0 0 1 0 0
4 chr1 953279 . T C 268 150 518 0 0 521 3 0
As we can see in column position site 905373 is common is added together from column refCount on. while site 911428 and 933024 are both unique to their datasets but inserted into the new dataset. Is their a none painful way of creating the output table?
Data <- structure(list(contig = c("chr1", "chr1", "chr1", "chr1", "chr1",
"chr1"), position = c(905373L, 911428L, 953279L, 962184L, 1024129L,
1039514L), variantID = c(".", ".", ".", ".", ".", "."), refAllele = c("T",
"C", "T", "T", "T", "C"), altAllele = c("C", "T", "C", "C", "G",
"T"), refCount = c(2L, 1L, 146L, 14L, 1L, 1L), altCount = c(4L,
2L, 126L, 15L, 0L, 1L), totalCount = c(6L, 3L, 272L, 29L, 1L,
2L), lowMAPQDepth = c(0L, 0L, 0L, 0L, 0L, 0L), lowBaseQDepth = c(0L,
0L, 0L, 0L, 0L, 0L), rawDepth = c(6L, 3L, 273L, 29L, 1L, 2L),
otherBases = c(0L, 0L, 1L, 0L, 0L, 0L), improperPairs = c(0L,
0L, 0L, 0L, 0L, 0L)), row.names = c(NA, 6L), class = "data.frame")
Data1 <- structure(list(contig = c("chr1", "chr1", "chr1", "chr1", "chr1",
"chr1"), position = c(905373L, 933024L, 953279L, 962184L, 1022518L,
1024129L), variantID = c(".", ".", ".", ".", ".", "."), refAllele = c("T",
"C", "T", "T", "G", "T"), altAllele = c("C", "T", "C", "C", "T",
"G"), refCount = c(2L, 1L, 122L, 17L, 0L, 1L), altCount = c(3L,
0L, 124L, 21L, 1L, 2L), totalCount = c(5L, 1L, 246L, 38L, 1L,
3L), lowMAPQDepth = c(0L, 0L, 0L, 0L, 0L, 0L), lowBaseQDepth = c(0L,
0L, 0L, 0L, 0L, 0L), rawDepth = c(5L, 1L, 248L, 38L, 1L, 3L),
otherBases = c(0L, 0L, 2L, 0L, 0L, 0L), improperPairs = c(0L,
0L, 0L, 0L, 0L, 0L)), row.names = c(NA, 6L), class = "data.frame")
CodePudding user response:
Here a posibility:
Data
#> contig position variantID refAllele altAllele refCount altCount totalCount
#> 1 chr1 905373 . T C 2 4 6
#> 2 chr1 911428 . C T 1 2 3
#> 3 chr1 953279 . T C 146 126 272
#> 4 chr1 962184 . T C 14 15 29
#> 5 chr1 1024129 . T G 1 0 1
#> 6 chr1 1039514 . C T 1 1 2
#> lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
#> 1 0 0 6 0 0
#> 2 0 0 3 0 0
#> 3 0 0 273 1 0
#> 4 0 0 29 0 0
#> 5 0 0 1 0 0
#> 6 0 0 2 0 0
Data1
#> contig position variantID refAllele altAllele refCount altCount totalCount
#> 1 chr1 905373 . T C 2 3 5
#> 2 chr1 933024 . C T 1 0 1
#> 3 chr1 953279 . T C 122 124 246
#> 4 chr1 962184 . T C 17 21 38
#> 5 chr1 1022518 . G T 0 1 1
#> 6 chr1 1024129 . T G 1 2 3
#> lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
#> 1 0 0 5 0 0
#> 2 0 0 1 0 0
#> 3 0 0 248 2 0
#> 4 0 0 38 0 0
#> 5 0 0 1 0 0
#> 6 0 0 3 0 0
aggregate(. ~ contig position variantID refAllele altAllele, rbind(Data, Data1), sum)
#> contig position variantID refAllele altAllele refCount altCount totalCount
#> 1 chr1 905373 . T C 4 7 11
#> 2 chr1 953279 . T C 268 250 518
#> 3 chr1 962184 . T C 31 36 67
#> 4 chr1 1024129 . T G 2 2 4
#> 5 chr1 911428 . C T 1 2 3
#> 6 chr1 933024 . C T 1 0 1
#> 7 chr1 1039514 . C T 1 1 2
#> 8 chr1 1022518 . G T 0 1 1
#> lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
#> 1 0 0 11 0 0
#> 2 0 0 521 3 0
#> 3 0 0 67 0 0
#> 4 0 0 4 0 0
#> 5 0 0 3 0 0
#> 6 0 0 1 0 0
#> 7 0 0 2 0 0
#> 8 0 0 1 0 0
CodePudding user response:
You need something like this:
library(dplyr)
bind_rows(Data, Data1) %>% as_tibble() %>%
group_by(contig, position, refAllele, altAllele) %>%
summarise(across(-c(variantID), ~sum(., na.rm = TRUE)))
contig position refAllele altAllele refCount altCount totalC…¹ lowMA…² lowBa…³ rawDe…⁴ other…⁵ impro…⁶
<chr> <int> <chr> <chr> <int> <int> <int> <int> <int> <int> <int> <int>
1 chr1 905373 T C 4 7 11 0 0 11 0 0
2 chr1 911428 C T 1 2 3 0 0 3 0 0
3 chr1 933024 C T 1 0 1 0 0 1 0 0
4 chr1 953279 T C 268 250 518 0 0 521 3 0
5 chr1 962184 T C 31 36 67 0 0 67 0 0
6 chr1 1022518 G T 0 1 1 0 0 1 0 0
7 chr1 1024129 T G 2 2 4 0 0 4 0 0
8 chr1 1039514 C T 1 1 2 0 0 2 0 0
# … with abbreviated variable names ¹totalCount, ²lowMAPQDepth, ³lowBaseQDepth, ⁴rawDepth, ⁵otherBases,
# ⁶improperPairs
CodePudding user response:
Here's another option using join/merge. I'll demo using dplyr
, though it is feasible in base R (with a bit more work):
library(dplyr)
full_join(Data, Data1, by = c("contig", "position", "variantID", "refAllele", "altAllele"),
suffix = c("", ".y")) %>%
mutate(
across(
where(is.numeric),
~ rowSums(cbind(., cur_data()[[ paste0(cur_column(), ".y") ]]), na.rm = TRUE)
)
) %>%
select(-ends_with(".y"))
# contig position variantID refAllele altAllele refCount altCount totalCount lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
# 1 chr1 905373 . T C 4 7 11 0 0 11 0 0
# 2 chr1 911428 . C T 1 2 3 0 0 3 0 0
# 3 chr1 953279 . T C 268 250 518 0 0 521 3 0
# 4 chr1 962184 . T C 31 36 67 0 0 67 0 0
# 5 chr1 1024129 . T G 2 2 4 0 0 4 0 0
# 6 chr1 1039514 . C T 1 1 2 0 0 2 0 0
# 7 chr1 933024 . C T 1 0 1 0 0 1 0 0
# 8 chr1 1022518 . G T 0 1 1 0 0 1 0 0
I joined on those fields since they were character and seemed to me (uninformed) that you would not want to try to aggregate if they were different.