I have the following dataset.
group | value | row_name | overlaps |
---|---|---|---|
group_a | 4 | 1 | 2 |
group_a | 5 | 2 | 3, 5 |
group_a | 48 | 3 | 4, 5 |
group_a | 54 | 4 | 5 |
group_a | 12 | 5 | |
group_b | 12 | 6 | 7 |
group_b | 1 | 7 |
The overlaps column indicates which rows have a certain 'overlap'.
I would like, only for rows whose value is greater than 10, to replace the data in the value column with zero for all rows indicated in the respective 'overlaps' column.
Expected output:
group | value | row_name | overlaps |
---|---|---|---|
group_a | 4 | 1 | 2 |
group_a | 5 | 2 | 3, 5 |
group_a | 48 | 3 | 4, 5 |
group_a | 0 | 4 | 5 |
group_a | 0 | 5 | |
group_b | 12 | 6 | 7 |
group_b | 0 | 7 |
Reproducible example:
data <- data.frame(group = c("group_a", "group_a", "group_a", "group_a",
"group_a", "group_b", "group_b"),
value = c(4, 5 , 48, 54, 12, 12, 1),
row_name = c("1", "2", "3", "4", "5", "6", "7"),
overlaps = c("2", "3, 5", "4, 5", "5", "", "7", ""))
I don't know if it's a very complex issue, but I'm stuck for hours and I didn't figure out how to solve it.
Does anyone have any suggestions on how I can solve this, by group, using dplyr or data.table?
CodePudding user response:
strsplit
the overlaps
column, subset to just those where data$value > 10
, then use that distinct set of row_name
s to overwrite the original data with 0
:
gr10 <- data$value > 10
sel <- Map(paste, data$group[gr10], strsplit(data$overlaps, ",\\s ")[gr10], sep="|")
sel <- Reduce(union, sel)
sel
#[1] "group_a|4" "group_a|5" "group_a|" "group_b|7"
data$value[do.call(paste, c(data[c("group","row_name")], sep="|")) %in% sel] <- 0
data
# group value row_name overlaps
#1 group_a 4 1 2
#2 group_a 5 2 3, 5
#3 group_a 48 3 4, 5
#4 group_a 0 4 5
#5 group_a 0 5
#6 group_b 12 6 7
#7 group_b 0 7
If the row_name
s are unique across the entire dataset, you can use simpler logic:
sel <- Reduce(union, strsplit(data$overlaps, ",\\s ")[data$value > 10])
sel
#[1] "4" "5" "7"
data$value[data$row_name %in% sel] <- 0
Bonus data.table solution:
library(data.table)
setDT(data)
data[
data[value > 10, .(row_name=unlist(strsplit(overlaps, ",\\s "))), by=group],
on=.(group, row_name),
value := 0
]
CodePudding user response:
Another solution, based on tidyverse
:
library(tidyverse)
data <- data.frame(group = c("group_a", "group_a", "group_a", "group_a",
"group_a", "group_b", "group_b"),
value = c(4, 5 , 48, 54, 12, 12, 1),
row_name = c("1", "2", "3", "4", "5", "6", "7"),
overlaps = c("2", "3, 5", "4, 5", "5", "", "7", ""))
data %>%
separate(
overlaps, into=c("o1", "o2"), sep=", ", fill="right", remove=F) %>%
mutate(across(o1:o2, ~ ifelse(value > 10, get(cur_column()), 0)),
value = ifelse(row_number() %in% c_across(o1:o2), 0, value)) %>%
select(-o1, -o2)
#> group value row_name overlaps
#> 1 group_a 4 1 2
#> 2 group_a 5 2 3, 5
#> 3 group_a 48 3 4, 5
#> 4 group_a 0 4 5
#> 5 group_a 0 5
#> 6 group_b 12 6 7
#> 7 group_b 0 7