R - Set values by group based on a condition in a dataframe-CodePudding

I have the following dataset.

group	value	row_name	overlaps
group_a	4	1	2
group_a	5	2	3, 5
group_a	48	3	4, 5
group_a	54	4	5
group_a	12	5
group_b	12	6	7
group_b	1	7

The overlaps column indicates which rows have a certain 'overlap'.

I would like, only for rows whose value is greater than 10, to replace the data in the value column with zero for all rows indicated in the respective 'overlaps' column.

Expected output:

group	value	row_name	overlaps
group_a	4	1	2
group_a	5	2	3, 5
group_a	48	3	4, 5
group_a	0	4	5
group_a	0	5
group_b	12	6	7
group_b	0	7

Reproducible example:

data <- data.frame(group = c("group_a", "group_a", "group_a", "group_a",
                             "group_a", "group_b", "group_b"),
                   value = c(4, 5 , 48, 54, 12, 12, 1),
                   row_name = c("1", "2", "3", "4", "5", "6", "7"),
                   overlaps = c("2", "3, 5", "4, 5", "5", "", "7", ""))

I don't know if it's a very complex issue, but I'm stuck for hours and I didn't figure out how to solve it.

Does anyone have any suggestions on how I can solve this, by group, using dplyr or data.table?

CodePudding user response：

strsplit the overlaps column, subset to just those where data$value > 10, then use that distinct set of row_names to overwrite the original data with 0:

gr10 <- data$value > 10
sel <- Map(paste, data$group[gr10], strsplit(data$overlaps, ",\\s ")[gr10], sep="|")
sel <- Reduce(union, sel)
sel
#[1] "group_a|4" "group_a|5" "group_a|"  "group_b|7"
data$value[do.call(paste, c(data[c("group","row_name")], sep="|")) %in% sel] <- 0
data
#    group value row_name overlaps
#1 group_a     4        1        2
#2 group_a     5        2     3, 5
#3 group_a    48        3     4, 5
#4 group_a     0        4        5
#5 group_a     0        5         
#6 group_b    12        6        7
#7 group_b     0        7

If the row_names are unique across the entire dataset, you can use simpler logic:

sel <- Reduce(union, strsplit(data$overlaps, ",\\s ")[data$value > 10])
sel
#[1] "4" "5" "7"
data$value[data$row_name %in% sel] <- 0

Bonus data.table solution:

library(data.table)
setDT(data)

data[
  data[value > 10, .(row_name=unlist(strsplit(overlaps, ",\\s "))), by=group],
  on=.(group, row_name),
  value := 0
]

CodePudding user response：

Another solution, based on tidyverse:

library(tidyverse)

data <- data.frame(group = c("group_a", "group_a", "group_a", "group_a",
                             "group_a", "group_b", "group_b"),
                   value = c(4, 5 , 48, 54, 12, 12, 1),
                   row_name = c("1", "2", "3", "4", "5", "6", "7"),
                   overlaps = c("2", "3, 5", "4, 5", "5", "", "7", ""))

data %>% 
  separate(
    overlaps, into=c("o1", "o2"), sep=", ", fill="right", remove=F) %>% 
  mutate(across(o1:o2, ~ ifelse(value > 10, get(cur_column()), 0)),
         value = ifelse(row_number() %in% c_across(o1:o2), 0, value)) %>% 
  select(-o1, -o2)

#>     group value row_name overlaps
#> 1 group_a     4        1        2
#> 2 group_a     5        2     3, 5
#> 3 group_a    48        3     4, 5
#> 4 group_a     0        4        5
#> 5 group_a     0        5         
#> 6 group_b    12        6        7
#> 7 group_b     0        7