Home > Software engineering >  R - Set values by group based on a condition in a dataframe
R - Set values by group based on a condition in a dataframe

Time:11-18

I have the following dataset.

group value row_name overlaps
group_a 4 1 2
group_a 5 2 3, 5
group_a 48 3 4, 5
group_a 54 4 5
group_a 12 5
group_b 12 6 7
group_b 1 7

The overlaps column indicates which rows have a certain 'overlap'.

I would like, only for rows whose value is greater than 10, to replace the data in the value column with zero for all rows indicated in the respective 'overlaps' column.

Expected output:

group value row_name overlaps
group_a 4 1 2
group_a 5 2 3, 5
group_a 48 3 4, 5
group_a 0 4 5
group_a 0 5
group_b 12 6 7
group_b 0 7

Reproducible example:

data <- data.frame(group = c("group_a", "group_a", "group_a", "group_a",
                             "group_a", "group_b", "group_b"),
                   value = c(4, 5 , 48, 54, 12, 12, 1),
                   row_name = c("1", "2", "3", "4", "5", "6", "7"),
                   overlaps = c("2", "3, 5", "4, 5", "5", "", "7", ""))

I don't know if it's a very complex issue, but I'm stuck for hours and I didn't figure out how to solve it.

Does anyone have any suggestions on how I can solve this, by group, using dplyr or data.table?

CodePudding user response:

strsplit the overlaps column, subset to just those where data$value > 10, then use that distinct set of row_names to overwrite the original data with 0:

gr10 <- data$value > 10
sel <- Map(paste, data$group[gr10], strsplit(data$overlaps, ",\\s ")[gr10], sep="|")
sel <- Reduce(union, sel)
sel
#[1] "group_a|4" "group_a|5" "group_a|"  "group_b|7"
data$value[do.call(paste, c(data[c("group","row_name")], sep="|")) %in% sel] <- 0
data
#    group value row_name overlaps
#1 group_a     4        1        2
#2 group_a     5        2     3, 5
#3 group_a    48        3     4, 5
#4 group_a     0        4        5
#5 group_a     0        5         
#6 group_b    12        6        7
#7 group_b     0        7         

If the row_names are unique across the entire dataset, you can use simpler logic:

sel <- Reduce(union, strsplit(data$overlaps, ",\\s ")[data$value > 10])
sel
#[1] "4" "5" "7"
data$value[data$row_name %in% sel] <- 0

Bonus data.table solution:

library(data.table)
setDT(data)

data[
  data[value > 10, .(row_name=unlist(strsplit(overlaps, ",\\s "))), by=group],
  on=.(group, row_name),
  value := 0
]

CodePudding user response:

Another solution, based on tidyverse:

library(tidyverse)

data <- data.frame(group = c("group_a", "group_a", "group_a", "group_a",
                             "group_a", "group_b", "group_b"),
                   value = c(4, 5 , 48, 54, 12, 12, 1),
                   row_name = c("1", "2", "3", "4", "5", "6", "7"),
                   overlaps = c("2", "3, 5", "4, 5", "5", "", "7", ""))

data %>% 
  separate(
    overlaps, into=c("o1", "o2"), sep=", ", fill="right", remove=F) %>% 
  mutate(across(o1:o2, ~ ifelse(value > 10, get(cur_column()), 0)),
         value = ifelse(row_number() %in% c_across(o1:o2), 0, value)) %>% 
  select(-o1, -o2)

#>     group value row_name overlaps
#> 1 group_a     4        1        2
#> 2 group_a     5        2     3, 5
#> 3 group_a    48        3     4, 5
#> 4 group_a     0        4        5
#> 5 group_a     0        5         
#> 6 group_b    12        6        7
#> 7 group_b     0        7
  • Related