Home > Software engineering >  Replacing values bigger than threshold with 0 in specified range of columns in R dataframe
Replacing values bigger than threshold with 0 in specified range of columns in R dataframe

Time:11-18

I have a dataset df1 with columns from S_2018_p to S_2021_p where I want to replace the values >= 10 with 0. I would expect a dataset like df2.

library(data.table)
df1 = data.table(
  ID = c("a1", "a2", "a3", "a4", "a5", "a6", "a7"),
  "string1" = c("x2", "g3", "n2", "m3", "2w", "ps2", "kg2"),
  "S_2018_p" = c(3,5,11,3,9,22,6),
  "S_2019_p" = c(3,5,6,21,1,4,0),
  "S_2020_p" = c(0,4,13,9,16,7,9),
  "S_2021_p" = c(4,0,3,8,5,4,6),
  "string2" = c("si", "q2", "oq", "mx", "ix", "p2", "2q"))

  ID string1 S_2018_p S_2019_p S_2020_p S_2021_p string2
1: a1      x2        3        3        0        4      si
2: a2      g3        5        5        4        0      q2
3: a3      n2       11        6       13        3      oq
4: a4      m3        3       21        9        8      mx
5: a5      2w        9        1       16        5      ix
6: a6     ps2       22        4        7        4      p2
7: a7     kg2        6        0        9        6      2q
df2 = data.table(
  ID = c("a1", "a2", "a3", "a4", "a5", "a6", "a7"),
  "string1" = c("x2", "g3", "n2", "m3", "2w", "ps2", "kg2"),
  "S_2018_p" = c(3,5,0,3,9,0,6),
  "S_2019_p" = c(3,5,6,0,1,4,0),
  "S_2020_p" = c(0,4,0,9,0,7,9),
  "S_2021_p" = c(4,0,3,8,5,4,6),
  "string2" = c("si", "q2", "oq", "mx", "ix", "p2", "2q"))

   ID string1 S_2018_p S_2019_p S_2020_p S_2021_p string2
1: a1      x2        3        3        0        4      si
2: a2      g3        5        5        4        0      q2
3: a3      n2        0        6        0        3      oq
4: a4      m3        3        0        9        8      mx
5: a5      2w        9        1        0        5      ix
6: a6     ps2        0        4        7        4      p2
7: a7     kg2        6        0        9        6      2q

I tried to do it with mutete_if, but I don't get the desired output.

df1 %>% 
  mutate_if(is.numeric, ~1 * (. >= 10))

CodePudding user response:

Dplyr (use the latest version) has a nice "across()" function to be used with mutate. Just be sure to update your dplyr package as it is quite recent

library(dplyr)

df1 %>% mutate(across(where(is.numeric), function(x) ifelse(x >= 10, 0, x)))

   ID string1 S_2018_p S_2019_p S_2020_p S_2021_p string2
1: a1      x2        3        3        0        4      si
2: a2      g3        5        5        4        0      q2
3: a3      n2        0        6        0        3      oq
4: a4      m3        3        0        9        8      mx
5: a5      2w        9        1        0        5      ix
6: a6     ps2        0        4        7        4      p2
7: a7     kg2        6        0        9        6      2q

CodePudding user response:

You can use the apply and ifelse functions on the specific columns of interest. For example:

apply(df1[,c(3,4,5,6)], MARGIN = c(1,2), FUN = function(x) ifelse(x >= 10, 0, x))

The apply function will work on the selected rows of the data frame (df1[,c(3,4,5,6)]) and will apply the function FUN = ifelse(x >= 10, 0, x) (if x is greater or equal to ten, replace it with 0, else, replace it with itself (don't replace it)) on each cells of the dataframe (MARGIN = c(1,2)).

You can of course then replace the dataframe part with the corrected part:

df1[,c(3,4,5,6)] <- apply(df1[,c(3,4,5,6)], MARGIN = c(1,2), FUN = function(x) ifelse(x >= 10, 0, x))

CodePudding user response:

Another solution, based on dplyr:

library(tidyverse)
library(data.table)

df1 = data.table(
  ID = c("a1", "a2", "a3", "a4", "a5", "a6", "a7"),
  "string1" = c("x2", "g3", "n2", "m3", "2w", "ps2", "kg2"),
  "S_2018_p" = c(3,5,11,3,9,22,6),
  "S_2019_p" = c(3,5,6,21,1,4,0),
  "S_2020_p" = c(0,4,13,9,16,7,9),
  "S_2021_p" = c(4,0,3,8,5,4,6),
  "string2" = c("si", "q2", "oq", "mx", "ix", "p2", "2q"))

df1 %>% 
  mutate(
    across(
      where(is.numeric),
      ~ if_else(get(cur_column()) > 10, 0, get(cur_column()))))

#>    ID string1 S_2018_p S_2019_p S_2020_p S_2021_p string2
#> 1: a1      x2        3        3        0        4      si
#> 2: a2      g3        5        5        4        0      q2
#> 3: a3      n2        0        6        0        3      oq
#> 4: a4      m3        3        0        9        8      mx
#> 5: a5      2w        9        1        0        5      ix
#> 6: a6     ps2        0        4        7        4      p2
#> 7: a7     kg2        6        0        9        6      2q

And yet another solution, based on purrr::map_if:

library(tidyverse)
library(data.table)

df1 = data.table(
  ID = c("a1", "a2", "a3", "a4", "a5", "a6", "a7"),
  "string1" = c("x2", "g3", "n2", "m3", "2w", "ps2", "kg2"),
  "S_2018_p" = c(3,5,11,3,9,22,6),
  "S_2019_p" = c(3,5,6,21,1,4,0),
  "S_2020_p" = c(0,4,13,9,16,7,9),
  "S_2021_p" = c(4,0,3,8,5,4,6),
  "string2" = c("si", "q2", "oq", "mx", "ix", "p2", "2q"))

df1 %>% map_if(is.numeric, ~ ifelse(.x > 10 , 0, .x)) %>% as.data.table

#>    ID string1 S_2018_p S_2019_p S_2020_p S_2021_p string2
#> 1: a1      x2        3        3        0        4      si
#> 2: a2      g3        5        5        4        0      q2
#> 3: a3      n2        0        6        0        3      oq
#> 4: a4      m3        3        0        9        8      mx
#> 5: a5      2w        9        1        0        5      ix
#> 6: a6     ps2        0        4        7        4      p2
#> 7: a7     kg2        6        0        9        6      2q
  • Related