How to alter values at a set frequency-CodePudding

I am trying to find a way to edit my data. I have a data frame that consists of a location column and then 10 other columns that consist of "Y"s, "N"s, and "-"s.

structure(list(V1 = c("chr1:10150", "chr1:10219", "chr1:10230", 
"chr1:10427", "chr1:10439", "chr1:10440", "chr1:10443", "chr1:13459", 
"chr1:14397", "chr1:15219"), V2 = c("Y", "Y", "Y", "N", "-", 
"N", "Y", "N", "Y", "N"), V3 = c("N", "N", "N", "N", "N", "N", 
"N", "N", "Y", "N"), V4 = c("N", "Y", "-", "-", "-", "N", "Y", 
"N", "-", "N"), V5 = c("Y", "Y", "Y", "Y", "Y", "Y", "Y", "N", 
"Y", "N"), V6 = c("-", "-", "-", "Y", "-", "Y", "Y", "N", "Y", 
"N"), V7 = c("Y", "N", "N", "N", "Y", "Y", "Y", "N", "Y", "N"
), V8 = c("N", "N", "Y", "-", "-", "N", "Y", "N", "N", "N"), 
    V9 = c("N", "N", "N", "N", "-", "N", "Y", "N", "Y", "-"), 
    V10 = c("N", "Y", "N", "N", "N", "Y", "-", "Y", "Y", "N"), 
    V11 = c("N", "N", "N", "N", "N", "N", "N", "N", "N", "N")), row.names = c(NA, 
-10L), class = "data.frame")

I would like to find a way to randomly edit each row at a set frequency. What I mean by this is that, if I wanted to edit 0-5 values in a row, the code will possibly edit 3 values in row 1, then possibly 0 values in row 2, then possibly 5 values in row 3, and so on. The reason for doing this is because I need the data to somewhat match the original dataframe so I cant completely randomize things. I created a case_when function to try and do this myself but I am just not sure how to add some kind of frequency to the function. Your answer does not nee dot include the case_when function. I am adding it just in case it is helpful.

library(dplyr)
df2 <- df %>% mutate_at(
  vars(-V1),
  funs(case_when(
    . == "Y" ~ "-",
    . == "N" ~ "Y",
    . == "-" ~ "Y"
    
  ))
)

CodePudding user response：

Here's a function that should replace a random number (between minEdit and maxEdit) of columns per row with a random different value:

vals <- c("Y" = 1, "N" = 2, "-" = 3)

alter <- function(df, minEdit, maxEdit) {
  m <- as.matrix(df[, -1])
  n <- sample(minEdit:maxEdit, nrow(m), replace = TRUE) # the number of columns to alter by row
  mIdx <- matrix(c(unlist(mapply(rep, seq_along(n), n)), unlist(mapply(sample, ncol(m), n))), ncol = 2) # matrix index of random columns to alter
  df[, -1] <- replace(m, mIdx, names(vals)[((vals[m[mIdx]]   sample(0:1, sum(n), replace = TRUE)) %% length(vals))   1L]) # replace current values with randomly selected alternative values
  return(df)
}

> alter(df, 0L, 5L)
           V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
1  chr1:10150  Y  N  N  N  -  Y  N  N   N   N
2  chr1:10219  Y  N  Y  Y  -  N  N  Y   Y   N
3  chr1:10230  Y  N  -  Y  -  N  -  N   N   Y
4  chr1:10427  N  Y  -  Y  N  -  -  N   N   Y
5  chr1:10439  -  N  N  -  -  N  Y  -   -   N
6  chr1:10440  N  N  N  Y  Y  Y  -  N   Y   -
7  chr1:10443  Y  N  Y  Y  Y  Y  Y  Y   -   N
8  chr1:13459  N  Y  -  N  -  N  N  Y   Y   -
9  chr1:14397  -  -  N  Y  Y  Y  N  Y   -   -
10 chr1:15219  N  N  N  N  N  N  Y  -   N   Y

If instead you want the substituted value to be deterministic instead of random, it would be something like this:

valsFrom <- c("Y" = 1, "N" = 2, "-" = 3)
valsTo <- c("-", "Y", "N")

alter <- function(df, minEdit, maxEdit) {
  m <- as.matrix(df[, -1])
  n <- sample(minEdit:maxEdit, nrow(m), replace = TRUE) # the number of columns to alter by row
  mIdx <- matrix(c(unlist(mapply(rep, seq_along(n), n)), unlist(mapply(sample, ncol(m), n))), ncol = 2) # matrix index of random columns to alter
  df[, -1] <- replace(m, mIdx, valsTo[valsFrom[m[mIdx]]]) # replace current values with alternative values
  return(df)
}

CodePudding user response：

Here is another possible (inelegant) solution.

It involves storing code in a column though, which is not advisable.

library(tidyverse)
library(purrrlyr)

replacer <- function(x) case_when(x == "Y" ~ "-*", x == "N" ~ "Y*", x == "-" ~ "Y*")

cols <- names(df)[2:ncol(df)]

tagged <- 
  df %>%
  rowwise() %>% 
  mutate(how_many_edits = sample(1:length(cols), size = 1, replace = F)) %>%
  mutate(cols_to_edit = list(sample(cols, size = how_many_edits, replace = F)),
         cols_to_edit = paste0(cols_to_edit, collapse = ","), 
         cols_to_edit = paste0("c(", cols_to_edit, ")" ))

modified <- 
  tagged %>% 
  by_row(..f = function(df) mutate(df, across(.cols = !!str2lang(df$cols_to_edit),
                                              .fns = replacer)), 
         .collate = "rows") %>%  ## apply the replacer function to each row separately
  select(`.row`:ncol(.))   ## output of by_row is duplicated; keep second half

Output:

modified
#> # A tibble: 10 × 8
#>     .row V1         V2    V3    V4    V5    how_many_edits cols_to_edit  
#>    <int> <chr>      <chr> <chr> <chr> <chr>          <int> <chr>         
#>  1     1 chr1:10150 -*    Y*    Y*    Y                  3 c(V2,V3,V4)   
#>  2     2 chr1:10219 -*    Y*    -*    -*                 4 c(V4,V5,V2,V3)
#>  3     3 chr1:10230 -*    Y*    Y*    Y                  3 c(V3,V4,V2)   
#>  4     4 chr1:10427 Y*    N     Y*    -*                 3 c(V4,V2,V5)   
#>  5     5 chr1:10439 -     Y*    Y*    Y                  2 c(V4,V3)      
#>  6     6 chr1:10440 N     N     N     -*                 1 c(V5)         
#>  7     7 chr1:10443 Y     Y*    Y     -*                 2 c(V3,V5)      
#>  8     8 chr1:13459 Y*    Y*    Y*    Y*                 4 c(V5,V2,V3,V4)
#>  9     9 chr1:14397 -*    -*    Y*    -*                 4 c(V4,V3,V2,V5)
#> 10    10 chr1:15219 Y*    Y*    N     Y*                 3 c(V5,V3,V2)

^{Created on 2021-10-25 by the reprex package (v2.0.1)}