I am trying to find a way to edit my data. I have a data frame that consists of a location column and then 10 other columns that consist of "Y"s, "N"s, and "-"s.
structure(list(V1 = c("chr1:10150", "chr1:10219", "chr1:10230",
"chr1:10427", "chr1:10439", "chr1:10440", "chr1:10443", "chr1:13459",
"chr1:14397", "chr1:15219"), V2 = c("Y", "Y", "Y", "N", "-",
"N", "Y", "N", "Y", "N"), V3 = c("N", "N", "N", "N", "N", "N",
"N", "N", "Y", "N"), V4 = c("N", "Y", "-", "-", "-", "N", "Y",
"N", "-", "N"), V5 = c("Y", "Y", "Y", "Y", "Y", "Y", "Y", "N",
"Y", "N"), V6 = c("-", "-", "-", "Y", "-", "Y", "Y", "N", "Y",
"N"), V7 = c("Y", "N", "N", "N", "Y", "Y", "Y", "N", "Y", "N"
), V8 = c("N", "N", "Y", "-", "-", "N", "Y", "N", "N", "N"),
V9 = c("N", "N", "N", "N", "-", "N", "Y", "N", "Y", "-"),
V10 = c("N", "Y", "N", "N", "N", "Y", "-", "Y", "Y", "N"),
V11 = c("N", "N", "N", "N", "N", "N", "N", "N", "N", "N")), row.names = c(NA,
-10L), class = "data.frame")
I would like to find a way to randomly edit each row at a set frequency. What I mean by this is that, if I wanted to edit 0-5 values in a row, the code will possibly edit 3 values in row 1, then possibly 0 values in row 2, then possibly 5 values in row 3, and so on. The reason for doing this is because I need the data to somewhat match the original dataframe so I cant completely randomize things. I created a case_when
function to try and do this myself but I am just not sure how to add some kind of frequency to the function. Your answer does not nee dot include the case_when
function. I am adding it just in case it is helpful.
library(dplyr)
df2 <- df %>% mutate_at(
vars(-V1),
funs(case_when(
. == "Y" ~ "-",
. == "N" ~ "Y",
. == "-" ~ "Y"
))
)
CodePudding user response:
Here's a function that should replace a random number (between minEdit
and maxEdit
) of columns per row with a random different value:
vals <- c("Y" = 1, "N" = 2, "-" = 3)
alter <- function(df, minEdit, maxEdit) {
m <- as.matrix(df[, -1])
n <- sample(minEdit:maxEdit, nrow(m), replace = TRUE) # the number of columns to alter by row
mIdx <- matrix(c(unlist(mapply(rep, seq_along(n), n)), unlist(mapply(sample, ncol(m), n))), ncol = 2) # matrix index of random columns to alter
df[, -1] <- replace(m, mIdx, names(vals)[((vals[m[mIdx]] sample(0:1, sum(n), replace = TRUE)) %% length(vals)) 1L]) # replace current values with randomly selected alternative values
return(df)
}
> alter(df, 0L, 5L)
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
1 chr1:10150 Y N N N - Y N N N N
2 chr1:10219 Y N Y Y - N N Y Y N
3 chr1:10230 Y N - Y - N - N N Y
4 chr1:10427 N Y - Y N - - N N Y
5 chr1:10439 - N N - - N Y - - N
6 chr1:10440 N N N Y Y Y - N Y -
7 chr1:10443 Y N Y Y Y Y Y Y - N
8 chr1:13459 N Y - N - N N Y Y -
9 chr1:14397 - - N Y Y Y N Y - -
10 chr1:15219 N N N N N N Y - N Y
If instead you want the substituted value to be deterministic instead of random, it would be something like this:
valsFrom <- c("Y" = 1, "N" = 2, "-" = 3)
valsTo <- c("-", "Y", "N")
alter <- function(df, minEdit, maxEdit) {
m <- as.matrix(df[, -1])
n <- sample(minEdit:maxEdit, nrow(m), replace = TRUE) # the number of columns to alter by row
mIdx <- matrix(c(unlist(mapply(rep, seq_along(n), n)), unlist(mapply(sample, ncol(m), n))), ncol = 2) # matrix index of random columns to alter
df[, -1] <- replace(m, mIdx, valsTo[valsFrom[m[mIdx]]]) # replace current values with alternative values
return(df)
}
CodePudding user response:
Here is another possible (inelegant) solution.
It involves storing code in a column though, which is not advisable.
library(tidyverse)
library(purrrlyr)
replacer <- function(x) case_when(x == "Y" ~ "-*", x == "N" ~ "Y*", x == "-" ~ "Y*")
cols <- names(df)[2:ncol(df)]
tagged <-
df %>%
rowwise() %>%
mutate(how_many_edits = sample(1:length(cols), size = 1, replace = F)) %>%
mutate(cols_to_edit = list(sample(cols, size = how_many_edits, replace = F)),
cols_to_edit = paste0(cols_to_edit, collapse = ","),
cols_to_edit = paste0("c(", cols_to_edit, ")" ))
modified <-
tagged %>%
by_row(..f = function(df) mutate(df, across(.cols = !!str2lang(df$cols_to_edit),
.fns = replacer)),
.collate = "rows") %>% ## apply the replacer function to each row separately
select(`.row`:ncol(.)) ## output of by_row is duplicated; keep second half
Output:
modified
#> # A tibble: 10 × 8
#> .row V1 V2 V3 V4 V5 how_many_edits cols_to_edit
#> <int> <chr> <chr> <chr> <chr> <chr> <int> <chr>
#> 1 1 chr1:10150 -* Y* Y* Y 3 c(V2,V3,V4)
#> 2 2 chr1:10219 -* Y* -* -* 4 c(V4,V5,V2,V3)
#> 3 3 chr1:10230 -* Y* Y* Y 3 c(V3,V4,V2)
#> 4 4 chr1:10427 Y* N Y* -* 3 c(V4,V2,V5)
#> 5 5 chr1:10439 - Y* Y* Y 2 c(V4,V3)
#> 6 6 chr1:10440 N N N -* 1 c(V5)
#> 7 7 chr1:10443 Y Y* Y -* 2 c(V3,V5)
#> 8 8 chr1:13459 Y* Y* Y* Y* 4 c(V5,V2,V3,V4)
#> 9 9 chr1:14397 -* -* Y* -* 4 c(V4,V3,V2,V5)
#> 10 10 chr1:15219 Y* Y* N Y* 3 c(V5,V3,V2)
Created on 2021-10-25 by the reprex package (v2.0.1)