R: Conditionally Deleting Parts of a Row-CodePudding

I am working with the R programming language.

I have the following data:

set.seed(123)
id = 1:100
weight_time_1 = rnorm(100,100,10)
weight_time_2 = rnorm(100,100,10)
weight_time_3 = rnorm(100,100,10)
weight_time_4 = rnorm(100,100,10)
weight_time_5 = rnorm(100,100,10)
weight_time_6 = rnorm(100,100,10)
weight_time_7 = rnorm(100,100,10)
weight_time_8 = rnorm(100,100,10)
weight_time_9 = rnorm(100,100,10)
weight_time_10 = rnorm(100,100,10)
state_time_1 = sample.int(5, 100, replace = TRUE)
state_time_2 = sample.int(5, 100, replace = TRUE)
state_time_3 = sample.int(5, 100, replace = TRUE)
state_time_4 = sample.int(5, 100, replace = TRUE)
state_time_5 = sample.int(5, 100, replace = TRUE)
state_time_6 = sample.int(5, 100, replace = TRUE)
state_time_7 = sample.int(5, 100, replace = TRUE)
state_time_8 = sample.int(5, 100, replace = TRUE)
state_time_9 = sample.int(5, 100, replace = TRUE)
state_time_10 = sample.int(5, 100, replace = TRUE)


my_data = data.frame(id, weight_time_1, state_time_1, weight_time_2, state_time_2, weight_time_3, state_time_3, 
weight_time_4, state_time_4, weight_time_5, state_time_5, weight_time_6, state_time_6, weight_time_7, state_time_7, 
weight_time_8, state_time_8, weight_time_9, state_time_9, weight_time_10, state_time_10)

The data looks something like this:

  id weight_time_1 state_time_1 weight_time_2 state_time_2 weight_time_3 state_time_3 weight_time_4 state_time_4 weight_time_5 state_time_5 weight_time_6 state_time_6 weight_time_7 state_time_7 weight_time_8 state_time_8
1  1      94.39524            1      92.89593            5     121.98810            3      92.84758            4      99.26444            2      93.98107            2     110.74012            3      92.71781            2
2  2      97.69823            1     102.56884            4     113.12413            3      92.47311            2      88.31349            2      90.06301            1      99.72653            1      84.59558            3
3  3     115.58708            2      97.53308            1      97.34855            4      90.61461            2      93.65252            3     110.26785            3      99.66670            3      93.06905            1
4  4     100.70508            2      96.52457            3     105.43194            1      89.47487            4      99.71158            5     107.51061            5      84.83932            5     101.18849            4
5  5     101.29288            1      90.48381            3      95.85660            5      95.62840            2     106.70696            5      84.90833            5     107.90385            2      86.35291            4
6  6     117.15065            3      99.54972            5      95.23753            3     103.31179            2      83.49453            4      99.04853            3      97.89266            3     105.89983            2
  weight_time_9 state_time_9 weight_time_10 state_time_10
1      103.5628            1       89.85886             4
2       93.4199            1       92.08686             2
3      108.5520            4      102.99594             2
4      111.5294            3      116.39052             1
5      102.7627            2      110.84617             1
6      101.4410            4       93.75433             3

I would like to randomly select some rows from this dataset - then, for these randomly selected rows: I would like to randomly choose a pair of (weight_time, state_time) and delete everything from the right of this randomly chosen pair. This would look something like this:

I recently found some code that shows how to delete everything from the left of this randomly chosen pair:

#Step 1: Randomly select id's that are eligible for deletion
#1 = delete, 2 = no delete
id = 1:100
 delete_or_not_delete = sample.int(2, 100, replace = TRUE)
 deleted_ids = data.frame(id,delete_or_not_delete)

#Step 2:  For id's that were selected to be deleted, randomly pick how many columns to be deleted (e.g. excluding the "id" column, 2 = first 2 columns deleted, 4 = first 4 columns deleted, etc.)

col_delete = c(2,4,6,8,10, 12, 14, 16, 18)
col_delete = sample(col_delete, 100, replace = TRUE)
deleted_ids$col_delete = col_delete
deleted_ids$final_number_of_col_delete = ifelse(deleted_ids$delete_or_not_delete == "1", deleted_ids$col_delete, "NONE")
deleted_ids$col_delete = NULL
deleted_ids$delete_or_not_delete = NULL

# Step 3 (Not My Code): Delete Everything From the Left Side

 for(i in 1:nrow(my_data)){
if(deleted_ids$final_number_of_col_delete[i]!="NONE"){
my_data[i,2:(deleted_ids$final_number_of_col_delete[i])]=NA #2 required to retain ID column but replaces all specified columns with NA
}
else{my_data[i,]=my_data[i,]} #Keeps the other values as they are before adding NAs
}

Now, I am trying to adapt this code so that it deletes everything from the right instead of deleting everything from the left.

Can someone please show me how to do this?

Thanks!

Note: "Delete" hear means replace with NA

CodePudding user response：

A tidyverse solution

set.seed(1)
n_grp <- (ncol(my_data) - 1) / 2
n_rm <- sample(1:nrow(my_data), 1)
id_rm <- sample(1:nrow(my_data), n_rm, replace = FALSE) %>% sort()
my_data %>% 
    filter(id %in% id_rm) %>% 
    group_by(id) %>% 
    group_modify(function(df, ...) {
        col_rm_n <- 2 * (sample(1:n_grp, 1) - 1)   1
        df %>% 
            mutate(across(col_rm_n:ncol(df), ~ NA))
    }) %>% 
    ungroup() %>%
    bind_rows(
        my_data %>% filter(!id %in% id_rm)
    ) %>% 
    arrange(id)

CodePudding user response：

I think this will help :

id = 1:100
delete_or_not_delete = sample.int(2, 100, replace = TRUE)
deleted_ids = data.frame(id,delete_or_not_delete)

#Step 2:  For id's that were selected to be deleted, randomly pick how many columns to be deleted (e.g. excluding the "id" column, 2 = first 2 columns deleted, 4 = first 4 columns deleted, etc.)

col_delete = c(4,6,8,10, 12, 14, 16, 18 , 20)
col_delete = sample(col_delete, 100, replace = TRUE)
deleted_ids$col_delete = col_delete
deleted_ids$final_number_of_col_delete = ifelse(deleted_ids$delete_or_not_delete == "1", deleted_ids$col_delete, "NONE")
deleted_ids$col_delete = NULL
deleted_ids$delete_or_not_delete = NULL

# Step 3 (Not My Code): Delete Everything From the Left Side

for(i in 1:nrow(my_data)){
    if(deleted_ids$final_number_of_col_delete[i]!="NONE"){
        my_data[i,(deleted_ids$final_number_of_col_delete[i]):21]=NA #2 required to retain ID column but replaces all specified columns with NA
    }
    else{my_data[i,]=my_data[i,]} #Keeps the other values as they are before adding NAs
}

CodePudding user response：

Another option:

my_data |> 
  slice_sample(n = sample(1:10, 1)) |> 
  pivot_longer(-id) |> 
  group_by(id) |> 
  group_map(~ head(.x, sample(seq(2, 20, 2), 1)), .keep = TRUE) |> 
  bind_rows() |> 
  pivot_wider(names_from = "name", values_from = "value")

CodePudding user response：

A simplified base R solution:

# generate `data`
set.seed(123)
wt <- replicate(10, rnorm(100, 100, 10))
dimnames(wt) <- list(1:100, paste0("weight_time_", 1:10))
st <- replicate(10, sample.int(5, 100, replace = TRUE))
dimnames(st) <- list(1:100, paste0("state_time_", 1:10))
data <- cbind(wt, st)[, rep(1:10, each = 2)   (0:1) * 10]
data <- cbind(id = 1:100, data)

# pick rows and start columns to set to NA
del <- sample(c(TRUE, FALSE), 100, replace = TRUE)
from_col <- sample(seq(2, 18, 2), 100, replace = TRUE)

# set selected indices to NA
for (i in which(del == TRUE))
    data[i, from_col[i]:dim(data)[2]] <- NA

^{Created on 2022-05-28 by the reprex package (v2.0.1)}