Home > Software engineering >  R: Conditionally Deleting Parts of a Row
R: Conditionally Deleting Parts of a Row


I am working with the R programming language.

I have the following data:

id = 1:100
weight_time_1 = rnorm(100,100,10)
weight_time_2 = rnorm(100,100,10)
weight_time_3 = rnorm(100,100,10)
weight_time_4 = rnorm(100,100,10)
weight_time_5 = rnorm(100,100,10)
weight_time_6 = rnorm(100,100,10)
weight_time_7 = rnorm(100,100,10)
weight_time_8 = rnorm(100,100,10)
weight_time_9 = rnorm(100,100,10)
weight_time_10 = rnorm(100,100,10)
state_time_1 = sample.int(5, 100, replace = TRUE)
state_time_2 = sample.int(5, 100, replace = TRUE)
state_time_3 = sample.int(5, 100, replace = TRUE)
state_time_4 = sample.int(5, 100, replace = TRUE)
state_time_5 = sample.int(5, 100, replace = TRUE)
state_time_6 = sample.int(5, 100, replace = TRUE)
state_time_7 = sample.int(5, 100, replace = TRUE)
state_time_8 = sample.int(5, 100, replace = TRUE)
state_time_9 = sample.int(5, 100, replace = TRUE)
state_time_10 = sample.int(5, 100, replace = TRUE)

my_data = data.frame(id, weight_time_1, state_time_1, weight_time_2, state_time_2, weight_time_3, state_time_3, 
weight_time_4, state_time_4, weight_time_5, state_time_5, weight_time_6, state_time_6, weight_time_7, state_time_7, 
weight_time_8, state_time_8, weight_time_9, state_time_9, weight_time_10, state_time_10)

The data looks something like this:

  id weight_time_1 state_time_1 weight_time_2 state_time_2 weight_time_3 state_time_3 weight_time_4 state_time_4 weight_time_5 state_time_5 weight_time_6 state_time_6 weight_time_7 state_time_7 weight_time_8 state_time_8
1  1      94.39524            1      92.89593            5     121.98810            3      92.84758            4      99.26444            2      93.98107            2     110.74012            3      92.71781            2
2  2      97.69823            1     102.56884            4     113.12413            3      92.47311            2      88.31349            2      90.06301            1      99.72653            1      84.59558            3
3  3     115.58708            2      97.53308            1      97.34855            4      90.61461            2      93.65252            3     110.26785            3      99.66670            3      93.06905            1
4  4     100.70508            2      96.52457            3     105.43194            1      89.47487            4      99.71158            5     107.51061            5      84.83932            5     101.18849            4
5  5     101.29288            1      90.48381            3      95.85660            5      95.62840            2     106.70696            5      84.90833            5     107.90385            2      86.35291            4
6  6     117.15065            3      99.54972            5      95.23753            3     103.31179            2      83.49453            4      99.04853            3      97.89266            3     105.89983            2
  weight_time_9 state_time_9 weight_time_10 state_time_10
1      103.5628            1       89.85886             4
2       93.4199            1       92.08686             2
3      108.5520            4      102.99594             2
4      111.5294            3      116.39052             1
5      102.7627            2      110.84617             1
6      101.4410            4       93.75433             3

I would like to randomly select some rows from this dataset - then, for these randomly selected rows: I would like to randomly choose a pair of (weight_time, state_time) and delete everything from the right of this randomly chosen pair. This would look something like this:

enter image description here

I recently found some code that shows how to delete everything from the left of this randomly chosen pair:

#Step 1: Randomly select id's that are eligible for deletion
#1 = delete, 2 = no delete
id = 1:100
 delete_or_not_delete = sample.int(2, 100, replace = TRUE)
 deleted_ids = data.frame(id,delete_or_not_delete)

#Step 2:  For id's that were selected to be deleted, randomly pick how many columns to be deleted (e.g. excluding the "id" column, 2 = first 2 columns deleted, 4 = first 4 columns deleted, etc.)

col_delete = c(2,4,6,8,10, 12, 14, 16, 18)
col_delete = sample(col_delete, 100, replace = TRUE)
deleted_ids$col_delete = col_delete
deleted_ids$final_number_of_col_delete = ifelse(deleted_ids$delete_or_not_delete == "1", deleted_ids$col_delete, "NONE")
deleted_ids$col_delete = NULL
deleted_ids$delete_or_not_delete = NULL

# Step 3 (Not My Code): Delete Everything From the Left Side

 for(i in 1:nrow(my_data)){
my_data[i,2:(deleted_ids$final_number_of_col_delete[i])]=NA #2 required to retain ID column but replaces all specified columns with NA
else{my_data[i,]=my_data[i,]} #Keeps the other values as they are before adding NAs

Now, I am trying to adapt this code so that it deletes everything from the right instead of deleting everything from the left.

Can someone please show me how to do this?


Note: "Delete" hear means replace with NA

CodePudding user response:

A tidyverse solution

n_grp <- (ncol(my_data) - 1) / 2
n_rm <- sample(1:nrow(my_data), 1)
id_rm <- sample(1:nrow(my_data), n_rm, replace = FALSE) %>% sort()
my_data %>% 
    filter(id %in% id_rm) %>% 
    group_by(id) %>% 
    group_modify(function(df, ...) {
        col_rm_n <- 2 * (sample(1:n_grp, 1) - 1)   1
        df %>% 
            mutate(across(col_rm_n:ncol(df), ~ NA))
    }) %>% 
    ungroup() %>%
        my_data %>% filter(!id %in% id_rm)
    ) %>% 

CodePudding user response:

I think this will help :

id = 1:100
delete_or_not_delete = sample.int(2, 100, replace = TRUE)
deleted_ids = data.frame(id,delete_or_not_delete)

#Step 2:  For id's that were selected to be deleted, randomly pick how many columns to be deleted (e.g. excluding the "id" column, 2 = first 2 columns deleted, 4 = first 4 columns deleted, etc.)

col_delete = c(4,6,8,10, 12, 14, 16, 18 , 20)
col_delete = sample(col_delete, 100, replace = TRUE)
deleted_ids$col_delete = col_delete
deleted_ids$final_number_of_col_delete = ifelse(deleted_ids$delete_or_not_delete == "1", deleted_ids$col_delete, "NONE")
deleted_ids$col_delete = NULL
deleted_ids$delete_or_not_delete = NULL

# Step 3 (Not My Code): Delete Everything From the Left Side

for(i in 1:nrow(my_data)){
        my_data[i,(deleted_ids$final_number_of_col_delete[i]):21]=NA #2 required to retain ID column but replaces all specified columns with NA
    else{my_data[i,]=my_data[i,]} #Keeps the other values as they are before adding NAs

CodePudding user response:

Another option:

my_data |> 
  slice_sample(n = sample(1:10, 1)) |> 
  pivot_longer(-id) |> 
  group_by(id) |> 
  group_map(~ head(.x, sample(seq(2, 20, 2), 1)), .keep = TRUE) |> 
  bind_rows() |> 
  pivot_wider(names_from = "name", values_from = "value")

CodePudding user response:

A simplified base R solution:

# generate `data`
wt <- replicate(10, rnorm(100, 100, 10))
dimnames(wt) <- list(1:100, paste0("weight_time_", 1:10))
st <- replicate(10, sample.int(5, 100, replace = TRUE))
dimnames(st) <- list(1:100, paste0("state_time_", 1:10))
data <- cbind(wt, st)[, rep(1:10, each = 2)   (0:1) * 10]
data <- cbind(id = 1:100, data)

# pick rows and start columns to set to NA
del <- sample(c(TRUE, FALSE), 100, replace = TRUE)
from_col <- sample(seq(2, 18, 2), 100, replace = TRUE)

# set selected indices to NA
for (i in which(del == TRUE))
    data[i, from_col[i]:dim(data)[2]] <- NA

Created on 2022-05-28 by the reprex package (v2.0.1)

  • Related