Randomly Deleting Parts of a Row-CodePudding

I have the following data frame:

id = 1:100
weight_time_1 = rnorm(100,100,10)
weight_time_2 = rnorm(100,100,10)
weight_time_3 = rnorm(100,100,10)
weight_time_4 = rnorm(100,100,10)
weight_time_5 = rnorm(100,100,10)
weight_time_6 = rnorm(100,100,10)
weight_time_7 = rnorm(100,100,10)
weight_time_8 = rnorm(100,100,10)
weight_time_9 = rnorm(100,100,10)
weight_time_10 = rnorm(100,100,10)
state_time_1 = sample.int(5, 100, replace = TRUE)
state_time_2 = sample.int(5, 100, replace = TRUE)
state_time_3 = sample.int(5, 100, replace = TRUE)
state_time_4 = sample.int(5, 100, replace = TRUE)
state_time_5 = sample.int(5, 100, replace = TRUE)
state_time_6 = sample.int(5, 100, replace = TRUE)
state_time_7 = sample.int(5, 100, replace = TRUE)
state_time_8 = sample.int(5, 100, replace = TRUE)
state_time_9 = sample.int(5, 100, replace = TRUE)
state_time_10 = sample.int(5, 100, replace = TRUE)


my_data = data.frame(id, weight_time_1, state_time_1, weight_time_2, state_time_2, weight_time_3, state_time_3, 
weight_time_4, state_time_4, weight_time_5, state_time_5, weight_time_6, state_time_6, weight_time_7, state_time_7, 
weight_time_8, state_time_8, weight_time_9, state_time_9, weight_time_10, state_time_10)

head(my_data)
  id weight_year_1 state_year_1 weight_year_2 state_year_2 weight_year_3 state_year_3 weight_year_4 state_year_4 weight_year_5 state_year_5 weight_year_6 state_year_6 weight_year_7 state_year_7 weight_year_8 state_year_8
1  1      119.3852            2     111.30729            5      99.11912            5      97.06366            1     103.73559            4     100.53940            3      90.98888            2      95.10628            3
2  2      124.5046            3      86.74208            4      96.87224            3      88.84019            2      92.39560            4      96.83324            3     108.60610            1      90.24227            3
3  3       98.3621            2     114.60002            1      91.61257            3     121.88707            2     103.78418            2      96.77586            2     103.58945            3     102.08050            3
4  4      102.8222            3      95.72920            5      92.51412            4     107.94097            4     105.07041            3     116.22625            1     100.52621            5     102.88718            1
5  5      114.0140            5      94.04442            2     112.10150            2     111.40825            4      90.93852            4      83.81637            3     118.08578            5      84.64170            3
6  6      113.0468            2      96.90621            1     102.99961            4      89.28867            1     107.19814            2      99.29141            1      79.91099            1     106.01940            1
  weight_year_9 state_year_9 weight_year_10 state_year_10
1     105.34245            5      106.61219             4
2      93.87486            4       95.14339             1
3      99.22730            1      108.46509             4
4      88.78866            1      114.68032             5
5      93.28602            5       91.50742             1
6     104.14194            4       98.67597             2

I want to randomly delete "parts of each row" from the left up until some column - this should look something like this ("red line" refers to deleted entries, e.g. replace with NA):

I thought of the following way to do this:

Step 1: First, randomly select which id's will be eligible to have deletions

 #1 = delete, 2 = no delete
id = 1:100
 delete_or_not_delete = sample.int(2, 100, replace = TRUE)
 deleted_ids = data.frame(id,delete_or_not_delete)

Step 2: For id's that were selected to be deleted, randomly pick how many columns to be deleted (e.g. excluding the "id" column, 2 = first 2 columns deleted, 4 = first 4 columns deleted, etc.)

col_delete = c(2,4,6,8,10, 12, 14, 16, 18)
col_delete = sample(col_delete, 100, replace = TRUE)
deleted_ids$col_delete = col_delete
deleted_ids$final_number_of_col_delete = ifelse(deleted_ids$delete_or_not_delete == "1", deleted_ids$col_delete, "NONE")
deleted_ids$col_delete = NULL
deleted_ids$delete_or_not_delete = NULL

In the end, I have something like this:

  id final_number_of_col_delete
1  1                       NONE
2  2                       NONE
3  3                         14
4  4                         14
5  5                         12
6  6                       NONE

Based on this file (deleted_ids), from "my_data" I would like to:

delete nothing from the row corresponding to id = 1
delete nothing from the row corresponding to id = 2
delete the first 14 columns (excluding the id column) from the row corresponding to id = 3
delete the first 14 columns (excluding the id column) from the row corresponding to id = 4
delete the first 12 columns (excluding the id column) from the row corresponding to id = 5
delete nothing from the row corresponding to id = 6
etc.

Can someone please show me how to do this?

Note: "Delete" here means "replace entries with NA".

CodePudding user response：

   opts <- c(NA, NA, NA, NA, 2,4,6,8,10, 12, 14, 16, 18)
   del_to <- sample(opts, nrow(df))
   
   for(i in 1:nrow(df)){
     if(is.na(del_to[i]) == FALSE)
        df[i, 1:del_to[i]] <- NA
   }

CodePudding user response：

Since all groups, weight and state have same features, just draw all rnorm and sample at once in a matrix.

Then try mapply for random deletion by using TRUE/FALSE, subsequent IDs and columns vector as input. The <<- changes the object in global environment.

Finally convert to as.data.frame and setNames.

n <- 100
set.seed(42)
M <- matrix(c(seq_len(n), rnorm(10*n, 100, 10),
              sample.int(5, 10*n, replace = TRUE)),
            n, 1   10   10)
invisible(mapply(\(x, y, z) if (sample(x, 1)) {s <- sample(z, 1); M[y, c(s, s   10)] <<- NA}, 
                 list(c(TRUE, FALSE)), seq_len(n), list(2:10)))
dat <- as.data.frame(M) |> 
  setNames(c('id', paste0('weight_time_', 1:10), paste0('state_time_', 1:10)))

Looks like this:

head(dat)
#   id weight_time_1 weight_time_2 weight_time_3 weight_time_4 weight_time_5 weight_time_6 weight_time_7
# 1  1     113.70958     112.00965      79.99071      99.95379     113.34913     110.29141      97.51517
# 2  2      94.35302     110.44751     103.33777     107.60242      91.30728     109.14775            NA
# 3  3     103.63128      89.96791     111.71325     100.38991            NA      99.97544     109.87653
# 4  4     106.32863     118.48482     120.59539     107.35072            NA     101.36010     108.35568
# 5  5     104.04268      93.33227      86.23138      98.53527      94.21644      92.79846      93.39478
# 6  6      98.93875     101.05514      88.49144            NA      90.01261      98.01876     115.64069
#   weight_time_8 weight_time_9 weight_time_10 state_time_1 state_time_2 state_time_3 state_time_4 state_time_5
# 1     102.94692     106.88808      109.41924            2            4            5            3            2
# 2     103.92741     107.25083       97.51386            5            5            3            5            4
# 3      89.99156     102.17380      100.96479            3            5            4            5           NA
# 4      96.74273      97.98343       95.66069            4            2            1            1           NA
# 5      89.91651            NA      121.78668            4            3            4            1            1
# 6      93.64569      96.91062       70.41220            4            1            5           NA            4
#   state_time_6 state_time_7 state_time_8 state_time_9 state_time_10
# 1            5            4            2            3             5
# 2            5           NA            2            3             4
# 3            4            1            5            1             1
# 4            5            4            5            3             4
# 5            4            4            4           NA             3
# 6            4            2            5            4             1