Randomly sampling iteratively from a data frame-CodePudding

I have the following dataframe that simulates a panel data set (i.e., multiple waves per unit).

dat <- structure(list(x = c(-0.32, -0.26, 0.05, -0.37, -0.37, -0.08, 
-0.01, 0.05, 0.19, -0.48, 0.37, 0.05, -0.58, -0.18, -0.04, -0.28, 
-0.44, -0.48, 1.05, 0.62, 0.85, 0.42, 0.7, 0.64, -0.19, -0.11, 
-0.65, -0.01, 0.39, -0.02, -0.23, -0.6, -0.1, 0.39, 0.33, 0.39, 
-0.09, -0.16, 0.26, -0.62, -0.44, -0.6, -0.17, -0.27, -0.12, 
-0.53, -0.38, -0.33, -0.17, -0.11, -0.25, -0.92, -0.6, -0.81, 
0.75, 0.52, 0.57, 1.32, 1.21, 1.21), y = c(-0.42, -2.01, -1.19, 
0.7, 1.28, 1.37, 0.52, 2.04, 2.34, -1.45, 2.84, 0.1, -3.12, 0.22, 
-0.06, -1.65, -0.9, -1.5, -0.98, -0.69, 0.15, 1.7, 1.47, 0.15, 
0.26, 0.84, 0.35, 0.86, -1.23, -0.74, -1.79, -0.56, -2.15, 2.11, 
2.34, 0.57, 0.38, 0.57, 0.97, 0.32, -1.71, -0.8, 1.45, -0.12, 
1.93, 2.76, 0.08, -2.8, -0.06, 1.09, -0.4, 0.41, 0.02, -1.61, 
1.75, 1.6, -0.19, 0.13, -0.89, -1.1), unit = c(1, 1, 1, 2, 2, 
2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 
9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 
15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, 
20, 20), wave = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 
1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 
1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 
1, 2, 3)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -60L), groups = structure(list(unit = c(1, 
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 
20), .rows = structure(list(1:3, 4:6, 7:9, 10:12, 13:15, 16:18, 
    19:21, 22:24, 25:27, 28:30, 31:33, 34:36, 37:39, 40:42, 43:45, 
    46:48, 49:51, 52:54, 55:57, 58:60), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -20L), .drop = TRUE))

I now want to simulate attrition into this data set: Some units drop out with a certain probability in wave 2; some of those who remain drop out in wave 3; and so forth until wave n. The probability remains the same at each step. Note that this approach should be flexible regarding the number of waves.

Here's what I came up with. Though it works, it feels slow to me. However, due to the varying number of waves, I am not sure how to avoid the loop.

# number of units and number of observations per unit:
n = 20
n_perunit = 3

# define attrition probability:
attrition = 2/3

# Start with a vector of all units
remaining <- 1:n

# loop through waves beginning with 2
system.time(for (i in 2:n_perunit) {
  n_remaining <- round(length(remaining)*attrition)
  remaining <- sample(remaining, n_remaining)
  dat <- dat %>% 
    mutate(drop = ifelse(
      wave >= i & !(unit %in% remaining), TRUE, FALSE)) %>%
    filter(drop == FALSE) %>%
    mutate(drop = NULL)
})

Efficiency:

   user  system elapsed 
  0.016   0.000   0.016

Any ideas how to improve this?

EDIT:

Based on @jpsmith 's answer (which as far as I see does not work for groups in which no people drop out, because min(which(dropout == "yes") will return a value of Inf for those), I came up with the following:

set.seed(1234)
system.time(if (!is.null(attrition)) {
  # assign a 1 or 0 indicating dropout
  dat <- dat %>%
    mutate(dropout = ifelse(
      wave > 1, sample(
        0:1, n(), prob = c(attrition, 1-attrition), replace = TRUE), 0))
  # first get the first (minimum) dropout in each unit...
  dat <- dat %>%
    group_by(unit) %>%
    mutate(min = ifelse(
      length(which(dropout == 1) > 0), min(which(dropout == 1)), n_perunit)) %>%
    # ... then slice out rows up to that row 
    slice(1:min) %>%
    # as this also includes the first dropout rows, drop that one
    filter(dropout == 0)
})

Efficiency:

user  system elapsed
0.01    0.00    0.01

However, some annoying warnings produced by slice - any idea why?

CodePudding user response：

Perhaps I'm wrong, but in effect the attrition is iid after the first wave: each subsequent wave has a probability of dropout - so if you made it to wave 3 then that probability is not conditioned on anything (akin to the probability of flipping a third heads if the first two were heads). If I am reading this correctly, you could assign the dropout simultaneously across waves > 1 and then drop all observations after the first "dropout". This would vectorize everything and be faster.

Code

set.seed(123)                                                                                                                                                                                                                                                                                                   ), row.names = c(NA, -20L), .drop = TRUE))
attrition <- 2/3

# Assign "dropout" position
dat$dropout <- ifelse(dat$wave > 1, sample(c("Yes","No"), prob = c(attrition, 1-attrition)), "No")

# Drop all observations after first dropout recorded
dat %>% group_by(unit) %>% slice(seq_len(min(which(dropout == "Yes") - 1)))

Output:

# Groups:   unit [20]
#       x     y  unit  wave dropout
# <dbl> <dbl> <dbl> <dbl> <chr>  
# 1 -0.32 -0.42     1     1 No     
# 2 -0.26 -2.01     1     2 No     
# 3 -0.37  0.7      2     1 No     
# 4 -0.01  0.52     3     1 No     
# 5  0.05  2.04     3     2 No     
# 6 -0.48 -1.45     4     1 No     
# 7 -0.58 -3.12     5     1 No     
# 8 -0.18  0.22     5     2 No     
# 9 -0.28 -1.65     6     1 No     
# 10  1.05 -0.98     7     1 No     
# # … with 20 more rows

Since you didnt set a seed or provide a desired output dataset, I cant compare, but happy to test this if you provide.

   user  system elapsed 
  0.008   0.001   0.009

CodePudding user response：

Since the number of units remaining after each wave is deterministic, we can do the sampling all in one go.

library(dplyr)
set.seed(5)
n <- 20
n_perunit <- 3

# define attrition probability:
attrition <- 2/3

# Start with a vector of all units
remaining <- 1:n

# loop through waves beginning with 2

fOriginal <- function(df, remaining) {
  for (i in 2:n_perunit) {
    n_remaining <- round(length(remaining)*attrition)
    remaining <- sample(remaining, n_remaining)
    df <- df %>% 
      mutate(drop = ifelse(
        wave >= i & !(unit %in% remaining), TRUE, FALSE)) %>%
      filter(drop == FALSE) %>%
      mutate(drop = NULL)
  }
  df
}

fNew <- function(df) {
  nleft <- numeric(n_perunit   1)
  nleft[1] <- n
  for (i in 2:n_perunit) nleft[i] <- round(nleft[i - 1]*attrition)
  df[df$wave <= sample(rep.int(1:n_perunit, -diff(nleft)))[df$unit],]
}

dfOrig <- fOriginal(dat, remaining)
dfNew <- fNew(dat)
# the resulting data.frames are not identical due to different random sampling
# methods, but they both have the same number of rows and same wave counts
identical(tabulate(dfOrig$wave), tabulate(dfNew$wave))
#> [1] TRUE

microbenchmark::microbenchmark(fOriginal = fOriginal(dat, remaining),
                               fNew = fNew(dat))
#> Unit: milliseconds
#>       expr     min       lq     mean   median      uq     max neval
#>  fOriginal 12.0433 13.24815 14.52889 14.02410 15.0525 23.5338   100
#>       fNew  1.2956  1.41915  1.73176  1.56935  1.7398  5.0738   100