Home > Software design >  Randomly sampling iteratively from a data frame
Randomly sampling iteratively from a data frame

Time:05-25

I have the following dataframe that simulates a panel data set (i.e., multiple waves per unit).

dat <- structure(list(x = c(-0.32, -0.26, 0.05, -0.37, -0.37, -0.08, 
-0.01, 0.05, 0.19, -0.48, 0.37, 0.05, -0.58, -0.18, -0.04, -0.28, 
-0.44, -0.48, 1.05, 0.62, 0.85, 0.42, 0.7, 0.64, -0.19, -0.11, 
-0.65, -0.01, 0.39, -0.02, -0.23, -0.6, -0.1, 0.39, 0.33, 0.39, 
-0.09, -0.16, 0.26, -0.62, -0.44, -0.6, -0.17, -0.27, -0.12, 
-0.53, -0.38, -0.33, -0.17, -0.11, -0.25, -0.92, -0.6, -0.81, 
0.75, 0.52, 0.57, 1.32, 1.21, 1.21), y = c(-0.42, -2.01, -1.19, 
0.7, 1.28, 1.37, 0.52, 2.04, 2.34, -1.45, 2.84, 0.1, -3.12, 0.22, 
-0.06, -1.65, -0.9, -1.5, -0.98, -0.69, 0.15, 1.7, 1.47, 0.15, 
0.26, 0.84, 0.35, 0.86, -1.23, -0.74, -1.79, -0.56, -2.15, 2.11, 
2.34, 0.57, 0.38, 0.57, 0.97, 0.32, -1.71, -0.8, 1.45, -0.12, 
1.93, 2.76, 0.08, -2.8, -0.06, 1.09, -0.4, 0.41, 0.02, -1.61, 
1.75, 1.6, -0.19, 0.13, -0.89, -1.1), unit = c(1, 1, 1, 2, 2, 
2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 
9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 
15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, 
20, 20), wave = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 
1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 
1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 
1, 2, 3)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -60L), groups = structure(list(unit = c(1, 
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 
20), .rows = structure(list(1:3, 4:6, 7:9, 10:12, 13:15, 16:18, 
    19:21, 22:24, 25:27, 28:30, 31:33, 34:36, 37:39, 40:42, 43:45, 
    46:48, 49:51, 52:54, 55:57, 58:60), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -20L), .drop = TRUE))

I now want to simulate attrition into this data set: Some units drop out with a certain probability in wave 2; some of those who remain drop out in wave 3; and so forth until wave n. The probability remains the same at each step. Note that this approach should be flexible regarding the number of waves.

Here's what I came up with. Though it works, it feels slow to me. However, due to the varying number of waves, I am not sure how to avoid the loop.

# number of units and number of observations per unit:
n = 20
n_perunit = 3

# define attrition probability:
attrition = 2/3

# Start with a vector of all units
remaining <- 1:n

# loop through waves beginning with 2
system.time(for (i in 2:n_perunit) {
  n_remaining <- round(length(remaining)*attrition)
  remaining <- sample(remaining, n_remaining)
  dat <- dat %>% 
    mutate(drop = ifelse(
      wave >= i & !(unit %in% remaining), TRUE, FALSE)) %>%
    filter(drop == FALSE) %>%
    mutate(drop = NULL)
})

Efficiency:

   user  system elapsed 
  0.016   0.000   0.016 

Any ideas how to improve this?

EDIT:

Based on @jpsmith 's answer (which as far as I see does not work for groups in which no people drop out, because min(which(dropout == "yes") will return a value of Inf for those), I came up with the following:

set.seed(1234)
system.time(if (!is.null(attrition)) {
  # assign a 1 or 0 indicating dropout
  dat <- dat %>%
    mutate(dropout = ifelse(
      wave > 1, sample(
        0:1, n(), prob = c(attrition, 1-attrition), replace = TRUE), 0))
  # first get the first (minimum) dropout in each unit...
  dat <- dat %>%
    group_by(unit) %>%
    mutate(min = ifelse(
      length(which(dropout == 1) > 0), min(which(dropout == 1)), n_perunit)) %>%
    # ... then slice out rows up to that row 
    slice(1:min) %>%
    # as this also includes the first dropout rows, drop that one
    filter(dropout == 0)
})

Efficiency:

user  system elapsed
0.01    0.00    0.01 

However, some annoying warnings produced by slice - any idea why?

CodePudding user response:

Perhaps I'm wrong, but in effect the attrition is iid after the first wave: each subsequent wave has a probability of dropout - so if you made it to wave 3 then that probability is not conditioned on anything (akin to the probability of flipping a third heads if the first two were heads). If I am reading this correctly, you could assign the dropout simultaneously across waves > 1 and then drop all observations after the first "dropout". This would vectorize everything and be faster.

Code

set.seed(123)                                                                                                                                                                                                                                                                                                   ), row.names = c(NA, -20L), .drop = TRUE))
attrition <- 2/3

# Assign "dropout" position
dat$dropout <- ifelse(dat$wave > 1, sample(c("Yes","No"), prob = c(attrition, 1-attrition)), "No")

# Drop all observations after first dropout recorded
dat %>% group_by(unit) %>% slice(seq_len(min(which(dropout == "Yes") - 1)))

Output:

# Groups:   unit [20]
#       x     y  unit  wave dropout
# <dbl> <dbl> <dbl> <dbl> <chr>  
# 1 -0.32 -0.42     1     1 No     
# 2 -0.26 -2.01     1     2 No     
# 3 -0.37  0.7      2     1 No     
# 4 -0.01  0.52     3     1 No     
# 5  0.05  2.04     3     2 No     
# 6 -0.48 -1.45     4     1 No     
# 7 -0.58 -3.12     5     1 No     
# 8 -0.18  0.22     5     2 No     
# 9 -0.28 -1.65     6     1 No     
# 10  1.05 -0.98     7     1 No     
# # … with 20 more rows

Since you didnt set a seed or provide a desired output dataset, I cant compare, but happy to test this if you provide.

   user  system elapsed 
  0.008   0.001   0.009 

CodePudding user response:

Since the number of units remaining after each wave is deterministic, we can do the sampling all in one go.

library(dplyr)
set.seed(5)
n <- 20
n_perunit <- 3

# define attrition probability:
attrition <- 2/3

# Start with a vector of all units
remaining <- 1:n

# loop through waves beginning with 2

fOriginal <- function(df, remaining) {
  for (i in 2:n_perunit) {
    n_remaining <- round(length(remaining)*attrition)
    remaining <- sample(remaining, n_remaining)
    df <- df %>% 
      mutate(drop = ifelse(
        wave >= i & !(unit %in% remaining), TRUE, FALSE)) %>%
      filter(drop == FALSE) %>%
      mutate(drop = NULL)
  }
  df
}

fNew <- function(df) {
  nleft <- numeric(n_perunit   1)
  nleft[1] <- n
  for (i in 2:n_perunit) nleft[i] <- round(nleft[i - 1]*attrition)
  df[df$wave <= sample(rep.int(1:n_perunit, -diff(nleft)))[df$unit],]
}

dfOrig <- fOriginal(dat, remaining)
dfNew <- fNew(dat)
# the resulting data.frames are not identical due to different random sampling
# methods, but they both have the same number of rows and same wave counts
identical(tabulate(dfOrig$wave), tabulate(dfNew$wave))
#> [1] TRUE

microbenchmark::microbenchmark(fOriginal = fOriginal(dat, remaining),
                               fNew = fNew(dat))
#> Unit: milliseconds
#>       expr     min       lq     mean   median      uq     max neval
#>  fOriginal 12.0433 13.24815 14.52889 14.02410 15.0525 23.5338   100
#>       fNew  1.2956  1.41915  1.73176  1.56935  1.7398  5.0738   100
  •  Tags:  
  • r
  • Related