Home > other >  Sample rows of data for iteratively smaller samples
Sample rows of data for iteratively smaller samples

Time:03-09

I have the following dataset:

group<- c(rep(1, 200), rep(2, 200), rep(3, 200), rep(4, 200), rep(5, 200), rep(6, 200)) 
injection<- c(rep(1, 200), rep(0, 600), rep(1, 400)) 
art_light<- c(rep(1, 400), rep(0, 600), rep(1, 200))
seasonal_light<- c(rep(1, 600), rep(0, 200), rep(1, 200),  rep(1, 200)) 

## generate data frame for species 1-3; where Species B is unaffected by all of these things and A and C are subtly different
## species is a random effect and so results are more generalizable
dat1 <- data.frame(group, injection,art_light,seasonal_light)
dat1$species = "A"
dat2 <- data.frame(group, injection,art_light,seasonal_light)
dat2$species = "B"
dat3 <- data.frame(group, injection,art_light,seasonal_light)
dat3$species = "C"

#################################
# Simulated Response  Variables #
#################################

alpha = 1
beta1 = 10
beta2 = 2
beta3 = 20
beta4 = 10

e1= rnorm(1200, 5, sd=1)
e2 = rlnorm(1200)
e = rcauchy(1200) 
e3 = floor(runif(1200, min = 0, max = 20))
e4 = rpois(1200, lambda = 4)
e5 = rlnorm(1200)

dat1$lh<-alpha   beta1*injection   beta2*art_light   
beta3*seasonal_light   beta4*injection*seasonal_light   e1
dat1$hb<-alpha   beta1*injection   e2

dat2$lh<- e
dat2$hb<- alpha   beta3*injection   e3

dat3$lh<-alpha   beta1*injection   beta2*art_light   
beta3*seasonal_light  e4
dat3$hb<-alpha   beta2*injection   e5
dat <- do.call("rbind", list(dat1, dat2, dat3))

I want to randomly sample rows within each group with no replacement. Sampling will be done iteratively such that in the first iteration of sampling, the number of rows sampled are the number of rows in group (n) and in the next iteration the number of sampled rows would be n-1.

These results should be combined into a data frame. Each subsample will be distinguished by a new variable sample_num that represents the number of rows sampled. Below is an example of the result data frame for the first 4 samples. The actual result should continue for all possible subsample sizes down to sampling of 10 rows.

samp1<-
  dat %>% 
  group_by(group) %>% 
  sample_n(size = 600)
samp1$sample_num<-600

samp2<-
  dat %>% 
  group_by(group) %>% 
  sample_n(size = 599)
samp2$sample_num<-599

samp3<-
  dat %>% 
  group_by(group) %>% 
  sample_n(size = 598)
samp3$sample_num<-598

samp4<-
  dat %>% 
  group_by(group) %>% 
  sample_n(size = 597)
samp4$sample_num<-597

samp_dat<- rbind(samp1,samp2,samp3,samp4)

CodePudding user response:

You can do:

library(dplyr)
library(purrr)

res <- map_df(set_names(600:10),
              ~ dat %>%
                group_by(group) %>%
                sample_n(size = .x),
              .id = "sample_num")

Giving:

# A tibble: 1,081,530 x 8
# Groups:   group [6]
   sample_num group injection art_light seasonal_light species       lh    hb
   <chr>      <dbl>     <dbl>     <dbl>          <dbl> <chr>      <dbl> <dbl>
 1 600            1         1         1              1 B       -403.    38   
 2 600            1         1         1              1 A         49.3   12.5 
 3 600            1         1         1              1 B          0.961 31   
 4 600            1         1         1              1 A         48.7   12.1 
 5 600            1         1         1              1 B         -0.691 23   
 6 600            1         1         1              1 A         47.7   11.4 
 7 600            1         1         1              1 C         37      3.55
 8 600            1         1         1              1 B         -0.327 22   
 9 600            1         1         1              1 B         -7.71  32   
10 600            1         1         1              1 B          0.153 36   
# ... with 1,081,520 more rows

CodePudding user response:

data.table is incredibly fast at this sort of thing

library(data.table)

rbindlist(lapply(600:10, \(x) setDT(dat)[,.SD[sample(1:.N,x)], by=.(group)][,sample:=x]))

Output:

         group injection art_light seasonal_light species        lh        hb sample
      1:     1         1         1              1       C 35.000000  3.196606    600
      2:     1         1         1              1       A 46.424639 12.210558    600
      3:     1         1         1              1       C 33.000000  5.303823    600
      4:     1         1         1              1       A 47.316622 11.814838    600
      5:     1         1         1              1       C 39.000000  3.769120    600
     ---                                                                            
1081526:     6         1         1              1       A 47.249496 11.360076     10
1081527:     6         1         1              1       B -3.188948 29.000000     10
1081528:     6         1         1              1       A 47.263460 12.062339     10
1081529:     6         1         1              1       C 38.000000  3.307954     10
1081530:     6         1         1              1       B -2.760421 35.000000     10
  • Related