I have the following dataset:
group<- c(rep(1, 200), rep(2, 200), rep(3, 200), rep(4, 200), rep(5, 200), rep(6, 200))
injection<- c(rep(1, 200), rep(0, 600), rep(1, 400))
art_light<- c(rep(1, 400), rep(0, 600), rep(1, 200))
seasonal_light<- c(rep(1, 600), rep(0, 200), rep(1, 200), rep(1, 200))
## generate data frame for species 1-3; where Species B is unaffected by all of these things and A and C are subtly different
## species is a random effect and so results are more generalizable
dat1 <- data.frame(group, injection,art_light,seasonal_light)
dat1$species = "A"
dat2 <- data.frame(group, injection,art_light,seasonal_light)
dat2$species = "B"
dat3 <- data.frame(group, injection,art_light,seasonal_light)
dat3$species = "C"
#################################
# Simulated Response Variables #
#################################
alpha = 1
beta1 = 10
beta2 = 2
beta3 = 20
beta4 = 10
e1= rnorm(1200, 5, sd=1)
e2 = rlnorm(1200)
e = rcauchy(1200)
e3 = floor(runif(1200, min = 0, max = 20))
e4 = rpois(1200, lambda = 4)
e5 = rlnorm(1200)
dat1$lh<-alpha beta1*injection beta2*art_light
beta3*seasonal_light beta4*injection*seasonal_light e1
dat1$hb<-alpha beta1*injection e2
dat2$lh<- e
dat2$hb<- alpha beta3*injection e3
dat3$lh<-alpha beta1*injection beta2*art_light
beta3*seasonal_light e4
dat3$hb<-alpha beta2*injection e5
dat <- do.call("rbind", list(dat1, dat2, dat3))
I want to randomly sample rows within each group
with no replacement. Sampling will be done iteratively such that in the first iteration of sampling, the number of rows sampled are the number of rows in group
(n) and in the next iteration the number of sampled rows would be n-1.
These results should be combined into a data frame. Each subsample will be distinguished by a new variable sample_num
that represents the number of rows sampled. Below is an example of the result data frame for the first 4 samples. The actual result should continue for all possible subsample sizes down to sampling of 10 rows.
samp1<-
dat %>%
group_by(group) %>%
sample_n(size = 600)
samp1$sample_num<-600
samp2<-
dat %>%
group_by(group) %>%
sample_n(size = 599)
samp2$sample_num<-599
samp3<-
dat %>%
group_by(group) %>%
sample_n(size = 598)
samp3$sample_num<-598
samp4<-
dat %>%
group_by(group) %>%
sample_n(size = 597)
samp4$sample_num<-597
samp_dat<- rbind(samp1,samp2,samp3,samp4)
CodePudding user response:
You can do:
library(dplyr)
library(purrr)
res <- map_df(set_names(600:10),
~ dat %>%
group_by(group) %>%
sample_n(size = .x),
.id = "sample_num")
Giving:
# A tibble: 1,081,530 x 8
# Groups: group [6]
sample_num group injection art_light seasonal_light species lh hb
<chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
1 600 1 1 1 1 B -403. 38
2 600 1 1 1 1 A 49.3 12.5
3 600 1 1 1 1 B 0.961 31
4 600 1 1 1 1 A 48.7 12.1
5 600 1 1 1 1 B -0.691 23
6 600 1 1 1 1 A 47.7 11.4
7 600 1 1 1 1 C 37 3.55
8 600 1 1 1 1 B -0.327 22
9 600 1 1 1 1 B -7.71 32
10 600 1 1 1 1 B 0.153 36
# ... with 1,081,520 more rows
CodePudding user response:
data.table is incredibly fast at this sort of thing
library(data.table)
rbindlist(lapply(600:10, \(x) setDT(dat)[,.SD[sample(1:.N,x)], by=.(group)][,sample:=x]))
Output:
group injection art_light seasonal_light species lh hb sample
1: 1 1 1 1 C 35.000000 3.196606 600
2: 1 1 1 1 A 46.424639 12.210558 600
3: 1 1 1 1 C 33.000000 5.303823 600
4: 1 1 1 1 A 47.316622 11.814838 600
5: 1 1 1 1 C 39.000000 3.769120 600
---
1081526: 6 1 1 1 A 47.249496 11.360076 10
1081527: 6 1 1 1 B -3.188948 29.000000 10
1081528: 6 1 1 1 A 47.263460 12.062339 10
1081529: 6 1 1 1 C 38.000000 3.307954 10
1081530: 6 1 1 1 B -2.760421 35.000000 10