Home > OS >  How to sample across a dataset with two factors in it?
How to sample across a dataset with two factors in it?

Time:06-13

I have a dataframe with two species A and B and certain variables a b associated with the total of 100 rows.

I want to create a sampler such that in one set it randomly picks 6 rows reps from the df dataset. However, the samples for A must only come from rows associated with sp A from df, similarly from B. I want do this for 500 times over for each of species A and B.

I attempted a for loop and when I ran sampling it shows a single row with 6 columns. I would appreciate any guidance

a <- rnorm(100, 2,1)
b <- rnorm(100, 2,1)
sp <- rep(c("A","B"), each = 50)  

df <- data.frame(a,b,sp)

df.sample <- for(i in 1:1000){
             sampling <- sample(df[i,],6,replace = TRUE)
}

#Output in a single row
a     a.1 sp        b sp.1     a.2
1000 1.68951 1.68951  B 1.395995    B 1.68951

#Expected dataframe
df.sample
set rep a b sp
  1  1  1 9  A
  1  2  3 2  A
  1  3  0 2  A
  1  4  1 2  A
  1  5  1 6  A
  1  6  4 2  A
  2  1  1 2  B
  2  2  5 2  B
  2  3  1 2  B
  2  4  1 6  B
  2  5  1 8  B
  2  6  9 2  B
  ....

CodePudding user response:

You could split df by species at first. Random rows in each species can be drawn by x[sample(nrow(x), 6), ]. Pass it into replicate(), you could do sampling for many times. Here dplyr::bind_rows() is used to combine samples and add a new column set indicating the sampling indices.

lapply(split(df, df$sp), function(x) {
  dplyr::bind_rows(
    replicate(3, x[sample(nrow(x), 6), ], FALSE),
    .id = "set"
  )
})
Output
$A
   set           a           b sp
1    1  1.52480034  3.41257975  A
2    1  1.82542370  2.08511584  A
3    1  1.80019901  1.39279162  A
4    1  2.20765154  2.11879412  A
5    1  1.61295185  2.04035172  A
6    1  1.92936567  2.90362816  A
7    2  0.88903679  2.46948106  A
8    2  3.19223788  2.81329767  A
9    2  1.28629416  2.69275525  A
10   2  2.61044815  0.82495427  A
11   2  2.30928735  1.67421328  A
12   2 -0.09789704  2.62434719  A
13   3  2.10386603  1.78157862  A
14   3  2.17542841  0.84016203  A
15   3  3.22202227  3.49863423  A
16   3  1.07929909 -0.02032945  A
17   3  2.95271838  2.34460193  A
18   3  1.90414536  1.54089645  A

$B
   set         a          b sp
1    1 3.5130317 -0.4704879  B
2    1 3.0053072  1.6021795  B
3    1 4.1167657  1.1123342  B
4    1 1.5460589  3.2915979  B
5    1 0.8742753  0.9132530  B
6    1 2.0882660  1.5588471  B
7    2 1.2444645  1.8199525  B
8    2 2.7960117  2.6657735  B
9    2 2.5970774  0.9984187  B
10   2 1.1977317  3.7360884  B
11   2 2.2830643  1.0452440  B
12   2 3.1047150  1.5609482  B
13   3 2.9309124  1.5679255  B
14   3 0.8631965  1.3501631  B
15   3 1.5460589  3.2915979  B
16   3 2.7960117  2.6657735  B
17   3 3.1047150  1.5609482  B
18   3 2.8735390  0.6329279  B

CodePudding user response:

Here's how I would do it (using tidyverse):

data:

a <- rnorm(100, 2,1)
b <- rnorm(100, 2,1)
sp <- rep(c("A","B"), each = 50)  
df <- data.frame(a,b,sp)


# create an empty table with desired columns

library(tidyverse)
output <- tibble(a = numeric(), 
                 b = numeric(), 
                 sp = character(), 
                 set = numeric())

# sampling in a loop

    set.seed(42)                    
    df.sample <- for(i in 1:500){
      samp1 <- df %>% filter(sp == 'A') %>% sample_n(6, replace = TRUE) %>% mutate(set = i)
      samp2 <- df %>% filter(sp == 'B') %>% sample_n(6, replace = TRUE) %>% mutate(set = i)
      output %>% add_row(bind_rows(samp1, samp2))  -> output
    }

Result

> head(output, 20)
# A tibble: 20 × 4
       a     b sp      set
   <dbl> <dbl> <chr> <dbl>
 1 2.59  3.31  A         1
 2 1.84  1.66  A         1
 3 2.35  1.17  A         1
 4 2.33  1.95  A         1
 5 0.418 1.11  A         1
 6 1.19  2.54  A         1
 7 2.35  0.899 B         1
 8 1.19  1.63  B         1
 9 0.901 0.986 B         1
10 3.12  1.75  B         1
11 2.28  2.61  B         1
12 1.37  3.47  B         1
13 2.33  1.95  A         2
14 1.84  1.66  A         2
15 3.76  1.26  A         2
16 2.96  3.10  A         2
17 1.03  1.81  A         2
18 1.42  2.00  A         2
19 0.901 0.986 B         2
20 2.37  1.39  B         2

CodePudding user response:

If I understood well what you want, it could be done following this code

# Create the initial data frame 
  a <- rnorm(100, 2,1)
  b <- rnorm(100, 2,1)
  sp <- rep(c("A","B"), each = 50)  

  df <- data.frame(a,b,sp)

# Rows with sp=A
  row.A <- which(df$sp=="A")
  row.B <- which(df$sp=="B")

# Sampling data.frame
  sampling <- data.frame(matrix(ncol = 5, nrow = 0))
# "rep" column for each iteration
  rep1 <- rep(1:6,2)
    
# Build the dara.frame
for(i in 1:500){
  # Sampling row.A
    s.A <- sample(row.A,6,replace = T)
  # Sampling row.B
    s.B <- sample(row.B,6,replace = T)
  # Data frame with the subset of df and "set" and "rep" values
    sampling <- rbind(sampling, set=cbind(rep(i,12),rep=rep1,df[c(s.A,s.B),]))
}
  
# Delete row.names of sampling and redefine sampling's column names
  row.names(sampling) <- NULL
  colnames(sampling) <- c("set", "rep", "a", "b", "sp")

And the output looks like this:

  set rep        a        b sp
    1   1 3.713663 2.717456  A
    1   2 2.456070 2.803443  A
    1   3 2.166655 1.395556  A
    1   4 1.453738 5.662969  A
    1   5 2.692518 2.971156  A
    1   6 2.699634 3.016791  A
  • Related