With known proportion/percentage and sample size, populate original data in R-CodePudding

I have a dataset where I have a known sample_size and percent "yes" for each year/location/ID combination. I am trying to use sample_size and percent to back-calculate the full dataset, which would be a binary Y_N column (where 1 is yes and 0 is no), corresponding to the percent in the percent column.

Here is an example of the dataset:

table <- "year location ID sample_size percent
1  2000     A   1a    10    40
2  2001     A   1a    10    30
3  2000     B   2a    10    70
4  2001     B   2a    10    90
5  2005     C   1a    20    10
6  2006     C   1a    20    50"

#Create a dataframe with the above table
df <- read.table(text=table, header = TRUE)
df

And here is a subset of what that would look like expanded for the first two year/location/ID combinations:

table <- "year location ID Y_N
1   2000     A   1a   1
2   2000     A   1a   1
3   2000     A   1a   1
4   2000     A   1a   1
5   2000     A   1a   0
6   2000     A   1a   0
7   2000     A   1a   0
8   2000     A   1a   0
9   2000     A   1a   0
10  2000     A   1a   0
11  2001     A   1a   1
12  2001     A   1a   1
13  2001     A   1a   1
14  2001     A   1a   0
15  2001     A   1a   0
16  2001     A   1a   0
17  2001     A   1a   0
18  2001     A   1a   0
19  2001     A   1a   0
20  2001     A   1a   0"

Is there a way to do this, say with dplyr mutate(), so that the expanded dataset can be grouped by each unique year/location/ID combination?

CodePudding user response：

You could first compute the number of yes and no, then reshape to long and afterwards use tidyr::uncount like so:

library(dplyr)
library(tidyr)

df |>
  mutate(n_1 = sample_size * percent / 100, n_0 = sample_size - n_1) |>
  select(-sample_size, -percent) |>
  pivot_longer(c(n_1, n_0), names_to = "Y_N", values_to = "count", names_prefix = "n_") |>
  tidyr::uncount(count)
#> # A tibble: 80 × 4
#>     year location ID    Y_N  
#>    <int> <chr>    <chr> <chr>
#>  1  2000 A        1a    1    
#>  2  2000 A        1a    1    
#>  3  2000 A        1a    1    
#>  4  2000 A        1a    1    
#>  5  2000 A        1a    0    
#>  6  2000 A        1a    0    
#>  7  2000 A        1a    0    
#>  8  2000 A        1a    0    
#>  9  2000 A        1a    0    
#> 10  2000 A        1a    0    
#> # … with 70 more rows

CodePudding user response：

using a bazooka you could also do it with base loops

table <- data.frame(1,2,3,4,5)

table[1,] <- c("2000", "A", "1a", 10, 40)
table[2,] <- c("2001", "A", "1a", 10, 50)

table2 <- table[1,-c(5)]
table3 <- table2[1,]

for (i in 1:nrow(table)){

  nb_row <- as.numeric(table[i,4])
  percentage <- as.numeric(table[i,5])

  table2[1:nb_row,] <- 1

  table2$X1 <- table[i,1]
  table2$X2 <- table[i,2]
  table2$X3 <- table[i,3]

  table2[c(1:percentage/10),4] <- 1
  table2[c(((percentage/10) 1):nb_row),4] <- 0

  table3 <- rbind(table3, table2)

}

table3 <- table3[-c(1),]    


> table3
     X1 X2 X3 X4
2  2000  A 1a  1
3  2000  A 1a  1
4  2000  A 1a  1
5  2000  A 1a  1
6  2000  A 1a  0
7  2000  A 1a  0
8  2000  A 1a  0
9  2000  A 1a  0
10 2000  A 1a  0
11 2000  A 1a  0
12 2001  A 1a  1
13 2001  A 1a  1
14 2001  A 1a  1
15 2001  A 1a  1
16 2001  A 1a  1
17 2001  A 1a  0
18 2001  A 1a  0
19 2001  A 1a  0
20 2001  A 1a  0
21 2001  A 1a  0

CodePudding user response：

You can use rbinom rowwise:

library(dplyr)
library(purrr)
df %>% 
  mutate(Y_N = pmap(select(., c(sample_size, percent)), ~ rbinom(..1, 1, prob = ..2 / 100))) %>% 
  unnest(Y_N) %>% 
  select(-c(sample_size, percent))

# A tibble: 80 × 4
    year location ID      Y_N
   <int> <chr>    <chr> <int>
 1  2000 A        1a        0
 2  2000 A        1a        0
 3  2000 A        1a        0
 4  2000 A        1a        1
 5  2000 A        1a        0
 6  2000 A        1a        1
 7  2000 A        1a        0
 8  2000 A        1a        0
 9  2000 A        1a        1
10  2000 A        1a        0
# … with 70 more rows
# ℹ Use `print(n = ...)` to see more rows

Or tabulate if you want exact count:

df %>% 
  mutate(Y_N = pmap(select(., c(sample_size, percent)), ~ tabulate(seq(..2 / 10), nbins = ..1))) %>% 
  unnest("Y_N") %>% 
  select(-c(sample_size, percent))