Simulating by ID variable in dataset-CodePudding

I'm not sure why I'm struggling with this, but I'm trying to create a dataset where each subject ("id" in this case) has an individual IQ score. They must also read 20 letters, each letter having a unique score attached to it ("value"). In theory what I want is the 300 people in this dataset to each "read" each letter, but have a constant IQ for themselves and a constant value for each letter. For example, Subject 1 should have read letters A to T with an IQ that is randomly normally distributed. So far this is what I have:

id <- 1:300
iq <- rnorm(n=300, mean=120, sd=15)
letter <- rep(c("a","b","c","d","e","f","g","h","i","j",
            "k","l","m","n","o","p","q","r","s","t"),15)
value <-  rep(c(2,2,1,2,2,2,2,2,3,2,
            3,1,3,2,1,2,2,2,1,2),15)
df <- data.frame(id,iq,letter,value)
df$id <- as.character(id)

This of course isn't helpful, if I run the head of the dataframe:

head(df)

You can see that each person has a unique IQ score, but only reads one letter, not all of them:

  id        iq letter value
1  1 126.35025      a     2
2  2 150.08165      b     2
3  3 105.88712      c     1
4  4 106.86652      d     2
5  5  97.86159      e     2
6  6 116.39497      f     2

What I want is something more like this:

id2 <- rep(1,4)
iq2 <- 120
letter2 <- c("a","b","c","d")
value2 <-  c(2,2,1,2)
df2 <- data.frame(id2,
                  iq2,
                  letter2,
                  value2)

Which gives this frame for one person who "reads" 4 letters

  id2 iq2 letter2 value2
1   1 120       a      2
2   1 120       b      2
3   1 120       c      1
4   1 120       d      2

How do I solve this problem?

CodePudding user response：

A solution using tidyr::crossing() and inner_join():

library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.2.1
#> Warning: package 'tibble' was built under R version 4.2.1

value <- c(2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 3, 1, 3, 2, 1, 2, 2, 2, 1, 2)

df_merged <- tibble(id = 1:300,
                    iq = rnorm(n = 300, mean = 120, sd = 15)) |>
  inner_join(crossing(id = 1:300,
                      letter = letters[1:20])) |>
  mutate(value = rep(value, 300))
#> Joining, by = "id"

#select a random id
df_merged |> 
  filter(id == 5)
#> # A tibble: 20 × 4
#>       id    iq letter value
#>    <int> <dbl> <chr>  <dbl>
#>  1     5  116. a          2
#>  2     5  116. b          2
#>  3     5  116. c          1
#>  4     5  116. d          2
#>  5     5  116. e          2
#>  6     5  116. f          2
#>  7     5  116. g          2
#>  8     5  116. h          2
#>  9     5  116. i          3
#> 10     5  116. j          2
#> 11     5  116. k          3
#> 12     5  116. l          1
#> 13     5  116. m          3
#> 14     5  116. n          2
#> 15     5  116. o          1
#> 16     5  116. p          2
#> 17     5  116. q          2
#> 18     5  116. r          2
#> 19     5  116. s          1
#> 20     5  116. t          2

^{Created on 2022-07-29 by the reprex package (v2.0.1)}

CodePudding user response：

If I understood correctly, you need to complete the combinations of letter and value. One way to do this is using complete, i.e.

library(dplyr)
library(tidyr)

df %>%
   complete(id, nesting(letter, value)) %>%
   group_by(id) %>%
   fill(iq, .direction = 'updown')

# A tibble: 6,000 x 4
# Groups:   id [300]
   id    letter value    iq
   <chr> <chr>  <dbl> <dbl>
 1 1     a          2  122.
 2 1     b          2  122.
 3 1     c          1  122.
 4 1     d          2  122.
 5 1     e          2  122.
 6 1     f          2  122.
 7 1     g          2  122.
 8 1     h          2  122.
 9 1     i          3  122.
10 1     j          2  122.
# ... with 5,990 more rows

CodePudding user response：

How about

library(tidyverse)

set.seed(123)

tibble(
  ID=1:300,
  IQ=rnorm(300, 120, 15)
) %>% 
expand(nesting(ID, IQ), Idx=1:20) %>% 
mutate(Letter=sample(letters, nrow(.), replace=TRUE)) %>% 
select(-Idx)
# A tibble: 6,000 × 3
      ID    IQ Letter
   <int> <dbl> <chr> 
 1     1  112. p     
 2     1  112. i     
 3     1  112. k     
 4     1  112. k     
 5     1  112. s     
 6     1  112. j     
 7     1  112. w     
 8     1  112. b     
 9     1  112. y     
10     1  112. f     
# … with 5,990 more row

The set.seed is for reproducibility. Initially, the tibble/dataframe contains only the subject Ids and IQs. Then expand generates placeholders for the 20 letters for each subject. I can't generate the letters directly, because expand replicates a constant vector. mutate then generates the letters themselves and then, finally (and optionally), I drop the placeholder.

Edit

I missed OP's request to include a value column. Here's one way to do it.

tibble(
  ID=1:300,
  IQ=rnorm(300, 120, 15)
) %>% 
expand(nesting(ID, IQ), Idx=1:20) %>% 
mutate(Letter=sample(letters, nrow(.), replace=TRUE)) %>% 
select(-Idx) %>% 
left_join(
  tibble(
    Letter=letters,
    value= c(2,2,1,2,2,2,2,2,3,2,3,1,3,2,1,2,2,2,1,2,    
             4,5,6,7,8,9)
  ),
  by="Letter"
)
# A tibble: 6,000 × 4
      ID    IQ Letter value
   <int> <dbl> <chr>  <dbl>
 1     1  112. p          2
 2     1  112. i          3
 3     1  112. k          3
 4     1  112. k          3
 5     1  112. s          1
 6     1  112. j          2
 7     1  112. w          6
 8     1  112. b          2
 9     1  112. y          8
10     1  112. f          2
# … with 5,990 more rows

CodePudding user response：

One liner using tidyr expand_grid

letter <- c("a","b","c","d","e","f","g","h","i","j",
                "k","l","m","n","o","p","q","r","s","t")
value <-  c(2,2,1,2,2,2,2,2,3,2,
                3,1,3,2,1,2,2,2,1,2)

tidyr::expand_grid(data.frame(id,iq), letter)

    id    iq letter
   <int> <dbl> <chr> 
 1     1  108. a     
 2     1  108. b     
 3     1  108. c     
 4     1  108. d     
 5     1  108. e     
 6     1  108. f     
 7     1  108. g     
 8     1  108. h     
 9     1  108. i     
10     1  108. j     
# … with 5,990 more rows

With value

tidyr::expand_grid(data.frame(id,iq), data.frame(letter,value))

# A tibble: 6,000 × 4
      id    iq letter value
   <int> <dbl> <chr>  <dbl>
 1     1  108. a          2
 2     1  108. b          2
 3     1  108. c          1
 4     1  108. d          2
 5     1  108. e          2
 6     1  108. f          2
 7     1  108. g          2
 8     1  108. h          2
 9     1  108. i          3
10     1  108. j          2