Home > database >  Sample Multiple Columns Without Repeats R dplyr
Sample Multiple Columns Without Repeats R dplyr

Time:08-01

I am trying to solve a problem where I take a random sample based on probability with 5 observations per row where I observe a color out of a possible set of colors, exclude the observed color from the next observation and repeat. Colors can repeat in any given column, but not in the same row.

Here is how I have approached the problem:

library(tidyverse)

data <- tibble(obsId = 1:100)

colors <- tibble(color = c('red', 'blue', 'white', 'yellow', 'green', 'orange', 
                           'gray', 'brown', 'purple', 'black', 'pink', 'navy', 
                           'maroon'), 
                  prob = c(0.85, 0.85, 0.75, 0.75, 0.65, 0.5, 0.5, 0.5, 0.4, 
                           0.4, 0.25, 0.15, 0.15))

data <- data %>% 
      mutate(color1 = sample(x = colors$color, size = n(), 
                          prob  = colors$prob, replace = T),
             color2 = sample(x = colors$color, size = n(), 
                          prob  = colors$prob, replace = T),
             color3 = sample(x = colors$color, size = n(), 
                          prob  = colors$prob, replace = T),
             color4 = sample(x = colors$color, size = n(), 
                          prob  = colors$prob, replace = T),
             color5 = sample(x = colors$color, size = n(), 
                          prob  = colors$prob, replace = T)

The issue I have is that color 2 will be equal to color 1 (and so forth) in certain rows. Is there any easy way to resolve this?

CodePudding user response:

How about replicating (nrow(data) times) the sampling of 5 items:

setNames(
  cbind(
    data,
    t(replicate(nrow(data), sample(colors$color, 5, prob=colors$prob)))
  ), c("obsID", paste0("color",1:5))
)

And here is a data.table version:

library(data.table)

f <- function(c, p,n=5) setNames(as.list(sample(c,n,prob=p)), paste0("color",1:n))

setDT(data)[, f(colors$color, colors$prob), obsId]

Output:

    obsID color1 color2 color3 color4 color5
1       1 yellow   gray  white    red  green
2       2   blue  white    red  black orange
3       3  white   blue    red purple yellow
4       4 orange   blue maroon    red yellow
5       5   blue yellow   gray  green  brown
6       6  green  black   gray orange  white
7       7 yellow  white    red   blue  green
8       8 yellow  black  green   pink  white
9       9 orange purple    red  white   blue
10     10   pink yellow  white orange   blue
11     11  white    red   blue orange purple
12     12    red  green  brown yellow purple
13     13    red   blue  brown  green  white
14     14    red orange  green   blue   gray
15     15  white   blue    red   pink yellow
16     16    red   blue  white purple  green
17     17   blue orange  brown  white  green
18     18  black   gray  brown yellow purple
19     19   pink   blue  green orange   gray
20     20 purple   pink yellow  brown    red
21     21  green  black  white   blue   pink
22     22  black   blue   pink   gray maroon
23     23   blue    red  brown orange yellow
24     24   gray    red purple  brown orange
25     25 purple  brown  green orange yellow
26     26   blue  white orange  green    red
27     27    red   blue orange  white   gray
28     28  white  black yellow   navy    red
29     29 orange   blue purple  brown  green
30     30 orange   blue purple  green  white
31     31   blue    red purple yellow  white
32     32  white   navy orange purple  brown
33     33 orange   blue purple   gray yellow
34     34    red   pink   blue yellow  green
35     35  brown   blue yellow   gray  white
36     36   gray  green yellow purple    red
37     37  green orange yellow   navy  white
38     38  brown  green purple  black orange
39     39   gray  black  green   pink  white
40     40  white  green yellow   blue    red
41     41 orange  black   gray maroon  white
42     42   blue  white   pink  brown    red
43     43 yellow purple   gray   navy  green
44     44  white    red  green yellow   pink
45     45    red  green   pink yellow  black
46     46 orange  white    red   gray yellow
47     47  brown purple   pink  black    red
48     48   pink   blue  green  brown  white
49     49   blue  green  brown orange yellow
50     50    red  black  green orange   blue
51     51   gray    red  green  white yellow
52     52   pink  black yellow    red  brown
53     53  white  green   navy  black   blue
54     54    red orange yellow  white  green
55     55  brown   gray purple yellow    red
56     56  brown yellow  black purple orange
57     57   pink  black   gray  white orange
58     58   blue  brown  black purple  green
59     59    red  green   gray  white   blue
60     60 orange maroon yellow  green   gray
61     61   pink  white   blue orange   gray
62     62  brown    red  white   gray   blue
63     63  black    red   blue yellow   navy
64     64  green maroon    red  black   blue
65     65  brown yellow    red purple  green
66     66   gray  brown   blue  green yellow
67     67   blue yellow maroon purple orange
68     68    red   gray   blue  black  white
69     69   gray yellow  white  brown orange
70     70  brown    red  white yellow maroon
71     71  black    red  white orange yellow
72     72   navy   blue  green   gray  black
73     73   gray orange  brown   blue    red
74     74  black   pink    red yellow   blue
75     75   gray  black   blue  green yellow
76     76   blue yellow  green   pink    red
77     77 yellow   blue    red  green orange
78     78  brown  white   gray   navy orange
79     79  brown   blue  white  green   navy
80     80  white   gray yellow  brown  green
81     81   gray  brown  white yellow    red
82     82 orange yellow   blue  white  green
83     83   gray   blue  brown  white  green
84     84 orange   blue  green  brown   gray
85     85   pink orange   blue  brown    red
86     86    red orange  green yellow   blue
87     87  black  brown   navy  white yellow
88     88  white  black  green purple   blue
89     89 yellow  green    red   gray maroon
90     90 purple maroon  brown   gray yellow
91     91  white   pink   blue    red  black
92     92   blue  black    red   pink  white
93     93    red  white  brown  black  green
94     94    red   pink   gray   blue purple
95     95 orange  green  white   gray    red
96     96    red   gray  green yellow purple
97     97  white  brown purple yellow   blue
98     98 purple    red orange yellow  green
99     99  black  white   gray yellow    red
100   100 yellow  white   blue purple   gray

CodePudding user response:

Here is a base/dplyr solution.

library(tidyverse)

data <- tibble(obsId = 1:100)

colors <- tibble(color = c('red', 'blue', 'white', 'yellow', 'green', 'orange', 
                           'gray', 'brown', 'purple', 'black', 'pink', 'navy', 
                           'maroon'), 
                 prob = c(0.85, 0.85, 0.75, 0.75, 0.65, 0.5, 0.5, 0.5, 0.4, 
                          0.4, 0.25, 0.15, 0.15))

sample_fun <- function(X, size) {
  sample(X$color, size = size, prob = X$prob)
}

data <- data %>% 
  bind_cols(
    t(replicate(nrow(data), sample_fun(colors, 5)))
  ) 
#> New names:
#> • `` -> `...2`
#> • `` -> `...3`
#> • `` -> `...4`
#> • `` -> `...5`
#> • `` -> `...6`
names(data)[-1] <- paste0("color", 1:5)
head(data)
#> # A tibble: 6 × 6
#>   obsId color1 color2 color3 color4 color5
#>   <int> <chr>  <chr>  <chr>  <chr>  <chr> 
#> 1     1 blue   white  orange green  yellow
#> 2     2 yellow maroon brown  red    pink  
#> 3     3 brown  white  maroon orange gray  
#> 4     4 brown  gray   blue   purple pink  
#> 5     5 green  navy   black  gray   yellow
#> 6     6 red    yellow black  green  blue

Created on 2022-07-31 by the reprex package (v2.0.1)

CodePudding user response:

Maybe, building the dataframe by row may be easier, but I don't know if the distribution of the colors are good

# IS IT CORRECT WITH RESPECT TO THE DISTRIBUTION OF THE COLORS VECTOR
data <- data.frame(color1 = character(nrow(data)), color2 = character(nrow(data)), color3 = character(nrow(data)), color4 = character(nrow(data)), color5 = character(nrow(data)))
for(i in 1:nrow(data)){
  data[i, ] <- sample(x = colors$color, size = 5, prob = colors$prob, replace = F)
}
  • Related