I know the wording is a bit complicated. However, the image below should make it clear. Basically wherever exp_stat = 1
we split those ids and allocate ids which had exp_stat = 0
(Grouped by subset
).
Here is my data:
df <- structure(list(id = c("24GMHA78TGM7D", "6E1T23X96Y35R", "9N6GXF4DZYQR9",
"0007HGS378G8W", "7DHCHR11GCZ6C", "BG81E8NJN8M3R", "0007HGS378G8Z",
"7DHCHR11GCZ6D", "BG81E8NJN8M3Q"), exp_stat = c(1L, 1L, 0L, 0L,
0L, 0L, 1L, 0L, 0L), subset = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L)), class = "data.frame", row.names = c(NA, -9L))
Here is the expected output:
structure(list(id = c("24GMHA78TGM7D", "24GMHA78TGM7D", "6E1T23X96Y35R",
"6E1T23X96Y35R", "0007HGS378G8Z", "0007HGS378G8Z"), id_1 = c("9N6GXF4DZYQR9",
"0007HGS378G8W", "7DHCHR11GCZ6C", "BG81E8NJN8M3R", "7DHCHR11GCZ6D",
"BG81E8NJN8M3Q"), subset = c(1L, 1L, 1L, 1L, 2L, 2L)), row.names = c(NA,
-6L), class = "data.frame")
I tried using pivot_wider
but to no avail. My attempt:
library(tidyverse)
library(data.table)
df %>%
group_by(subset) %>%
mutate(no1= rleid(subset,exp_stat)) %>%
mutate(no=row_number()) %>%
pivot_wider(names_from="exp_stat",
values_from="no1")
Entire data:
structure(list(id = c("24GMHA78TGM7D", "6E1T23X96Y35R", "9N6GXF4DZYQR9",
"0007HGS378G8W", "7DHCHR11GCZ6C", "BG81E8NJN8M3R", "03WFCSQP8BEJV",
"1C3EZWWH9AKVH", "FMX97PFVXHG9F", "2D788PS159YQA", "FA9E1X97QDBET",
"FGF0BD4F92CB6", "0011BE039X4TN", "50YEZB08NC0PX", "07RGHNR8YZTV0",
"612HPTSCD97DK", "3D91MHAVJQTBH", "3KJF3YYVC1X2Y", "1ATYE2M0BRV29",
"1FW25TMBNNMKZ", "F433HAXGJ0B1F", "DS5NPYY3W9HFA", "0GP2FRPGMBS0X",
"76AKE060S7P7A", "3TPVX8DDTX2Y2", "ES3DMV2BEWMSJ", "17EYCTGD2B5TM",
"49M2ATYWJKR0E", "ANASVHK91GZS2", "EJHGHVHD776QD", "F2RMZFK564M2N",
"BJF6JY5VA4MCH", "BZ1V1FFYCGD45", "3XAKGTF09T65M", "ADZC3Q7JX48V1",
"FWX6RN0RQB5CZ", "CZ72XDBVPZYRQ", "D96NC82778FDP", "5PYVE3R8XB8BV",
"6TAJC1HEXNDQP", "5VEQP8DHNSQB8", "A21B8H4TQ497T", "D1W9FTQ614ECH",
"0C5H4CRY9PTNV", "16Z61HABMQAGF", "5Y7Y2FHQKBVQT", "8KQQ2GH11RCM5",
"B52DEW8X6BNAN", "E5G6F2DAS9VKA", "2CHCK9D83QP3E", "1S9TXXSVGVJBV",
"AGSJ33ZCCPZEA", "7DHPJF99FP34W", "90P5BBEJZM45D", "249QNK0HM8DN2",
"54PGWA5EDSMZH", "1EJF0P0EVR88X", "1GD5KYBVF4ZMV", "DNBJC9ZTR8F5E",
"8BKV7YE188FBS", "FKRVE51TTRYDE", "603RR2KTXKVA6", "F2DXD2E64HEXE",
"09WF6M77F8CRB", "4GYRAF4HM97T0", "ARXJSKRX5VN8A"), exp_stat = c(1L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), subset = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-66L))
CodePudding user response:
Here is one solution:
I first create a dataframe only containing the subset exp_stat = 1
.
Then I join to the subset exp_stat = 0
:
library(tidyverse)
df1 <- df %>% as_tibble() %>% filter(exp_stat == 1) %>%
group_by(subset) %>%
mutate(join_col = 1:n())
df %>% group_by(subset) %>%
mutate(n = sum(exp_stat == 1)) %>%
filter(exp_stat == 0) %>%
rename(id_1 = id) %>%
mutate(join_col = rep(1:first(n), each = n()/ first(n))[1:n()]) %>%
left_join(df1, by = c("subset", "join_col")) %>%
select(id, id_1, subset)
# A tibble: 6 x 3
# Groups: subset [2]
id id_1 subset
<chr> <chr> <int>
1 24GMHA78TGM7D 9N6GXF4DZYQR9 1
2 24GMHA78TGM7D 0007HGS378G8W 1
3 6E1T23X96Y35R 7DHCHR11GCZ6C 1
4 6E1T23X96Y35R BG81E8NJN8M3R 1
5 0007HGS378G8Z 7DHCHR11GCZ6D 2
6 0007HGS378G8Z BG81E8NJN8M3Q 2