I need to impute the NA with the date associated with conversion 1, grouped by each user (USERID). For the date without a conversion (1) linked with it, impute NULL to it.
Here is the table to show the sample dataset:
USERID | date | conversion | purchaseTime |
---|---|---|---|
1 | 2022-02-05 | 0 | NA |
1 | 2022-02-05 | 0 | NA |
1 | 2022-02-05 | 0 | NA |
1 | 2022-02-05 | 0 | NA |
1 | 2022-02-05 | 1 | 2022-02-05 |
1 | 2022-02-06 | 0 | NA |
1 | 2022-02-06 | 0 | NA |
1 | 2022-02-06 | 0 | NA |
2 | 2022-03-11 | 0 | NA |
2 | 2022-03-11 | 0 | NA |
2 | 2022-03-11 | 1 | 2022-03-11 |
2 | 2022-03-24 | 0 | NA |
2 | 2022-03-24 | 0 | NA |
2 | 2022-03-24 | 1 | 2022-03-24 |
Sample code:
df <- data.frame(list(USERID = (rep(1, 8) )))
df <- df %>% add_row(USERID = rep(2, 6))
randomDate <- c("2022-2-5", "2022-2-5", "2022-2-5", "2022-2-5", "2022-2-5",
"2022-2-6", "2022-2-6", "2022-2-6", "2022-3-11", "2022-3-11", "2022-3-11",
"2022-3-24", "2022-3-24", "2022-3-24")
df <- df %>% group_by(USERID) %>%
mutate( purchaseTime = ifelse( conversion == "1", as.Date(date) , NA) )
df$purchaseTime <- as.Date(df$purchaseTime ,origin="1970-01-01")
df <- df %>% mutate(date = as.Date(randomDate))
df <- df %>% mutate(conversion = rep(0, 14))
df <- df %>% mutate(PurchaseID = rep(NA, 14))
df[5,3] <- 1
df[14,3] <- 1
df[11,3] <- 1
df[5,4] <- 4
df[14,4] <- 8
TARGET RESULT:
USERID | date | conversion | purchaseTime |
---|---|---|---|
1 | 2022-02-05 | 0 | 2022-02-05 |
1 | 2022-02-05 | 0 | 2022-02-05 |
1 | 2022-02-05 | 0 | 2022-02-05 |
1 | 2022-02-05 | 0 | 2022-02-05 |
1 | 2022-02-05 | 1 | 2022-02-05 |
1 | 2022-02-06 | 0 | NULL |
1 | 2022-02-06 | 0 | NULL |
1 | 2022-02-06 | 0 | NULL |
2 | 2022-03-11 | 0 | 2022-03-11 |
2 | 2022-03-11 | 0 | 2022-03-11 |
2 | 2022-03-11 | 1 | 2022-03-11 |
2 | 2022-03-24 | 0 | 2022-03-24 |
2 | 2022-03-24 | 0 | 2022-03-24 |
2 | 2022-03-24 | 1 | 2022-03-24 |
Thanks so much!
CodePudding user response:
You could use tidyr's fill
:
library(tidyverse)
df <- tribble(
~USERID, ~date, ~conversion, ~purchaseTime,
1, "2022-02-05", 0, NA,
1, "2022-02-05", 0, NA,
1, "2022-02-05", 0, NA,
1, "2022-02-05", 0, NA,
1, "2022-02-05", 1, "2022-02-05",
1, "2022-02-06", 0, NA,
1, "2022-02-06", 0, NA,
1, "2022-02-06", 0, NA,
2, "2022-03-11", 0, NA,
2, "2022-03-11", 0, NA,
2, "2022-03-11", 1, "2022-03-11",
2, "2022-03-24", 0, NA,
2, "2022-03-24", 0, NA,
2, "2022-03-24", 1, "2022-03-24"
)
df2 <- df |>
group_by(USERID, date) |>
fill(purchaseTime, .direction = "updown") |>
ungroup() # if needed
df2
#> # A tibble: 14 × 4
#> USERID date conversion purchaseTime
#> <dbl> <chr> <dbl> <chr>
#> 1 1 2022-02-05 0 2022-02-05
#> 2 1 2022-02-05 0 2022-02-05
#> 3 1 2022-02-05 0 2022-02-05
#> 4 1 2022-02-05 0 2022-02-05
#> 5 1 2022-02-05 1 2022-02-05
#> 6 1 2022-02-06 0 <NA>
#> 7 1 2022-02-06 0 <NA>
#> 8 1 2022-02-06 0 <NA>
#> 9 2 2022-03-11 0 2022-03-11
#> 10 2 2022-03-11 0 2022-03-11
#> 11 2 2022-03-11 1 2022-03-11
#> 12 2 2022-03-24 0 2022-03-24
#> 13 2 2022-03-24 0 2022-03-24
#> 14 2 2022-03-24 1 2022-03-24
Created on 2022-07-03 by the reprex package (v2.0.1)