I have dateframe with multiple measuring dates for each subjects in each row, and another dataframe with multiple visit dates for the same subject in each row (also including some NA's).
What I want is to extract the measuring dates that match the visit dates for a certain subject, and tag the measuring dates that do not comply a visit date (e.g, with a 'FALSE' or -99), and keep the NA's as is.
set.seed(1)
# Dataframe with measure dates
df1 <- rbind.data.frame(sort(sample(seq(as.Date("2018-01-01"), as.Date("2019-01-01"), by = "day"), 10)),
c(sort(sample(seq(as.Date("2018-06-01"), as.Date("2019-06-01"), by = "day"), 8)), NA, NA),
c(sort(sample(seq(as.Date("2019-06-01"), as.Date("2020-06-01"), by = "day"), 6)), rep(NA, 4)))
names(df1) <- paste("MEASUREDATE", 1:10, sep = "")
myfun <- function(x) as.Date(x, format = "%Y-%m-%d", origin = "1970-01-01")
df1 <- data.frame(lapply(df1, myfun))
df1
# Dataframe with visit dates
df2 <- rbind.data.frame(as.numeric(df1[1, 2:7]), as.numeric(c(df1[2, 4:6], NA, NA, NA)), as.numeric(c(df1[3, 1:2], rep(NA, 4))))
df2 <- data.frame(lapply(df2, myfun))
names(df2) <- paste("VISIT", 1:6, sep = "")
df2
So the fist row of the new dataframe would be like this:
# New dataframe
df3 <- df1[1, ]
df3[1] <- FALSE
df3[8:10] <- FALSE
df3
Do you know how to tackle this problem? Any help is very much appreciated.
CodePudding user response:
One possibility is to work with both dataframes in long format. Here, I pivot df1
long, then I left_join
to df2
(also after converting it to a long format). For dates that have a match, the name from df2
will be present (while others will be NA
), then we can use this information to convert the date data to NA
if there is no match. Then, I drop the column name.y
that had the visit number, and keep only unique values. Then, we can pivot back to the wider format.
library(tidyverse)
df1 %>%
mutate(row = row_number()) %>%
pivot_longer(-row) %>%
left_join(.,
df2 %>% mutate(row = row_number()) %>%
pivot_longer(-row),
by = c("row", "value")) %>%
mutate(value = case_when(is.na(name.y)
~ as.Date(NA),
TRUE ~ value)) %>%
select(-name.y) %>%
distinct() %>%
pivot_wider(names_from = "name.x", values_from = "value") %>%
select(-row)
Output
MEASUREDATE1 MEASUREDATE2 MEASUREDATE3 MEASUREDATE4 MEASUREDATE5 MEASUREDATE6 MEASUREDATE7 MEASUREDATE8 MEASUREDATE9 MEASUREDATE10
<date> <date> <date> <date> <date> <date> <date> <date> <date> <date>
1 NA 2018-05-09 2018-06-16 2018-07-06 2018-09-27 2018-10-04 2018-10-26 NA NA NA
2 NA NA NA 2018-11-12 2018-12-30 2019-01-03 NA NA NA NA
3 2019-08-28 2020-03-15 NA NA NA NA NA NA NA NA
Update
If you want to distinguish between FALSE
and NA
, then we will need to convert date
to character
first. Then, we can set some additional conditions in case_when
.
df1 %>%
mutate(row = row_number()) %>%
pivot_longer(-row) %>%
left_join(.,
df2 %>% mutate(row = row_number()) %>%
pivot_longer(-row),
by = c("row", "value")) %>%
mutate(across(everything(), ~as.character(.))) %>%
mutate(value = case_when(is.na(name.y) & !is.na(value) ~ "FALSE",
!is.na(name.y) & !is.na(value) ~ value,
TRUE ~ "NA")) %>%
select(-name.y) %>%
distinct() %>%
pivot_wider(names_from = "name.x", values_from = "value") %>%
select(-row)
Output
MEASUREDATE1 MEASUREDATE2 MEASUREDATE3 MEASUREDATE4 MEASUREDATE5 MEASUREDATE6 MEASUREDATE7 MEASUREDATE8 MEASUREDATE9 MEASUREDATE10
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 FALSE 2018-05-09 2018-06-16 2018-07-06 2018-09-27 2018-10-04 2018-10-26 FALSE FALSE FALSE
2 FALSE FALSE FALSE 2018-11-12 2018-12-30 2019-01-03 FALSE FALSE NA NA
3 2019-08-28 2020-03-15 FALSE FALSE FALSE FALSE NA NA NA NA
Data
df1 <- structure(
list(
MEASUREDATE1 = structure(c(17616, 17719, 18136), class = "Date"),
MEASUREDATE2 = structure(c(17660, 17761, 18336), class = "Date"),
MEASUREDATE3 = structure(c(17698, 17787, 18337), class = "Date"),
MEASUREDATE4 = structure(c(17718, 17847, 18373), class = "Date"),
MEASUREDATE5 = structure(c(17801, 17895, 18387), class = "Date"),
MEASUREDATE6 = structure(c(17808, 17899, 18409), class = "Date"),
MEASUREDATE7 = structure(c(17830, 17945, NA), class = "Date"),
MEASUREDATE8 = structure(c(17838, 18011, NA), class = "Date"),
MEASUREDATE9 = structure(c(17855, NA, NA), class = "Date"),
MEASUREDATE10 = structure(c(17861, NA, NA), class = "Date")
),
class = "data.frame",
row.names = c(NA,-3L)
)
df2 <-
structure(
list(
VISIT1 = structure(c(17660, 17847, 18136), class = "Date"),
VISIT2 = structure(c(17698, 17895, 18336), class = "Date"),
VISIT3 = structure(c(17718, 17899, NA), class = "Date"),
VISIT4 = structure(c(17801, NA, NA), class = "Date"),
VISIT5 = structure(c(17808, NA, NA), class = "Date"),
VISIT6 = structure(c(17830, NA, NA), class = "Date")
),
class = "data.frame",
row.names = c(NA,-3L)
)
CodePudding user response:
I think the cleanest way is to take the nice long route that @Andrew Gillreath-Brown's answer provides. However, if you desire, we can also just simply apply across the rows of the data frames (if nrow(df1) == nrow(df2)
).
dfl <- lapply(
1:nrow(df1),
\(i) {
measures <- as.Date(unlist(df1[i,]), origin = "1970-01-01")
visits <- as.Date(unlist(df2[i,]), origin = "1970-01-01")
measures[!(measures %in% visits)] <- NA
measures
}
)
dfl
#> [[1]]
#> MEASUREDATE1 MEASUREDATE2 MEASUREDATE3 MEASUREDATE4 MEASUREDATE5
#> NA "2018-05-09" "2018-06-16" "2018-07-06" "2018-09-27"
#> MEASUREDATE6 MEASUREDATE7 MEASUREDATE8 MEASUREDATE9 MEASUREDATE10
#> "2018-10-04" "2018-10-26" NA NA NA
#>
#> [[2]]
#> MEASUREDATE1 MEASUREDATE2 MEASUREDATE3 MEASUREDATE4 MEASUREDATE5
#> NA NA NA "2018-11-12" "2018-12-30"
#> MEASUREDATE6 MEASUREDATE7 MEASUREDATE8 MEASUREDATE9 MEASUREDATE10
#> "2019-01-03" NA NA NA NA
#>
#> [[3]]
#> MEASUREDATE1 MEASUREDATE2 MEASUREDATE3 MEASUREDATE4 MEASUREDATE5
#> "2019-08-28" "2020-03-15" NA NA NA
#> MEASUREDATE6 MEASUREDATE7 MEASUREDATE8 MEASUREDATE9 MEASUREDATE10
#> NA NA NA NA NA
Then for convenience can just bind together to get your df3
(or just use purrr::map_dfr
above).
dplyr::bind_rows(dfl)
#> # A tibble: 3 × 10
#> MEASUREDATE1 MEASUREDATE2 MEASUREDATE3 MEASUREDATE4 MEASUREDATE5 MEASUREDATE6
#> <date> <date> <date> <date> <date> <date>
#> 1 NA 2018-05-09 2018-06-16 2018-07-06 2018-09-27 2018-10-04
#> 2 NA NA NA 2018-11-12 2018-12-30 2019-01-03
#> 3 2019-08-28 2020-03-15 NA NA NA NA
#> # … with 4 more variables: MEASUREDATE7 <date>, MEASUREDATE8 <date>,
#> # MEASUREDATE9 <date>, MEASUREDATE10 <date>
Update
@Andrew Gillreath-Brown noted you want to keep FALSE
and NA
separate. If you want to keep the FALSE
and NA
values separate, then just convert the strings to characters first with this method.
dfl2 <- lapply(
1:nrow(df1),
\(i) {
measures <- as.character(as.Date(unlist(df1[i,]), origin = "1970-01-01"))
visits <- as.character(as.Date(unlist(df2[i,]), origin = "1970-01-01"))
measures[!(measures %in% visits)] <- "FALSE"
measures
}
)
dplyr::bind_rows(dfl2)
#> # A tibble: 3 × 10
#> MEASUREDATE1 MEASUREDATE2 MEASUREDATE3 MEASUREDATE4 MEASUREDATE5 MEASUREDATE6
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 FALSE 2018-05-09 2018-06-16 2018-07-06 2018-09-27 2018-10-04
#> 2 FALSE FALSE FALSE 2018-11-12 2018-12-30 2019-01-03
#> 3 2019-08-28 2020-03-15 FALSE FALSE FALSE FALSE
#> # … with 4 more variables: MEASUREDATE7 <chr>, MEASUREDATE8 <chr>,
#> # MEASUREDATE9 <chr>, MEASUREDATE10 <chr>