I have a data in following format:
ID time0 obs_num recorded_dt 1 2009-01-01 A 2009-01-01 1 2009-01-01 D 2009-01-31 1 2009-01-01 B 2009-01-05 2 2005-02-02 B 2005-02-03
I want to calculate columns based on days difference for up to a month in reference to time0 and add respective values from the obs_num column. In the end, the data should look like this:
ID time0 obs_num recorded_dt day0 day1 day2 day3 day4 day5 ... day31 1 2009-01-01 A 2009-01-01 A NULL NULL NULL NULL NULL ... NULL 1 2009-01-01 D 2009-01-31 NULL NULL NULL NULL NULL NULL ... D 1 2009-01-01 B 2009-01-05 NULL NULL NULL NULL NULL B ... NULL 2 2005-02-02 B 2005-02-03 NULL B NULL NULL NULL NULL ... NULL
CodePudding user response:
I don't know why do you want every value in a different column. That is inefficient. You can just use lubridate
to work with dates:
example = data.frame(ID = c(1,1,1,2),
time0 = c('2009-01-01','2009-01-01','2009-01-01','2005-02-02'),
obs_num = c('A','D','B','B'),
recorded_dt = c('2009-01-01 ','2009-01-31','2009-01-05','2005-02-03')
)
library(tidyverse)
library(lubridate)
example$time0 = ymd(example$time0)
example$recorded_dt = ymd(example$recorded_dt)
example %>%
mutate(difs_days = floor(difftime(recorded_dt, time0, units="days")))
Output:
ID time0 obs_num recorded_dt difs_days
1 1 2009-01-01 A 2009-01-01 0 days
2 1 2009-01-01 D 2009-01-31 30 days
3 1 2009-01-01 B 2009-01-05 4 days
4 2 2005-02-02 B 2005-02-03 1 days
However, to get the result you asked for, we can use pivot_wider()
and the names_prefix=
property as follows:
example %>%
mutate(difs_days = floor(difftime(recorded_dt, time0, units="days"))) %>%
arrange(difs_days) %>%
pivot_wider(names_from = difs_days, values_from = obs_num, names_prefix = 'day') %>%
arrange(ID, recorded_dt)
Output:
# A tibble: 4 x 7
ID time0 recorded_dt day0 day1 day4 day30
<dbl> <date> <date> <chr> <chr> <chr> <chr>
1 1 2009-01-01 2009-01-01 A NA NA NA
2 1 2009-01-01 2009-01-05 NA NA B NA
3 1 2009-01-01 2009-01-31 NA NA NA D
4 2 2005-02-02 2005-02-03 NA B NA NA
Note if you are pretending to create columns for everyday in a large dataframe, your RAM memory will be highly wasted.
CodePudding user response:
library(dplyr)
dat %>%
mutate(
day = recorded_dt - time0,
day = if_else(day > 30, NA_character_, as.character(day))
) %>%
bind_cols(model.matrix(~ day - 1, .)) %>%
select(-day) %>%
mutate(across(starts_with("day"), ~ if_else(. > 0, obs_num, obs_num[NA])))
# ID time0 obs_num recorded_dt day0 day1 day30 day4
# 1 1 2009-01-01 A 2009-01-01 A <NA> <NA> <NA>
# 2 1 2009-01-01 D 2009-01-31 <NA> <NA> D <NA>
# 3 1 2009-01-01 B 2009-01-05 <NA> <NA> <NA> B
# 4 2 2005-02-02 B 2005-02-03 <NA> B <NA> <NA>
Data, dates already converted to Date
-class.
dat <- structure(list(ID = c(1L, 1L, 1L, 2L), time0 = structure(c(14245, 14245, 14245, 12816), class = "Date"), obs_num = c("A", "D", "B", "B"), recorded_dt = structure(c(14245, 14275, 14249, 12817), class = "Date")), row.names = c(NA, -4L), class = "data.frame")
CodePudding user response:
You can use pivot_wider()
from tidyr
.
library(dplyr)
library(tidyr)
df %>%
mutate(diff = recorded_dt - time0) %>%
pivot_wider(names_from = diff, names_prefix = "day", names_sort = TRUE,
values_from = obs_num)
# # A tibble: 4 × 7
# ID time0 recorded_dt day0 day1 day4 day30
# <int> <date> <date> <chr> <chr> <chr> <chr>
# 1 1 2009-01-01 2009-01-01 A NA NA NA
# 2 1 2009-01-01 2009-01-31 NA NA NA D
# 3 1 2009-01-01 2009-01-05 NA NA B NA
# 4 2 2005-02-02 2005-02-03 NA B NA NA
If you want to show all day differences, use complete()
before pivot_wider()
.
df %>%
mutate(diff = as.numeric(recorded_dt - time0)) %>%
complete(nesting(ID, time0, recorded_dt), diff = 0:30) %>%
pivot_wider(names_from = diff, names_prefix = "day", names_sort = TRUE,
values_from = obs_num)
# # A tibble: 4 × 34
# ID time0 recorded_dt day0 day1 day2 day3 day4 day5 ...... day30
# <int> <date> <date> <chr> <chr> <chr> <chr> <chr> <chr> ...... <chr>
# 1 1 2009-01-01 2009-01-01 A NA NA NA NA NA ...... NA
# 2 1 2009-01-01 2009-01-05 NA NA NA NA B NA ...... D
# 3 1 2009-01-01 2009-01-31 NA NA NA NA NA NA ...... NA
# 4 2 2005-02-02 2005-02-03 NA B NA NA NA NA ...... NA
Data
df <- structure(list(ID = c(1L, 1L, 1L, 2L), time0 = structure(c(14245,
14245, 14245, 12816), class = "Date"), obs_num = c("A", "D",
"B", "B"), recorded_dt = structure(c(14245, 14275, 14249, 12817),
class = "Date")), row.names = c(NA, -4L), class = "data.frame")