Given two dataframes df1
and df2
with 4 columns each, I would like to do the following:
- Expand
df1
to have a date range between1948-2016
. For example:
seq_df <- data_frame(Date = seq.Date(as.Date("1948-01-01"),
as.Date("2016-12-31"),
by="day"))
The expanded rows should have the columns filled with NA
- Replace all
NAs
indf1
with corresponding cell-specific values fromdf2
.
I tried the following but it did not work:
seq_df <- data_frame(Date = seq.Date(as.Date("1948-01-01"),
as.Date("2016-12-31"),
by="day"))
data1=full_join(df1, seq_df) %>% arrange(Date)
Sample Data
df1=structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808,
3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819,
3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830,
3831, 3832, 3833), class = "Date"), Pr = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), Tmax = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 13.5, 13, 12, 11.2, 8.5, 12, 13, 17.5, 17.1,
17.1, 9.5, 6.5, NA), Tmin = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10.5, 11, 12, 12.5, 7,
10, 11.5, 7, 5, 4.5, 1, 2)), row.names = c(NA, 30L), class = "data. Frame")
df2=structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808,
3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819,
3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830,
3831, 3832, 3833), class = "Date"), Pr = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1.985656032, 0, 0), Tmax = c(16.7579, 16.425, 16.8751, 15.9877,
15.3752, 15.4138, 15.6038, 14.194, 15.9599, 17.0566, 13.6396,
15.8548, 15.9682, 17.4073, 17.9238, 10.1966, 10.7029, 13.6226,
12.7758, 15.0315000000001, 14.5553, 10.0413, 12.5804, 11.7263,
13.9642, 20.8888, 19.1779, 6.32770000000005, 8.32380000000001,
10.3664), Tmin = c(2.38980000000004, 3.73170000000005, 2.37440000000004,
0.744400000000041, 1.34430000000003, 0.914200000000051, -0.107699999999966,
-0.11099999999999, 1.29250000000002, 2.44490000000002, -0.0418999999999983,
2.48950000000002, 2.7414, 2.8424, 0.513500000000022, -5.2158,
-9.0718, 0.44380000000001, -1.51529999999997, -2.18459999999999,
2.36690000000004, 0.0375000000000227, -3.572, -2.93179999999995,
-2.54819999999995, 0.20700000000005, 5.50260000000003, 1.28820000000002,
-2.12149999999997, -7.39839999999998)), row.names = 11841:11870, class = "data. Frame")
CodePudding user response:
I think you want to use all of df1
's data where present, fill in df2
where df1
is missing, then fill out the Date
for the rest of the timespan.
The first step is to fill in df1
with data from df2
, coalesce (fill NA
) where needed, and remove the duplicate columns.
full_join(df1, df2, by = "Date", suffix = c("", ".y")) %>%
mutate(across(c(Pr, Tmax, Tmin), ~ coalesce(., cur_data()[[ paste0(cur_column(), ".y") ]]))) %>%
select(-ends_with(".y"))
# Date Pr Tmax Tmin
# 1 1980-06-01 0.000000 16.7579 2.3898
# 2 1980-06-02 0.000000 16.4250 3.7317
# 3 1980-06-03 0.000000 16.8751 2.3744
# 4 1980-06-04 0.000000 15.9877 0.7444
# 5 1980-06-05 0.000000 15.3752 1.3443
# 6 1980-06-06 0.000000 15.4138 0.9142
# 7 1980-06-07 0.000000 15.6038 -0.1077
# 8 1980-06-08 0.000000 14.1940 -0.1110
# 9 1980-06-09 0.000000 15.9599 1.2925
# 10 1980-06-10 0.000000 17.0566 2.4449
# 11 1980-06-11 0.000000 13.6396 -0.0419
# 12 1980-06-12 0.000000 15.8548 2.4895
# 13 1980-06-13 0.000000 15.9682 2.7414
# 14 1980-06-14 0.000000 17.4073 2.8424
# 15 1980-06-15 0.000000 17.9238 0.5135
# 16 1980-06-16 0.000000 10.1966 -5.2158
# 17 1980-06-17 0.000000 10.7029 -9.0718
# 18 1980-06-18 0.000000 13.5000 0.4438
# 19 1980-06-19 0.000000 13.0000 10.5000
# 20 1980-06-20 0.000000 12.0000 11.0000
# 21 1980-06-21 0.000000 11.2000 12.0000
# 22 1980-06-22 0.000000 8.5000 12.5000
# 23 1980-06-23 0.000000 12.0000 7.0000
# 24 1980-06-24 0.000000 13.0000 10.0000
# 25 1980-06-25 0.000000 17.5000 11.5000
# 26 1980-06-26 0.000000 17.1000 7.0000
# 27 1980-06-27 0.000000 17.1000 5.0000
# 28 1980-06-28 1.985656 9.5000 4.5000
# 29 1980-06-29 0.000000 6.5000 1.0000
# 30 1980-06-30 0.000000 10.3664 2.0000
From here, filling out the timespan is just another join:
result <- full_join(df1, df2, by = "Date", suffix = c("", ".y")) %>%
mutate(across(c(Pr, Tmax, Tmin), ~ coalesce(., cur_data()[[ paste0(cur_column(), ".y") ]]))) %>%
select(-ends_with(".y")) %>%
full_join(seq_df, by = "Date")
(optionally add %>% arrange(Date)
).
--
Data (with the correct class=
):
df1 <- structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808, 3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819, 3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830, 3831, 3832, 3833), class = "Date"), Pr = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Tmax = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13.5, 13, 12, 11.2, 8.5, 12, 13, 17.5, 17.1, 17.1, 9.5, 6.5, NA), Tmin = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10.5, 11, 12, 12.5, 7, 10, 11.5, 7, 5, 4.5, 1, 2)), row.names = c(NA, 30L), class = "data.frame")
df2 <- structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808, 3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819, 3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830, 3831, 3832, 3833), class = "Date"), Pr = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.985656032, 0, 0), Tmax = c(16.7579, 16.425, 16.8751, 15.9877, 15.3752, 15.4138, 15.6038, 14.194, 15.9599, 17.0566, 13.6396, 15.8548, 15.9682, 17.4073, 17.9238, 10.1966, 10.7029, 13.6226, 12.7758, 15.0315000000001, 14.5553, 10.0413, 12.5804, 11.7263, 13.9642, 20.8888, 19.1779, 6.32770000000005, 8.32380000000001, 10.3664), Tmin = c(2.38980000000004, 3.73170000000005, 2.37440000000004, 0.744400000000041, 1.34430000000003, 0.914200000000051, -0.107699999999966, -0.11099999999999, 1.29250000000002, 2.44490000000002, -0.0418999999999983, 2.48950000000002, 2.7414, 2.8424, 0.513500000000022, -5.2158, -9.0718, 0.44380000000001, -1.51529999999997, -2.18459999999999, 2.36690000000004, 0.0375000000000227, -3.572, -2.93179999999995, -2.54819999999995, 0.20700000000005, 5.50260000000003, 1.28820000000002, -2.12149999999997, -7.39839999999998)), row.names = 11841:11870, class = "data.frame")
# data_frame is deprecated, use tibble
seq_df <- tibble(Date = seq.Date(as.Date("1948-01-01"), as.Date("2016-12-31"), by="day"))