Home > OS >  Fill missing values in a dataframe based on values from another dataframe R
Fill missing values in a dataframe based on values from another dataframe R

Time:12-14

Given two dataframes df1 and df2 with 4 columns each, I would like to do the following:

  1. Expand df1 to have a date range between 1948-2016. For example:
seq_df <- data_frame(Date = seq.Date(as.Date("1948-01-01"), 
                                     as.Date("2016-12-31"), 
                                     by="day"))

The expanded rows should have the columns filled with NA

  1. Replace all NAs in df1 with corresponding cell-specific values from df2.

I tried the following but it did not work:

seq_df <- data_frame(Date = seq.Date(as.Date("1948-01-01"), 
                                     as.Date("2016-12-31"), 
                                     by="day"))


data1=full_join(df1, seq_df) %>% arrange(Date)

Sample Data

df1=structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808, 
3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819, 
3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830, 
3831, 3832, 3833), class = "Date"), Pr = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), Tmax = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, 13.5, 13, 12, 11.2, 8.5, 12, 13, 17.5, 17.1, 
17.1, 9.5, 6.5, NA), Tmin = c(NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10.5, 11, 12, 12.5, 7, 
10, 11.5, 7, 5, 4.5, 1, 2)), row.names = c(NA, 30L), class = "data. Frame")
df2=structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808, 
3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819, 
3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830, 
3831, 3832, 3833), class = "Date"), Pr = c(0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
1.985656032, 0, 0), Tmax = c(16.7579, 16.425, 16.8751, 15.9877, 
15.3752, 15.4138, 15.6038, 14.194, 15.9599, 17.0566, 13.6396, 
15.8548, 15.9682, 17.4073, 17.9238, 10.1966, 10.7029, 13.6226, 
12.7758, 15.0315000000001, 14.5553, 10.0413, 12.5804, 11.7263, 
13.9642, 20.8888, 19.1779, 6.32770000000005, 8.32380000000001, 
10.3664), Tmin = c(2.38980000000004, 3.73170000000005, 2.37440000000004, 
0.744400000000041, 1.34430000000003, 0.914200000000051, -0.107699999999966, 
-0.11099999999999, 1.29250000000002, 2.44490000000002, -0.0418999999999983, 
2.48950000000002, 2.7414, 2.8424, 0.513500000000022, -5.2158, 
-9.0718, 0.44380000000001, -1.51529999999997, -2.18459999999999, 
2.36690000000004, 0.0375000000000227, -3.572, -2.93179999999995, 
-2.54819999999995, 0.20700000000005, 5.50260000000003, 1.28820000000002, 
-2.12149999999997, -7.39839999999998)), row.names = 11841:11870, class = "data. Frame")

CodePudding user response:

I think you want to use all of df1's data where present, fill in df2 where df1 is missing, then fill out the Date for the rest of the timespan.

The first step is to fill in df1 with data from df2, coalesce (fill NA) where needed, and remove the duplicate columns.

full_join(df1, df2, by = "Date", suffix = c("", ".y")) %>%
  mutate(across(c(Pr, Tmax, Tmin), ~ coalesce(., cur_data()[[ paste0(cur_column(), ".y") ]]))) %>%
  select(-ends_with(".y"))
#          Date       Pr    Tmax    Tmin
# 1  1980-06-01 0.000000 16.7579  2.3898
# 2  1980-06-02 0.000000 16.4250  3.7317
# 3  1980-06-03 0.000000 16.8751  2.3744
# 4  1980-06-04 0.000000 15.9877  0.7444
# 5  1980-06-05 0.000000 15.3752  1.3443
# 6  1980-06-06 0.000000 15.4138  0.9142
# 7  1980-06-07 0.000000 15.6038 -0.1077
# 8  1980-06-08 0.000000 14.1940 -0.1110
# 9  1980-06-09 0.000000 15.9599  1.2925
# 10 1980-06-10 0.000000 17.0566  2.4449
# 11 1980-06-11 0.000000 13.6396 -0.0419
# 12 1980-06-12 0.000000 15.8548  2.4895
# 13 1980-06-13 0.000000 15.9682  2.7414
# 14 1980-06-14 0.000000 17.4073  2.8424
# 15 1980-06-15 0.000000 17.9238  0.5135
# 16 1980-06-16 0.000000 10.1966 -5.2158
# 17 1980-06-17 0.000000 10.7029 -9.0718
# 18 1980-06-18 0.000000 13.5000  0.4438
# 19 1980-06-19 0.000000 13.0000 10.5000
# 20 1980-06-20 0.000000 12.0000 11.0000
# 21 1980-06-21 0.000000 11.2000 12.0000
# 22 1980-06-22 0.000000  8.5000 12.5000
# 23 1980-06-23 0.000000 12.0000  7.0000
# 24 1980-06-24 0.000000 13.0000 10.0000
# 25 1980-06-25 0.000000 17.5000 11.5000
# 26 1980-06-26 0.000000 17.1000  7.0000
# 27 1980-06-27 0.000000 17.1000  5.0000
# 28 1980-06-28 1.985656  9.5000  4.5000
# 29 1980-06-29 0.000000  6.5000  1.0000
# 30 1980-06-30 0.000000 10.3664  2.0000

From here, filling out the timespan is just another join:

result <- full_join(df1, df2, by = "Date", suffix = c("", ".y")) %>%
  mutate(across(c(Pr, Tmax, Tmin), ~ coalesce(., cur_data()[[ paste0(cur_column(), ".y") ]]))) %>%
  select(-ends_with(".y")) %>%
  full_join(seq_df, by = "Date")

(optionally add %>% arrange(Date)).

--

Data (with the correct class=):

df1 <- structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808, 3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819, 3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830, 3831, 3832, 3833), class = "Date"), Pr = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,  NA_real_, NA_real_, NA_real_), Tmax = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13.5, 13, 12, 11.2, 8.5, 12, 13, 17.5, 17.1, 17.1, 9.5, 6.5, NA), Tmin = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10.5, 11, 12, 12.5, 7, 10, 11.5, 7, 5, 4.5, 1, 2)), row.names = c(NA, 30L), class = "data.frame")
df2 <- structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808, 3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819, 3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830, 3831, 3832, 3833), class = "Date"), Pr = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.985656032, 0, 0), Tmax = c(16.7579, 16.425, 16.8751, 15.9877, 15.3752, 15.4138, 15.6038, 14.194, 15.9599, 17.0566, 13.6396, 15.8548, 15.9682, 17.4073, 17.9238, 10.1966, 10.7029, 13.6226,  12.7758, 15.0315000000001, 14.5553, 10.0413, 12.5804, 11.7263, 13.9642, 20.8888, 19.1779, 6.32770000000005, 8.32380000000001, 10.3664), Tmin = c(2.38980000000004, 3.73170000000005, 2.37440000000004, 0.744400000000041, 1.34430000000003, 0.914200000000051, -0.107699999999966, -0.11099999999999, 1.29250000000002, 2.44490000000002, -0.0418999999999983, 2.48950000000002, 2.7414, 2.8424, 0.513500000000022, -5.2158, -9.0718, 0.44380000000001, -1.51529999999997, -2.18459999999999, 2.36690000000004, 0.0375000000000227,  -3.572, -2.93179999999995, -2.54819999999995, 0.20700000000005, 5.50260000000003, 1.28820000000002, -2.12149999999997, -7.39839999999998)), row.names = 11841:11870, class = "data.frame")
# data_frame is deprecated, use tibble
seq_df <- tibble(Date = seq.Date(as.Date("1948-01-01"), as.Date("2016-12-31"), by="day"))
  • Related