I have three data sets, which one can generate as follows:
library(dplyr)
library(tidyr)
# Simulate x data
countries = LETTERS[1:3]
mat_x = matrix(runif(27, 0, 100), nrow = 3)
colnames(mat_x) = 2012:2020
df_x = bind_cols(country = countries, mat_x)
df_x
# # A tibble: 3 × 10
# country `2012` `2013` `2014` `2015` `2016` `2017` `2018` `2019` `2020`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 A 36.2 61.5 15.8 8.52 4.35 24.4 1.74 86.8 15.4
# 2 B 48.3 70.2 96.9 73.1 64.6 74.0 16.0 86.9 80.8
# 3 C 52.7 50.3 70.1 93.7 10.1 44.1 85.1 40.6 54.0
mat_y = matrix(runif(27, 0, 100), nrow = 3)
colnames(mat_y) <- paste0("Y", 2012:2020)
df_y = bind_cols(country = countries, mat_y)
df_y
# # A tibble: 3 × 10
# country Y2012 Y2013 Y2014 Y2015 Y2016 Y2017 Y2018 Y2019 Y2020
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 A 41.0 86.6 30.6 97.7 41.0 40.1 8.42 76.4 98.0
# 2 B 39.1 12.9 67.9 26.8 0.616 15.6 56.9 49.7 25.8
# 3 C 27.1 82.3 35.0 43.4 24.3 45.9 82.1 82.3 63.3
mat_z = matrix(runif(27, 0, 100), nrow = 3)
colnames(mat_z) <- paste0("Y", 2021:2029)
df_z = bind_cols(country = countries, mat_z)
df_z
# # A tibble: 3 × 10
# country Y2021 Y2022 Y2023 Y2024 Y2025 Y2026 Y2027 Y2028 Y2029
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 A 31.0 15.0 87.2 50.9 21.0 94.2 90.7 3.32 65.5
# 2 B 42.0 36.2 12.1 63.1 73.8 49.2 40.7 73.9 96.8
# 3 C 96.8 38.0 9.00 67.3 63.7 95.2 51.0 81.7 74.4
df_x
is a response variable, df_y
is a regressor, and df_z
are future values of the regressor.
I tried to build nested linear regression models between each row in df_x
and its corresponding row in df_y
, as follows:
# convert df_x to long format
df_x_long <-
df_x %>%
pivot_longer(-country, names_to = "year", values_to = "x") %>%
mutate(across(year, as.numeric))
head(df_x_long)
# # A tibble: 6 × 3
# country year x
# <chr> <dbl> <dbl>
# 1 A 2012 36.2
# 2 A 2013 61.5
# 3 A 2014 15.8
# 4 A 2015 8.52
# 5 A 2016 4.35
# 6 A 2017 24.4
# convert df_y to long format
df_y_long <-
df_y %>%
select(country, starts_with("Y")) %>%
pivot_longer(-country, names_to = "year", values_to = "y") %>%
mutate(year = as.numeric(gsub("^Y", "", year)))
# join
df_xy_long <- inner_join(df_x_long, df_y_long, by = c("country", "year"))
head(df_xy_long)
# # A tibble: 6 × 4
# country year x y
# <chr> <dbl> <dbl> <dbl>
# 1 A 2012 88.9 21.2
# 2 A 2013 3.76 24.5
# 3 A 2014 96.2 40.0
# 4 A 2015 26.9 44.6
# 5 A 2016 22.1 27.4
# 6 A 2017 88.7 24.8
# nest and model
df_xy_nested <-
df_xy_long %>%
group_by(country) %>%
nest() %>%
rowwise() %>%
mutate(model = list(lm(y ~ x, data = data))) %>%
ungroup()
head(df_xy_nested)
# # A tibble: 3 × 3
# country data model
# <chr> <list> <list>
# 1 A <tibble [9 × 3]> <lm>
# 2 B <tibble [9 × 3]> <lm>
# 3 C <tibble [9 × 3]> <lm>
Now I would like to add predictions for each row (country
), based on the data in df_z
, where it gives predictions based on model lm
for the years 2021
to 2029
, in a way similar to this.
Please help me, any hint or suggestion would be very welcome.
CodePudding user response:
You can do basically the same thing. You convert df_z
to long just like you did with df_x
, nest it and join it to the data, and then use purrr::map2
(or base mapply
) to call predict
on the model and prediction data in each row:
library(purrr)
df_z %>%
select(country, starts_with("Y")) %>%
pivot_longer(-country, names_to = "year", values_to = "x") %>%
mutate(year = as.numeric(gsub("^Y", "", year))) %>%
group_by(country) %>%
nest() %>%
rename(pred_data = data) %>%
left_join(df_xy_nested) %>%
mutate(
prediction = map2(.x = model, .y = pred_data, predict)
)
# Joining, by = "country"
# # A tibble: 3 × 5
# # Groups: country [3]
# country pred_data data model prediction
# <chr> <list> <list> <list> <list>
# 1 A <tibble [9 × 2]> <tibble [9 × 3]> <lm> <dbl [9]>
# 2 B <tibble [9 × 2]> <tibble [9 × 3]> <lm> <dbl [9]>
# 3 C <tibble [9 × 2]> <tibble [9 × 3]> <lm> <dbl [9]>