Context
We would like to know how subtract columns, two by two. Specifically, we want to subtract the columns of the dataframe as follow:
- the
u_2018
-u_2019
. - the
u_2019
-u_2020
. - the
u_2020
-u_2021
. - the
u_2021
-u_2022
.
We looked for a optimal solution in the several stackoverflow question posts, like this (How to subtract two columns using tidyverse mutate with columns named by external variables), but the only undesired approach achieved is datiled in the R code used section. The R session version is 4.2.0
and the dplyr
package version is 1.0.9
.
Input data in R session
> dat
# A tibble: 6 × 5
u_2018 u_2019 u_2020 u_2021 u_2022
<int> <int> <int> <int> <int>
1 90035 88015 76135 50725 16517
2 20 NA NA 13792 12793
3 555 620 15032 19309 6479
4 11171 11782 10281 8974 3901
5 NA 116896 40169 13191 3610
R code used
dat %>%
mutate(
diff_2018_2019 = u_2018 - u_2019,
diff_2019_2020 = u_2019 - u_2020,
diff_2020_2021 = u_2020 - u_2021,
diff_2021_2022 = u_2021 - u_2022)
Question
We would like to know the optimal solution to subtract two by two columns. Maybe one approach includes mutate_at()
or across
to obtain the differences between and save the subtraction in new columns.
Thanks in advance
CodePudding user response:
A possible solution:
library(tidyverse)
df %>%
mutate(map2_dfc(names(.)[-ncol(.)], names(.)[-1], ~ df[.x] - df[.y]) %>%
set_names(str_c("new", 1:(ncol(df)-1))))
#> u_2018 u_2019 u_2020 u_2021 u_2022 new1 new2 new3 new4
#> 1 90035 88015 76135 50725 16517 2020 11880 25410 34208
#> 2 20 NA NA 13792 12793 NA NA NA 999
#> 3 555 620 15032 19309 6479 -65 -14412 -4277 12830
#> 4 11171 11782 10281 8974 3901 -611 1501 1307 5073
#> 5 NA 116896 40169 13191 3610 NA 76727 26978 9581
Or in base R
:
cbind(df, setNames(df[-ncol(df)] - df[-1], paste0("new", 1:(ncol(df)-1))))
#> u_2018 u_2019 u_2020 u_2021 u_2022 new1 new2 new3 new4
#> 1 90035 88015 76135 50725 16517 2020 11880 25410 34208
#> 2 20 NA NA 13792 12793 NA NA NA 999
#> 3 555 620 15032 19309 6479 -65 -14412 -4277 12830
#> 4 11171 11782 10281 8974 3901 -611 1501 1307 5073
#> 5 NA 116896 40169 13191 3610 NA 76727 26978 9581
CodePudding user response:
You could use pmap_dfr
diff
:
library(purrr)
pmap_dfr(df, ~ -diff(c(...))) %>%
set_names(~ as.integer(sub('u_', '', .x)) %>% paste('dif', . - 1, ., sep = '_'))
# # A tibble: 5 × 4
# dif_2018_2019 dif_2019_2020 dif_2020_2021 dif_2021_2022
# <int> <int> <int> <int>
# 1 2020 11880 25410 34208
# 2 NA NA NA 999
# 3 -65 -14412 -4277 12830
# 4 -611 1501 1307 5073
# 5 NA 76727 26978 9581
Note that diff()
defaults to subtract the former from the latter, so you need to invert the sign with -diff()
.
Data
df <- structure(list(u_2018 = c(90035L, 20L, 555L, 11171L, NA), u_2019 = c(88015L,
NA, 620L, 11782L, 116896L), u_2020 = c(76135L, NA, 15032L, 10281L, 40169L),
u_2021 = c(50725L, 13792L, 19309L, 8974L, 13191L), u_2022 = c(16517L,
12793L, 6479L, 3901L, 3610L)), class = "data.frame", row.names = c("1", "2", "3", "4", "5"))
CodePudding user response:
df <- structure(list(u_2018 = c(90035L, 20L, 555L, 11171L, NA), u_2019 = c(88015L,
NA, 620L, 11782L, 116896L), u_2020 = c(76135L, NA, 15032L, 10281L, 40169L),
u_2021 = c(50725L, 13792L, 19309L, 8974L, 13191L), u_2022 = c(16517L,
12793L, 6479L, 3901L, 3610L)), class = "data.frame", row.names = c("1", "2", "3", "4", "5"))
library(dplyr, warn.conflicts = FALSE)
library(stringr)
df %>%
mutate(rename_with(df[-ncol(df)] - df[-1], ~ str_replace(., 'u', 'diff')))
#> u_2018 u_2019 u_2020 u_2021 u_2022 diff_2018 diff_2019 diff_2020 diff_2021
#> 1 90035 88015 76135 50725 16517 2020 11880 25410 34208
#> 2 20 NA NA 13792 12793 NA NA NA 999
#> 3 555 620 15032 19309 6479 -65 -14412 -4277 12830
#> 4 11171 11782 10281 8974 3901 -611 1501 1307 5073
#> 5 NA 116896 40169 13191 3610 NA 76727 26978 9581
Created on 2022-07-28 by the reprex package (v2.0.1)
CodePudding user response:
We could use across2
from dplyover
library(dplyover)
library(stringr)
df1 %>%
mutate(across2(-last_col(), -1, `-`,
.names_fn = ~str_replace(.x, "u", "dif")))
-output
u_2018 u_2019 u_2020 u_2021 u_2022 dif_2018_u_2019 dif_2019_u_2020 dif_2020_u_2021 dif_2021_u_2022
1 90035 88015 76135 50725 16517 2020 11880 25410 34208
2 20 NA NA 13792 12793 NA NA NA 999
3 555 620 15032 19309 6479 -65 -14412 -4277 12830
4 11171 11782 10281 8974 3901 -611 1501 1307 5073
5 NA 116896 40169 13191 3610 NA 76727 26978 9581
data
df1 <- structure(list(u_2018 = c(90035L, 20L, 555L, 11171L, NA), u_2019 = c(88015L,
NA, 620L, 11782L, 116896L), u_2020 = c(76135L, NA, 15032L, 10281L,
40169L), u_2021 = c(50725L, 13792L, 19309L, 8974L, 13191L), u_2022 = c(16517L,
12793L, 6479L, 3901L, 3610L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5"))