I have two vectors, each containing names of columns in my data.frame, data
, that is in wide form, e.g.
NSW_cols <- c("NSW_1", "NSW_2") # Columns for two variables from New South Wales
AUS_cols <- c("AUS_1", "AUS_2") # Columns for two variables from Australia
head(data)
date NSW_1 AUS_1 NSW_2 AUS_2
1 2000-01-01 -0.38358623 -1.78824221 0.995984590 1.0744594
2 2001-01-01 -1.95910318 2.03124252 -1.695764903 0.2605978
3 2002-01-01 -0.84170506 -0.70314433 -0.533372143 -0.3142720
4 2003-01-01 1.90354747 0.15816476 -1.372269451 -0.7496301
5 2004-01-01 0.62249393 0.50623480 -2.207919779 -0.8621983
6 2005-01-01 1.99092044 -0.81999511 1.822122519 2.0480403
I need to compute share_i := NSW_i / AUS_i
(i.e. row-wise, or by date
) for i = 1,2
, while referencing my two name vectors above.
For example, using data.table
, I can do something like:
setDT(data)
for (i in 1:2) {
data[, paste0("share_", i) := get(NSW_cols[i]) / get(AUS_cols[i])]
}
This seems silly for only a few columns, but in reality my application has several thousand columns. Moreover, in my actual application, I need to do more than simply divide one column by the other. Therefore, I am trying to find a solution that will work for any function accepting two numeric vectors as inputs.
The data comes to me in a somewhat random form, so the convenient ordering of the columns in this example is also not something I can use.
Q: What is the best way to compute on many pairs of columns using vectors of names?
Suspicion: Maybe there is a clever way to do this by converting the data to long form? I haven't been able to make it work.
Example data:
structure(list(date = structure(c(10957, 11323, 11688, 12053,
12418, 12784, 13149, 13514, 13879, 14245, 14610, 14975, 15340,
15706, 16071, 16436, 16801, 17167, 17532, 17897), class = "Date"),
NSW_1 = c(1.24119982707737, 0.138858419103515, 1.71063158823657,
-0.430640974722993, -1.04422958092706, 0.537579524580529,
-0.669585987150229, 0.638805611438309, -1.72398983449257,
-1.74243008034091, 0.689804173282994, 0.330963177173467,
0.871067708948055, -2.01624558221344, 1.21257910351036, 1.20049469882194,
1.03206832593544, 0.786410256177216, 2.11007351377927, -1.45380984681329
), AUS_1 = c(-0.58310384813065, 0.409723982550305, -0.806981635414238,
0.0855504408545073, 0.746243168639741, -0.653673061331084,
0.657105983301959, 0.549909235009709, -0.806729358432671,
-0.997379717276235, 0.97589063842626, -0.169423180700716,
0.72219177943747, -0.844418606912503, 1.27729368500115, -1.34311054918022,
0.765340668860696, 0.464202569980373, 0.267993278040529,
0.667522687135242), NSW_2 = c(0.398467283863779, -0.638071030930068,
-0.267712904023511, 0.359879564885712, -1.31286609394071,
-0.883969609668706, 2.07709479458465, -2.09922563220098,
-1.23850596532828, 0.990433089664036, 1.08866186319808, 0.839852254188259,
0.0568586386833875, 0.32387805118027, -0.904668667590953,
-0.65218384837012, -0.262454637960949, -0.934662840905703,
0.821161212634175, -1.62425917345994), AUS_2 = c(-1.03040366948029,
-1.261929311973, 0.39218462589648, -1.13143826165372, 0.544144484008162,
1.17660893473457, 0.0252285692390373, 0.515133169692034,
-0.654109760017422, 0.503641990827566, -1.2721192223284,
-0.0767711536986575, -1.34531937567941, -0.266317560246187,
1.08756299501947, 0.700567795415207, -0.4427595146404, -0.78851996588786,
-0.856775709774438, -0.746419014623393)), class = "data.frame", row.names = c(NA,
-20L))
CodePudding user response:
Here is a way.
Use lapply
not on the columns names vectors but on an index to them.
data <-
structure(list(
date = structure(c(10957, 11323, 11688, 12053,
12418, 12784, 13149, 13514, 13879, 14245, 14610, 14975, 15340,
15706, 16071, 16436, 16801, 17167, 17532, 17897), class = "Date"),
NSW_1 = c(1.24119982707737, 0.138858419103515, 1.71063158823657,
-0.430640974722993, -1.04422958092706, 0.537579524580529,
-0.669585987150229, 0.638805611438309, -1.72398983449257,
-1.74243008034091, 0.689804173282994, 0.330963177173467,
0.871067708948055, -2.01624558221344, 1.21257910351036, 1.20049469882194,
1.03206832593544, 0.786410256177216, 2.11007351377927, -1.45380984681329
), AUS_1 = c(-0.58310384813065, 0.409723982550305, -0.806981635414238,
0.0855504408545073, 0.746243168639741, -0.653673061331084,
0.657105983301959, 0.549909235009709, -0.806729358432671,
-0.997379717276235, 0.97589063842626, -0.169423180700716,
0.72219177943747, -0.844418606912503, 1.27729368500115, -1.34311054918022,
0.765340668860696, 0.464202569980373, 0.267993278040529,
0.667522687135242),
NSW_2 = c(0.398467283863779, -0.638071030930068,
-0.267712904023511, 0.359879564885712, -1.31286609394071,
-0.883969609668706, 2.07709479458465, -2.09922563220098,
-1.23850596532828, 0.990433089664036, 1.08866186319808, 0.839852254188259,
0.0568586386833875, 0.32387805118027, -0.904668667590953,
-0.65218384837012, -0.262454637960949, -0.934662840905703,
0.821161212634175, -1.62425917345994),
AUS_2 = c(-1.03040366948029,
-1.261929311973, 0.39218462589648, -1.13143826165372, 0.544144484008162,
1.17660893473457, 0.0252285692390373, 0.515133169692034,
-0.654109760017422, 0.503641990827566, -1.2721192223284,
-0.0767711536986575, -1.34531937567941, -0.266317560246187,
1.08756299501947, 0.700567795415207, -0.4427595146404, -0.78851996588786,
-0.856775709774438, -0.746419014623393)),
class = "data.frame", row.names = c(NA, -20L))
library(data.table)
NSW_cols <- c("NSW_1", "NSW_2") # Columns for two variables from New South Wales
AUS_cols <- c("AUS_1", "AUS_2") # Columns for two variables from Australia
i <- sub(".*(\\d $)", "\\1", NSW_cols)
shares <- paste("share", i, sep = "_")
setDT(data)
data[, (shares) := lapply(seq_along(NSW_cols), \(i) get(NSW_cols[i])/get(AUS_cols[i]))]
data
#> date NSW_1 AUS_1 NSW_2 AUS_2 share_1 share_2
#> 1: 2000-01-01 1.2411998 -0.58310385 0.39846728 -1.03040367 -2.1286085 -0.38670988
#> 2: 2001-01-01 0.1388584 0.40972398 -0.63807103 -1.26192931 0.3389072 0.50563136
#> 3: 2002-01-01 1.7106316 -0.80698164 -0.26771290 0.39218463 -2.1197900 -0.68261958
#> 4: 2003-01-01 -0.4306410 0.08555044 0.35987956 -1.13143826 -5.0337669 -0.31807265
#> 5: 2004-01-01 -1.0442296 0.74624317 -1.31286609 0.54414448 -1.3993154 -2.41271598
#> 6: 2005-01-01 0.5375795 -0.65367306 -0.88396961 1.17660893 -0.8223982 -0.75128582
#> 7: 2006-01-01 -0.6695860 0.65710598 2.07709479 0.02522857 -1.0189924 82.33105789
#> 8: 2007-01-01 0.6388056 0.54990924 -2.09922563 0.51513317 1.1616565 -4.07511253
#> 9: 2008-01-01 -1.7239898 -0.80672936 -1.23850597 -0.65410976 2.1370114 1.89342224
#> 10: 2009-01-01 -1.7424301 -0.99737972 0.99043309 0.50364199 1.7470077 1.96654192
#> 11: 2010-01-01 0.6898042 0.97589064 1.08866186 -1.27211922 0.7068458 -0.85578603
#> 12: 2011-01-01 0.3309632 -0.16942318 0.83985225 -0.07677115 -1.9534705 -10.93968520
#> 13: 2012-01-01 0.8710677 0.72219178 0.05685864 -1.34531938 1.2061446 -0.04226405
#> 14: 2013-01-01 -2.0162456 -0.84441861 0.32387805 -0.26631756 2.3877323 -1.21613479
#> 15: 2014-01-01 1.2125791 1.27729369 -0.90466867 1.08756300 0.9493346 -0.83183105
#> 16: 2015-01-01 1.2004947 -1.34311055 -0.65218385 0.70056780 -0.8938167 -0.93093610
#> 17: 2016-01-01 1.0320683 0.76534067 -0.26245464 -0.44275951 1.3485084 0.59277018
#> 18: 2017-01-01 0.7864103 0.46420257 -0.93466284 -0.78851997 1.6941101 1.18533820
#> 19: 2018-01-01 2.1100735 0.26799328 0.82116121 -0.85677571 7.8736061 -0.95843195
#> 20: 2019-01-01 -1.4538098 0.66752269 -1.62425917 -0.74641901 -2.1779183 2.17606886
Created on 2022-11-18 with reprex v2.0.2
CodePudding user response:
Using tidyverse
I'd do something like this:
library(dplyr)
library(purrr)
library(magrittr)
map_dfc(
seq_along(NSW_cols),
~ {
(data[[NSW_cols[.x]]] / data[[AUS_cols[.x]]]) %>%
list() %>%
set_names(paste0("share_", .x))
}
) %>%
bind_cols(data, .)