I have a dataset that looks something like this:
"" | "region" | "region_a_price_raw" | "region_b_price_raw" | "region_c_price_raw" | "region_a_adjusted" | "region_b_adjusted" | "region_c_adjusted" | "region_a_pct_chng" | "region_b_pct_chng" | "region_c_pct_chng" |
---|---|---|---|---|---|---|---|---|---|---|
"1" | "C" | 0.691277900885566 | -1.12168419402904 | -1.80708124084338 | -0.823054962637259 | -1.56205680347623 | 2.39150423647063 | 94 | 43 | 100 |
"2" | "B" | -0.917718229751991 | 0.35628937645658 | 0.587525813366388 | 0.839040270582852 | 0.240455566072964 | -0.281641015285604 | 27 | 48 | 21 |
"3" | "B" | 1.2846493277039 | 0.13190349180679 | 1.26024317859471 | -0.971360861843787 | 0.257888869705433 | -0.979961536031851 | 92 | 64 | 82 |
What I need to do is create a new variable that has the price variable for each region, for the raw
, adjusted
and pct_chng
variables.
I know how to do this manually. However, there are a lot of regions (far more than the three in the example), as well as multiple percent change variables (I only included one here for sake of brevity).
So what I'm hoping is that, since each relevant price variable includes the region name in it's own variable name, there is some way to do this where I can write a function that automatically detects the region in the variable name, since it's in the named there. I unfortunately don't know how to do this elegantly at present.
library(dplyr)
#creating sample data
df1 <- data.frame(region = sample(LETTERS[1:3],15,replace = TRUE), region_a_price_raw = rnorm(15),region_b_price_raw=rnorm(15),region_c_price_raw=rnorm(15))
df2 <- data.frame(region_a_adjusted=rnorm(15),region_b_adjusted=rnorm(15),region_c_adjusted=rnorm(15))
df3 <- data.frame(region_a_pct_chng=sample(1:100,15,replace = TRUE),region_b_pct_chng=sample(1:100,15,replace = TRUE),region_c_pct_chng=sample(1:100,15,replace = TRUE))
sample <- cbind(df1,df2,df3)
#here's how it would work manually. this would take forever in the actual dataset though
sample <- sample %>%
mutate(price_raw=case_when(region=="A"~region_a_price_raw,
region=="B"~region_b_price_raw,
region=="C"~region_c_price_raw)) %>%
mutate(price_adjusted=case_when(region=="A"~region_a_adjusted,
region=="B"~region_b_adjusted,
region=="C"~region_c_adjusted)) %>%
mutate(pct_chng=case_when(region=="A"~region_a_pct_chng,
region=="B"~region_b_pct_chng,
region=="C"~region_c_pct_chng))
I'm hoping someone has a way to do this that won't have me manually doing this across each region and price variable.
CodePudding user response:
(I think there's a more direct way than this for combining the last three lines into one using a little regex...)
library(dplyr); library(tidyr)
sample %>%
mutate(row = row_number()) %>%
pivot_longer(-c(row, region)) %>%
separate(name, c("drop", "region", "type"), sep = "_", extra = "merge") %>%
pivot_wider(names_from = type, values_from = value)
Result
# A tibble: 45 × 6
row drop region price_raw adjusted pct_chng
<int> <chr> <chr> <dbl> <dbl> <dbl>
1 1 region a 0.222 -0.869 92
2 1 region b 0.149 -0.972 19
3 1 region c 1.04 0.116 94
4 2 region a -0.844 -0.755 13
5 2 region b -0.963 -0.547 81
6 2 region c 0.198 1.38 61
7 3 region a 0.444 -0.130 48
8 3 region b -0.0665 -1.69 13
9 3 region c -1.63 0.574 56
10 4 region a 0.0558 -1.00 7
# … with 35 more rows
CodePudding user response:
You never gave a seed to your data. So will use the data with 3 rows above:
sample %>%
pivot_longer(-c(rn, region), names_to = c('grp', '.value'),
names_pattern = 'region_([^_ ])_(. )$') %>%
filter(tolower(region) == grp)
region grp price_raw adjusted pct_chng
<chr> <chr> <dbl> <dbl> <int>
1 C c -1.81 2.39 100
2 B b 0.356 0.240 48
3 B b 0.132 0.258 64
Data
sample <- structure(list(region = c("C", "B", "B"), region_a_price_raw = c(0.691277900885566,
-0.917718229751991, 1.2846493277039), region_b_price_raw = c(-1.12168419402904,
0.35628937645658, 0.13190349180679), region_c_price_raw = c(-1.80708124084338,
0.587525813366388, 1.26024317859471), region_a_adjusted = c(-0.823054962637259,
0.839040270582852, -0.971360861843787), region_b_adjusted = c(-1.56205680347623,
0.240455566072964, 0.257888869705433), region_c_adjusted = c(2.39150423647063,
-0.281641015285604, -0.979961536031851), region_a_pct_chng = c(94L,
27L, 92L), region_b_pct_chng = c(43L, 48L, 64L), region_c_pct_chng = c(100L,
21L, 82L)), class = "data.frame", row.names = c(NA, 3L))