I have a dataset that looks something like this:

""	"region"	"region_a_price_raw"	"region_b_price_raw"	"region_c_price_raw"	"region_a_adjusted"	"region_b_adjusted"	"region_c_adjusted"	"region_a_pct_chng"	"region_b_pct_chng"	"region_c_pct_chng"
"1"	"C"	0.691277900885566	-1.12168419402904	-1.80708124084338	-0.823054962637259	-1.56205680347623	2.39150423647063	94	43	100
"2"	"B"	-0.917718229751991	0.35628937645658	0.587525813366388	0.839040270582852	0.240455566072964	-0.281641015285604	27	48	21
"3"	"B"	1.2846493277039	0.13190349180679	1.26024317859471	-0.971360861843787	0.257888869705433	-0.979961536031851	92	64	82

What I need to do is create a new variable that has the price variable for each region, for the raw, adjusted and pct_chng variables.

I know how to do this manually. However, there are a lot of regions (far more than the three in the example), as well as multiple percent change variables (I only included one here for sake of brevity).

So what I'm hoping is that, since each relevant price variable includes the region name in it's own variable name, there is some way to do this where I can write a function that automatically detects the region in the variable name, since it's in the named there. I unfortunately don't know how to do this elegantly at present.

library(dplyr)
#creating sample data
df1 <- data.frame(region = sample(LETTERS[1:3],15,replace = TRUE), region_a_price_raw = rnorm(15),region_b_price_raw=rnorm(15),region_c_price_raw=rnorm(15)) 
df2 <- data.frame(region_a_adjusted=rnorm(15),region_b_adjusted=rnorm(15),region_c_adjusted=rnorm(15))
df3 <- data.frame(region_a_pct_chng=sample(1:100,15,replace = TRUE),region_b_pct_chng=sample(1:100,15,replace = TRUE),region_c_pct_chng=sample(1:100,15,replace = TRUE))
sample <- cbind(df1,df2,df3)

#here's how it would work manually. this would take forever in the actual dataset though
sample <- sample %>%
  mutate(price_raw=case_when(region=="A"~region_a_price_raw,
                             region=="B"~region_b_price_raw,
                             region=="C"~region_c_price_raw)) %>% 
  mutate(price_adjusted=case_when(region=="A"~region_a_adjusted,
                                  region=="B"~region_b_adjusted,
                                  region=="C"~region_c_adjusted)) %>% 
  mutate(pct_chng=case_when(region=="A"~region_a_pct_chng,
                            region=="B"~region_b_pct_chng,
                            region=="C"~region_c_pct_chng))

I'm hoping someone has a way to do this that won't have me manually doing this across each region and price variable.

CodePudding user response：

(I think there's a more direct way than this for combining the last three lines into one using a little regex...)

library(dplyr); library(tidyr)
sample %>%
  mutate(row = row_number()) %>%
  pivot_longer(-c(row, region)) %>%
  separate(name, c("drop", "region", "type"), sep = "_", extra = "merge") %>%
  pivot_wider(names_from = type, values_from = value)

Result

# A tibble: 45 × 6
     row drop   region price_raw adjusted pct_chng
   <int> <chr>  <chr>      <dbl>    <dbl>    <dbl>
 1     1 region a         0.222    -0.869       92
 2     1 region b         0.149    -0.972       19
 3     1 region c         1.04      0.116       94
 4     2 region a        -0.844    -0.755       13
 5     2 region b        -0.963    -0.547       81
 6     2 region c         0.198     1.38        61
 7     3 region a         0.444    -0.130       48
 8     3 region b        -0.0665   -1.69        13
 9     3 region c        -1.63      0.574       56
10     4 region a         0.0558   -1.00         7
# … with 35 more rows

CodePudding user response：

You never gave a seed to your data. So will use the data with 3 rows above:

sample %>%
   pivot_longer(-c(rn, region), names_to = c('grp', '.value'),
                names_pattern = 'region_([^_ ])_(. )$') %>%
   filter(tolower(region) == grp)

region grp   price_raw adjusted pct_chng
  <chr>  <chr>     <dbl>    <dbl>    <int>
1 C      c        -1.81     2.39       100
2 B      b         0.356    0.240       48
3 B      b         0.132    0.258       64

Data

sample <- structure(list(region = c("C", "B", "B"), region_a_price_raw = c(0.691277900885566, 
-0.917718229751991, 1.2846493277039), region_b_price_raw = c(-1.12168419402904, 
0.35628937645658, 0.13190349180679), region_c_price_raw = c(-1.80708124084338, 
0.587525813366388, 1.26024317859471), region_a_adjusted = c(-0.823054962637259, 
0.839040270582852, -0.971360861843787), region_b_adjusted = c(-1.56205680347623, 
0.240455566072964, 0.257888869705433), region_c_adjusted = c(2.39150423647063, 
-0.281641015285604, -0.979961536031851), region_a_pct_chng = c(94L, 
27L, 92L), region_b_pct_chng = c(43L, 48L, 64L), region_c_pct_chng = c(100L, 
21L, 82L)), class = "data.frame", row.names = c(NA, 3L))