Home > database >  Pivot only some values with distinct string characteristics from a field to a wider format R
Pivot only some values with distinct string characteristics from a field to a wider format R

Time:02-19

I have a data frame that is structured like this:

   country      variable  value
1      ARG    variable_1 0.2340
2      ARG variable_1_se 0.0063
3      ARG variable_1_cv 0.0008
4      ARG    variable_2 0.5320
5      ARG variable_2_se 0.0023
6      ARG variable_2_cv 0.0004
7      BOL    variable_1 0.3240
8      BOL variable_1_se 0.0013
9      BOL variable_1_cv 0.0004
10     BOL    variable_2 0.6380
11     BOL variable_2_se 0.0053
12     BOL variable_2_cv 0.0009

I would like to pull the se and cv values out to a wider format so it looks like this:

  country   variable value     se    cv
1     ARG variable_1 0.234 0.0063 8e-04
2     ARG variable_2 0.532 0.0023 4e-04
3     BOL variable_1 0.324 0.0013 4e-04
4     BOL variable_2 0.638 0.0053 9e-04

I am using pivot_wider() from tidyverse, but I am struggling with a few things:

  1. I have having trouble only pulling the SE and CV values, while leaving the variables in place.
  2. I have a lot of variables with associated SE and CV values, so I would like something that allows me to just specify ends_with("_se") or "_cv".

Here is the code to reproduce the dfs:

  df_original <- structure(list(country = c("ARG","ARG","ARG","ARG","ARG","ARG", "BOL", "BOL", "BOL","BOL", "BOL", "BOL"), 
                       variable = c("variable_1", "variable_1_se", "variable_1_cv","variable_2", "variable_2_se", "variable_2_cv", 
                                    "variable_1", "variable_1_se", "variable_1_cv","variable_2", "variable_2_se", "variable_2_cv"), 
                       value = c(.234, .0063, .0008, .532, .0023, .0004, 0.324,.0013, .0004,.638, .0053, .0009)), class = "data.frame", row.names = c(NA, -12L))
  
  df_desired <- structure(list(country = c("ARG","ARG","BOL", "BOL"), 
                               variable = c("variable_1", "variable_2", 
                                            "variable_1","variable_2"), 
                               value = c(.234,.532,.324,.638),
                               se = c(.0063,.0023,.0013,.0053), 
                               cv = c(.0008,.0004,.0004,.0009)), class = "data.frame", row.names = c(NA, -4L))
  

CodePudding user response:

We may need to extract the substring and do a pivot_wider

library(dplyr)
library(stringr)
library(tidyr)
df_original %>%
  mutate(colnm = str_extract(variable, "^\\w _\\d ")) %>% 
  group_by(country, colnm) %>% 
  mutate(value2 = value[variable == colnm]) %>%
  slice(-1) %>% 
  ungroup %>%
  mutate(variable = str_remove(variable, "^\\w _\\d _")) %>% 
  pivot_wider(names_from = variable, values_from = value)%>%  
  rename(variable = colnm, value = value2)

-output

# A tibble: 4 × 5
  country variable   value     se     cv
  <chr>   <chr>      <dbl>  <dbl>  <dbl>
1 ARG     variable_1 0.234 0.0063 0.0008
2 ARG     variable_2 0.532 0.0023 0.0004
3 BOL     variable_1 0.324 0.0013 0.0004
4 BOL     variable_2 0.638 0.0053 0.0009

Or use extract from tidyr

df_original %>%
  tidyr::extract(variable, into = c("variable", "colnm"), 
   "^(\\w _\\d )_?([a-z]*)$") %>% 
  group_by(country, variable) %>% 
  mutate(value2 = value, value = value[!nzchar(colnm)]) %>% 
  ungroup %>% 
  filter(nzchar(colnm)) %>% 
  pivot_wider(names_from = colnm, values_from = value2)

CodePudding user response:

Another solution:

df_original %>% 
  mutate(variable = ifelse(grepl('variable_\\d$', variable), paste(variable, 'value', sep = '_'), variable)) %>% 
  separate(variable, c('variable', 'num', 'measure'), sep = '_') %>% 
  pivot_wider(names_from = measure, values_from = value) %>% 
  mutate(variable = paste(variable, num, sep = '_')) %>% 
  select(-num)

  country variable   value     se     cv
  <chr>   <chr>      <dbl>  <dbl>  <dbl>
1 ARG     variable_1 0.234 0.0063 0.0008
2 ARG     variable_2 0.532 0.0023 0.0004
3 BOL     variable_1 0.324 0.0013 0.0004
4 BOL     variable_2 0.638 0.0053 0.0009

CodePudding user response:

Packages

library(dplyr)
library(tidyr)

Solution

df_original %>% 
  separate(variable, sep = "_", into = c("name", "variable", "se_cv")) %>% 
  mutate(variable = paste0(name, "_", variable),
         se_cv = case_when( is.na(se_cv) ~ "value",
                            T ~ se_cv)) %>% 
  select(-name) %>% 
  pivot_wider(names_from = se_cv, 
              values_from = value)

Output

# A tibble: 4 x 5
  country variable   value     se     cv
  <chr>   <chr>      <dbl>  <dbl>  <dbl>
1 ARG     variable_1 0.234 0.0063 0.0008
2 ARG     variable_2 0.532 0.0023 0.0004
3 BOL     variable_1 0.324 0.0013 0.0004
4 BOL     variable_2 0.638 0.0053 0.0009

CodePudding user response:

Interesting question: Here is an alternative approach:

  1. We create a df_variable by filtering only the variable column with variable_1...

  2. create here a identifier row

  3. Next we define and filter se and cv with case_when within df

  4. group and add identifier row

  5. pivot_wider

  6. and finally right_join

library(tidyverse)

df_variable <- df %>% 
  filter(str_detect(variable, '\\w \\_\\d$')) %>% 
  mutate(row = parse_number(variable))

df %>% 
  mutate(variable = case_when(str_detect(variable, "_se") ~"se",
                              str_detect(variable, "_cv") ~"cv",
                              TRUE ~ variable)) %>% 
  filter(str_detect(variable, "se|cv")) %>% 
  group_by(country, variable) %>% 
  mutate(row = row_number()) %>% 
  pivot_wider(
    names_from = variable,
    values_from = value,
  ) %>% 
  right_join(df_variable, by=c("country", "row")) %>% 
  select(-row)
  country     se     cv variable   value
  <chr>    <dbl>  <dbl> <chr>      <dbl>
1 ARG     0.0063 0.0008 variable_1 0.234
2 ARG     0.0023 0.0004 variable_2 0.532
3 BOL     0.0013 0.0004 variable_1 0.324
4 BOL     0.0053 0.0009 variable_2 0.638

CodePudding user response:

df_original %>%
  separate(variable, c('variable', 'name'), '(?<=\\d)_', fill = 'right') %>%
  mutate(name = replace_na(name, 'value')) %>%
  pivot_wider()

# A tibble: 4 x 5
  country variable   value     se     cv
  <chr>   <chr>      <dbl>  <dbl>  <dbl>
1 ARG     variable_1 0.234 0.0063 0.0008
2 ARG     variable_2 0.532 0.0023 0.0004
3 BOL     variable_1 0.324 0.0013 0.0004
4 BOL     variable_2 0.638 0.0053 0.0009
  • Related