I have a data frame that is structured like this:
country variable value
1 ARG variable_1 0.2340
2 ARG variable_1_se 0.0063
3 ARG variable_1_cv 0.0008
4 ARG variable_2 0.5320
5 ARG variable_2_se 0.0023
6 ARG variable_2_cv 0.0004
7 BOL variable_1 0.3240
8 BOL variable_1_se 0.0013
9 BOL variable_1_cv 0.0004
10 BOL variable_2 0.6380
11 BOL variable_2_se 0.0053
12 BOL variable_2_cv 0.0009
I would like to pull the se and cv values out to a wider format so it looks like this:
country variable value se cv
1 ARG variable_1 0.234 0.0063 8e-04
2 ARG variable_2 0.532 0.0023 4e-04
3 BOL variable_1 0.324 0.0013 4e-04
4 BOL variable_2 0.638 0.0053 9e-04
I am using pivot_wider()
from tidyverse, but I am struggling with a few things:
- I have having trouble only pulling the SE and CV values, while leaving the variables in place.
- I have a lot of variables with associated SE and CV values, so I would like something that allows me to just specify ends_with("_se") or "_cv".
Here is the code to reproduce the dfs:
df_original <- structure(list(country = c("ARG","ARG","ARG","ARG","ARG","ARG", "BOL", "BOL", "BOL","BOL", "BOL", "BOL"),
variable = c("variable_1", "variable_1_se", "variable_1_cv","variable_2", "variable_2_se", "variable_2_cv",
"variable_1", "variable_1_se", "variable_1_cv","variable_2", "variable_2_se", "variable_2_cv"),
value = c(.234, .0063, .0008, .532, .0023, .0004, 0.324,.0013, .0004,.638, .0053, .0009)), class = "data.frame", row.names = c(NA, -12L))
df_desired <- structure(list(country = c("ARG","ARG","BOL", "BOL"),
variable = c("variable_1", "variable_2",
"variable_1","variable_2"),
value = c(.234,.532,.324,.638),
se = c(.0063,.0023,.0013,.0053),
cv = c(.0008,.0004,.0004,.0009)), class = "data.frame", row.names = c(NA, -4L))
CodePudding user response:
We may need to extract
the substring and do a pivot_wider
library(dplyr)
library(stringr)
library(tidyr)
df_original %>%
mutate(colnm = str_extract(variable, "^\\w _\\d ")) %>%
group_by(country, colnm) %>%
mutate(value2 = value[variable == colnm]) %>%
slice(-1) %>%
ungroup %>%
mutate(variable = str_remove(variable, "^\\w _\\d _")) %>%
pivot_wider(names_from = variable, values_from = value)%>%
rename(variable = colnm, value = value2)
-output
# A tibble: 4 × 5
country variable value se cv
<chr> <chr> <dbl> <dbl> <dbl>
1 ARG variable_1 0.234 0.0063 0.0008
2 ARG variable_2 0.532 0.0023 0.0004
3 BOL variable_1 0.324 0.0013 0.0004
4 BOL variable_2 0.638 0.0053 0.0009
Or use extract
from tidyr
df_original %>%
tidyr::extract(variable, into = c("variable", "colnm"),
"^(\\w _\\d )_?([a-z]*)$") %>%
group_by(country, variable) %>%
mutate(value2 = value, value = value[!nzchar(colnm)]) %>%
ungroup %>%
filter(nzchar(colnm)) %>%
pivot_wider(names_from = colnm, values_from = value2)
CodePudding user response:
Another solution:
df_original %>%
mutate(variable = ifelse(grepl('variable_\\d$', variable), paste(variable, 'value', sep = '_'), variable)) %>%
separate(variable, c('variable', 'num', 'measure'), sep = '_') %>%
pivot_wider(names_from = measure, values_from = value) %>%
mutate(variable = paste(variable, num, sep = '_')) %>%
select(-num)
country variable value se cv
<chr> <chr> <dbl> <dbl> <dbl>
1 ARG variable_1 0.234 0.0063 0.0008
2 ARG variable_2 0.532 0.0023 0.0004
3 BOL variable_1 0.324 0.0013 0.0004
4 BOL variable_2 0.638 0.0053 0.0009
CodePudding user response:
Packages
library(dplyr)
library(tidyr)
Solution
df_original %>%
separate(variable, sep = "_", into = c("name", "variable", "se_cv")) %>%
mutate(variable = paste0(name, "_", variable),
se_cv = case_when( is.na(se_cv) ~ "value",
T ~ se_cv)) %>%
select(-name) %>%
pivot_wider(names_from = se_cv,
values_from = value)
Output
# A tibble: 4 x 5
country variable value se cv
<chr> <chr> <dbl> <dbl> <dbl>
1 ARG variable_1 0.234 0.0063 0.0008
2 ARG variable_2 0.532 0.0023 0.0004
3 BOL variable_1 0.324 0.0013 0.0004
4 BOL variable_2 0.638 0.0053 0.0009
CodePudding user response:
Interesting question: Here is an alternative approach:
We create a
df_variable
by filtering only the variable column withvariable_1
...create here a identifier
row
Next we define and filter
se
andcv
withcase_when
withindf
group and add identifier
row
pivot_wider
and finally
right_join
library(tidyverse)
df_variable <- df %>%
filter(str_detect(variable, '\\w \\_\\d$')) %>%
mutate(row = parse_number(variable))
df %>%
mutate(variable = case_when(str_detect(variable, "_se") ~"se",
str_detect(variable, "_cv") ~"cv",
TRUE ~ variable)) %>%
filter(str_detect(variable, "se|cv")) %>%
group_by(country, variable) %>%
mutate(row = row_number()) %>%
pivot_wider(
names_from = variable,
values_from = value,
) %>%
right_join(df_variable, by=c("country", "row")) %>%
select(-row)
country se cv variable value
<chr> <dbl> <dbl> <chr> <dbl>
1 ARG 0.0063 0.0008 variable_1 0.234
2 ARG 0.0023 0.0004 variable_2 0.532
3 BOL 0.0013 0.0004 variable_1 0.324
4 BOL 0.0053 0.0009 variable_2 0.638
CodePudding user response:
df_original %>%
separate(variable, c('variable', 'name'), '(?<=\\d)_', fill = 'right') %>%
mutate(name = replace_na(name, 'value')) %>%
pivot_wider()
# A tibble: 4 x 5
country variable value se cv
<chr> <chr> <dbl> <dbl> <dbl>
1 ARG variable_1 0.234 0.0063 0.0008
2 ARG variable_2 0.532 0.0023 0.0004
3 BOL variable_1 0.324 0.0013 0.0004
4 BOL variable_2 0.638 0.0053 0.0009