R: harder pivot_longer arguments-CodePudding

I have a dataframe with these names:


df <- tribble( ~ state, ~ county, ~ kfr_asian_pooled_p25, ~ kfr_asian_pooled_p75, 
               ~ kfr_black_pooled_p25, ~ kfr_black_pooled_p75, 
               ~ kfr_pooled_pooled_p25, ~ kfr_pooled_pooled_p75, ~ kfr_white_pooled_p25, 
               ~ kfr_white_pooled_p75, ~ kids_count, ~ kfr_asian_pooled_p25_se, 
               ~ kfr_asian_pooled_p75_se, ~ kfr_black_pooled_p25_se, 
               ~ kfr_black_pooled_p75_se, ~ kfr_white_pooled_p25_se, 
               ~ kfr_white_pooled_p75_se,
               1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)

Of course the dataset is giant, but what I want to achieve is to pivot longer using its maximum potential. I mean grabbing several variables at the same time and pivot them and their names.

df <- tribble(~ state, ~ county, ~ kids_count,   ~ race, ~ percentile,   ~ se, ~ value,
                     1,       2,            3,  "asian",        "p25",   TRUE,   value,
                     2,       3,            4,  "black",        "p25",   TRUE,   value,
                     3,       4,            5,  "white",        "p25",   TRUE,   value,
                     1,       2,            3,  "asian",        "p75",   TRUE,   value,
                     2,       3,            4,  "black",        "p75",   TRUE,   value,
                     3,       4,            5,  "white",        "p75",   TRUE,   value,
                     1,       2,            3,  "asian",        "p25",  FALSE,   value,
                     2,       3,            4,  "black",        "p25",  FALSE,   value,
                     3,       4,            5,  "white",        "p25",  FALSE,   value,
                     1,       2,            3,  "asian",        "p75",  FALSE,   value,
                     2,       3,            4,  "black",        "p75",  FALSE,   value,
                     3,       4,            5,  "white",        "p75",  FALSE,   value)

I would really appreciate any help!

CodePudding user response：

You can do something like this, if I understand what you want

(First, a quick explanation of the pivot_longer() options that I use)

cols selects the columns you want to pivot
names_to with multiple elements allows us to pivot into multiple columns
names_prefix allows us to remove a common prefix across the columns that we want to pivot
name_sep allows us to indicate how the multiple names in names_to are broken up in the original columns

pivot_longer(
  df,
  cols = starts_with("kfr"),
  names_to=c("race", "pooled", "percentile", "se"), 
  names_prefix="kfr_", 
  names_sep="_"
) %>% 
  select(!pooled) %>% 
  mutate(se=!is.na(se))


# A tibble: 14 x 7
   state county kids_count race   percentile se    value
   <dbl>  <dbl>      <dbl> <chr>  <chr>      <lgl> <dbl>
 1     1      2         11 asian  p25        FALSE     3
 2     1      2         11 asian  p75        FALSE     4
 3     1      2         11 black  p25        FALSE     5
 4     1      2         11 black  p75        FALSE     6
 5     1      2         11 pooled p25        FALSE     7
 6     1      2         11 pooled p75        FALSE     8
 7     1      2         11 white  p25        FALSE     9
 8     1      2         11 white  p75        FALSE    10
 9     1      2         11 asian  p25        TRUE     12
10     1      2         11 asian  p75        TRUE     13
11     1      2         11 black  p25        TRUE     14
12     1      2         11 black  p75        TRUE     15
13     1      2         11 white  p25        TRUE     16
14     1      2         11 white  p75        TRUE     17

CodePudding user response：

It looks like you are trying to pivot_longer just the percentile columns and then split the names to create separate columns if I'm understanding correctly. The tidyselect functions such as matches are very useful for selecting similarly named columns. I’ve also used str_split from the stringr package to help create the new character columns. You could always reorganize the columns with select or relocate if the order is important to you.

library(tidyverse)
library(stringr)

df_long <-
  df %>%
  pivot_longer(cols = matches("[p25]|[p75]"), names_to = "percentile") %>%
  mutate(se = grepl("se", percentile),
         race = stringr::str_split(percentile, "_", simplify = TRUE)[,2],
         percentile = stringr::str_split(percentile, "_", simplify = TRUE)[,4])

df_long
#> # A tibble: 14 x 7
#>    state county kids_count percentile value se    race  
#>    <dbl>  <dbl>      <dbl> <chr>      <dbl> <lgl> <chr> 
#>  1     1      2         11 p25            3 FALSE asian 
#>  2     1      2         11 p75            4 FALSE asian 
#>  3     1      2         11 p25            5 FALSE black 
#>  4     1      2         11 p75            6 FALSE black 
#>  5     1      2         11 p25            7 FALSE pooled
#>  6     1      2         11 p75            8 FALSE pooled
#>  7     1      2         11 p25            9 FALSE white 
#>  8     1      2         11 p75           10 FALSE white 
#>  9     1      2         11 p25           12 TRUE  asian 
#> 10     1      2         11 p75           13 TRUE  asian 
#> 11     1      2         11 p25           14 TRUE  black 
#> 12     1      2         11 p75           15 TRUE  black 
#> 13     1      2         11 p25           16 TRUE  white 
#> 14     1      2         11 p75           17 TRUE  white

^{Created on 2022-02-18 by the reprex package (v2.0.1)}

CodePudding user response：

An option using separate().

library(tidyverse)

df %>%
  pivot_longer(-c('state', 'county', 'kids_count')) %>%
  separate(name, into = c(NA, 'race', NA, 'percentile', 'se'), sep = '_') %>%
  mutate(se = !is.na(se))

# # A tibble: 14 x 7
#   state county kids_count race   percentile se    value
#    <dbl>  <dbl>      <dbl> <chr>  <chr>      <lgl> <dbl>
#  1     1      2         11 asian  p25        FALSE     3
#  2     1      2         11 asian  p75        FALSE     4
#  3     1      2         11 black  p25        FALSE     5
#  4     1      2         11 black  p75        FALSE     6
#  5     1      2         11 pooled p25        FALSE     7
#  6     1      2         11 pooled p75        FALSE     8
#  7     1      2         11 white  p25        FALSE     9
#  8     1      2         11 white  p75        FALSE    10
#  9     1      2         11 asian  p25        TRUE     12
# 10     1      2         11 asian  p75        TRUE     13
# 11     1      2         11 black  p25        TRUE     14
# 12     1      2         11 black  p75        TRUE     15
# 13     1      2         11 white  p25        TRUE     16
# 14     1      2         11 white  p75        TRUE     17

CodePudding user response：

We could combine pivot_longer from tidyr and cSplit from süöotstaclshape

library(splitstackshape)
library(dplyr)
library(tidyr)
df %>% 
  pivot_longer(
    -c(state, county, kids_count)
    ) %>%  
  cSplit("name", "_") %>% 
  select(state, county, kids_count, race=name_2, percentile=name_4, se=name_5, value)

    state county kids_count   race percentile   se value
 1:     1      2         11  asian        p25 <NA>     3
 2:     1      2         11  asian        p75 <NA>     4
 3:     1      2         11  black        p25 <NA>     5
 4:     1      2         11  black        p75 <NA>     6
 5:     1      2         11 pooled        p25 <NA>     7
 6:     1      2         11 pooled        p75 <NA>     8
 7:     1      2         11  white        p25 <NA>     9
 8:     1      2         11  white        p75 <NA>    10
 9:     1      2         11  asian        p25   se    12
10:     1      2         11  asian        p75   se    13
11:     1      2         11  black        p25   se    14
12:     1      2         11  black        p75   se    15
13:     1      2         11  white        p25   se    16
14:     1      2         11  white        p75   se    17