Restrict `pivot_wider` to rows matching a pattern-CodePudding

I want to pivot a column wider based not on all values in the column but only those that match a pattern.

Some toy data:

df <- data.frame(utterance = c("A and stuff", 
                               "X and something", 
                               "A and some more", 
                               "B etc.", 
                               "B", 
                               "x yz and so on", 
                               "BBB"),
                     timestamp = c("00:05:31.736 - 00:05:35.263", "00:05:31.829 - 00:05:36.449", 
                                   "00:05:31.829 - 00:05:36.449", "00:05:31.829 - 00:05:36.449", 
                                   "00:05:31.842 - 00:05:35.302", "00:05:35.088 - 00:05:36.134", 
                                   "00:05:35.263 - 00:05:53.052"))

I want to pivot wider only those rows in utterance that start with A or B. I can only pivot wider on all rows in utterance:

library(tidyr)
df %>%
  group_by(timestamp) %>%
  pivot_wider(-utterance, 
              names_from = utterance, 
              values_from = utterance)

# A tibble: 5 x 8
# Groups:   timestamp [5]
  timestamp                   `A and stuff` `X and something` `A and some more` `B etc.` B     `x yz and so on` BBB  
  <chr>                       <chr>         <chr>             <chr>             <chr>    <chr> <chr>            <chr>
1 00:05:31.736 - 00:05:35.263 A and stuff   NA                NA                NA       NA    NA               NA   
2 00:05:31.829 - 00:05:36.449 NA            X and something   A and some more   B etc.   NA    NA               NA   
3 00:05:31.842 - 00:05:35.302 NA            NA                NA                NA       B     NA               NA   
4 00:05:35.088 - 00:05:36.134 NA            NA                NA                NA       NA    x yz and so on   NA   
5 00:05:35.263 - 00:05:53.052 NA            NA                NA                NA       NA    NA               BBB

I've tried to subset utterance on the pattern, but got an error:

df %>%
  group_by(timestamp) %>%
  pivot_wider(names_from = utterance[grepl("^(A|B)", utterance)], 
              values_from = utterance[grepl("^(A|B)", utterance)])
Error: object 'utterance' not found

How can I pivot only on the matching rows?

Expected:

# timestamp                      `A`              utterance         `B`   
# <chr>                          <chr>            <chr>             <chr> 
#  00:05:31.736 - 00:05:35.263   A and stuff      NA                NA    
#  00:05:31.829 - 00:05:36.449   A and some more  X and something   B etc.
#  00:05:31.842 - 00:05:35.302   NA               NA                B     
#  00:05:35.088 - 00:05:36.134   NA               x yz and so on    NA    
#  00:05:35.263 - 00:05:53.052   NA               NA                BBB

CodePudding user response：

You could create a new names column:

library(stringr)
library(dplyr)
library(tidyr)

df %>% 
  mutate(pvt = case_when(str_detect(utterance, "^A") ~ "A",
                         str_detect(utterance, "^B") ~ "B",
                         TRUE ~ "utterance")) %>% 
  pivot_wider(names_from = pvt,
              values_from = utterance)

This returns

# A tibble: 5 x 4
  timestamp                   A               utterance       B     
  <chr>                       <chr>           <chr>           <chr> 
1 00:05:31.736 - 00:05:35.263 A and stuff     NA              NA    
2 00:05:31.829 - 00:05:36.449 A and some more X and something B etc.
3 00:05:31.842 - 00:05:35.302 NA              NA              B     
4 00:05:35.088 - 00:05:36.134 NA              x yz and so on  NA    
5 00:05:35.263 - 00:05:53.052 NA              NA              BBB

CodePudding user response：

A solution without pivot_wider:

library(tidyverse)

df <- data.frame(utterance = c("A and stuff", 
                               "X and something", 
                               "A and some more", 
                               "B etc.", 
                               "B", 
                               "x yz and so on", 
                               "BBB"),
                 timestamp = c("00:05:31.736 - 00:05:35.263", "00:05:31.829 - 00:05:36.449", 
                               "00:05:31.829 - 00:05:36.449", "00:05:31.829 - 00:05:36.449", 
                               "00:05:31.842 - 00:05:35.302", "00:05:35.088 - 00:05:36.134", 
                               "00:05:35.263 - 00:05:53.052"))

df %>% 
  mutate(A = ifelse(str_detect(utterance,"^A"),utterance,NA),
         B = ifelse(str_detect(utterance,"^B"),utterance,NA),
         utterance = ifelse(str_detect(utterance,"^A|^B"),NA, utterance)) %>% 
  relocate(utterance,.before="B") %>% 
  group_by(timestamp) %>% 
  fill(everything(),.direction = "downup") %>% 
  ungroup() %>% 
  distinct() 
#> # A tibble: 5 × 4
#>   timestamp                   A               utterance       B     
#>   <chr>                       <chr>           <chr>           <chr> 
#> 1 00:05:31.736 - 00:05:35.263 A and stuff     <NA>            <NA>  
#> 2 00:05:31.829 - 00:05:36.449 A and some more X and something B etc.
#> 3 00:05:31.842 - 00:05:35.302 <NA>            <NA>            B     
#> 4 00:05:35.088 - 00:05:36.134 <NA>            x yz and so on  <NA>  
#> 5 00:05:35.263 - 00:05:53.052 <NA>            <NA>            BBB