Rolling paste strings across columns-CodePudding

I have this type of data:

df <- data.frame(
  w1 = c("A", "B", "C", "E", "F", "G"),
  w2 = c("B", "G", "C", "D", "E", "V"),
  w3 = c("D", "S", "O", "F", NA, "N"),
  w4 = c("E", "U", NA, "T", NA, NA),
  w5 = c("C", NA, NA, NA, NA, NA)
)

I need to iterate through column pairs to rolling-paste the separate strings into bigrams. Note that in the actual data the strings are of variable character length and character type. I've tried this but it fails:

df[, paste0("bigr_", 1:4, "_", 2:5)] <- lapply(df[, 1:5], 
                                               function(x) paste(x[i], x[i 1], sep = " "))

The expected output is:

  w1 w2   w3   w4   w5 bigr_1_2 bigr_2_3 bigr_3_4 bigr_4_5
1  A  B    D    E    C      A B      B D      D E      E C
2  B  G    S    U <NA>      B G      G S      S U     <NA>
3  C  C    O <NA> <NA>      C C      C O     <NA>     <NA>
4  E  D    F    T <NA>      E D      D F      F T     <NA>
5  F  E <NA> <NA> <NA>      F E     <NA>     <NA>     <NA>
6  G  V    N <NA> <NA>      G V      V N     <NA>     <NA>

I'd be most interested in a dplyr solution but am open and grateful for other solutions as well.

CodePudding user response：

As you said you're most interested in a dplyr solution, this can be achieved using mutate() and across(). You can alter the function applied to each column if this doesn't achieve the exact desired output.

df %>%
  mutate(
    across(
      # For the first four columns (i.e. has number 1-4 in column name)
      matches("[1-4]"),
      
      # Apply custom function
      function(col) {
        
        # Paste together
        paste(
          col, # the data in the current column
          cur_data()[[which(names(cur_data()) == cur_column()) 1]], # and the data in the next column along
          sep = " "
        )
      },
      .names = "{gsub(pattern = 'w', replacement = 'bigr_', {col})}" # alter name of new cols (replace 'w' with 'bigr_')
    )
  ) %>%

  # EDIT: added to rename columns to match desired output
  rename_with(.cols = matches("bigr"),
              .fn = function(colname) {
                paste0(colname, "_", as.numeric(gsub(pattern = "bigr_", replacement = "", colname)) 1)
              })

CodePudding user response：

df <- data.frame(
  w1 = c("A", "B", "C", "E", "F", "G"),
  w2 = c("B", "G", "C", "D", "E", "V"),
  w3 = c("D", "S", "O", "F", NA, "N"),
  w4 = c("E", "U", NA, "T", NA, NA),
  w5 = c("C", NA, NA, NA, NA, NA)
)

library(tidyverse)
library(janitor)

df %>% 
  mutate(rn = row_number()) %>% 
  pivot_longer(-rn, values_drop_na = TRUE) %>% 
  group_by(rn) %>% 
  mutate(bigr = paste0(value, "_", lead(value))) %>% 
  mutate(bigr = if_else(str_detect(bigr, "_NA"), NA_character_, bigr)) %>% 
  pivot_wider(rn, names_from = c(name), values_from = c(value, bigr)) %>% 
  remove_empty("cols") %>% 
  ungroup() %>% 
  select(-rn) %>% 
  rename_with(~str_remove(string = ., "value_")) %>% 
  rename_with(~str_replace(., "(_w)(\\d )", "_\\2"))

#> # A tibble: 6 × 9
#>   w1    w2    w3    w4    w5    bigr_1 bigr_2 bigr_3 bigr_4
#>   <chr> <chr> <chr> <chr> <chr> <chr>  <chr>  <chr>  <chr> 
#> 1 A     B     D     E     C     A_B    B_D    D_E    E_C   
#> 2 B     G     S     U     <NA>  B_G    G_S    S_U    <NA>  
#> 3 C     C     O     <NA>  <NA>  C_C    C_O    <NA>   <NA>  
#> 4 E     D     F     T     <NA>  E_D    D_F    F_T    <NA>  
#> 5 F     E     <NA>  <NA>  <NA>  F_E    <NA>   <NA>   <NA>  
#> 6 G     V     N     <NA>  <NA>  G_V    V_N    <NA>   <NA>

^{Created on 2022-04-26 by the reprex package (v2.0.1)}

CodePudding user response：

As long as you don't have a string that is NA, you could try:

df %>%
    mutate(across(c(everything(), - 1), 
                  ~ paste(get(paste0("w", match(cur_column(), names(cur_data())) - 1)), .),
                  .names = 'bigr_{paste0("w", match(.col, names(cur_data())) - 1)}_{.col}')) %>%
    mutate(across(starts_with("bigr"), 
                  ~ if_else(str_count(., "NA") != 0, NA_character_, .)))

  w1 w2   w3   w4   w5 bigr_w1_w2 bigr_w2_w3 bigr_w3_w4 bigr_w4_w5
1  A  B    D    E    C        A B        B D        D E        E C
2  B  G    S    U <NA>        B G        G S        S U       <NA>
3  C  C    O <NA> <NA>        C C        C O       <NA>       <NA>
4  E  D    F    T <NA>        E D        D F        F T       <NA>
5  F  E <NA> <NA> <NA>        F E       <NA>       <NA>       <NA>
6  G  V    N <NA> <NA>        G V        V N       <NA>       <NA>

CodePudding user response：

As you are open to non-dplyr solutions, we can do it in base R by modifying your original code:

df[, paste0("bigr_", 1:4, "_", 2:5)] <- mapply(paste, df[, 1:4], df[, 2:5])

# as NA is coerced to character, we need to find those positions and correct
x <- which(is.na(df[, 1:4]) | is.na(df[, 2:5]), arr.ind = TRUE)
x[, 2] <- x[, 2]   5
df[x] <- NA
df

#   w1 w2   w3   w4   w5 bigr_1_2 bigr_2_3 bigr_3_4 bigr_4_5
# 1  A  B    D    E    C      A B      B D      D E      E C
# 2  B  G    S    U <NA>      B G      G S      S U     <NA>
# 3  C  C    O <NA> <NA>      C C      C O     <NA>     <NA>
# 4  E  D    F    T <NA>      E D      D F      F T     <NA>
# 5  F  E <NA> <NA> <NA>      F E     <NA>     <NA>     <NA>
# 6  G  V    N <NA> <NA>      G V      V N     <NA>     <NA>

CodePudding user response：

We can use the tidytext package as follows:

df %>%
  rowid_to_column() %>%
  unite(col, -rowid, sep = ' ') %>%
  tidytext::unnest_ngrams(value, 'col', 2, to_lower = FALSE) %>%
  group_by(rowid) %>%
  mutate(name = row_number()) %>%
  pivot_wider(rowid, names_prefix = 'bgram_')

# A tibble: 6 x 5
# Groups:   rowid [6]
  rowid bgram_1 bgram_2 bgram_3 bgram_4
  <int> <chr>   <chr>   <chr>   <chr>  
1     1 A B     B D     D E     E C    
2     2 B G     G S     S U     U NA   
3     3 C C     C O     O NA    NA NA  
4     4 E D     D F     F T     T NA   
5     5 F E     E NA    NA NA   NA NA  
6     6 G V     V N     N NA    NA NA

CodePudding user response：

using data.table

df[, (paste("bigr", 1:4, 2:5, sep = "_")) := Map(function(x, y) ifelse(is.na(x) | is.na(y), NA, paste(x, y)), .SD[, 1:4], .SD[, 2:5])]