I have this type of data:
df <- data.frame(
w1 = c("A", "B", "C", "E", "F", "G"),
w2 = c("B", "G", "C", "D", "E", "V"),
w3 = c("D", "S", "O", "F", NA, "N"),
w4 = c("E", "U", NA, "T", NA, NA),
w5 = c("C", NA, NA, NA, NA, NA)
)
I need to iterate through column pairs to rolling-paste the separate strings into bigrams. Note that in the actual data the strings are of variable character length and character type. I've tried this but it fails:
df[, paste0("bigr_", 1:4, "_", 2:5)] <- lapply(df[, 1:5],
function(x) paste(x[i], x[i 1], sep = " "))
The expected output is:
w1 w2 w3 w4 w5 bigr_1_2 bigr_2_3 bigr_3_4 bigr_4_5
1 A B D E C A B B D D E E C
2 B G S U <NA> B G G S S U <NA>
3 C C O <NA> <NA> C C C O <NA> <NA>
4 E D F T <NA> E D D F F T <NA>
5 F E <NA> <NA> <NA> F E <NA> <NA> <NA>
6 G V N <NA> <NA> G V V N <NA> <NA>
I'd be most interested in a dplyr
solution but am open and grateful for other solutions as well.
CodePudding user response:
As you said you're most interested in a dplyr solution, this can be achieved using mutate()
and across()
. You can alter the function applied to each column if this doesn't achieve the exact desired output.
df %>%
mutate(
across(
# For the first four columns (i.e. has number 1-4 in column name)
matches("[1-4]"),
# Apply custom function
function(col) {
# Paste together
paste(
col, # the data in the current column
cur_data()[[which(names(cur_data()) == cur_column()) 1]], # and the data in the next column along
sep = " "
)
},
.names = "{gsub(pattern = 'w', replacement = 'bigr_', {col})}" # alter name of new cols (replace 'w' with 'bigr_')
)
) %>%
# EDIT: added to rename columns to match desired output
rename_with(.cols = matches("bigr"),
.fn = function(colname) {
paste0(colname, "_", as.numeric(gsub(pattern = "bigr_", replacement = "", colname)) 1)
})
CodePudding user response:
df <- data.frame(
w1 = c("A", "B", "C", "E", "F", "G"),
w2 = c("B", "G", "C", "D", "E", "V"),
w3 = c("D", "S", "O", "F", NA, "N"),
w4 = c("E", "U", NA, "T", NA, NA),
w5 = c("C", NA, NA, NA, NA, NA)
)
library(tidyverse)
library(janitor)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(-rn, values_drop_na = TRUE) %>%
group_by(rn) %>%
mutate(bigr = paste0(value, "_", lead(value))) %>%
mutate(bigr = if_else(str_detect(bigr, "_NA"), NA_character_, bigr)) %>%
pivot_wider(rn, names_from = c(name), values_from = c(value, bigr)) %>%
remove_empty("cols") %>%
ungroup() %>%
select(-rn) %>%
rename_with(~str_remove(string = ., "value_")) %>%
rename_with(~str_replace(., "(_w)(\\d )", "_\\2"))
#> # A tibble: 6 × 9
#> w1 w2 w3 w4 w5 bigr_1 bigr_2 bigr_3 bigr_4
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 A B D E C A_B B_D D_E E_C
#> 2 B G S U <NA> B_G G_S S_U <NA>
#> 3 C C O <NA> <NA> C_C C_O <NA> <NA>
#> 4 E D F T <NA> E_D D_F F_T <NA>
#> 5 F E <NA> <NA> <NA> F_E <NA> <NA> <NA>
#> 6 G V N <NA> <NA> G_V V_N <NA> <NA>
Created on 2022-04-26 by the reprex package (v2.0.1)
CodePudding user response:
As long as you don't have a string that is NA, you could try:
df %>%
mutate(across(c(everything(), - 1),
~ paste(get(paste0("w", match(cur_column(), names(cur_data())) - 1)), .),
.names = 'bigr_{paste0("w", match(.col, names(cur_data())) - 1)}_{.col}')) %>%
mutate(across(starts_with("bigr"),
~ if_else(str_count(., "NA") != 0, NA_character_, .)))
w1 w2 w3 w4 w5 bigr_w1_w2 bigr_w2_w3 bigr_w3_w4 bigr_w4_w5
1 A B D E C A B B D D E E C
2 B G S U <NA> B G G S S U <NA>
3 C C O <NA> <NA> C C C O <NA> <NA>
4 E D F T <NA> E D D F F T <NA>
5 F E <NA> <NA> <NA> F E <NA> <NA> <NA>
6 G V N <NA> <NA> G V V N <NA> <NA>
CodePudding user response:
As you are open to non-dplyr solutions, we can do it in base R by modifying your original code:
df[, paste0("bigr_", 1:4, "_", 2:5)] <- mapply(paste, df[, 1:4], df[, 2:5])
# as NA is coerced to character, we need to find those positions and correct
x <- which(is.na(df[, 1:4]) | is.na(df[, 2:5]), arr.ind = TRUE)
x[, 2] <- x[, 2] 5
df[x] <- NA
df
# w1 w2 w3 w4 w5 bigr_1_2 bigr_2_3 bigr_3_4 bigr_4_5
# 1 A B D E C A B B D D E E C
# 2 B G S U <NA> B G G S S U <NA>
# 3 C C O <NA> <NA> C C C O <NA> <NA>
# 4 E D F T <NA> E D D F F T <NA>
# 5 F E <NA> <NA> <NA> F E <NA> <NA> <NA>
# 6 G V N <NA> <NA> G V V N <NA> <NA>
CodePudding user response:
We can use the tidytext
package as follows:
df %>%
rowid_to_column() %>%
unite(col, -rowid, sep = ' ') %>%
tidytext::unnest_ngrams(value, 'col', 2, to_lower = FALSE) %>%
group_by(rowid) %>%
mutate(name = row_number()) %>%
pivot_wider(rowid, names_prefix = 'bgram_')
# A tibble: 6 x 5
# Groups: rowid [6]
rowid bgram_1 bgram_2 bgram_3 bgram_4
<int> <chr> <chr> <chr> <chr>
1 1 A B B D D E E C
2 2 B G G S S U U NA
3 3 C C C O O NA NA NA
4 4 E D D F F T T NA
5 5 F E E NA NA NA NA NA
6 6 G V V N N NA NA NA
CodePudding user response:
using data.table
df[, (paste("bigr", 1:4, 2:5, sep = "_")) := Map(function(x, y) ifelse(is.na(x) | is.na(y), NA, paste(x, y)), .SD[, 1:4], .SD[, 2:5])]