Extracting a letter and put it in a separated column in R-CodePudding

I have data set like this:

df<-data.frame(ID=(1:5), cloumn1=c("AA","GG","AG","AA","AT"), cloumn2=c("AA","GG","AG","AA","AT"), stringsAsFactors=FALSE)
df
ID cloumn1 cloumn2
 1      AA      AA
 2      GG      GG
 3      AG      AG
 4      AA      AA
 5      AT      AT

I want to separate each column into 2 letters so the output will look something like this:

ID cloumn1.A cloumn1.B cloumn2.A cloumn2.B
 1         A         A         A         A
 2         G         G         G         G
 3         A         G         A         G
 4         A         A         A         A
 5         A         T         A         T

Can you help me please?

CodePudding user response：

library(tidyverse)

df %>% 
  pivot_longer(-ID) %>% 
  mutate(tmp = str_split(value, pattern = "")) %>% 
  unnest(tmp) %>% 
  group_by(ID, name) %>% 
  mutate(id_row = LETTERS[row_number()]) %>% 
  pivot_wider(id_cols = c(ID, name), names_from =c(name, id_row), values_from = tmp, names_sep = ".") %>% 
  ungroup()

#> # A tibble: 5 x 5
#>      ID cloumn1.A cloumn1.B cloumn2.A cloumn2.B
#>   <int> <chr>     <chr>     <chr>     <chr>    
#> 1     1 A         A         A         A        
#> 2     2 G         G         G         G        
#> 3     3 A         G         A         G        
#> 4     4 A         A         A         A        
#> 5     5 A         T         A         T

data

df <-
  data.frame(
    ID = (1:5),
    cloumn1 = c("AA", "GG", "AG", "AA", "AT"),
    cloumn2 = c("AA", "GG", "AG", "AA", "AT"),
    stringsAsFactors = FALSE
  )

^{Created on 2021-11-05 by the reprex package (v2.0.1)}

data.table

library(data.table)

setDT(df)

melt(data = df, id.vars = "ID") %>% 
  .[, list(value = unlist(strsplit(value, split = ""))), by = list(ID, variable)] %>% 
  .[, id_row := LETTERS[rowid(ID, variable)]] %>% 
  dcast(formula = ID ~ variable   id_row, value.var = "value")

   ID cloumn1_A cloumn1_B cloumn2_A cloumn2_B
1:  1         A         A         A         A
2:  2         G         G         G         G
3:  3         A         G         A         G
4:  4         A         A         A         A
5:  5         A         T         A         T

CodePudding user response：

Uisng strsplit.

cbind(df[1], do.call(cbind.data.frame, lapply(df[-1], function(x) 
  do.call(rbind, strsplit(x, '')))))
#   ID cloumn1.1 cloumn1.2 cloumn2.1 cloumn2.2
# 1  1         A         A         A         A
# 2  2         G         G         G         G
# 3  3         A         G         A         G
# 4  4         A         A         A         A
# 5  5         A         T         A         T

CodePudding user response：

Yet another solution, tidyverse-based:

library(tidyverse)

df<-data.frame(ID=(1:5), column1=c("AA","GG","AG","AA","AT"), column2=c("AA","GG","AG","AA","AT"), stringsAsFactors=FALSE)

df %>% 
  mutate(
    across(
      starts_with("column"), ~
      str_split(get(cur_column()), "(?<=[A-Z])(?=[A-Z])", simplify = T),
      .names="{.col}_sep"), column1 = NULL, column2 = NULL)

#>   ID column1_sep.1 column1_sep.2 column2_sep.1 column2_sep.2
#> 1  1             A             A             A             A
#> 2  2             G             G             G             G
#> 3  3             A             G             A             G
#> 4  4             A             A             A             A
#> 5  5             A             T             A             T

Another possibility, based on a pivot_longer followed by a pivot_wider:

library(tidyverse)

df<-data.frame(ID=(1:5), column1=c("AA","GG","AG","AA","AT"), column2=c("AA","GG","AG","AA","AT"), stringsAsFactors=FALSE)


df %>% 
  pivot_longer(-ID) %>% 
  separate(value, into=LETTERS[1:2], sep= "(?<=[A-Z])(?=[A-Z])") %>% 
  pivot_wider(ID, names_from = "name", values_from = c(A,B), 
              names_glue = "{name}.{.value}") %>% 
  relocate(column1.B,.before=column2.A)

#> # A tibble: 5 × 5
#>      ID column1.A column1.B column2.A column2.B
#>   <int> <chr>     <chr>     <chr>     <chr>    
#> 1     1 A         A         A         A        
#> 2     2 G         G         G         G        
#> 3     3 A         G         A         G        
#> 4     4 A         A         A         A        
#> 5     5 A         T         A         T