string in multiple columns-CodePudding

I am trying to split a string in multiple columns, currently I have this list in a data frame. Every string has different length, and I can't always predict it, I would like to split by ",", remove the "(" and ")" and have every variable in a column.

x
(1,2,3,4,5)
(1,2,3,4,5,6)
(1,2,3,4,5,6,7)

I already tried but does not work

y = strsplit(as.character(df$x),',')

desired output

x   x   x   x   x   x   x
1   2   3   4   5   n/a n/a
1   2   3   4   5   6   n/a
1   2   3   4   5   6   7

CodePudding user response：

You can do:

x <- c("(1,2,3,4,5)", "(1,2,3,4,5,6)", "(1,2,3,4,5,6,7)")

x <- lapply(strsplit(gsub("\\(|\\)", "", x), ","), as.numeric)

x <- lapply(x, function(y) c(y, rep(NA, max(lengths(x)) - length(y))))

setNames(as.data.frame(x), c("x1", "x2", "x3"))
#>   x1 x2 x3
#> 1  1  1  1
#> 2  2  2  2
#> 3  3  3  3
#> 4  4  4  4
#> 5  5  5  5
#> 6 NA  6  6
#> 7 NA NA  7

^{Created on 2022-05-28 by the reprex package (v2.0.1)}

CodePudding user response：

Here's a start:

library(tidyr)
x %>%
  # remove`(`  and `)`:
  mutate(V = gsub("\\(|\\)", "", V)) %>%
  # split `V` into separate columns:
  separate(V, into = paste0('x', 1:7), fill = 'right', remove = TRUE, sep = ',')
  x1 x2 x3 x4 x5   x6   x7
1  1  2  3  4  5 <NA> <NA>
2  1  2  3  4  5    6 <NA>
3  1  2  3  4  5    6    7

Data:

x <- data.frame(
V = c("(1,2,3,4,5)","(1,2,3,4,5,6)","(1,2,3,4,5,6,7)")
)

EDIT:

If the number of digits and hence columns is unknown you can do this:

x_new <-x %>%
  # remove`(`  and `)`:
  mutate(V = gsub("\\(|\\)", "", V)) %>%
  # count number of digits:
  mutate(N = str_count(V, "\\d"))

x_new %>% 
  # split `V` into separate columns:
  separate(V, into = paste0('x', 1:max(x$N, na.rm = TRUE)), fill = 'right', remove = TRUE, sep = ',') %>%
  select(-N)

CodePudding user response：

Split on non-numerics, remove first elements, adapt lengths, data frame.

lapply(strsplit(dat$V1, '\\D'), `[`, -1) |>
  {\(.) lapply(., `length<-`, max(lengths(.)))}() |>
  do.call(what=rbind) |> as.data.frame()
#   V1 V2 V3 V4 V5   V6   V7
# 1  1  2  3  4  5 <NA> <NA>
# 2  1  2  3  4  5    6 <NA>
# 3  1  2  3  4  5    6    7

Data:

dat <- structure(list(V1 = c("(1,2,3,4,5)", "(1,2,3,4,5,6)", "(1,2,3,4,5,6,7)"
)), class = "data.frame", row.names = c(NA, -3L))