I would like to create new columns from a column containing an id sequence, say ID1-5.
Say, we have the following tibble:
# A tibble: 3 x 2
group id_seq
<chr> <chr>
1 A ID_Nr61-63
2 A ID_Nr67-69
3 B ID_Nr73-75
My desired output is:
# A tibble: 3 x 6
group id_seq id1 id2 id3 id4
<chr> <chr> <chr> <chr> <chr> <chr>
1 A ID_Nr61-64 ID_Nr61 ID_Nr62 ID_Nr63 ID_Nr64
2 A ID_Nr67-69 ID_Nr67 ID_Nr68 ID_Nr69 NA
3 B ID_Nr73-75 ID_Nr73 ID_Nr74 ID_Nr75 NA
Appreciate your help! Thanks.
CodePudding user response:
We may extract the digits from the id_seq, by rowwise
,get the sequence (:
) to expand the data and reshape from 'long' to 'wide' with pivot_wider
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)
df1 %>%
mutate(s1 = as.numeric(str_extract(id_seq, "\\d ")),
s2 = as.numeric(str_extract(id_seq, "\\d $"))) %>%
mutate(rn = row_number()) %>%
rowwise %>%
summarise(rn, group, id_seq, se = (s1:s2)) %>%
mutate(id_seq2 = str_replace(id_seq, "\\d -\\d $", as.character(se)),
rn2 = str_c("id", rowid(rn)), se = NULL) %>%
pivot_wider(names_from = rn2, values_from = id_seq2) %>%
select(-rn)
-output
# A tibble: 3 × 6
group id_seq id1 id2 id3 id4
<chr> <chr> <chr> <chr> <chr> <chr>
1 A ID_Nr61-64 ID_Nr61 ID_Nr62 ID_Nr63 ID_Nr64
2 A ID_Nr67-69 ID_Nr67 ID_Nr68 ID_Nr69 <NA>
3 B ID_Nr73-75 ID_Nr73 ID_Nr74 ID_Nr75 <NA>
Or using base R
lst1 <- lapply(sub("-", ":", sub("ID_Nr", "", df1$id_seq)),
function(x) paste0("id_seq", eval(parse(text = x))))
mx <- max(lengths(lst1))
m1 <- do.call(rbind, lapply(lst1, `length<-`, mx))
df1[paste0("id", seq_len(ncol(m1)))] <- m1
df1
group id_seq id1 id2 id3 id4
1 A ID_Nr61-64 id_seq61 id_seq62 id_seq63 id_seq64
2 A ID_Nr67-69 id_seq67 id_seq68 id_seq69 <NA>
3 B ID_Nr73-75 id_seq73 id_seq74 id_seq75 <NA>
data
df1 <- structure(list(group = c("A", "A", "B"), id_seq = c("ID_Nr61-64",
"ID_Nr67-69", "ID_Nr73-75")), class = "data.frame", row.names = c("1",
"2", "3"))
CodePudding user response:
here's an alternative:
df <- data.frame(group = c("A", "A", "B"),
id_seq = c("ID_Nr61-63", "ID_Nr67-69", "ID_Nr73-75"))
library(tidyverse)
df %>%
mutate(id_new = str_remove(id_seq, "ID_Nr")) %>%
separate(id_new, into = c("start", "end"), sep = "-") %>%
pivot_longer(cols = c(start, end)) %>%
mutate(value = as.numeric(value)) %>%
group_by(id_seq) %>%
expand(full_ids = full_seq(value, 1)) %>%
full_join(df) %>%
mutate(id_pivot = 1:n()) %>%
ungroup() %>%
pivot_wider(values_from = full_ids,
names_from = id_pivot,
names_prefix = "id") %>%
mutate(across(starts_with("id"), ~paste0("ID_Nr", .)))
which gives:
# A tibble: 3 x 5
id_seq group id1 id2 id3
<chr> <chr> <chr> <chr> <chr>
1 ID_NrID_Nr61-63 A ID_Nr61 ID_Nr62 ID_Nr63
2 ID_NrID_Nr67-69 A ID_Nr67 ID_Nr68 ID_Nr69
3 ID_NrID_Nr73-75 B ID_Nr73 ID_Nr74 ID_Nr75
CodePudding user response:
You can do this (somewhat close to @akrun's answer):
ll <- sapply(str_extract_all(dat$id_seq, "\\(?[0-9,.] \\)?"),
function(x) sprintf("ID_Nr%d", x[1]:x[2]))
mat <- t(sapply(ll, "length<-", max(lengths(ll))))
dat[paste0("id", seq(ncol(mat)))] <- mat
group id_seq id1 id2 id3 id4
1 A ID_Nr61-64 ID_Nr61 ID_Nr62 ID_Nr63 ID_Nr64
2 A ID_Nr67-69 ID_Nr67 ID_Nr68 ID_Nr69 <NA>
3 B ID_Nr73-75 ID_Nr73 ID_Nr74 ID_Nr75 <NA>
CodePudding user response:
data.table option
func <- function(s,e) c(sapply(seq_along(s),function(x) paste0("ID_Nr",seq(s[x],e[x]))))
dcast(
setDT(dat)[, `:=`(s=str_extract(id_seq,"(?<=r)\\d.?"), e=str_extract(id_seq, "\\d.?$"))][
,func(s,e), by=.(group,id_seq)][,id:=paste0("id",1:.N), by=.(group,id_seq)],
group id_seq~id,value.var="V1"
)
group id_seq id1 id2 id3 id4
<char> <char> <char> <char> <char> <char>
1: A ID_Nr61-64 ID_Nr61 ID_Nr62 ID_Nr63 ID_Nr64
2: A ID_Nr67-69 ID_Nr67 ID_Nr68 ID_Nr69 <NA>
3: B ID_Nr73-75 ID_Nr73 ID_Nr74 ID_Nr75 <NA>
If more helpful in individual steps (much more readable, :)
# function to get id numbers
func <- function(s,e) c(sapply(seq_along(s),function(x) paste0("ID_Nr",seq(s[x],e[x]))))
# get start and end
setDT(dat)[, `:=`(s=str_extract(id_seq,"(?<=r)\\d.?"), e=str_extract(id_seq, "\\d.?$"))]
# create each id number from start and end
dat <- dat[,func(s,e), by=.(group,id_seq)]
# add id column
dat[, id:=paste0("id",1:.N), by=.(group,id_seq)]
# swing to wide
dcast(dat,group id_seq~id, value.var="V1")
Input:
dat <- tibble(
group=c("A","A","B"), id_seq=c("ID_Nr61-64", "ID_Nr67-69", "ID_Nr73-75")
)