Home > Mobile >  Expand columns based on sequence of ids
Expand columns based on sequence of ids

Time:02-22

I would like to create new columns from a column containing an id sequence, say ID1-5.

Say, we have the following tibble:

# A tibble: 3 x 2
  group   id_seq   
  <chr>   <chr>       
1 A       ID_Nr61-63
2 A       ID_Nr67-69
3 B       ID_Nr73-75

My desired output is:

# A tibble: 3 x 6
  group   id_seq      id1      id2      id3      id4
  <chr>   <chr>       <chr>    <chr>    <chr>    <chr>       
1 A       ID_Nr61-64  ID_Nr61  ID_Nr62  ID_Nr63  ID_Nr64    
2 A       ID_Nr67-69  ID_Nr67  ID_Nr68  ID_Nr69  NA
3 B       ID_Nr73-75  ID_Nr73  ID_Nr74  ID_Nr75  NA

Appreciate your help! Thanks.

CodePudding user response:

We may extract the digits from the id_seq, by rowwise,get the sequence (:) to expand the data and reshape from 'long' to 'wide' with pivot_wider

library(dplyr)
library(tidyr)
library(stringr)
library(data.table)
df1 %>% 
   mutate(s1 = as.numeric(str_extract(id_seq, "\\d ")), 
    s2 = as.numeric(str_extract(id_seq, "\\d $"))) %>%
   mutate(rn = row_number()) %>% 
  rowwise %>% 
  summarise(rn, group, id_seq, se = (s1:s2)) %>% 
  mutate(id_seq2 = str_replace(id_seq, "\\d -\\d $", as.character(se)), 
   rn2 = str_c("id", rowid(rn)), se = NULL) %>%
  pivot_wider(names_from = rn2, values_from = id_seq2) %>%
  select(-rn)

-output

# A tibble: 3 × 6
  group id_seq     id1     id2     id3     id4    
  <chr> <chr>      <chr>   <chr>   <chr>   <chr>  
1 A     ID_Nr61-64 ID_Nr61 ID_Nr62 ID_Nr63 ID_Nr64
2 A     ID_Nr67-69 ID_Nr67 ID_Nr68 ID_Nr69 <NA>   
3 B     ID_Nr73-75 ID_Nr73 ID_Nr74 ID_Nr75 <NA>   

Or using base R

lst1 <- lapply(sub("-", ":", sub("ID_Nr", "", df1$id_seq)), 
    function(x) paste0("id_seq", eval(parse(text = x))))
 mx <- max(lengths(lst1))
m1 <- do.call(rbind, lapply(lst1, `length<-`, mx))
df1[paste0("id", seq_len(ncol(m1)))] <- m1
df1
  group     id_seq      id1      id2      id3      id4
1     A ID_Nr61-64 id_seq61 id_seq62 id_seq63 id_seq64
2     A ID_Nr67-69 id_seq67 id_seq68 id_seq69     <NA>
3     B ID_Nr73-75 id_seq73 id_seq74 id_seq75     <NA>

data

df1 <- structure(list(group = c("A", "A", "B"), id_seq = c("ID_Nr61-64", 
"ID_Nr67-69", "ID_Nr73-75")), class = "data.frame", row.names = c("1", 
"2", "3"))

CodePudding user response:

here's an alternative:

df <- data.frame(group = c("A", "A", "B"),
             id_seq = c("ID_Nr61-63", "ID_Nr67-69", "ID_Nr73-75"))

library(tidyverse)
df %>%
  mutate(id_new = str_remove(id_seq, "ID_Nr")) %>%
  separate(id_new, into = c("start", "end"), sep = "-") %>%
  pivot_longer(cols = c(start, end)) %>%
  mutate(value = as.numeric(value)) %>%
  group_by(id_seq) %>%
  expand(full_ids = full_seq(value, 1)) %>%
  full_join(df) %>%
  mutate(id_pivot = 1:n()) %>%
  ungroup() %>%
  pivot_wider(values_from = full_ids,
              names_from = id_pivot,
              names_prefix = "id") %>%
  mutate(across(starts_with("id"), ~paste0("ID_Nr", .)))

which gives:

# A tibble: 3 x 5
  id_seq          group id1     id2     id3    
  <chr>           <chr> <chr>   <chr>   <chr>  
1 ID_NrID_Nr61-63 A     ID_Nr61 ID_Nr62 ID_Nr63
2 ID_NrID_Nr67-69 A     ID_Nr67 ID_Nr68 ID_Nr69
3 ID_NrID_Nr73-75 B     ID_Nr73 ID_Nr74 ID_Nr75

CodePudding user response:

You can do this (somewhat close to @akrun's answer):

ll <- sapply(str_extract_all(dat$id_seq, "\\(?[0-9,.] \\)?"), 
                  function(x) sprintf("ID_Nr%d", x[1]:x[2]))
mat <- t(sapply(ll, "length<-", max(lengths(ll))))
dat[paste0("id", seq(ncol(mat)))] <- mat

  group     id_seq     id1     id2     id3     id4
1     A ID_Nr61-64 ID_Nr61 ID_Nr62 ID_Nr63 ID_Nr64
2     A ID_Nr67-69 ID_Nr67 ID_Nr68 ID_Nr69    <NA>
3     B ID_Nr73-75 ID_Nr73 ID_Nr74 ID_Nr75    <NA>

CodePudding user response:

data.table option

func <- function(s,e) c(sapply(seq_along(s),function(x) paste0("ID_Nr",seq(s[x],e[x]))))
dcast(
  setDT(dat)[, `:=`(s=str_extract(id_seq,"(?<=r)\\d.?"), e=str_extract(id_seq, "\\d.?$"))][
    ,func(s,e), by=.(group,id_seq)][,id:=paste0("id",1:.N), by=.(group,id_seq)],
  group id_seq~id,value.var="V1"
)

    group     id_seq     id1     id2     id3     id4
   <char>     <char>  <char>  <char>  <char>  <char>
1:      A ID_Nr61-64 ID_Nr61 ID_Nr62 ID_Nr63 ID_Nr64
2:      A ID_Nr67-69 ID_Nr67 ID_Nr68 ID_Nr69    <NA>
3:      B ID_Nr73-75 ID_Nr73 ID_Nr74 ID_Nr75    <NA>

If more helpful in individual steps (much more readable, :)

# function to get id numbers
func <- function(s,e) c(sapply(seq_along(s),function(x) paste0("ID_Nr",seq(s[x],e[x]))))

# get start and end
setDT(dat)[, `:=`(s=str_extract(id_seq,"(?<=r)\\d.?"), e=str_extract(id_seq, "\\d.?$"))]

# create each id number from start and end
dat <- dat[,func(s,e), by=.(group,id_seq)]

# add id column
dat[, id:=paste0("id",1:.N), by=.(group,id_seq)]

# swing to wide
dcast(dat,group id_seq~id, value.var="V1")

Input:

dat <- tibble(
  group=c("A","A","B"), id_seq=c("ID_Nr61-64", "ID_Nr67-69", "ID_Nr73-75")
)
  • Related