Home > Blockchain >  R: How to insert and repeat columns in between existing columns and name them accordingly
R: How to insert and repeat columns in between existing columns and name them accordingly

Time:10-20

I have a dataframe that looks like this:

Col_1  Col_X_1  Col_2  Col_X_2 ...
ABC    890      AJF    341
JFH    183      DFJ    132
...

After each block of columns (e.g. Col_1 & Col_X_1) that belong together according to the number at the end, I want to insert two more (empty) columns with the names Col_Y_n and Col_Z_n, with n being the same number as the block of columns before. The final dataframe should look like this:

Col_1  Col_X_1  Col_Y_1  Col_Z_1  Col_2  Col_X_2   Col_Y_2  Col_Z_2  ...
ABC    890                        AJF    341
JFH    183                        DFJ    132
...

How can I accomplish this?

Here my dput output of my real data:

structure(list(Company = c("CompanyA", "CompanyB"), 
    Team_1 = c("NameA", "NameB"), Team_Desc_1 = c("Founder & Co-CEO", 
    "Senior Blockchain Engineer"), Team_URL_1 = c("https://www.linkedin.com/in/NameA/", 
    NA), Team_Ver_1 = c("unverified", NA), Team_2 = c("NameC", 
    "NameD"), Team_Desc_2 = c("Chairman", "Senior Software Engineer"
    ), Team_URL_2 = c("https://www.linkedin.com/in/NameC/", 
    NA), Team_Ver_2 = c("unverified", NA), Team_3 = c("NameE", 
    "NameF")), class = c("grouped_df", "tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -2L), groups = structure(list(
    Company = c("CompanyB", "CompanyA"), .rows = structure(list(
        2L, 1L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), .drop = TRUE))

CodePudding user response:

In base R you can try something like this; the idea is to split() the data.frame in n data.frames in one list, one for each number in colnames, and then apply all the wrangling you need to each part of the list.

# transpose your data
df_t <- data.frame(t(df[,-1]))

# add a variabile made of the number in colnames
df_t$var <- gsub("[^0-9]", "", rownames(df_t))

# split into a list
df_t_list <- split(df_t, df_t$var)

Now for each element of the list, we do some operations: we use lapply() function and a last for loop:

# remove the useless column used to split
df_t_list <- lapply(df_t_list, function(x) { x["var"] <- NULL; data.frame(t(x)) })

# add the columns you need
df_t_list <- lapply(df_t_list, function(x) { x$col_Y_ <- NA; x$col_Z_ <- NA;x })

We have columns with the last number on (old columns) and the new columns without it. Let's remove it from all the columns then add it to all.

# remove
df_t_list <- lapply(df_t_list, function(x) {colnames(x) <- gsub("[0-9]", "", colnames(x));x})

# add
for(i in seq_along(df_t_list)) {colnames(df_t_list[[i]]) <- paste0(colnames(df_t_list[[i]]),names(df_t_list)[i])}

Let's get everything together:

do.call(cbind, unname(df_t_list))



   Team_1                Team_Desc_1                         Team_URL_1 Team_Ver_1 col_Y_1 col_Z_1 Team_2              Team_Desc_2                         Team_URL_2 Team_Ver_2
X1  NameA           Founder & Co-CEO https://www.linkedin.com/in/NameA/ unverified      NA      NA  NameC                 Chairman https://www.linkedin.com/in/NameC/ unverified
X2  NameB Senior Blockchain Engineer                               <NA>       <NA>      NA      NA  NameD Senior Software Engineer                               <NA>       <NA>
   col_Y_2 col_Z_2 Team_3 col_Y_3 col_Z_3
X1      NA      NA  NameE      NA      NA
X2      NA      NA  NameF      NA      NA

CodePudding user response:

Here's a way to do what you want using data.table and stringr.

library(data.table)
library(stringr)

group_ids <- str_extract(names(data), "[0-9]*$")

group_lvls <- factor(group_ids, levels=unique(group_ids))
groups <- split(colnames(data), group_lvls)

add_empty <- function(x){
  
  fun_id <- stringr::str_extract(x[1], "[0-9]*$")
  
  x <- c(x, paste0("Col_Y_", fun_id), paste0("Col_Z_", fun_id) )
  
  return(x)
  
}


fnl_groups <- lapply(groups, add_empty)

struckt1 <- lapply(unlist(fnl_groups, use.names = FALSE), function(x) {df <- data.frame(col = NA); names(df) <- x; return(df)})

struckt2 <- do.call(cbind, struckt1)

res <- rbindlist(list(struckt2, data), use.names = TRUE, fill = TRUE)[-1,]

setcolorder(res, neworder = "Company")

Output:

    Company Team_1                Team_Desc_1                         Team_URL_1 Team_Ver_1 Col_Y_1 Col_Z_1 Team_2              Team_Desc_2
1: CompanyA  NameA           Founder & Co-CEO https://www.linkedin.com/in/NameA/ unverified      NA      NA  NameC                 Chairman
2: CompanyB  NameB Senior Blockchain Engineer                               <NA>       <NA>      NA      NA  NameD Senior Software Engineer
                           Team_URL_2 Team_Ver_2 Col_Y_2 Col_Z_2 Team_3 Col_Y_3 Col_Z_3
1: https://www.linkedin.com/in/NameC/ unverified      NA      NA  NameE      NA      NA
2:                               <NA>       <NA>      NA      NA  NameF      NA      NA

Edit: Addressing the comment below, the script doesn't seem to affect the ordering of other columns.

Altered data (replaced Team 3 with Team 13)

structure(
  list(
    Company = c("CompanyA", "CompanyB"),
    Team_1 = c("NameA", "NameB"),
    Team_Desc_1 = c("Founder & Co-CEO",
                    "Senior Blockchain Engineer"),
    Team_URL_1 = c("https://www.linkedin.com/in/NameA/",
                   NA),
    Team_Ver_1 = c("unverified", NA),
    Team_2 = c("NameC",
               "NameD"),
    Team_Desc_2 = c("Chairman", "Senior Software Engineer"),
    Team_URL_2 = c("https://www.linkedin.com/in/NameC/",
                   NA),
    Team_Ver_2 = c("unverified", NA),
    Team_13 = c("NameE",
               "NameF")
  ),
  class = c("grouped_df", "tbl_df", "tbl",
            "data.frame"),
  row.names = c(NA,-2L),
  groups = structure(
    list(
      Company = c("CompanyB", "CompanyA"),
      .rows = structure(
        list(2L, 1L),
        ptype = integer(0),
        class = c("vctrs_list_of",
                  "vctrs_vctr", "list")
      )
    ),
    class = c("tbl_df", "tbl", "data.frame"),
    row.names = c(NA,-2L),
    .drop = TRUE
  )
) -> data

Output:

> res
    Company Col_Y_ Col_Z_ Team_1                Team_Desc_1                         Team_URL_1 Team_Ver_1 Col_Y_1 Col_Z_1 Team_2
1: CompanyA     NA     NA  NameA           Founder & Co-CEO https://www.linkedin.com/in/NameA/ unverified      NA      NA  NameC
2: CompanyB     NA     NA  NameB Senior Blockchain Engineer                               <NA>       <NA>      NA      NA  NameD
                Team_Desc_2                         Team_URL_2 Team_Ver_2 Col_Y_2 Col_Z_2 Team_13 Col_Y_13 Col_Z_13
1:                 Chairman https://www.linkedin.com/in/NameC/ unverified      NA      NA   NameE       NA       NA
2: Senior Software Engineer                               <NA>       <NA>      NA      NA   NameF       NA       NA
  • Related