I have a dataframe that looks like this:
Col_1 Col_X_1 Col_2 Col_X_2 ...
ABC 890 AJF 341
JFH 183 DFJ 132
...
After each block of columns (e.g. Col_1 & Col_X_1) that belong together according to the number at the end, I want to insert two more (empty) columns with the names Col_Y_n and Col_Z_n, with n being the same number as the block of columns before. The final dataframe should look like this:
Col_1 Col_X_1 Col_Y_1 Col_Z_1 Col_2 Col_X_2 Col_Y_2 Col_Z_2 ...
ABC 890 AJF 341
JFH 183 DFJ 132
...
How can I accomplish this?
Here my dput
output of my real data:
structure(list(Company = c("CompanyA", "CompanyB"),
Team_1 = c("NameA", "NameB"), Team_Desc_1 = c("Founder & Co-CEO",
"Senior Blockchain Engineer"), Team_URL_1 = c("https://www.linkedin.com/in/NameA/",
NA), Team_Ver_1 = c("unverified", NA), Team_2 = c("NameC",
"NameD"), Team_Desc_2 = c("Chairman", "Senior Software Engineer"
), Team_URL_2 = c("https://www.linkedin.com/in/NameC/",
NA), Team_Ver_2 = c("unverified", NA), Team_3 = c("NameE",
"NameF")), class = c("grouped_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -2L), groups = structure(list(
Company = c("CompanyB", "CompanyA"), .rows = structure(list(
2L, 1L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), .drop = TRUE))
CodePudding user response:
In base R you can try something like this; the idea is to split()
the data.frame
in n data.frame
s in one list, one for each number in colnames
, and then apply all the wrangling you need to each part of the list.
# transpose your data
df_t <- data.frame(t(df[,-1]))
# add a variabile made of the number in colnames
df_t$var <- gsub("[^0-9]", "", rownames(df_t))
# split into a list
df_t_list <- split(df_t, df_t$var)
Now for each element of the list, we do some operations: we use lapply()
function and a last for
loop:
# remove the useless column used to split
df_t_list <- lapply(df_t_list, function(x) { x["var"] <- NULL; data.frame(t(x)) })
# add the columns you need
df_t_list <- lapply(df_t_list, function(x) { x$col_Y_ <- NA; x$col_Z_ <- NA;x })
We have columns with the last number on (old columns) and the new columns without it. Let's remove it from all the columns then add it to all.
# remove
df_t_list <- lapply(df_t_list, function(x) {colnames(x) <- gsub("[0-9]", "", colnames(x));x})
# add
for(i in seq_along(df_t_list)) {colnames(df_t_list[[i]]) <- paste0(colnames(df_t_list[[i]]),names(df_t_list)[i])}
Let's get everything together:
do.call(cbind, unname(df_t_list))
Team_1 Team_Desc_1 Team_URL_1 Team_Ver_1 col_Y_1 col_Z_1 Team_2 Team_Desc_2 Team_URL_2 Team_Ver_2
X1 NameA Founder & Co-CEO https://www.linkedin.com/in/NameA/ unverified NA NA NameC Chairman https://www.linkedin.com/in/NameC/ unverified
X2 NameB Senior Blockchain Engineer <NA> <NA> NA NA NameD Senior Software Engineer <NA> <NA>
col_Y_2 col_Z_2 Team_3 col_Y_3 col_Z_3
X1 NA NA NameE NA NA
X2 NA NA NameF NA NA
CodePudding user response:
Here's a way to do what you want using data.table
and stringr
.
library(data.table)
library(stringr)
group_ids <- str_extract(names(data), "[0-9]*$")
group_lvls <- factor(group_ids, levels=unique(group_ids))
groups <- split(colnames(data), group_lvls)
add_empty <- function(x){
fun_id <- stringr::str_extract(x[1], "[0-9]*$")
x <- c(x, paste0("Col_Y_", fun_id), paste0("Col_Z_", fun_id) )
return(x)
}
fnl_groups <- lapply(groups, add_empty)
struckt1 <- lapply(unlist(fnl_groups, use.names = FALSE), function(x) {df <- data.frame(col = NA); names(df) <- x; return(df)})
struckt2 <- do.call(cbind, struckt1)
res <- rbindlist(list(struckt2, data), use.names = TRUE, fill = TRUE)[-1,]
setcolorder(res, neworder = "Company")
Output:
Company Team_1 Team_Desc_1 Team_URL_1 Team_Ver_1 Col_Y_1 Col_Z_1 Team_2 Team_Desc_2
1: CompanyA NameA Founder & Co-CEO https://www.linkedin.com/in/NameA/ unverified NA NA NameC Chairman
2: CompanyB NameB Senior Blockchain Engineer <NA> <NA> NA NA NameD Senior Software Engineer
Team_URL_2 Team_Ver_2 Col_Y_2 Col_Z_2 Team_3 Col_Y_3 Col_Z_3
1: https://www.linkedin.com/in/NameC/ unverified NA NA NameE NA NA
2: <NA> <NA> NA NA NameF NA NA
Edit: Addressing the comment below, the script doesn't seem to affect the ordering of other columns.
Altered data (replaced Team 3 with Team 13)
structure(
list(
Company = c("CompanyA", "CompanyB"),
Team_1 = c("NameA", "NameB"),
Team_Desc_1 = c("Founder & Co-CEO",
"Senior Blockchain Engineer"),
Team_URL_1 = c("https://www.linkedin.com/in/NameA/",
NA),
Team_Ver_1 = c("unverified", NA),
Team_2 = c("NameC",
"NameD"),
Team_Desc_2 = c("Chairman", "Senior Software Engineer"),
Team_URL_2 = c("https://www.linkedin.com/in/NameC/",
NA),
Team_Ver_2 = c("unverified", NA),
Team_13 = c("NameE",
"NameF")
),
class = c("grouped_df", "tbl_df", "tbl",
"data.frame"),
row.names = c(NA,-2L),
groups = structure(
list(
Company = c("CompanyB", "CompanyA"),
.rows = structure(
list(2L, 1L),
ptype = integer(0),
class = c("vctrs_list_of",
"vctrs_vctr", "list")
)
),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA,-2L),
.drop = TRUE
)
) -> data
Output:
> res
Company Col_Y_ Col_Z_ Team_1 Team_Desc_1 Team_URL_1 Team_Ver_1 Col_Y_1 Col_Z_1 Team_2
1: CompanyA NA NA NameA Founder & Co-CEO https://www.linkedin.com/in/NameA/ unverified NA NA NameC
2: CompanyB NA NA NameB Senior Blockchain Engineer <NA> <NA> NA NA NameD
Team_Desc_2 Team_URL_2 Team_Ver_2 Col_Y_2 Col_Z_2 Team_13 Col_Y_13 Col_Z_13
1: Chairman https://www.linkedin.com/in/NameC/ unverified NA NA NameE NA NA
2: Senior Software Engineer <NA> <NA> NA NA NameF NA NA