I have a dataframe df
df <- structure(list(GENE = c("TNFRSF4", "TNFRSF4", "VWA1", "VWA1",
"PEX10", "CEP104"), KEY.varID = c("chr1:1213738:G:A", "chr1:1232280:T:C",
"chr1:1435798:T:TGGCGCGGAGC", "chr1:1437401:C:G", "chr1:2406791:C:CT",
"chr1:3844977:G:A")), row.names = c(NA, -6L), class = "data.frame")
Code I tried:
library(dplyr)
df %>% group_by(GENE) %>%
mutate(all_variants = paste(KEY.varID, collapse = ","))
Result I want:
GENE KEY.varID
TNFRSF4 chr1:1213738:G:A, chr1:1232280:T:C
VWA1 chr1:1435798:T:TGGCGCGGAGC, chr1:1437401:C:G
PEX10 chr1:2406791:C:CT
CEP104 chr1:3844977:G:A
CodePudding user response:
Your data is already a data.table; you can simply paste/collapse, and use by
library(data.table)
df[, .(KEY.varID = paste(KEY.varID, collapse = ",")), by=GENE]
Output:
GENE KEY.varID
1: TNFRSF4 chr1:1213738:G:A,chr1:1232280:T:C
2: VWA1 chr1:1435798:T:TGGCGCGGAGC,chr1:1437401:C:G
3: PEX10 chr1:2406791:C:CT
4: CEP104 chr1:3844977:G:A
CodePudding user response:
Or using dplyr
:
library(tidyverse)
library(data.table)
df %>%
group_by(GENE) %>%
summarise(KEY.varID = str_c(KEY.varID, collapse = ", ")) %>%
as.data.table
#> GENE KEY.varID
#> 1: CEP104 chr1:3844977:G:A
#> 2: PEX10 chr1:2406791:C:CT
#> 3: TNFRSF4 chr1:1213738:G:A, chr1:1232280:T:C
#> 4: VWA1 chr1:1435798:T:TGGCGCGGAGC, chr1:1437401:C:G
CodePudding user response:
A base R solution is
tapply(df$KEY.varID, df$GENE, paste, collapse = ",") |>
(\(x) data.frame(GENE = names(x), KEY.varID = unname(x)))()
#R> GENE KEY.varID
#R> 1 CEP104 chr1:3844977:G:A
#R> 2 PEX10 chr1:2406791:C:CT
#R> 3 TNFRSF4 chr1:1213738:G:A,chr1:1232280:T:C
#R> 4 VWA1 chr1:1435798:T:TGGCGCGGAGC,chr1:1437401:C:G