To illustrate the problem see this example using dplyr/tidyr, answers appreciated using any other packages:
#data
df1 <- data.frame("SNP" = c("rs11807834", "rs3729986"),
"Symbols" = c("GRIN1,SETD1A", "MADD,STAC3,SPI1"))
# SNP Symbols
# 1 rs11807834 GRIN1,SETD1A
# 2 rs3729986 MADD,STAC3,SPI1
As expected Symbols column with multiple genes is gone:
df1 %>%
separate_rows(Symbols, sep = ",")
# # A tibble: 5 x 2
# SNP Symbols
# <chr> <chr>
# 1 rs11807834 GRIN1
# 2 rs11807834 SETD1A
# 3 rs3729986 MADD
# 4 rs3729986 STAC3
# 5 rs3729986 SPI1
I could join the original data to get the values
df1 %>%
separate_rows(Symbols, sep = ",") %>%
left_join(df1, by = "SNP")
# # A tibble: 5 x 3
# SNP Symbols.x Symbols.y
# <chr> <chr> <chr>
# 1 rs11807834 GRIN1 GRIN1,SETD1A
# 2 rs11807834 SETD1A GRIN1,SETD1A
# 3 rs3729986 MADD MADD,STAC3,SPI1
# 4 rs3729986 STAC3 MADD,STAC3,SPI1
# 5 rs3729986 SPI1 MADD,STAC3,SPI1
Or I could paste them back again using group/ungroup:
df1 %>%
separate_rows(Symbols, sep = ",") %>%
group_by(SNP) %>%
mutate(Genes = paste(Symbols, collapse = ",")) %>%
ungroup()
# # A tibble: 5 x 3
# SNP Symbols Genes
# <chr> <chr> <chr>
# 1 rs11807834 GRIN1 GRIN1,SETD1A
# 2 rs11807834 SETD1A GRIN1,SETD1A
# 3 rs3729986 MADD MADD,STAC3,SPI1
# 4 rs3729986 STAC3 MADD,STAC3,SPI1
# 5 rs3729986 SPI1 MADD,STAC3,SPI1
Am I missing something obvious? I'd hoped there would be something like:
# doesn't have such argument
df1 %>%
separate_rows(Symbols, sep = ",", keep = TRUE)
CodePudding user response:
Yes, the obvious solution is to just mutate
a copy of Symbols
into a column called Genes
before separating its rows.
df1 %>%
mutate(Genes = Symbols) %>%
separate_rows(Symbols)
#> # A tibble: 5 x 3
#> SNP Symbols Genes
#> <chr> <chr> <chr>
#> 1 rs11807834 GRIN1 GRIN1,SETD1A
#> 2 rs11807834 SETD1A GRIN1,SETD1A
#> 3 rs3729986 MADD MADD,STAC3,SPI1
#> 4 rs3729986 STAC3 MADD,STAC3,SPI1
#> 5 rs3729986 SPI1 MADD,STAC3,SPI1
Created on 2022-11-17 with reprex v2.0.2
CodePudding user response:
You can also use cSplit
, but it would also require creating a copy of the column. cSplit
does have a parameter for drop
; however, it can only be used on "wide" datasets (see SO answer).
library(splitstackshape)
library(data.table)
cSplit(data.table(df1)[, Symbols2 := Symbols], "Symbols", ",", "long", type.convert = "as.is")
Output
SNP Symbols Symbols2
1: rs11807834 GRIN1 GRIN1,SETD1A
2: rs11807834 SETD1A GRIN1,SETD1A
3: rs3729986 MADD MADD,STAC3,SPI1
4: rs3729986 STAC3 MADD,STAC3,SPI1
5: rs3729986 SPI1 MADD,STAC3,SPI1