I have a list of sequences
class(myseq)
[1] "list"
Inside of each element of the list are stores multiples elements withe the format of the Biostrings package. In this example the list myseq contain 6 samples
names(myseq)
[1] "Sample-01" "Sample-02" "Sample-03" "Sample-04" "Sample-05" "Sample-06"
and each sample have a format of the Biostrings package
class(myseq[["Sample-01"]])
[1] "AAStringSet"
attr(,"package")
[1] "Biostrings"
myseqs[["Sample-01"]]
AAStringSet object of length 143:
width seq names
[1] 453 MISIIKRFLGKRQPRQSAEHHYEFLPAHLALAQKPPSPFARLTAITLSIGVLAVLLWAY...VFPAQVQLNKNNIVIDGQTVELTPGMSVVAEIKTDKRRVIDYLLSPIREYQAEALRER IMEHDJCA_00190
[2] 701 MSNNEGLTLICIHFYLSIISGNREQFKENANLTKTNNYKEELKKIQKANKVRITTKHSQ...VITIAHRLSTVRDCNRIIVLHQGAIVEQGSHQQLLTHGKQYKQLWQLQQELKQEETTA IMEHDJCA_00191
... ... ...
[142] 275 MTAITISDQEYRDFSRFLESQCGIVLGDSKQYLVRSRLSPLVTKFKLASLSDLLRDVVT...RNVLIYFSPDMKSKVLNQMANSLNPGGYLLLGASESLTGLTDRFEMVRCNPGIIYKLK IMEHDJCA_03929
[143] 172 MPLLDSFTVDHTRMHAPAVRVAKTMQTPKGDTITVFDLRFTAPNKDILSEKGIHTLEHL...ESQNKIPELNEYQCGTAAMHSLDEAKQIAQNILAVGISVNRNDELALPEAMLKELKVD IMEHDJCA_04048
so I want to extract an specif gene of each sample using a data.frame
head(df)
qseqid samples gene
1 IMEHDJCA_02683 Sample-01 pilB
2 DIBHEKPI_01114 Sample-02 pilB
3 LLMDBGDK_00899 Sample-03 pilB
4 EBMGLGMO_01529 Sample-04 pilB
5 ILCJGNBA_00973 Sample-05 pilB
6 JAGNDBFC_01143 Sample-06 pilB
Extract the genes of each Sample using the qseqid column with a for loop
genes <- c()
for(i in 1:nrow(df)){
sample <- df[i,2]
qseq_id <- df[i,1]
seq <- myseqs[[sample]]
genes[[i]] <- Biostrings::AAStringSet(seq[qseq_id])
}
my problem is that the gene variable is a list
[[1]]
AAStringSet object of length 1:
width seq names
[1] 562 MQSNLATILRQANQLSLTQEQACRETIQASGVTAPEALLQLGFFQPDELTEKLSAIFGLP...YEVMPFDEQLAEAIVKGASVQSLEMLARQKGMMTLKDSGLEKLKQGITSLEELQRVLYL IMEHDJCA_02683
[[2]]
AAStringSet object of length 1:
width seq names
[1] 562 MQSNLATILRQANQLSLTQEQACRETIQASGVTAPEALLQLGFFQPDELTEKLSAIFGLP...YEVMPFDEQLAEAIVKGASVQSLEMLARQKGMMTLKDSGLEKLKQGITSLEELQRVLYL DIBHEKPI_01114
[[3]]
AAStringSet object of length 1:
width seq names
[1] 561 MTNLATILRQANQLSLTQEQACRETIQASGVTAPEALLQLGFFQPDELTEQLSAIFGLPC...YEVMPFDEQLAEAIVKGASVQSLEMLAQQKGMMTLKDSGLEKLKQGITSLEELQRVLYL LLMDBGDK_00899
[[4]]
AAStringSet object of length 1:
width seq names
[1] 562 MQSNLATILRQANQLSLTQEQACRETIQASGVTAPEALLQLGFFQPDELTEKLSAIFGLP...YEVMPFNEQLAEAIVKGASVQSLEMLARQKGMMTLKDSGLEKLKQGITSLEELQRVLYL EBMGLGMO_01529
[[5]]
AAStringSet object of length 1:
width seq names
[1] 562 MQSNLATILRQANQLSLTQEQACRETIQASGVTAPEALLQLGFFQPDELTEKLSAIFGLP...YEVMPFNEQLAEAIVKGASVQSLEMLARQKGMMTLKDSGLEKLKQGITSLEELQRVLYL ILCJGNBA_00973
[[6]]
AAStringSet object of length 1:
width seq names
[1] 562 MQSNLATILRQANQLSLTQEQACRETIQASGVTAPEALLQLGFFQPDELTEKLSAIFGLP...YEVMPFNEQLAEAIVKGASVQSLEMLARQKGMMTLKDSGLEKLKQGITSLEELQRVLYL JAGNDBFC_01143
And I want just a single Biostrings object that contain the genes of the 6 samples, something like :
genes
AAStringSet object of length 6:
width seq names
[1] 562 MQSNLATILRQANQLSLTQEQACRETIQASGVTAPEALLQLGFFQPDELTEKLSAIFGLP...YEVMPFDEQLAEAIVKGASVQSLEMLARQKGMMTLKDSGLEKLKQGITSLEELQRVLYL IMEHDJCA_02683
[2] 562 MQSNLATILRQANQLSLTQEQACRETIQASGVTAPEALLQLGFFQPDELTEKLSAIFGLP...YEVMPFDEQLAEAIVKGASVQSLEMLARQKGMMTLKDSGLEKLKQGITSLEELQRVLYL DIBHEKPI_01114
[3] 561 MTNLATILRQANQLSLTQEQACRETIQASGVTAPEALLQLGFFQPDELTEQLSAIFGLPC...YEVMPFDEQLAEAIVKGASVQSLEMLAQQKGMMTLKDSGLEKLKQGITSLEELQRVLYL LLMDBGDK_00899
[4] 562 MQSNLATILRQANQLSLTQEQACRETIQASGVTAPEALLQLGFFQPDELTEKLSAIFGLP...YEVMPFNEQLAEAIVKGASVQSLEMLARQKGMMTLKDSGLEKLKQGITSLEELQRVLYL EBMGLGMO_01529
[5] 562 MQSNLATILRQANQLSLTQEQACRETIQASGVTAPEALLQLGFFQPDELTEKLSAIFGLP...YEVMPFNEQLAEAIVKGASVQSLEMLARQKGMMTLKDSGLEKLKQGITSLEELQRVLYL ILCJGNBA_00973
[6] 562 MQSNLATILRQANQLSLTQEQACRETIQASGVTAPEALLQLGFFQPDELTEKLSAIFGLP...YEVMPFNEQLAEAIVKGASVQSLEMLARQKGMMTLKDSGLEKLKQGITSLEELQRVLYL JAGNDBFC_01143
I have tried to store the gene variable without [[i]]:
genes <- Biostrings::AAStringSet(seq[qseq_id])
but it only keep the last sequence.
If I do it manually, will be like:
S01 <- Biostrings::AAStringSet(myseqs[["Sample-01"]]["IMEHDJCA_02683"])
S02 <- Biostrings::AAStringSet(myseqs[["Sample-02"]]["DIBHEKPI_01114"])
and so on for Sample-03 to Sample-n .... and then generate the Biostrings object
genes <- c(S01, S02, S03, S04, S05, S06)
Does any body know how to do it ???
Thanks So Much !!!
CodePudding user response:
You can unlist the AAStringSet object, e.g.
library(Biostrings)
a <- AAString("MISIIKRFLGKRQPRQSAEHHYEFLPAHLALAQKPPSPFARLTAITLSIGVLAVLLWAYVFPAQVQLNKNNIVIDGQTVELTPGMSVVAEIKTDKRRVIDYLLSPIREYQAEALRER")
b <- AAString("MTAITISDQEYRDFSRFLESQCGIVLGDSKQYLVRSRLSPLVTKFKLASLSDLLRDVVTRNVLIYFSPDMKSKVLNQMANSLNPGGYLLLGASESLTGLTDRFEMVRCNPGIIYKLK")
myseqs <- list("Sample-01" = AAStringSet(c("IMEHDJCA_02683" = a)),
"Sample-02" = AAStringSet(c("DIBHEKPI_01114" = b)))
class(myseqs)
#> [1] "list"
names(myseqs)
#> [1] "Sample-01" "Sample-02"
myseqs[["Sample-01"]]
#> AAStringSet object of length 1:
#> width seq names
#> [1] 117 MISIIKRFLGKRQPRQSAEHHYE...KRRVIDYLLSPIREYQAEALRER IMEHDJCA_02683
class(myseqs[["Sample-01"]])
#> [1] "AAStringSet"
#> attr(,"package")
#> [1] "Biostrings"
df <- read.table(text = " qseqid samples gene
1 IMEHDJCA_02683 Sample-01 pilB
2 DIBHEKPI_01114 Sample-02 pilB", header = TRUE)
df
#> qseqid samples gene
#> 1 IMEHDJCA_02683 Sample-01 pilB
#> 2 DIBHEKPI_01114 Sample-02 pilB
genes <- c()
for(i in 1:nrow(df)){
sample <- df[i,2]
qseq_id <- df[i,1]
seq <- myseqs[[sample]]
genes[[i]] <- unlist(AAStringSet(seq[qseq_id]))
}
genes_object <- AAStringSet(genes)
genes_object
#> AAStringSet object of length 2:
#> width seq
#> [1] 117 MISIIKRFLGKRQPRQSAEHHYEFLPAHLALAQK...MSVVAEIKTDKRRVIDYLLSPIREYQAEALRER
#> [2] 117 MTAITISDQEYRDFSRFLESQCGIVLGDSKQYLV...GGYLLLGASESLTGLTDRFEMVRCNPGIIYKLK
Created on 2022-08-09 by the reprex package (v2.0.1)
But, unfortunately, a side effect is that you lose the @ranges@NAMES attribute (i.e. the "names" column disappears).
I think you can add the names to the new object, but you would need to check whether it works for your use-case (i.e. check that the result is correct):
library(Biostrings)
a <- AAString("MISIIKRFLGKRQPRQSAEHHYEFLPAHLALAQKPPSPFARLTAITLSIGVLAVLLWAYVFPAQVQLNKNNIVIDGQTVELTPGMSVVAEIKTDKRRVIDYLLSPIREYQAEALRER")
b <- AAString("MTAITISDQEYRDFSRFLESQCGIVLGDSKQYLVRSRLSPLVTKFKLASLSDLLRDVVTRNVLIYFSPDMKSKVLNQMANSLNPGGYLLLGASESLTGLTDRFEMVRCNPGIIYKLK")
myseqs <- list("Sample-01" = AAStringSet(c("IMEHDJCA_02683" = a)),
"Sample-02" = AAStringSet(c("DIBHEKPI_01114" = b)))
class(myseqs)
#> [1] "list"
names(myseqs)
#> [1] "Sample-01" "Sample-02"
myseqs[["Sample-01"]]
#> AAStringSet object of length 1:
#> width seq names
#> [1] 117 MISIIKRFLGKRQPRQSAEHHYE...KRRVIDYLLSPIREYQAEALRER IMEHDJCA_02683
class(myseqs[["Sample-01"]])
#> [1] "AAStringSet"
#> attr(,"package")
#> [1] "Biostrings"
df <- read.table(text = " qseqid samples gene
1 IMEHDJCA_02683 Sample-01 pilB
2 DIBHEKPI_01114 Sample-02 pilB", header = TRUE)
df
#> qseqid samples gene
#> 1 IMEHDJCA_02683 Sample-01 pilB
#> 2 DIBHEKPI_01114 Sample-02 pilB
genes <- c()
qseqids <- c()
for(i in 1:nrow(df)){
sample <- df[i,2]
qseq_id <- df[i,1]
seq <- myseqs[[sample]]
genes[[i]] <- unlist(AAStringSet(seq[qseq_id]))
qseqids <- c(qseqids, qseq_id)
}
names(genes) <- qseqids
genes_object <- AAStringSet(genes)
genes_object
#> AAStringSet object of length 2:
#> width seq names
#> [1] 117 MISIIKRFLGKRQPRQSAEHHYE...KRRVIDYLLSPIREYQAEALRER IMEHDJCA_02683
#> [2] 117 MTAITISDQEYRDFSRFLESQCG...SLTGLTDRFEMVRCNPGIIYKLK DIBHEKPI_01114
Created on 2022-08-09 by the reprex package (v2.0.1)