I have the following strings and I would like to filter based on, let's say, A or T in the first position, followed by C or T in the second position, etc.
structure(list(barcode.Universe.sub = c("TCCGCTGGTGCG", "GTCGCGGACTGC",
"CTTCCAAAATAG", "CCCCTTTCGTGG", "TTTCAGCGTCAG", "TACGAGCCTGGT",
"AGCAGTATCAAC", "TAACCGTATAGA", "GGCTAATTCCGC", "CTCGGTTGGGCG",
"TCCCGTGCGCCC", "TCTCCCAAACGA", "ATCATGGTCAAC", "ACCGCACTGAAC",
"AAGTTTTCCTCA", "CTAGAGACCGAT", "CATTCTCATGGA", "ATGCGCGGCGAA",
"GAGTGCCCTAGG", "ATGAGTAGTAGG", "TCTAGCGTCATT", "TCTCAACTTCTC",
"GAGAACGCTCCT", "TACAGGACACAC", "GATTCTCACGAT", "GAGACGGAGGCC",
"GCGGCTCCTCTA", "TGGATACTCTGC", "GCACCGCGTTCA", "GAATTGATAAGT",
"GGGCAGCAGATA", "GGCATATACGGC", "TCATCACTATGA", "CGCGGCTGGGAT",
"AGTCGCCACGCT", "TTGGCCGAAATT", "CGAGACCCACCG", "TGGTGCCTCACT",
"CTGATACTGGGT", "AGGTGGCGTCTA", "GGCAAGGAGTAC", "GCGACTGAAATA",
"AGCTTCGGATTA", "GTTGCCAGACTC", "AGCTGTCGCACG", "AGGGTTCGCTGT",
"GTGCGTACCGCG", "CGCTTACACATG", "ACAACGCCATGT", "ATATACTAAGCC",
"ACCGGAATAGCT", "CTACGAACGACT", "TAACCTTATGCT", "GCCTGCAGATGA",
"CGAGTGCGGTGG", "TGGAGGTGTACT", "CTTCAATATTGA", "AACGACATAAAC",
"TACAGTGGATGC", "ATCTAAGCTGTT", "GCCTGGCATCTT", "CATGGGGAACCT",
"GCCCGAGCTAAG", "GGGTGCTCCGAC", "TAATTAGGACGC", "TACCTAAGCGAT",
"TAGTCTGTAGGC", "CGTTAACTCCGC", "GCACGAAGTCAC", "GAGCGTCCAGCT",
"CCGACTTACAAA", "CCGGTTCAGATG", "TGCAGCTGTGTG", "AATCTATTTCTT",
"TCGTATAAGGTA", "AACTGGATGCCC", "ACGAAGAACGCT", "AGTGCTCTTCTG",
"CTACAGTGTACA", "AGGGCCATACTC", "AACAACCGCTTA", "CTAGACGGCAAT",
"GGGTTGAAGAAG", "GGAGCATACTAA", "CGAACCCCGGTA", "TGTACCATGGAA",
"TTCGAGGTTGAT", "GTGCTTAGGATC", "CAACCGTATGAC", "GACGTCCTTCAG",
"TGGTAATGGACA", "ACGAGCGCTATG", "GCGGAGCCACCC", "TCAGAGGCTGGA",
"GGCCTTACGCAA", "TACGACCCATAG", "CCATTCAGCATG", "CCTAAGGGCCTT",
"GGTCTATCGCAT", "CAGTACATGTCG")), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -100L))
I have tried stringr and also specific packages for DNA sequences (ape) without good results yet.
Help would be greatly appreciated.
CodePudding user response:
You can use grep
.
grep("^[AT][CT]", s[,1])
# [1] 1 5 11 12 13 14 18 20 21 22 33 36 49 50 51 60 75 77 87 92 94
s[grep("^[AT][CT]", s[,1]), 1]
# [1] "TCCGCTGGTGCG" "TTTCAGCGTCAG" "TCCCGTGCGCCC" "TCTCCCAAACGA" "ATCATGGTCAAC"
# [6] "ACCGCACTGAAC" "ATGCGCGGCGAA" "ATGAGTAGTAGG" "TCTAGCGTCATT" "TCTCAACTTCTC"
#[11] "TCATCACTATGA" "TTGGCCGAAATT" "ACAACGCCATGT" "ATATACTAAGCC" "ACCGGAATAGCT"
#[16] "ATCTAAGCTGTT" "TCGTATAAGGTA" "ACGAAGAACGCT" "TTCGAGGTTGAT" "ACGAGCGCTATG"
#[21] "TCAGAGGCTGGA"
Where ^
indicates the start of the string, [AT]
matches A
or T
and [CT]
matches C
or T
.
Data
s <- structure(list(barcode.Universe.sub = c("TCCGCTGGTGCG", "GTCGCGGACTGC",
"CTTCCAAAATAG", "CCCCTTTCGTGG", "TTTCAGCGTCAG", "TACGAGCCTGGT",
"AGCAGTATCAAC", "TAACCGTATAGA", "GGCTAATTCCGC", "CTCGGTTGGGCG",
"TCCCGTGCGCCC", "TCTCCCAAACGA", "ATCATGGTCAAC", "ACCGCACTGAAC",
"AAGTTTTCCTCA", "CTAGAGACCGAT", "CATTCTCATGGA", "ATGCGCGGCGAA",
"GAGTGCCCTAGG", "ATGAGTAGTAGG", "TCTAGCGTCATT", "TCTCAACTTCTC",
"GAGAACGCTCCT", "TACAGGACACAC", "GATTCTCACGAT", "GAGACGGAGGCC",
"GCGGCTCCTCTA", "TGGATACTCTGC", "GCACCGCGTTCA", "GAATTGATAAGT",
"GGGCAGCAGATA", "GGCATATACGGC", "TCATCACTATGA", "CGCGGCTGGGAT",
"AGTCGCCACGCT", "TTGGCCGAAATT", "CGAGACCCACCG", "TGGTGCCTCACT",
"CTGATACTGGGT", "AGGTGGCGTCTA", "GGCAAGGAGTAC", "GCGACTGAAATA",
"AGCTTCGGATTA", "GTTGCCAGACTC", "AGCTGTCGCACG", "AGGGTTCGCTGT",
"GTGCGTACCGCG", "CGCTTACACATG", "ACAACGCCATGT", "ATATACTAAGCC",
"ACCGGAATAGCT", "CTACGAACGACT", "TAACCTTATGCT", "GCCTGCAGATGA",
"CGAGTGCGGTGG", "TGGAGGTGTACT", "CTTCAATATTGA", "AACGACATAAAC",
"TACAGTGGATGC", "ATCTAAGCTGTT", "GCCTGGCATCTT", "CATGGGGAACCT",
"GCCCGAGCTAAG", "GGGTGCTCCGAC", "TAATTAGGACGC", "TACCTAAGCGAT",
"TAGTCTGTAGGC", "CGTTAACTCCGC", "GCACGAAGTCAC", "GAGCGTCCAGCT",
"CCGACTTACAAA", "CCGGTTCAGATG", "TGCAGCTGTGTG", "AATCTATTTCTT",
"TCGTATAAGGTA", "AACTGGATGCCC", "ACGAAGAACGCT", "AGTGCTCTTCTG",
"CTACAGTGTACA", "AGGGCCATACTC", "AACAACCGCTTA", "CTAGACGGCAAT",
"GGGTTGAAGAAG", "GGAGCATACTAA", "CGAACCCCGGTA", "TGTACCATGGAA",
"TTCGAGGTTGAT", "GTGCTTAGGATC", "CAACCGTATGAC", "GACGTCCTTCAG",
"TGGTAATGGACA", "ACGAGCGCTATG", "GCGGAGCCACCC", "TCAGAGGCTGGA",
"GGCCTTACGCAA", "TACGACCCATAG", "CCATTCAGCATG", "CCTAAGGGCCTT",
"GGTCTATCGCAT", "CAGTACATGTCG")), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -100L))