Here is my reproducible example:
grange_list <- list(hepg2 = structure(list(seqnames = structure(c(7L, 15L, 1L
), .Label = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6",
"chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14",
"chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21",
"chr22", "chrX"), class = "factor"), start = c(158126281L, 69110138L,
2205071L), end = c(158126380L, 69110237L, 2205170L), width = c(100L,
100L, 100L), strand = structure(c(2L, 2L, 1L), .Label = c(" ",
"-", "*"), class = "factor"), name = c("FUS_HepG2_IDR", "FUS_HepG2_IDR",
"FUS_HepG2_IDR"), score = c(1000L, 1000L, 1000L), annotation = c("Intron (uc011kwa.2/5799, intron 2 of 22)",
"Intron (uc002arl.3/8125, intron 1 of 6)", "Intron (uc001aja.4/6497, intron 1 of 6)"
), geneChr = c(7L, 15L, 1L), geneStart = c(157331750L, 69070875L,
2160134L), geneEnd = c(158380482L, 69113261L, 2241652L), geneLength = c(1048733L,
42387L, 81519L), geneStrand = c(2L, 2L, 1L), geneId = c("5799",
"8125", "6497"), distanceToTSS = c(254102, 3024, 44937)), row.names = c(NA,
3L), class = "data.frame"), k562 = structure(list(seqnames = structure(c(10L,
22L, 11L), .Label = c("chr1", "chr2", "chr3", "chr4", "chr5",
"chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13",
"chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20",
"chr21", "chr22", "chrX"), class = "factor"), start = c(72508428L,
49992192L, 3072043L), end = c(72508527L, 49992291L, 3072142L),
width = c(100L, 100L, 100L), strand = structure(c(1L, 2L,
2L), .Label = c(" ", "-", "*"), class = "factor"), name = c("FUS_K562_IDR",
"FUS_K562_IDR", "FUS_K562_IDR"), score = c(1000L, 1000L,
1000L), annotation = c("Intron (uc001jrg.3/140766, intron 15 of 21)",
"Intron (uc003biq.3/uc003biq.3, intron 1 of 4)", "Intron (uc001lxe.3/833, intron 1 of 22)"
), geneChr = c(10L, 22L, 11L), geneStart = c(72432559L, 50013290L,
3022152L), geneEnd = c(72522195L, 50051190L, 3078681L), geneLength = c(89637L,
37901L, 56530L), geneStrand = c(1L, 2L, 2L), geneId = c("140766",
"348645", "833"), distanceToTSS = c(75869, 58998, 6539)), row.names = c(NA,
3L), class = "data.frame"), hoel = structure(list(seqnames = structure(c(1L,
1L, 1L), .Label = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6",
"chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14",
"chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21",
"chr22", "chrX", "chrY"), class = "factor"), start = c(557045L,
870107L, 936673L), end = c(557144L, 870206L, 936772L), width = c(100L,
100L, 100L), strand = structure(c(1L, 1L, 1L), .Label = c(" ",
"-", "*"), class = "factor"), name = c("FUS", "FUS", "FUS"),
score = c(1000L, 1000L, 1000L), annotation = c("Distal Intergenic",
"Intron (uc001abv.1/148398, intron 4 of 4)", "Distal Intergenic"
), geneChr = c(1L, 1L, 1L), geneStart = c(762971L, 860530L,
948847L), geneEnd = c(794826L, 879961L, 949919L), geneLength = c(31856L,
19432L, 1073L), geneStrand = c(1L, 1L, 1L), geneId = c("643837",
"148398", "9636"), distanceToTSS = c(-205827, 9577, -12075
)), row.names = c(NA, 3L), class = "data.frame"))
This is a list of dataframes, it looks like that:
$hepg2
seqnames start end width strand name score annotation geneChr geneStart geneEnd geneLength geneStrand geneId distanceToTSS
1 chr7 158126281 158126380 100 - FUS_HepG2_IDR 1000 Intron (uc011kwa.2/5799, intron 2 of 22) 7 157331750 158380482 1048733 2 5799 254102
2 chr15 69110138 69110237 100 - FUS_HepG2_IDR 1000 Intron (uc002arl.3/8125, intron 1 of 6) 15 69070875 69113261 42387 2 8125 3024
3 chr1 2205071 2205170 100 FUS_HepG2_IDR 1000 Intron (uc001aja.4/6497, intron 1 of 6) 1 2160134 2241652 81519 1 6497 44937
$k562
seqnames start end width strand name score annotation geneChr geneStart geneEnd geneLength geneStrand geneId distanceToTSS
1 chr10 72508428 72508527 100 FUS_K562_IDR 1000 Intron (uc001jrg.3/140766, intron 15 of 21) 10 72432559 72522195 89637 1 140766 75869
2 chr22 49992192 49992291 100 - FUS_K562_IDR 1000 Intron (uc003biq.3/uc003biq.3, intron 1 of 4) 22 50013290 50051190 37901 2 348645 58998
3 chr11 3072043 3072142 100 - FUS_K562_IDR 1000 Intron (uc001lxe.3/833, intron 1 of 22) 11 3022152 3078681 56530 2 833 6539
$hoel
seqnames start end width strand name score annotation geneChr geneStart geneEnd geneLength geneStrand geneId distanceToTSS
1 chr1 557045 557144 100 FUS 1000 Distal Intergenic 1 762971 794826 31856 1 643837 -205827
2 chr1 870107 870206 100 FUS 1000 Intron (uc001abv.1/148398, intron 4 of 4) 1 860530 879961 19432 1 148398 9577
3 chr1 936673 936772 100 FUS 1000 Distal Intergenic 1 948847 949919 1073 1 9636 -12075
I created a function that looks for a particular pattern in the annotation
column:
flag_annot<-function(annotation){
flag = 0
if(length(grep("UTR", annotation, ignore.case = TRUE))){flag = 1}
if(length(grep("Intron",annotation, ignore.case = TRUE))){flag = 1}
if(length(grep("Exon", annotation, ignore.case = TRUE))){flag = 1}
return(flag)
}
The goal is to create another column called intragenic
based on the values the annotation
column gets, which is 1
or 0
.
I'm aware I can subset the annotation column like so:
lapply(grange_list,'[',,'annotation')
I was looking for a neat one liner, perhaps using mapply
that can combine the flag_annot
function with the kind of subset I just did above. Thanks.
CodePudding user response:
To simplify I would change the function to use grepl
and use multiple patterns with |
instead of writing them separately.
flag_annot<-function(annotation){
as.integer(grepl('UTR|Intron|Exon', annotation, ignore.case = TRUE))
}
and then use lapply
as -
lapply(grange_list,function(x) transform(x, intragenic = flag_annot(annotation)))
CodePudding user response:
A tidyverse
approach
library(tidyverse)
flag_annot <- function(x){
x$intragenic <- if_else(str_detect(x$annotation,"UTR|Intron|Exon"),1,0)
return(x)
}
map(grange_list,flag_annot)