Home > Enterprise >  create a column in a list of dataframes based on a function
create a column in a list of dataframes based on a function

Time:10-13

Here is my reproducible example:

grange_list <- list(hepg2 = structure(list(seqnames = structure(c(7L, 15L, 1L
), .Label = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", 
"chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", 
"chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", 
"chr22", "chrX"), class = "factor"), start = c(158126281L, 69110138L, 
2205071L), end = c(158126380L, 69110237L, 2205170L), width = c(100L, 
100L, 100L), strand = structure(c(2L, 2L, 1L), .Label = c(" ", 
"-", "*"), class = "factor"), name = c("FUS_HepG2_IDR", "FUS_HepG2_IDR", 
"FUS_HepG2_IDR"), score = c(1000L, 1000L, 1000L), annotation = c("Intron (uc011kwa.2/5799, intron 2 of 22)", 
"Intron (uc002arl.3/8125, intron 1 of 6)", "Intron (uc001aja.4/6497, intron 1 of 6)"
), geneChr = c(7L, 15L, 1L), geneStart = c(157331750L, 69070875L, 
2160134L), geneEnd = c(158380482L, 69113261L, 2241652L), geneLength = c(1048733L, 
42387L, 81519L), geneStrand = c(2L, 2L, 1L), geneId = c("5799", 
"8125", "6497"), distanceToTSS = c(254102, 3024, 44937)), row.names = c(NA, 
3L), class = "data.frame"), k562 = structure(list(seqnames = structure(c(10L, 
22L, 11L), .Label = c("chr1", "chr2", "chr3", "chr4", "chr5", 
"chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", 
"chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", 
"chr21", "chr22", "chrX"), class = "factor"), start = c(72508428L, 
49992192L, 3072043L), end = c(72508527L, 49992291L, 3072142L), 
    width = c(100L, 100L, 100L), strand = structure(c(1L, 2L, 
    2L), .Label = c(" ", "-", "*"), class = "factor"), name = c("FUS_K562_IDR", 
    "FUS_K562_IDR", "FUS_K562_IDR"), score = c(1000L, 1000L, 
    1000L), annotation = c("Intron (uc001jrg.3/140766, intron 15 of 21)", 
    "Intron (uc003biq.3/uc003biq.3, intron 1 of 4)", "Intron (uc001lxe.3/833, intron 1 of 22)"
    ), geneChr = c(10L, 22L, 11L), geneStart = c(72432559L, 50013290L, 
    3022152L), geneEnd = c(72522195L, 50051190L, 3078681L), geneLength = c(89637L, 
    37901L, 56530L), geneStrand = c(1L, 2L, 2L), geneId = c("140766", 
    "348645", "833"), distanceToTSS = c(75869, 58998, 6539)), row.names = c(NA, 
3L), class = "data.frame"), hoel = structure(list(seqnames = structure(c(1L, 
1L, 1L), .Label = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", 
"chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", 
"chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", 
"chr22", "chrX", "chrY"), class = "factor"), start = c(557045L, 
870107L, 936673L), end = c(557144L, 870206L, 936772L), width = c(100L, 
100L, 100L), strand = structure(c(1L, 1L, 1L), .Label = c(" ", 
"-", "*"), class = "factor"), name = c("FUS", "FUS", "FUS"), 
    score = c(1000L, 1000L, 1000L), annotation = c("Distal Intergenic", 
    "Intron (uc001abv.1/148398, intron 4 of 4)", "Distal Intergenic"
    ), geneChr = c(1L, 1L, 1L), geneStart = c(762971L, 860530L, 
    948847L), geneEnd = c(794826L, 879961L, 949919L), geneLength = c(31856L, 
    19432L, 1073L), geneStrand = c(1L, 1L, 1L), geneId = c("643837", 
    "148398", "9636"), distanceToTSS = c(-205827, 9577, -12075
    )), row.names = c(NA, 3L), class = "data.frame"))

This is a list of dataframes, it looks like that:

$hepg2
  seqnames     start       end width strand          name score                               annotation geneChr geneStart   geneEnd geneLength geneStrand geneId distanceToTSS
1     chr7 158126281 158126380   100      - FUS_HepG2_IDR  1000 Intron (uc011kwa.2/5799, intron 2 of 22)       7 157331750 158380482    1048733          2   5799        254102
2    chr15  69110138  69110237   100      - FUS_HepG2_IDR  1000  Intron (uc002arl.3/8125, intron 1 of 6)      15  69070875  69113261      42387          2   8125          3024
3     chr1   2205071   2205170   100        FUS_HepG2_IDR  1000  Intron (uc001aja.4/6497, intron 1 of 6)       1   2160134   2241652      81519          1   6497         44937

$k562
  seqnames    start      end width strand         name score                                    annotation geneChr geneStart  geneEnd geneLength geneStrand geneId distanceToTSS
1    chr10 72508428 72508527   100        FUS_K562_IDR  1000   Intron (uc001jrg.3/140766, intron 15 of 21)      10  72432559 72522195      89637          1 140766         75869
2    chr22 49992192 49992291   100      - FUS_K562_IDR  1000 Intron (uc003biq.3/uc003biq.3, intron 1 of 4)      22  50013290 50051190      37901          2 348645         58998
3    chr11  3072043  3072142   100      - FUS_K562_IDR  1000       Intron (uc001lxe.3/833, intron 1 of 22)      11   3022152  3078681      56530          2    833          6539

$hoel
  seqnames  start    end width strand name score                                annotation geneChr geneStart geneEnd geneLength geneStrand geneId distanceToTSS
1     chr1 557045 557144   100         FUS  1000                         Distal Intergenic       1    762971  794826      31856          1 643837       -205827
2     chr1 870107 870206   100         FUS  1000 Intron (uc001abv.1/148398, intron 4 of 4)       1    860530  879961      19432          1 148398          9577
3     chr1 936673 936772   100         FUS  1000                         Distal Intergenic       1    948847  949919       1073          1   9636        -12075

I created a function that looks for a particular pattern in the annotation column:

flag_annot<-function(annotation){
  flag = 0
  if(length(grep("UTR",   annotation, ignore.case = TRUE))){flag = 1}
  if(length(grep("Intron",annotation, ignore.case = TRUE))){flag = 1}
  if(length(grep("Exon",  annotation, ignore.case = TRUE))){flag = 1}
  return(flag)
}

The goal is to create another column called intragenic based on the values the annotation column gets, which is 1 or 0.

I'm aware I can subset the annotation column like so:

lapply(grange_list,'[',,'annotation')

I was looking for a neat one liner, perhaps using mapply that can combine the flag_annot function with the kind of subset I just did above. Thanks.

CodePudding user response:

To simplify I would change the function to use grepl and use multiple patterns with | instead of writing them separately.

flag_annot<-function(annotation){
  as.integer(grepl('UTR|Intron|Exon',  annotation, ignore.case = TRUE))
}

and then use lapply as -

lapply(grange_list,function(x) transform(x, intragenic = flag_annot(annotation)))

CodePudding user response:

A tidyverse approach

library(tidyverse)

flag_annot <- function(x){
  x$intragenic <- if_else(str_detect(x$annotation,"UTR|Intron|Exon"),1,0)
  return(x)
}

map(grange_list,flag_annot)
  •  Tags:  
  • r
  • Related