A reproducible example:
example <- structure(list(seqnames = c("chr1", "chr1", "chr1", "chr1", "chr1",
"chr1"), start = c(14660L, 661861L, 662360L, 700216L, 703359L,
713320L), end = c(14736L, 661929L, 662414L, 700326L, 703430L,
713395L), width = c(77L, 69L, 55L, 111L, 72L, 76L), strand = c(" ",
" ", "-", "-", "-", "-")), row.names = c(NA, -6L), class = "data.frame")
That looks like that:
seqnames start end width strand
1 chr1 14660 14736 77
2 chr1 661861 661929 69
3 chr1 662360 662414 55 -
4 chr1 700216 700326 111 -
5 chr1 703359 703430 72 -
6 chr1 713320 713395 76 -
My function looks for start
and adds 100
if start has as strand
the
sign, for the -
strand the end
is diminished by 100
.
extension <- function(peak_df) {
if(peak_df['strand']==' '){
peak_df['end'] = peak_df['start'] 100
}
else if (peak_df['strand']=='-') {
peak_df['start'] = peak_df['end'] - 100
}
}
Then I want to take this function and loop trough the rows of example
. If I use apply
I get the following error:
> apply(pk.df,1, extension)
Error in peak_df["end"] - 100 : non-numeric argument to binary operator
If I use sapply
then I get a different error:
> sapply(pk.df,extension)
Error in if (peak_df["strand"] == " ") { :
missing value where TRUE/FALSE needed
Is this because my function is not vectorized ?
CodePudding user response:
Using dplyr:
library(dplyr)
example %>% mutate(end = case_when(strand == ' ' ~ start 100, TRUE ~ end 0),
start = case_when(strand == '-' ~ end - 100, TRUE ~ start 0))
seqnames start end width strand
1 chr1 14660 14760 77
2 chr1 661861 661961 69
3 chr1 662314 662414 55 -
4 chr1 700226 700326 111 -
5 chr1 703330 703430 72 -
6 chr1 713295 713395 76 -
CodePudding user response:
Check str
of apply
,
apply(example,1, function(x) {str(x)})
Named chr [1:5] "chr1" " 14660" " 14736" " 77" " "
- attr(*, "names")= chr [1:5] "seqnames" "start" "end" "width" ...
Named chr [1:5] "chr1" "661861" "661929" " 69" " "
- attr(*, "names")= chr [1:5] "seqnames" "start" "end" "width" ...
Named chr [1:5] "chr1" "662360" "662414" " 55" "-"
- attr(*, "names")= chr [1:5] "seqnames" "start" "end" "width" ...
Named chr [1:5] "chr1" "700216" "700326" "111" "-"
- attr(*, "names")= chr [1:5] "seqnames" "start" "end" "width" ...
Named chr [1:5] "chr1" "703359" "703430" " 72" "-"
- attr(*, "names")= chr [1:5] "seqnames" "start" "end" "width" ...
Named chr [1:5] "chr1" "713320" "713395" " 76" "-"
- attr(*, "names")= chr [1:5] "seqnames" "start" "end" "width" ...
NULL
Those values are character
in apply
so add as.numeric
.
extension <- function(peak_df) {
if(peak_df['strand']==' '){
peak_df['end'] = as.numeric(peak_df['start']) 100
}
else if (peak_df['strand']=='-') {
peak_df['start'] = as.numeric(peak_df['end']) - 100
}
}
apply(example, 1, extension)
[1] 14760 661961 662314 700226 703330 713295
It works now.
Also, in sapply
, it works column-wise, so your function print that message. Take a look at result of sapply(example, function(x){print(x)})
[1] "chr1" "chr1" "chr1" "chr1" "chr1" "chr1"
[1] 14660 661861 662360 700216 703359 713320
[1] 14736 661929 662414 700326 703430 713395
[1] 77 69 55 111 72 76
[1] " " " " "-" "-" "-" "-"
CodePudding user response:
You should write a vectorized version of extension
function.
extension <- function(peak_df) {
inds <- peak_df$strand == " "
peak_df$end[inds] = peak_df$start[inds] 100
peak_df$start[!inds] = peak_df$end[!inds] - 100
peak_df
}
extension(example)
# seqnames start end width strand
#1 chr1 14660 14760 77
#2 chr1 661861 661961 69
#3 chr1 662314 662414 55 -
#4 chr1 700226 700326 111 -
#5 chr1 703330 703430 72 -
#6 chr1 713295 713395 76 -