My idea is to filter my rows within the first two standard deviation or beyond if there exist values.
So this is my dummy dataset
nr1 = 4; nr2 = 8; nr3 = 6; nr = nr1 nr2 nr3
nc1 = 6; nc2 = 8; nc3 = 10; nc = nc1 nc2 nc3
mat = cbind(rbind(matrix(rnorm(nr1*nc1, mean = 1, sd = 0.5), nr = nr1),
matrix(rnorm(nr2*nc1, mean = 0, sd = 0.5), nr = nr2),
matrix(rnorm(nr3*nc1, mean = 0, sd = 0.5), nr = nr3)),
rbind(matrix(rnorm(nr1*nc2, mean = 0, sd = 0.5), nr = nr1),
matrix(rnorm(nr2*nc2, mean = 1, sd = 0.5), nr = nr2),
matrix(rnorm(nr3*nc2, mean = 0, sd = 0.5), nr = nr3)),
rbind(matrix(rnorm(nr1*nc3, mean = 0.5, sd = 0.5), nr = nr1),
matrix(rnorm(nr2*nc3, mean = 0.5, sd = 0.5), nr = nr2),
matrix(rnorm(nr3*nc3, mean = 1, sd = 0.5), nr = nr3))
)
mat = mat[sample(nr, nr), sample(nc, nc)] # random shuffle rows and columns
rownames(mat) = paste0("row", seq_len(nr))
colnames(mat) = paste0("column", seq_len(nc))
In order to calculate the row-wise std deviation I used this
rld2 <- as.data.frame((m)) %>% rownames_to_column('gene')
bb <- rld2 %>%
rowwise() %>%
do(data.frame(., rsds = sd(unlist(.[2:length(.)]))))
Now my idea is to filter out row which are within The first, second and the third std deviation.
How do i Implement that?
Any help or suggestion would be really appreciated
CodePudding user response:
One option is to transpose the data frame and create a longer table. Then the filtering operations become fairly easy, see below:
library(tidyr)
library(dplyr)
df <- t(mat) %>%
as.data.frame() %>%
pivot_longer(everything(), names_to = "gene") %>%
group_by(gene) %>%
mutate(meanval = mean(value), stdev = sd(value))
Now you can filter easily:
df %>%
filter(abs((value-meanval)/stdev)>2)
or make different groups:
df %>%
mutate(
fact = abs((value-meanval)/stdev),
stgroup = case_when(
fact < 1 ~ "<1",
fact <=2 ~ "1-2",
fact >2 ~ ">2",
TRUE ~ NA_character_
)
)