Say I have a matrix like the following, with marker
values per id
, 10 events per id
(in this example):
set.seed(123)
mymat <- matrix(rnorm(300), nrow=30)
rownames(mymat) <- paste0('id',rep(1:3,each=10))
colnames(mymat) <- letters[1:10]
> head(mymat)
a b c d e f g h i j
id1 -0.56047565 0.4264642 0.3796395 0.9935039 0.1176466 0.7877388 -1.0633261 0.1192452 -0.7886220 0.8450130
id1 -0.23017749 -0.2950715 -0.5023235 0.5483970 -0.9474746 0.7690422 1.2631852 0.2436874 -0.5021987 0.9625280
id1 1.55870831 0.8951257 -0.3332074 0.2387317 -0.4905574 0.3322026 -0.3496504 1.2324759 1.4960607 0.6843094
id1 0.07050839 0.8781335 -1.0185754 -0.6279061 -0.2560922 -1.0083766 -0.8655129 -0.5160638 -1.1373036 -1.3952743
id1 0.12928774 0.8215811 -1.0717912 1.3606524 1.8438620 -0.1194526 -0.2362796 -0.9925072 -0.1790516 0.8496430
id1 1.71506499 0.6886403 0.3035286 -0.6002596 -0.6519499 -0.2803953 -0.1971759 1.6756969 1.9023618 -0.4465572
And an associated data frame of cutoff values (a min and a max cutoff per id
and marker
), like this one:
cutoff_df <- data.frame(id=paste0('id',rep(1:3,each=10)), marker=rep(letters[1:10],3), min=runif(30, 0, 2), max=runif(30, 5, 7))
> head(cutoff_df)
id marker min max
1 id1 a 0.4744594 6.518271
2 id1 b 1.3729807 6.689669
3 id1 c 0.4516368 5.915843
4 id1 d 0.6369892 6.459263
5 id1 e 0.3479676 5.208157
6 id1 f 1.6028592 5.439966
What I want to do here, is calculate a frequency table, so that I record the percentage of events per id
and marker
that fall into the cutoffs for that id
and marker
.
This is my attempt using some ugly nested loops... Wondering if there is a nicer and cleaner way to do this, ideally with base functions or data.table or tidyr...
My ugly code:
freq_mat <- matrix(nrow=length(unique(rownames(mymat))))
rownames(freq_mat) <- unique(rownames(mymat))
for (mk in colnames(mymat)){
mk_freq <- NULL
for (id in unique(rownames(mymat))){
data <- mymat[rownames(mymat)==id,mk]
min <- cutoff_df$min[cutoff_df$id==id & cutoff_df$marker==mk]
max <- cutoff_df$max[cutoff_df$id==id & cutoff_df$marker==mk]
ins <- length(data[data>=min & data<=max])
freq <- ins/length(data)*100
mk_freq <- c(mk_freq, freq)
}
mk_freq <- as.data.frame(mk_freq)
names(mk_freq) <- mk
freq_mat <- cbind(freq_mat, mk_freq)
}
> freq_mat
freq_mat a b c d e f g h i j
id1 NA 20 0 20 40 10 0 30 10 20 30
id2 NA 10 30 30 0 20 10 10 0 0 70
id3 NA 0 0 0 0 30 10 30 10 30 60
CodePudding user response:
Something like this? Here, the sum of all cells is 100.
library(tidyverse)
set.seed(123)
mymat <- matrix(rnorm(300), nrow = 30)
rownames(mymat) <- paste0("id", rep(1:3, each = 10))
colnames(mymat) <- letters[1:10]
cutoff_df <- data.frame(
id = paste0("id", rep(1:3, each = 10)),
marker = rep(letters[1:10], 3), min = runif(30, 0, 2), max = runif(30, 5, 7)
)
mymat %>%
as_tibble(rownames = "id") %>%
pivot_longer(-id, names_to = "marker") %>%
left_join(cutoff_df) %>%
filter(value <= max & value >= min) %>%
count(id, marker) %>%
# group_by(marker) %>% # e.g. to make sum of 100 per marker
mutate(n = n / sum(n) * 100) %>%
pivot_wider(names_from = marker, values_from = n, values_fill = list(n = 0))
#> Joining, by = c("id", "marker")
#> # A tibble: 3 × 11
#> id a c d e g h i j b f
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1 3.77 3.77 7.55 1.89 5.66 1.89 3.77 5.66 0 0
#> 2 id2 1.89 5.66 0 3.77 1.89 0 0 13.2 5.66 1.89
#> 3 id3 0 0 0 5.66 5.66 1.89 5.66 11.3 0 1.89
Created on 2022-03-30 by the reprex package (v2.0.0)
CodePudding user response:
Here is a solution based on the purrr
package. I'm not sure that it is cleaner but it is shorter.
library(purrr)
asplit(mymat,2) |>
imap(~{
with(filter(cutoff_df, marker == .y),
outer(.x, min, ">=") &
outer(.x, max, "<") &
outer(names(.x), id, "=="))
}) |>
map(rowSums) |>
map_dfr(~tapply(.x, names(.x), FUN = sum),
.id = "marker")
##> # A tibble: 10 × 4
##> marker id1 id2 id3
##> <chr> <dbl> <dbl> <dbl>
##> 1 a 2 1 0
##> 2 b 0 3 0
##> 3 c 2 3 0
##> 4 d 4 0 0
##> 5 e 1 2 3
##> 6 f 0 1 1
##> 7 g 3 1 3
##> 8 h 1 0 1
##> 9 i 2 0 3
##> 10 j 3 7 6