Home > OS >  R: calculate frequency table of a matrix given a data frame of cutoff values
R: calculate frequency table of a matrix given a data frame of cutoff values

Time:03-30

Say I have a matrix like the following, with marker values per id, 10 events per id (in this example):

set.seed(123)
mymat <- matrix(rnorm(300), nrow=30)
rownames(mymat) <- paste0('id',rep(1:3,each=10))
colnames(mymat) <- letters[1:10]
>   head(mymat)
              a          b          c          d          e          f          g          h          i          j
id1 -0.56047565  0.4264642  0.3796395  0.9935039  0.1176466  0.7877388 -1.0633261  0.1192452 -0.7886220  0.8450130
id1 -0.23017749 -0.2950715 -0.5023235  0.5483970 -0.9474746  0.7690422  1.2631852  0.2436874 -0.5021987  0.9625280
id1  1.55870831  0.8951257 -0.3332074  0.2387317 -0.4905574  0.3322026 -0.3496504  1.2324759  1.4960607  0.6843094
id1  0.07050839  0.8781335 -1.0185754 -0.6279061 -0.2560922 -1.0083766 -0.8655129 -0.5160638 -1.1373036 -1.3952743
id1  0.12928774  0.8215811 -1.0717912  1.3606524  1.8438620 -0.1194526 -0.2362796 -0.9925072 -0.1790516  0.8496430
id1  1.71506499  0.6886403  0.3035286 -0.6002596 -0.6519499 -0.2803953 -0.1971759  1.6756969  1.9023618 -0.4465572

And an associated data frame of cutoff values (a min and a max cutoff per id and marker), like this one:

cutoff_df <- data.frame(id=paste0('id',rep(1:3,each=10)), marker=rep(letters[1:10],3), min=runif(30, 0, 2), max=runif(30, 5, 7))
>   head(cutoff_df)
   id marker       min      max
1 id1      a 0.4744594 6.518271
2 id1      b 1.3729807 6.689669
3 id1      c 0.4516368 5.915843
4 id1      d 0.6369892 6.459263
5 id1      e 0.3479676 5.208157
6 id1      f 1.6028592 5.439966

What I want to do here, is calculate a frequency table, so that I record the percentage of events per id and marker that fall into the cutoffs for that id and marker.

This is my attempt using some ugly nested loops... Wondering if there is a nicer and cleaner way to do this, ideally with base functions or data.table or tidyr...

My ugly code:

freq_mat <- matrix(nrow=length(unique(rownames(mymat))))
rownames(freq_mat) <- unique(rownames(mymat))
for (mk in colnames(mymat)){
  mk_freq <- NULL
  for (id in unique(rownames(mymat))){
    data <- mymat[rownames(mymat)==id,mk]
    min <- cutoff_df$min[cutoff_df$id==id & cutoff_df$marker==mk]
    max <- cutoff_df$max[cutoff_df$id==id & cutoff_df$marker==mk]
    ins <- length(data[data>=min & data<=max])
    freq <- ins/length(data)*100
    mk_freq <- c(mk_freq, freq)
  }
  mk_freq <- as.data.frame(mk_freq)
  names(mk_freq) <- mk
  freq_mat <- cbind(freq_mat, mk_freq)
}
> freq_mat
    freq_mat  a  b  c  d  e  f  g  h  i  j
id1       NA 20  0 20 40 10  0 30 10 20 30
id2       NA 10 30 30  0 20 10 10  0  0 70
id3       NA  0  0  0  0 30 10 30 10 30 60

CodePudding user response:

Something like this? Here, the sum of all cells is 100.

library(tidyverse)

set.seed(123)
mymat <- matrix(rnorm(300), nrow = 30)
rownames(mymat) <- paste0("id", rep(1:3, each = 10))
colnames(mymat) <- letters[1:10]
cutoff_df <- data.frame(
  id = paste0("id", rep(1:3, each = 10)),
  marker = rep(letters[1:10], 3), min = runif(30, 0, 2), max = runif(30, 5, 7)
)

mymat %>%
  as_tibble(rownames = "id") %>%
  pivot_longer(-id, names_to = "marker") %>%
  left_join(cutoff_df) %>%
  filter(value <= max & value >= min) %>%
  count(id, marker) %>%
  # group_by(marker) %>% # e.g. to make sum of 100 per marker
  mutate(n = n / sum(n) * 100) %>%
  pivot_wider(names_from = marker, values_from = n, values_fill = list(n = 0))
#> Joining, by = c("id", "marker")
#> # A tibble: 3 × 11
#>   id        a     c     d     e     g     h     i     j     b     f
#>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1    3.77  3.77  7.55  1.89  5.66  1.89  3.77  5.66  0     0   
#> 2 id2    1.89  5.66  0     3.77  1.89  0     0    13.2   5.66  1.89
#> 3 id3    0     0     0     5.66  5.66  1.89  5.66 11.3   0     1.89

Created on 2022-03-30 by the reprex package (v2.0.0)

CodePudding user response:

Here is a solution based on the purrr package. I'm not sure that it is cleaner but it is shorter.

library(purrr)
asplit(mymat,2) |>
  imap(~{
    with(filter(cutoff_df, marker == .y),
         outer(.x, min, ">=") &
         outer(.x, max, "<") &
         outer(names(.x), id, "=="))
  }) |>
  map(rowSums) |>
  map_dfr(~tapply(.x, names(.x), FUN = sum),
          .id = "marker")

##>   # A tibble: 10 × 4
##>    marker   id1   id2   id3
##>    <chr>  <dbl> <dbl> <dbl>
##>  1 a          2     1     0
##>  2 b          0     3     0
##>  3 c          2     3     0
##>  4 d          4     0     0
##>  5 e          1     2     3
##>  6 f          0     1     1
##>  7 g          3     1     3
##>  8 h          1     0     1
##>  9 i          2     0     3
##> 10 j          3     7     6
  • Related