Home > Enterprise >  return the name of the column with highest and 2nd highest and 3rd highest value in R
return the name of the column with highest and 2nd highest and 3rd highest value in R

Time:12-19

Struggling to get this right and hoping someone could help ...

I have a dataset in which I am trying to get the row_max to be the highest value between columns A:N, and row_min should be the 2nd largest (not zeros).

I tried the following to get row_max:

tmp$row_max = colnames(tmp[,5:9])[apply(tmp[,5:9],1,which.max)]

which.min returns a column that contains a 0

           contig  pos ref cov  A  T  C  G N row_max 
1: NW_017095466.1  130   N  41 39  2  0  0 0       A 
2: NW_017095466.1  166   N  48  0 46  2  0 0       T 
3: NW_017095466.1  427   N  52 50  0  0  2 0       A 
4: NW_017095466.1 1736   N  54 44  0 10  0 0       A 
5: NW_017095466.1 1918   N  46  0  0  3 43 0       G 
6: NW_017095466.1 2688   N  52  5  0 47  0 0       C 

I can sort the columns and select the 2nd highest, but this gives me the value, struggling to get the column_name to be returned:

apply(tmp[,5:9], 1, FUN = function(x) sort(x)[4])

Is there a neat tidyverse solution to this?

dput(tmp)
structure(list(contig = c("NW_017095466.1", "NW_017095466.1", 
"NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1"
), pos = c(130L, 166L, 427L, 1736L, 1918L, 2688L), ref = c("N", 
"N", "N", "N", "N", "N"), cov = c(41L, 48L, 52L, 54L, 46L, 52L
), A = c(39L, 0L, 50L, 44L, 0L, 5L), T = c(2L, 46L, 0L, 0L, 0L, 
0L), C = c(0L, 2L, 0L, 10L, 3L, 47L), G = c(0L, 0L, 2L, 0L, 43L, 
0L), N = c(0L, 0L, 0L, 0L, 0L, 0L), row_max = c("A", "T", "A", 
"A", "G", "C"), row_min = c("C", "A", "T", "T", "A", "T")), row.names = c(NA, 
-6L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x7f94c30168e0>)

CodePudding user response:

You can easily do this with order(), which will give you the the position of the elements if they were sorted. Use this to select the names in the right order and you will get a matrix of names sorted by the size of the elements.

library(tidyverse)

tmp <- structure(list(contig = c("NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1"), pos = c(130L, 166L, 427L, 1736L, 1918L, 2688L), ref = c("N", "N", "N", "N", "N", "N"), cov = c(41L, 48L, 52L, 54L, 46L, 52L), A = c(39L, 0L, 50L, 44L, 0L, 5L), T = c(2L, 46L, 0L, 0L, 0L, 0L), C = c(0L, 2L, 0L, 10L, 3L, 47L), G = c(0L, 0L, 2L, 0L, 43L, 0L), N = c(0L, 0L, 0L, 0L, 0L, 0L), row_max = c("A", "T", "A", "A", "G", "C"), row_min = c("C", "A", "T", "T", "A", "T")), row.names = c(NA, -6L), class = c("data.table", "data.frame"))

tmp[5:9] %>% 
  apply(1,\(row) rev(names(.)[order(row)])) %>%
  t()
#>      [,1] [,2] [,3] [,4] [,5]
#> [1,] "A"  "T"  "N"  "G"  "C" 
#> [2,] "T"  "C"  "N"  "G"  "A" 
#> [3,] "A"  "G"  "N"  "C"  "T" 
#> [4,] "A"  "C"  "N"  "G"  "T" 
#> [5,] "G"  "C"  "N"  "T"  "A" 
#> [6,] "C"  "A"  "N"  "G"  "T"

CodePudding user response:

Here's an approach excluding 0 for the min part. Also keep in mind that apply converts to character if the data has a mix of numeric and character, so an explicit conversion to numeric avoids odd spaces.

data.frame(tmp, t(apply(tmp, 1, function(x){ nuc_num <- as.numeric(x[5:8])
  c(max = colnames(tmp)[5:8][which.max(nuc_num)], 
    min = colnames(tmp)[5:8][which(nuc_num > 0)[which.min(nuc_num[nuc_num > 0])]]) })))
          contig  pos ref cov  A  T  C  G N max min
1 NW_017095466.1  130   N  41 39  2  0  0 0   A   T
2 NW_017095466.1  166   N  48  0 46  2  0 0   T   C
3 NW_017095466.1  427   N  52 50  0  0  2 0   A   G
4 NW_017095466.1 1736   N  54 44  0 10  0 0   A   C
5 NW_017095466.1 1918   N  46  0  0  3 43 0   G   C
6 NW_017095466.1 2688   N  52  5  0 47  0 0   C   A

With dplyr

library(dplyr)

tmp %>% 
  rowwise() %>% 
  mutate(nuc = list(names(across(A:G))), 
         max = nuc[which.max(across(A:G))], 
         nuc_nzero = list(which(across(A:G) > 0)), 
         min = nuc[nuc_nzero[which.min(across(A:G)[nuc_nzero])]], 
         nuc = NULL, nuc_nzero = NULL) %>% 
  ungroup()
# A tibble: 6 × 11
  contig           pos ref     cov     A     T     C     G     N max   min
  <chr>          <int> <chr> <int> <int> <int> <int> <int> <int> <chr> <chr>
1 NW_017095466.1   130 N        41    39     2     0     0     0 A     T
2 NW_017095466.1   166 N        48     0    46     2     0     0 T     C
3 NW_017095466.1   427 N        52    50     0     0     2     0 A     G
4 NW_017095466.1  1736 N        54    44     0    10     0     0 A     C
5 NW_017095466.1  1918 N        46     0     0     3    43     0 G     C
6 NW_017095466.1  2688 N        52     5     0    47     0     0 C     A

Data

tmp <- structure(list(contig = c("NW_017095466.1", "NW_017095466.1",
"NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1"
), pos = c(130L, 166L, 427L, 1736L, 1918L, 2688L), ref = c("N",
"N", "N", "N", "N", "N"), cov = c(41L, 48L, 52L, 54L, 46L, 52L
), A = c(39L, 0L, 50L, 44L, 0L, 5L), T = c(2L, 46L, 0L, 0L, 0L,
0L), C = c(0L, 2L, 0L, 10L, 3L, 47L), G = c(0L, 0L, 2L, 0L, 43L,
0L), N = c(0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(NA, -6L), class = "data.frame")

CodePudding user response:

Here is another solution:


library(tidyverse)

tmp |> 
  mutate(row = row_number()) |> 
  group_by(row) |> 
  pivot_longer(cols = A:N) |> 
  mutate(max = name[value == max(value)]) |> 
  mutate(value = ifelse(value == 0, NA, value)) |> 
  mutate(tm = min(value, na.rm = T)) |> 
  mutate(value = ifelse(is.na(value), 0, value)) |> 
  mutate(min = name[value == tm]) |> 
  pivot_wider(names_from = name, values_from = value) |> 
  ungroup() |> 
  select(-row, -tm) |> 
  relocate(c(min, max), .after = last_col())
#> # A tibble: 6 × 11
#>   contig           pos ref     cov     A     T     C     G     N min   max  
#>   <chr>          <int> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
#> 1 NW_017095466.1   130 N        41    39     2     0     0     0 T     A    
#> 2 NW_017095466.1   166 N        48     0    46     2     0     0 C     T    
#> 3 NW_017095466.1   427 N        52    50     0     0     2     0 G     A    
#> 4 NW_017095466.1  1736 N        54    44     0    10     0     0 C     A    
#> 5 NW_017095466.1  1918 N        46     0     0     3    43     0 C     G    
#> 6 NW_017095466.1  2688 N        52     5     0    47     0     0 A     C
  • Related