Struggling to get this right and hoping someone could help ...
I have a dataset in which I am trying to get the row_max to be the highest value between columns A:N, and row_min should be the 2nd largest (not zeros).
I tried the following to get row_max:
tmp$row_max = colnames(tmp[,5:9])[apply(tmp[,5:9],1,which.max)]
which.min
returns a column that contains a 0
contig pos ref cov A T C G N row_max
1: NW_017095466.1 130 N 41 39 2 0 0 0 A
2: NW_017095466.1 166 N 48 0 46 2 0 0 T
3: NW_017095466.1 427 N 52 50 0 0 2 0 A
4: NW_017095466.1 1736 N 54 44 0 10 0 0 A
5: NW_017095466.1 1918 N 46 0 0 3 43 0 G
6: NW_017095466.1 2688 N 52 5 0 47 0 0 C
I can sort the columns and select the 2nd highest, but this gives me the value, struggling to get the column_name to be returned:
apply(tmp[,5:9], 1, FUN = function(x) sort(x)[4])
Is there a neat tidyverse solution to this?
dput(tmp)
structure(list(contig = c("NW_017095466.1", "NW_017095466.1",
"NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1"
), pos = c(130L, 166L, 427L, 1736L, 1918L, 2688L), ref = c("N",
"N", "N", "N", "N", "N"), cov = c(41L, 48L, 52L, 54L, 46L, 52L
), A = c(39L, 0L, 50L, 44L, 0L, 5L), T = c(2L, 46L, 0L, 0L, 0L,
0L), C = c(0L, 2L, 0L, 10L, 3L, 47L), G = c(0L, 0L, 2L, 0L, 43L,
0L), N = c(0L, 0L, 0L, 0L, 0L, 0L), row_max = c("A", "T", "A",
"A", "G", "C"), row_min = c("C", "A", "T", "T", "A", "T")), row.names = c(NA,
-6L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x7f94c30168e0>)
CodePudding user response:
You can easily do this with order()
, which will give you the the position of the elements if they were sorted. Use this to select the names in the right order and you will get a matrix of names sorted by the size of the elements.
library(tidyverse)
tmp <- structure(list(contig = c("NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1"), pos = c(130L, 166L, 427L, 1736L, 1918L, 2688L), ref = c("N", "N", "N", "N", "N", "N"), cov = c(41L, 48L, 52L, 54L, 46L, 52L), A = c(39L, 0L, 50L, 44L, 0L, 5L), T = c(2L, 46L, 0L, 0L, 0L, 0L), C = c(0L, 2L, 0L, 10L, 3L, 47L), G = c(0L, 0L, 2L, 0L, 43L, 0L), N = c(0L, 0L, 0L, 0L, 0L, 0L), row_max = c("A", "T", "A", "A", "G", "C"), row_min = c("C", "A", "T", "T", "A", "T")), row.names = c(NA, -6L), class = c("data.table", "data.frame"))
tmp[5:9] %>%
apply(1,\(row) rev(names(.)[order(row)])) %>%
t()
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] "A" "T" "N" "G" "C"
#> [2,] "T" "C" "N" "G" "A"
#> [3,] "A" "G" "N" "C" "T"
#> [4,] "A" "C" "N" "G" "T"
#> [5,] "G" "C" "N" "T" "A"
#> [6,] "C" "A" "N" "G" "T"
CodePudding user response:
Here's an approach excluding 0
for the min
part. Also keep in mind that apply
converts to character
if the data has a mix of numeric
and character
, so an explicit conversion to numeric avoids odd spaces.
data.frame(tmp, t(apply(tmp, 1, function(x){ nuc_num <- as.numeric(x[5:8])
c(max = colnames(tmp)[5:8][which.max(nuc_num)],
min = colnames(tmp)[5:8][which(nuc_num > 0)[which.min(nuc_num[nuc_num > 0])]]) })))
contig pos ref cov A T C G N max min
1 NW_017095466.1 130 N 41 39 2 0 0 0 A T
2 NW_017095466.1 166 N 48 0 46 2 0 0 T C
3 NW_017095466.1 427 N 52 50 0 0 2 0 A G
4 NW_017095466.1 1736 N 54 44 0 10 0 0 A C
5 NW_017095466.1 1918 N 46 0 0 3 43 0 G C
6 NW_017095466.1 2688 N 52 5 0 47 0 0 C A
With dplyr
library(dplyr)
tmp %>%
rowwise() %>%
mutate(nuc = list(names(across(A:G))),
max = nuc[which.max(across(A:G))],
nuc_nzero = list(which(across(A:G) > 0)),
min = nuc[nuc_nzero[which.min(across(A:G)[nuc_nzero])]],
nuc = NULL, nuc_nzero = NULL) %>%
ungroup()
# A tibble: 6 × 11
contig pos ref cov A T C G N max min
<chr> <int> <chr> <int> <int> <int> <int> <int> <int> <chr> <chr>
1 NW_017095466.1 130 N 41 39 2 0 0 0 A T
2 NW_017095466.1 166 N 48 0 46 2 0 0 T C
3 NW_017095466.1 427 N 52 50 0 0 2 0 A G
4 NW_017095466.1 1736 N 54 44 0 10 0 0 A C
5 NW_017095466.1 1918 N 46 0 0 3 43 0 G C
6 NW_017095466.1 2688 N 52 5 0 47 0 0 C A
Data
tmp <- structure(list(contig = c("NW_017095466.1", "NW_017095466.1",
"NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1"
), pos = c(130L, 166L, 427L, 1736L, 1918L, 2688L), ref = c("N",
"N", "N", "N", "N", "N"), cov = c(41L, 48L, 52L, 54L, 46L, 52L
), A = c(39L, 0L, 50L, 44L, 0L, 5L), T = c(2L, 46L, 0L, 0L, 0L,
0L), C = c(0L, 2L, 0L, 10L, 3L, 47L), G = c(0L, 0L, 2L, 0L, 43L,
0L), N = c(0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(NA, -6L), class = "data.frame")
CodePudding user response:
Here is another solution:
library(tidyverse)
tmp |>
mutate(row = row_number()) |>
group_by(row) |>
pivot_longer(cols = A:N) |>
mutate(max = name[value == max(value)]) |>
mutate(value = ifelse(value == 0, NA, value)) |>
mutate(tm = min(value, na.rm = T)) |>
mutate(value = ifelse(is.na(value), 0, value)) |>
mutate(min = name[value == tm]) |>
pivot_wider(names_from = name, values_from = value) |>
ungroup() |>
select(-row, -tm) |>
relocate(c(min, max), .after = last_col())
#> # A tibble: 6 × 11
#> contig pos ref cov A T C G N min max
#> <chr> <int> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
#> 1 NW_017095466.1 130 N 41 39 2 0 0 0 T A
#> 2 NW_017095466.1 166 N 48 0 46 2 0 0 C T
#> 3 NW_017095466.1 427 N 52 50 0 0 2 0 G A
#> 4 NW_017095466.1 1736 N 54 44 0 10 0 0 C A
#> 5 NW_017095466.1 1918 N 46 0 0 3 43 0 C G
#> 6 NW_017095466.1 2688 N 52 5 0 47 0 0 A C