Return column header if rows values are above certain threshold-CodePudding

I have a dataframe called tt. I want to create a new column called Ethnicity where I want to have a column header for every row value that is more than 80%. If no row has one value that is greater than 80%, then I want to have string 'MIX' in that row.

tt <- structure(list(INDIVIDUAL = c("SJL0253301", "SJL1073801", "SJL1066401", 
"SJL1762813"), EUR = c(0.974378, 0.496489, 1e-05, 1e-05), EAS = c(0.010592, 
0.438799, 0.99996, 1e-05), AMR = c(0.004699, 1e-05, 1e-05, 0.99996
), SAS = c(1e-05, 0.053618, 1e-05, 1e-05), AFR = c(0.010321, 
0.011084, 1e-05, 1e-05)), row.names = c(1L, 44L, 19L, 911L), class = "data.frame")

Result I want:

INDIVIDUAL      EUR      EAS      AMR      SAS      AFR Ethnicity
SJL0253301 0.974378 0.010592 0.004699 0.000010 0.010321 EUR
SJL1073801 0.496489 0.438799 0.000010 0.053618 0.011084 MIX
SJL1066401 0.000010 0.999960 0.000010 0.000010 0.000010 EAS
SJL1762813 0.000010 0.000010 0.999960 0.000010 0.000010 AMR

CodePudding user response：

We can use max.col to return the first column index (for each row) that shows value greater than 0.8, then assign MIX for those cases where there are none to "MIX"

tt$Ethnicity <- names(tt)[-1][max.col(tt[-1] > 0.8, "first")]
tt$Ethnicity[!rowSums(tt[2:6] > 0.8)] <- "MIX"

-output

> tt
    INDIVIDUAL      EUR      EAS      AMR      SAS      AFR Ethnicity
1   SJL0253301 0.974378 0.010592 0.004699 0.000010 0.010321       EUR
44  SJL1073801 0.496489 0.438799 0.000010 0.053618 0.011084       MIX
19  SJL1066401 0.000010 0.999960 0.000010 0.000010 0.000010       EAS
911 SJL1762813 0.000010 0.000010 0.999960 0.000010 0.000010       AMR

CodePudding user response：

Another possible solution:

cbind(tt, Ethnicity = apply(tt[-1] > 0.8, 1, \(x) if (any(x)) names(x)[x] else "MIX"))

#>     INDIVIDUAL      EUR      EAS      AMR      SAS      AFR Ethnicity
#> 1   SJL0253301 0.974378 0.010592 0.004699 0.000010 0.010321       EUR
#> 44  SJL1073801 0.496489 0.438799 0.000010 0.053618 0.011084       MIX
#> 19  SJL1066401 0.000010 0.999960 0.000010 0.000010 0.000010       EAS
#> 911 SJL1762813 0.000010 0.000010 0.999960 0.000010 0.000010       AMR

CodePudding user response：

Here is a tidyverse approach:

library(dplyr)
library(tidyr)

tt %>% 
  mutate(across(-INDIVIDUAL, ~case_when(. > 0.8 ~ cur_column()), .names = "new_{.col}")) %>% 
  unite(Ethnicity, starts_with('new'), na.rm = TRUE, sep = ' ') %>% 
  mutate(Ethnicity = ifelse(Ethnicity== "", "MIX", Ethnicity))

   INDIVIDUAL      EUR      EAS      AMR      SAS      AFR Ethnicity
1   SJL0253301 0.974378 0.010592 0.004699 0.000010 0.010321       EUR
44  SJL1073801 0.496489 0.438799 0.000010 0.053618 0.011084       MIX
19  SJL1066401 0.000010 0.999960 0.000010 0.000010 0.000010       EAS
911 SJL1762813 0.000010 0.000010 0.999960 0.000010 0.000010       AMR

CodePudding user response：

Here is another option:

library(dplyr)

tt %>% 
  rowwise() %>% 
  mutate(Ethnicity = ifelse(all(c_across(-INDIVIDUAL) < 0.8), "MIX", names(which.max(across(-INDIVIDUAL))))) %>% 
  ungroup()

Output

   INDIVIDUAL      EUR      EAS      AMR      SAS      AFR Ethnicity
1   SJL0253301 0.974378 0.010592 0.004699 0.000010 0.010321       EUR
44  SJL1073801 0.496489 0.438799 0.000010 0.053618 0.011084       MIX
19  SJL1066401 0.000010 0.999960 0.000010 0.000010 0.000010       EAS
911 SJL1762813 0.000010 0.000010 0.999960 0.000010 0.000010       AMR