Home > database >  Reshaping dataframe in R and extracting the column name of the largest element in a row
Reshaping dataframe in R and extracting the column name of the largest element in a row

Time:07-26

I have the following data frame in R

        n1  n2  n3  n4  n5  n6  n7  n8  n9  n10 n11 n12 n13 n14
3557    10  5   13  3   2   8   1   9   4   7   6   12  11  14
3558    9   10  11  1   12  7   2   14  5   8   13  4   3   6
3559    6   7   2   14  11  10  12  5   13  8   1   9   3   4
3560    8   11  13  1   12  3   4   2   10  9   14  7   5   6
3561    3   9   7   8   12  5   10  2   1   6   14  11  4   13
3562    14  2   3   10  6   12  5   13  8   9   7   4   1   11
3563    13  6   5   9   7   8   10  11  2   3   4   1   12  14
3564    10  11  3   5   12  7   14  4   9   8   2   13  1   6
3565    4   8   11  7   10  3   13  14  12  2   5   9   1   6
3566    12  9   2   11  1   14  6   10  8   3   7   5   4   13
3567    11  12  6   7   8   14  10  13  3   4   9   2   1   5
3568    14  7   5   11  8   12  4   3   1   9   6   13  10  2
3569    1   11  13  2   14  6   9   7   4   12  3   5   8   10
3570    4   5   10  8   13  7   11  3   2   14  9   12  6   1
3571    6   9   13  14  12  2   3   7   10  8   1   5   4   11
3572    2   6   11  5   9   13  8   10  3   4   14  7   1   12
3573    10  13  12  9   5   14  11  6   1   3   4   8   7   2
3574    2   10  4   7   13  5   9   8   12  11  14  3   1   6

and I want to extract the column name with which it has the largest number in that row. i.e.

        Choice
3557    n14
3558    n8
3559    n4
3560    n11
3561    n11
3562    n1
3563    n14
3564    n7
3565    n8
3566    n6
3567    n6
3568    n1
3569    n5
3570    n10
3571    n4
3572    n11
3573    n6
3574    n11

Is there any quick and efficient way to do it? Thanks in advance.

CodePudding user response:

An efficient option would be max.col to find the column index for each row where the value is max and then use the index to return the column names

out <- data.frame(Choice = names(df1)[max.col(df1, "first")])
row.names(out) <- row.names(df1)

-output

> out
     Choice
3557    n14
3558     n8
3559     n4
3560    n11
3561    n11
3562     n1
3563    n14
3564     n7
3565     n8
3566     n6
3567     n6
3568     n1
3569     n5
3570    n10
3571     n4
3572    n11
3573     n6
3574    n11

data

df1 <- structure(list(n1 = c(10L, 9L, 6L, 8L, 3L, 14L, 13L, 10L, 4L, 
12L, 11L, 14L, 1L, 4L, 6L, 2L, 10L, 2L), n2 = c(5L, 10L, 7L, 
11L, 9L, 2L, 6L, 11L, 8L, 9L, 12L, 7L, 11L, 5L, 9L, 6L, 13L, 
10L), n3 = c(13L, 11L, 2L, 13L, 7L, 3L, 5L, 3L, 11L, 2L, 6L, 
5L, 13L, 10L, 13L, 11L, 12L, 4L), n4 = c(3L, 1L, 14L, 1L, 8L, 
10L, 9L, 5L, 7L, 11L, 7L, 11L, 2L, 8L, 14L, 5L, 9L, 7L), n5 = c(2L, 
12L, 11L, 12L, 12L, 6L, 7L, 12L, 10L, 1L, 8L, 8L, 14L, 13L, 12L, 
9L, 5L, 13L), n6 = c(8L, 7L, 10L, 3L, 5L, 12L, 8L, 7L, 3L, 14L, 
14L, 12L, 6L, 7L, 2L, 13L, 14L, 5L), n7 = c(1L, 2L, 12L, 4L, 
10L, 5L, 10L, 14L, 13L, 6L, 10L, 4L, 9L, 11L, 3L, 8L, 11L, 9L
), n8 = c(9L, 14L, 5L, 2L, 2L, 13L, 11L, 4L, 14L, 10L, 13L, 3L, 
7L, 3L, 7L, 10L, 6L, 8L), n9 = c(4L, 5L, 13L, 10L, 1L, 8L, 2L, 
9L, 12L, 8L, 3L, 1L, 4L, 2L, 10L, 3L, 1L, 12L), n10 = c(7L, 8L, 
8L, 9L, 6L, 9L, 3L, 8L, 2L, 3L, 4L, 9L, 12L, 14L, 8L, 4L, 3L, 
11L), n11 = c(6L, 13L, 1L, 14L, 14L, 7L, 4L, 2L, 5L, 7L, 9L, 
6L, 3L, 9L, 1L, 14L, 4L, 14L), n12 = c(12L, 4L, 9L, 7L, 11L, 
4L, 1L, 13L, 9L, 5L, 2L, 13L, 5L, 12L, 5L, 7L, 8L, 3L), n13 = c(11L, 
3L, 3L, 5L, 4L, 1L, 12L, 1L, 1L, 4L, 1L, 10L, 8L, 6L, 4L, 1L, 
7L, 1L), n14 = c(14L, 6L, 4L, 6L, 13L, 11L, 14L, 6L, 6L, 13L, 
5L, 2L, 10L, 1L, 11L, 12L, 2L, 6L)), class = "data.frame", row.names = c("3557", 
"3558", "3559", "3560", "3561", "3562", "3563", "3564", "3565", 
"3566", "3567", "3568", "3569", "3570", "3571", "3572", "3573", 
"3574"))

CodePudding user response:

dplyr option where we first reshape the data from wide to long to find which column with name has the highest value per id:

df <- read.table(text = "        n1  n2  n3  n4  n5  n6  n7  n8  n9  n10 n11 n12 n13 n14
3557    10  5   13  3   2   8   1   9   4   7   6   12  11  14
3558    9   10  11  1   12  7   2   14  5   8   13  4   3   6
3559    6   7   2   14  11  10  12  5   13  8   1   9   3   4
3560    8   11  13  1   12  3   4   2   10  9   14  7   5   6
3561    3   9   7   8   12  5   10  2   1   6   14  11  4   13
3562    14  2   3   10  6   12  5   13  8   9   7   4   1   11
3563    13  6   5   9   7   8   10  11  2   3   4   1   12  14
3564    10  11  3   5   12  7   14  4   9   8   2   13  1   6
3565    4   8   11  7   10  3   13  14  12  2   5   9   1   6
3566    12  9   2   11  1   14  6   10  8   3   7   5   4   13
3567    11  12  6   7   8   14  10  13  3   4   9   2   1   5
3568    14  7   5   11  8   12  4   3   1   9   6   13  10  2
3569    1   11  13  2   14  6   9   7   4   12  3   5   8   10
3570    4   5   10  8   13  7   11  3   2   14  9   12  6   1
3571    6   9   13  14  12  2   3   7   10  8   1   5   4   11
3572    2   6   11  5   9   13  8   10  3   4   14  7   1   12
3573    10  13  12  9   5   14  11  6   1   3   4   8   7   2
3574    2   10  4   7   13  5   9   8   12  11  14  3   1   6", header = TRUE)

library(dplyr)
library(tidyr)
library(tibble)
df %>%
  rownames_to_column('id') %>% 
  gather(Choice, highest, n1:n14) %>% 
  group_by(id) %>% 
  slice(which.max(highest)) %>%
  column_to_rownames("id") %>%
  select(-highest)
#>      Choice
#> 3557    n14
#> 3558     n8
#> 3559     n4
#> 3560    n11
#> 3561    n11
#> 3562     n1
#> 3563    n14
#> 3564     n7
#> 3565     n8
#> 3566     n6
#> 3567     n6
#> 3568     n1
#> 3569     n5
#> 3570    n10
#> 3571     n4
#> 3572    n11
#> 3573     n6
#> 3574    n11

Created on 2022-07-25 by the reprex package (v2.0.1)

  • Related