find unique rows in every column and mark them in r-CodePudding

I have a data frame with 1000s of rows with a few columns where I would like to find the unique rows and mark if each column has a unique value or not.

for example, I have a data frame that looks like the following.

data frame: structure(list(X = c("Row1", "Row2", "Row3", "Row4", "Row5", "Row6", "Row7"), Col1 = c(0L, 1L, 1L, 0L, 1L, -1L, 0L), Col2 = c(1L, 0L, 1L, 1L, 1L, 0L, -2L), Col3 = c(-1L, 0L, 1L, 0L, 0L, 1L, 1L )), class = "data.frame", row.names = c(NA, -7L))

      Col1 Col2 Col3
Row1    0   1   -1
Row2    1   0   0
Row3    1   1   1
Row4    0   1   0
Row5    1   1   0
Row6    -1  0   1
Row7    0   -2  1

and I would like to get the following data frame.

        Col1  Col2  Col3    Col1_uni    Col2_uni    Col3_uni
 Row1   0      1    -1      Col1        Col2        Col3
Row2    1      0    0       Col1        NA           NA
Row3    1      1    1       NA          NA           NA
Row4    0      1    0       NA          Col2         NA
Row5    1      1    0       NA          NA           NA
Row6    -1     0    1       Col1       Col2         Col3
Row7    0     -2    1       Col1       Col2         Col3

I have tried df=df[row.names(unique(df[,c("clo_2")])), c("clo_2")] but that doesnt really helps.I hope somebody has a easy solution for this. thanks in advance.

CodePudding user response：

We can use dplyr with rowwise():

library(dplyr)
df %>% rowwise %>%
  mutate(across(starts_with('Col'), ~ sum(c_across(starts_with('Col')) == .x), .names = "{.col}_uni"),
         across(ends_with('uni'), ~ifelse(.x==1, str_remove(deparse(substitute(.x)), '_uni$'), NA)))

# A tibble: 7 × 7
# Rowwise: 
  X      Col1  Col2  Col3 Col1_uni Col2_uni Col3_uni
  <chr> <int> <int> <int> <chr>    <chr>    <chr>   
1 Row1      0     1    -1 Col1     Col2     Col3    
2 Row2      1     0     0 Col1     NA       NA      
3 Row3      1     1     1 NA       NA       NA      
4 Row4      0     1     0 NA       Col2     NA      
5 Row5      1     1     0 NA       NA       Col3    
6 Row6     -1     0     1 Col1     Col2     Col3    
7 Row7      0    -2     1 Col1     Col2     Col3

I would also recommend the simpler "col_names-less" approach suggested by @caldwellst, for which the simplified code in my answer would be:

libarry(dplyr)

df %>% rowwise %>%
  mutate(across(starts_with('Col'),
                ~sum(c_across(starts_with('Col')) == .x) == 1,
                .names = "{.col}_uni"))

# A tibble: 7 × 7
# Rowwise: 
  X      Col1  Col2  Col3 Col1_uni Col2_uni Col3_uni
  <chr> <int> <int> <int> <lgl>    <lgl>    <lgl>   
1 Row1      0     1    -1 TRUE     TRUE     TRUE    
2 Row2      1     0     0 TRUE     FALSE    FALSE   
3 Row3      1     1     1 FALSE    FALSE    FALSE   
4 Row4      0     1     0 FALSE    TRUE     FALSE   
5 Row5      1     1     0 FALSE    FALSE    TRUE    
6 Row6     -1     0     1 TRUE     TRUE     TRUE    
7 Row7      0    -2     1 TRUE     TRUE     TRUE

CodePudding user response：

Here is an example giving logical values and not bothering with column names. You can easily amend to give the names of the vector if necessary. It's simply applying a function to check uniqueness of values in a vector row-wise.

df <- structure(list(Col1 = c(0, 1, 1, 0, 1, -1, 0),
                     Col2 = c(1, 0, 1, 1, 1, 0, -2),
                     Col3 = c(-1, 0, 1, 0, 0, 1, 1)),
                row.names = c(NA, -7L),
                class = c("tbl_df", "tbl", "data.frame"))

chk <- apply(df, 1, function(x) !x %in% x[duplicated(x)])
cbind(df, t(chk))
#>   Col1 Col2 Col3     1     2     3
#> 1    0    1   -1  TRUE  TRUE  TRUE
#> 2    1    0    0  TRUE FALSE FALSE
#> 3    1    1    1 FALSE FALSE FALSE
#> 4    0    1    0 FALSE  TRUE FALSE
#> 5    1    1    0 FALSE FALSE  TRUE
#> 6   -1    0    1  TRUE  TRUE  TRUE
#> 7    0   -2    1  TRUE  TRUE  TRUE

CodePudding user response：

A base R solution with row-wise apply

data.frame( dat, setNames( data.frame( t(apply( dat, 1, function(x){ 
   tf=!duplicated(x, fromLast=F)&!duplicated(x, fromLast=T);
   ifelse( tf,colnames(dat)[tf],NA) } ) )), paste0(colnames(dat),"_uni") ) )

     Col1 Col2 Col3 Col1_uni Col2_uni Col3_uni
Row1    0    1   -1     Col1     Col2     Col3
Row2    1    0    0     Col1     <NA>     <NA>
Row3    1    1    1     <NA>     <NA>     <NA>
Row4    0    1    0     <NA>     Col2     <NA>
Row5    1    1    0     <NA>     <NA>     Col3
Row6   -1    0    1     Col1     Col2     Col3
Row7    0   -2    1     Col1     Col2     Col3

Data

dat <- structure(list(Col1 = c(0L, 1L, 1L, 0L, 1L, -1L, 0L), Col2 = c(1L, 
0L, 1L, 1L, 1L, 0L, -2L), Col3 = c(-1L, 0L, 1L, 0L, 0L, 1L, 1L
)), class = "data.frame", row.names = c("Row1", "Row2", "Row3", 
"Row4", "Row5", "Row6", "Row7"))