Home > other >  Count unique values by group (data.table) doesn't work
Count unique values by group (data.table) doesn't work

Time:04-23

I'm trying to count the unique values from a variable (ruc_pk_informante) by two groups (year and pn_soc_informante) with data.table but I get different values from dplyr (I used tidytable).

data.table

library(data.table)
library(tidytable)

> tt[, .(count = uniqueN(ruc_pk_informante)), by = .(sort(year),pn_soc_informante)]
# A tidytable: 4 x 3
   sort pn_soc_informante count
  <int> <chr>             <int>
1  2012 PERSONA NATURAL       1
2  2013 PERSONA NATURAL      10 --------->????
3  2014 PERSONA NATURAL       8
4  2015 PERSONA NATURAL       1

dplyr and manually

> tt %>% arrange.(year) %>% summarise.(n_distinct.(ruc_pk_informante), .by = c(year, pn_soc_informante))
# A tidytable: 4 x 3
   year pn_soc_informante    V1
  <int> <chr>             <int>
1  2012 PERSONA NATURAL       1
2  2013 PERSONA NATURAL      17 --------> OK!
3  2014 PERSONA NATURAL      12
4  2015 PERSONA NATURAL      13
> t1 = tt %>% filter.(year == 2013)
> length(unique(t1$ruc_pk_informante)) # OK!
[1] 17 

Data

# data ---------------------------------------------------------------------


tt = structure(list(ruc_pk_informante = c("R3", "R3", "R3", "R3", 
                                     "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", 
                                     "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", 
                                     "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", 
                                     "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", 
                                     "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", 
                                     "R3", "R3", "R3", "R3", "R3", "R5", "R5", "R7", "R7", "R7", "R7", 
                                     "C9", "C9", "R12", "R12", "R12", "R14", "R16", "R16", "R16", 
                                     "R16", "R16", "R16", "R16", "R16", "R16", "R16", "R16", "R16", 
                                     "R16", "R16", "R16", "R16", "R16", "R16", "R16", "R16", "R16", 
                                     "C21", "R23", "R23", "R23", "R23", "R23", "R23", "R23", "R23", 
                                     "R23", "R27", "R27", "R27", "R27", "R27", "R27", "R27", "R27", 
                                     "R27", "R27", "R27", "R27", "R27", "R27", "R27", "R27", "R27", 
                                     "R27", "R27", "R27", "R27", "R27", "R27", "R30", "R30", "R30", 
                                     "R30", "R33", "R33", "R33", "R33", "R33", "R33", "R34", "R34", 
                                     "R34", "R34", "R37", "R41", "R41", "R41", "R41", "R41", "R41", 
                                     "R41", "R44", "R44", "R44", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45"
), year = c(2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2014L, 2014L, 2014L, 
            2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 
            2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 
            2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2015L, 2015L, 
            2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 
            2015L, 2015L, 2015L, 2013L, 2015L, 2012L, 2013L, 2014L, 2015L, 
            2013L, 2015L, 2013L, 2014L, 2015L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 
            2014L, 2014L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 
            2013L, 2013L, 2013L, 2013L, 2014L, 2014L, 2014L, 2015L, 2015L, 
            2015L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2014L, 2014L, 2014L, 2014L, 2014L, 2015L, 2015L, 2015L, 
            2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2013L, 2014L, 2015L, 
            2015L, 2013L, 2013L, 2014L, 2014L, 2015L, 2015L, 2013L, 2013L, 
            2014L, 2015L, 2013L, 2013L, 2013L, 2014L, 2014L, 2015L, 2015L, 
            2015L, 2013L, 2014L, 2015L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2014L, 
            2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 
            2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 
            2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 
            2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L
), pn_soc_informante = c("PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL"
)), row.names = c(NA, -250L), class = c("tidytable", "data.table", 
                                        "data.frame"), index = structure(integer(0), "`__year`" = c(67L, 
                                                                                                                                                       1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
                                                                                                                                                       15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 65L, 68L, 71L, 73L, 76L, 
                                                                                                                                                       77L, 78L, 79L, 80L, 81L, 82L, 98L, 99L, 100L, 101L, 108L, 109L, 
                                                                                                                                                       110L, 111L, 112L, 113L, 114L, 115L, 116L, 131L, 135L, 136L, 141L, 
                                                                                                                                                       142L, 145L, 146L, 147L, 153L, 156L, 157L, 158L, 159L, 160L, 161L, 
                                                                                                                                                       162L, 163L, 164L, 165L, 166L, 167L, 168L, 169L, 170L, 171L, 172L, 
                                                                                                                                                       173L, 174L, 175L, 176L, 177L, 178L, 179L, 180L, 181L, 182L, 183L, 
                                                                                                                                                       184L, 185L, 186L, 187L, 188L, 189L, 190L, 191L, 192L, 193L, 194L, 
                                                                                                                                                       195L, 196L, 197L, 198L, 199L, 200L, 201L, 202L, 203L, 204L, 205L, 
                                                                                                                                                       206L, 207L, 208L, 209L, 210L, 211L, 212L, 213L, 23L, 24L, 25L, 
                                                                                                                                                       26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 
                                                                                                                                                       39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 69L, 
                                                                                                                                                       74L, 83L, 84L, 85L, 86L, 87L, 88L, 89L, 90L, 102L, 103L, 104L, 
                                                                                                                                                       117L, 118L, 119L, 120L, 121L, 132L, 137L, 138L, 143L, 148L, 149L, 
                                                                                                                                                       154L, 214L, 215L, 216L, 217L, 218L, 219L, 220L, 221L, 222L, 223L, 
                                                                                                                                                       224L, 225L, 226L, 227L, 228L, 229L, 230L, 231L, 232L, 233L, 234L, 
                                                                                                                                                       235L, 236L, 237L, 238L, 239L, 240L, 241L, 242L, 243L, 244L, 245L, 
                                                                                                                                                       246L, 247L, 248L, 249L, 250L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 
                                                                                                                                                       58L, 59L, 60L, 61L, 62L, 63L, 64L, 66L, 70L, 72L, 75L, 91L, 92L, 
                                                                                                                                                       93L, 94L, 95L, 96L, 97L, 105L, 106L, 107L, 122L, 123L, 124L, 
                                                                                                                                                       125L, 126L, 127L, 128L, 129L, 130L, 133L, 134L, 139L, 140L, 144L, 
                                                                                                                                                       150L, 151L, 152L, 155L)))


CodePudding user response:

The sort on the by is breaking the order for the rows - i.e. it is rearranging the year separately and this would result in erroneous values from other columns matching with the sorted column of 'year'. It should be either on the i

tt[order(year), .(count = uniqueN(ruc_pk_informante)), 
     by = .(year, pn_soc_informante)]
# A tidytable: 4 × 3
   year pn_soc_informante count
  <int> <chr>             <int>
1  2012 PERSONA NATURAL       1
2  2013 PERSONA NATURAL      17
3  2014 PERSONA NATURAL      12
4  2015 PERSONA NATURAL      13

or do the order later

tt[, .(count = uniqueN(ruc_pk_informante)), 
   by = .(year, pn_soc_informante)][order(year)]
# A tidytable: 4 × 3
   year pn_soc_informante count
  <int> <chr>             <int>
1  2012 PERSONA NATURAL       1
2  2013 PERSONA NATURAL      17
3  2014 PERSONA NATURAL      12
4  2015 PERSONA NATURAL      13

Check the output from the sorted and unsorted

> tt[, .SD, by = .(year = year,pn_soc_informante)][year == 2013]$ruc_pk_informante
  [1] "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3" 
 [23] "R5"  "R7"  "C9"  "R12" "R14" "R16" "R16" "R16" "R16" "R16" "R16" "C21" "R23" "R23" "R23" "R27" "R27" "R27" "R27" "R27" "R27" "R27"
 [45] "R27" "R27" "R30" "R33" "R33" "R34" "R34" "R37" "R41" "R41" "R44" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45"
 [67] "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45"
 [89] "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45"
[111] "R45" "R45" "R45"
> tt[, .SD, by = .(year = sort(year),pn_soc_informante)][year == 2013]$ruc_pk_informante
  [1] "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3" 
 [23] "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3" 
 [45] "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R5"  "R5"  "R7" 
 [67] "R7"  "R7"  "R7"  "C9"  "C9"  "R12" "R12" "R12" "R14" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16"
 [89] "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "C21" "R23" "R23" "R23" "R23" "R23" "R23" "R23" "R23" "R23" "R27" "R27" "R27" "R27"
[111] "R27" "R27" "R27"
  • Related