I'm trying to count the unique values from a variable (ruc_pk_informante
) by two groups (year
and pn_soc_informante
) with data.table
but I get different values from dplyr
(I used tidytable
).
data.table
library(data.table)
library(tidytable)
> tt[, .(count = uniqueN(ruc_pk_informante)), by = .(sort(year),pn_soc_informante)]
# A tidytable: 4 x 3
sort pn_soc_informante count
<int> <chr> <int>
1 2012 PERSONA NATURAL 1
2 2013 PERSONA NATURAL 10 --------->????
3 2014 PERSONA NATURAL 8
4 2015 PERSONA NATURAL 1
dplyr and manually
> tt %>% arrange.(year) %>% summarise.(n_distinct.(ruc_pk_informante), .by = c(year, pn_soc_informante))
# A tidytable: 4 x 3
year pn_soc_informante V1
<int> <chr> <int>
1 2012 PERSONA NATURAL 1
2 2013 PERSONA NATURAL 17 --------> OK!
3 2014 PERSONA NATURAL 12
4 2015 PERSONA NATURAL 13
> t1 = tt %>% filter.(year == 2013)
> length(unique(t1$ruc_pk_informante)) # OK!
[1] 17
Data
# data ---------------------------------------------------------------------
tt = structure(list(ruc_pk_informante = c("R3", "R3", "R3", "R3",
"R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3",
"R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3",
"R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3",
"R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3",
"R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3",
"R3", "R3", "R3", "R3", "R3", "R5", "R5", "R7", "R7", "R7", "R7",
"C9", "C9", "R12", "R12", "R12", "R14", "R16", "R16", "R16",
"R16", "R16", "R16", "R16", "R16", "R16", "R16", "R16", "R16",
"R16", "R16", "R16", "R16", "R16", "R16", "R16", "R16", "R16",
"C21", "R23", "R23", "R23", "R23", "R23", "R23", "R23", "R23",
"R23", "R27", "R27", "R27", "R27", "R27", "R27", "R27", "R27",
"R27", "R27", "R27", "R27", "R27", "R27", "R27", "R27", "R27",
"R27", "R27", "R27", "R27", "R27", "R27", "R30", "R30", "R30",
"R30", "R33", "R33", "R33", "R33", "R33", "R33", "R34", "R34",
"R34", "R34", "R37", "R41", "R41", "R41", "R41", "R41", "R41",
"R41", "R44", "R44", "R44", "R45", "R45", "R45", "R45", "R45",
"R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45",
"R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45",
"R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45",
"R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45",
"R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45",
"R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45",
"R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45",
"R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45",
"R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45",
"R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45"
), year = c(2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2013L, 2015L, 2012L, 2013L, 2014L, 2015L,
2013L, 2015L, 2013L, 2014L, 2015L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2013L, 2013L, 2013L, 2013L, 2014L, 2014L, 2014L, 2015L, 2015L,
2015L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2014L, 2014L, 2014L, 2014L, 2014L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2013L, 2014L, 2015L,
2015L, 2013L, 2013L, 2014L, 2014L, 2015L, 2015L, 2013L, 2013L,
2014L, 2015L, 2013L, 2013L, 2013L, 2014L, 2014L, 2015L, 2015L,
2015L, 2013L, 2014L, 2015L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L
), pn_soc_informante = c("PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL",
"PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL"
)), row.names = c(NA, -250L), class = c("tidytable", "data.table",
"data.frame"), index = structure(integer(0), "`__year`" = c(67L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 65L, 68L, 71L, 73L, 76L,
77L, 78L, 79L, 80L, 81L, 82L, 98L, 99L, 100L, 101L, 108L, 109L,
110L, 111L, 112L, 113L, 114L, 115L, 116L, 131L, 135L, 136L, 141L,
142L, 145L, 146L, 147L, 153L, 156L, 157L, 158L, 159L, 160L, 161L,
162L, 163L, 164L, 165L, 166L, 167L, 168L, 169L, 170L, 171L, 172L,
173L, 174L, 175L, 176L, 177L, 178L, 179L, 180L, 181L, 182L, 183L,
184L, 185L, 186L, 187L, 188L, 189L, 190L, 191L, 192L, 193L, 194L,
195L, 196L, 197L, 198L, 199L, 200L, 201L, 202L, 203L, 204L, 205L,
206L, 207L, 208L, 209L, 210L, 211L, 212L, 213L, 23L, 24L, 25L,
26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L,
39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 69L,
74L, 83L, 84L, 85L, 86L, 87L, 88L, 89L, 90L, 102L, 103L, 104L,
117L, 118L, 119L, 120L, 121L, 132L, 137L, 138L, 143L, 148L, 149L,
154L, 214L, 215L, 216L, 217L, 218L, 219L, 220L, 221L, 222L, 223L,
224L, 225L, 226L, 227L, 228L, 229L, 230L, 231L, 232L, 233L, 234L,
235L, 236L, 237L, 238L, 239L, 240L, 241L, 242L, 243L, 244L, 245L,
246L, 247L, 248L, 249L, 250L, 51L, 52L, 53L, 54L, 55L, 56L, 57L,
58L, 59L, 60L, 61L, 62L, 63L, 64L, 66L, 70L, 72L, 75L, 91L, 92L,
93L, 94L, 95L, 96L, 97L, 105L, 106L, 107L, 122L, 123L, 124L,
125L, 126L, 127L, 128L, 129L, 130L, 133L, 134L, 139L, 140L, 144L,
150L, 151L, 152L, 155L)))
CodePudding user response:
The sort
on the by
is breaking the order for the rows - i.e. it is rearranging the year
separately and this would result in erroneous values from other columns matching with the sorted column of 'year'. It should be either on the i
tt[order(year), .(count = uniqueN(ruc_pk_informante)),
by = .(year, pn_soc_informante)]
# A tidytable: 4 × 3
year pn_soc_informante count
<int> <chr> <int>
1 2012 PERSONA NATURAL 1
2 2013 PERSONA NATURAL 17
3 2014 PERSONA NATURAL 12
4 2015 PERSONA NATURAL 13
or do the order later
tt[, .(count = uniqueN(ruc_pk_informante)),
by = .(year, pn_soc_informante)][order(year)]
# A tidytable: 4 × 3
year pn_soc_informante count
<int> <chr> <int>
1 2012 PERSONA NATURAL 1
2 2013 PERSONA NATURAL 17
3 2014 PERSONA NATURAL 12
4 2015 PERSONA NATURAL 13
Check the output from the sorted and unsorted
> tt[, .SD, by = .(year = year,pn_soc_informante)][year == 2013]$ruc_pk_informante
[1] "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3"
[23] "R5" "R7" "C9" "R12" "R14" "R16" "R16" "R16" "R16" "R16" "R16" "C21" "R23" "R23" "R23" "R27" "R27" "R27" "R27" "R27" "R27" "R27"
[45] "R27" "R27" "R30" "R33" "R33" "R34" "R34" "R37" "R41" "R41" "R44" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45"
[67] "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45"
[89] "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45"
[111] "R45" "R45" "R45"
> tt[, .SD, by = .(year = sort(year),pn_soc_informante)][year == 2013]$ruc_pk_informante
[1] "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3"
[23] "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3"
[45] "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R3" "R5" "R5" "R7"
[67] "R7" "R7" "R7" "C9" "C9" "R12" "R12" "R12" "R14" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16"
[89] "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "C21" "R23" "R23" "R23" "R23" "R23" "R23" "R23" "R23" "R23" "R27" "R27" "R27" "R27"
[111] "R27" "R27" "R27"