Home > Mobile >  Keep first value of variable in grouped data in R
Keep first value of variable in grouped data in R

Time:10-11

I have the following data:

structure(list(n_historia = c(6815084L, 6815084L, 6815084L, 6815084L, 
6815084L, 6815084L, 6832636L, 6832636L, 6832636L, 6832636L), 
    from = structure(c(1550579100, 1550579160, 1550919120, 1553805900, 
    1555284240, 1555797180, 1554968160, 1558981560, 1560778200, 
    1564667100), tzone = "UTC", class = c("POSIXct", "POSIXt"
    )), to = structure(c(1550579160, 1550919120, 1553805900, 
    1553805900, 1555284240, 1555797180, 1558981560, 1558981560, 
    1561056480, 1564704240), tzone = "UTC", class = c("POSIXct", 
    "POSIXt")), cama = c("UCN17", "UCN07", "UCN18", "NEN02", 
    "UCN16", "NEN03", "UCN17", "NEN10", "NEN27", "NEN09"), box = c("UCN-BOX-3", 
    "UCN-BOX-2", "UCN-BOX-3", "NEN-BOX-1", "UCN-BOX-3", "NEN-BOX-1", 
    "UCN-BOX-3", "NEN-BOX-1", "NEN-BOX-3", "NEN-BOX-1"), fecha_primer_pos = structure(c(17935, 
    17935, 17935, 17935, 17935, 17935, 18100, 18100, 18100, 18100
    ), class = "Date"), fecha_ultimo_pos = structure(c(17981, 
    17981, 17981, 17981, 17981, 17981, 18100, 18100, 18100, 18100
    ), class = "Date")), row.names = c(NA, -10L), class = "data.frame")

And I wonder if there is a function to keep the first value in grouped data and set NA in the rest of them, to get something like this:

structure(list(n_historia = c(6815084L, 6815084L, 6815084L, 6815084L, 
6815084L, 6815084L, 6832636L, 6832636L, 6832636L, 6832636L), 
    from = structure(c(1550579100, 1550579160, 1550919120, 1553805900, 
    1555284240, 1555797180, 1554968160, 1558981560, 1560778200, 
    1564667100), tzone = "UTC", class = c("POSIXct", "POSIXt"
    )), to = structure(c(1550579160, 1550919120, 1553805900, 
    1553805900, 1555284240, 1555797180, 1558981560, 1558981560, 
    1561056480, 1564704240), tzone = "UTC", class = c("POSIXct", 
    "POSIXt")), cama = c("UCN17", "UCN07", "UCN18", "NEN02", 
    "UCN16", "NEN03", "UCN17", "NEN10", "NEN27", "NEN09"), box = c("UCN-BOX-3", 
    "UCN-BOX-2", "UCN-BOX-3", "NEN-BOX-1", "UCN-BOX-3", "NEN-BOX-1", 
    "UCN-BOX-3", "NEN-BOX-1", "NEN-BOX-3", "NEN-BOX-1"), fecha_primer_pos = structure(c(17935, 
    NA, NA, NA, NA, NA, 18100, NA, NA, NA
    ), class = "Date"), fecha_ultimo_pos = structure(c(17981, 
    NA, NA, NA, NA, NA, 18100, NA, NA, NA
    ), class = "Date")), row.names = c(NA, -10L), class = "data.frame")

CodePudding user response:

Using if_else, check if it is the first row, then update the columns:

library(dplyr)

dd %>% 
  group_by(n_historia) %>% 
  mutate(fecha_primer_pos_new = if_else(row_number() == 1, fecha_primer_pos, as.Date(NA)),
         fecha_ultimo_pos_new = if_else(row_number() == 1, fecha_ultimo_pos, as.Date(NA))) %>% 
  #selecting relevant columns for this Q&A
  select(n_historia, fecha_primer_pos_new, fecha_ultimo_pos_new)

# # A tibble: 10 x 3
# # Groups:   n_historia [2]
#    n_historia fecha_primer_pos_new fecha_ultimo_pos_n~
#         <int> <date>               <date>             
#  1    6815084 2019-02-08           2019-03-26         
#  2    6815084 NA                   NA                 
#  3    6815084 NA                   NA                 
#  4    6815084 NA                   NA                 
#  5    6815084 NA                   NA                 
#  6    6815084 NA                   NA                 
#  7    6832636 2019-07-23           2019-07-23         
#  8    6832636 NA                   NA                 
#  9    6832636 NA                   NA                 
# 10    6832636 NA                   NA                 

CodePudding user response:

You can use duplicated -

df[duplicated(df$n_historia), c('fecha_primer_pos', 'fecha_ultimo_pos')] <- NA
df

#   n_historia                from                  to  cama       box fecha_primer_pos fecha_ultimo_pos
#1     6815084 2019-02-19 12:25:00 2019-02-19 12:26:00 UCN17 UCN-BOX-3       2019-02-08       2019-03-26
#2     6815084 2019-02-19 12:26:00 2019-02-23 10:52:00 UCN07 UCN-BOX-2             <NA>             <NA>
#3     6815084 2019-02-23 10:52:00 2019-03-28 20:45:00 UCN18 UCN-BOX-3             <NA>             <NA>
#4     6815084 2019-03-28 20:45:00 2019-03-28 20:45:00 NEN02 NEN-BOX-1             <NA>             <NA>
#5     6815084 2019-04-14 23:24:00 2019-04-14 23:24:00 UCN16 UCN-BOX-3             <NA>             <NA>
#6     6815084 2019-04-20 21:53:00 2019-04-20 21:53:00 NEN03 NEN-BOX-1             <NA>             <NA>
#7     6832636 2019-04-11 07:36:00 2019-05-27 18:26:00 UCN17 UCN-BOX-3       2019-07-23       2019-07-23
#8     6832636 2019-05-27 18:26:00 2019-05-27 18:26:00 NEN10 NEN-BOX-1             <NA>             <NA>
#9     6832636 2019-06-17 13:30:00 2019-06-20 18:48:00 NEN27 NEN-BOX-3             <NA>             <NA>
#10    6832636 2019-08-01 13:45:00 2019-08-02 00:04:00 NEN09 NEN-BOX-1             <NA>             <NA>

In dplyr you can take help of replace -

library(dplyr)

df %>%
  group_by(n_historia) %>%
  mutate(across(starts_with('fecha'), replace, -1, NA)) %>%
  ungroup
  • Related