How to remove a specific value that is based on a specific date-CodePudding

Below is a portion of my dataframe:

    Year Date        Location Tag Section domsec atDominantLocation
    2017 2017-08-13        a wnd       n      n                  1
    2017 2017-08-14        a wnd       n      n                  1
    2017 2017-08-15        a wnd       n      n                  1
    2017 2017-08-16        a wnd       n      n                  1
    2017 2017-08-17        a wnd       n      n                  1
    2017 2017-08-18        a wnd       n      n                  1
    2017 2017-08-19        a wnd       n      n                  1
    2017 2017-08-20        a wnd       n      n                  1
    2017 2017-08-21        a wnd       b      n                  0
    2017 2017-08-21        a wnd       u      n                  0
    2017 2017-08-23        a wnd       f      n                  0
    2018 2018-08-21        a wnd       f      s                  0
    2018 2018-08-18        a wnd       h      s                  0
    2018 2018-08-19        a wnd       o      s                  0
    2018 2018-08-15        a wnd       s      s                  1
    2018 2018-08-17        a wnd       s      s                  1
    2018 2018-08-14        c wnd      ss      s                  0
    2018 2018-08-16        a wnd       t      s                  0
    2018 2018-08-21        a wnd       t      s                  0
    2018 2018-08-13        c wnd      ww      s                  0
    2018 2018-08-20        a wnd       y      s                  0

The column "atDominantLocation" contains "1"s and "0"s. I want to keep all the "0"s but only keep the "1" with the earliest date. So, there should ONLY be one "1" for each tag and year and that "1" should be the earliest date in that year.

Below is my desired output:

    Year Date        Location Tag Section domsec atDominantLocation
    2017 2017-08-13        a wnd       n      n                  1
    2017 2017-08-21        a wnd       b      n                  0
    2017 2017-08-21        a wnd       u      n                  0
    2017 2017-08-23        a wnd       f      n                  0
    2018 2018-08-21        a wnd       f      s                  0
    2018 2018-08-18        a wnd       h      s                  0
    2018 2018-08-19        a wnd       o      s                  0
    2018 2018-08-15        a wnd       s      s                  1
    2018 2018-08-14        c wnd      ss      s                  0
    2018 2018-08-16        a wnd       t      s                  0
    2018 2018-08-21        a wnd       t      s                  0
    2018 2018-08-13        c wnd      ww      s                  0
    2018 2018-08-20        a wnd       y      s                  0

I've tried both the duplicate and unique functions without any success. Thanks for your help.

CodePudding user response：

We can group by 'Year', 'Tag' and create the logical expression in filter to subset the rows - create separate logical expressions and join them with | (OR) i..e. as we want all 0 values (atDominantLocation == 0) and only the row with minimum 'Date' where the 'atDominantLocation' is 1 (Date == min(Date[atDominantLocation == 1]))

library(dplyr)
df1 %>%
   mutate(Date = as.Date(Date)) %>% 
   group_by(Year, Tag) %>%
   filter(atDominantLocation == 0| 
     ( Date == min(Date[atDominantLocation == 1]))) %>%
   ungroup

-output

# A tibble: 13 × 7
    Year Date       Location Tag   Section domsec atDominantLocation
   <int> <date>     <chr>    <chr> <chr>   <chr>               <int>
 1  2017 2017-08-13 a        wnd   n       n                       1
 2  2017 2017-08-21 a        wnd   b       n                       0
 3  2017 2017-08-21 a        wnd   u       n                       0
 4  2017 2017-08-23 a        wnd   f       n                       0
 5  2018 2018-08-21 a        wnd   f       s                       0
 6  2018 2018-08-18 a        wnd   h       s                       0
 7  2018 2018-08-19 a        wnd   o       s                       0
 8  2018 2018-08-15 a        wnd   s       s                       1
 9  2018 2018-08-14 c        wnd   ss      s                       0
10  2018 2018-08-16 a        wnd   t       s                       0
11  2018 2018-08-21 a        wnd   t       s                       0
12  2018 2018-08-13 c        wnd   ww      s                       0
13  2018 2018-08-20 a        wnd   y       s                       0

data

df1 <- structure(list(Year = c(2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 
2017L, 2017L, 2017L, 2017L, 2017L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L), Date = c("2017-08-13", 
"2017-08-14", "2017-08-15", "2017-08-16", "2017-08-17", "2017-08-18", 
"2017-08-19", "2017-08-20", "2017-08-21", "2017-08-21", "2017-08-23", 
"2018-08-21", "2018-08-18", "2018-08-19", "2018-08-15", "2018-08-17", 
"2018-08-14", "2018-08-16", "2018-08-21", "2018-08-13", "2018-08-20"
), Location = c("a", "a", "a", "a", "a", "a", "a", "a", "a", 
"a", "a", "a", "a", "a", "a", "a", "c", "a", "a", "c", "a"), 
    Tag = c("wnd", "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", 
    "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", 
    "wnd", "wnd", "wnd", "wnd", "wnd"), Section = c("n", "n", 
    "n", "n", "n", "n", "n", "n", "b", "u", "f", "f", "h", "o", 
    "s", "s", "ss", "t", "t", "ww", "y"), domsec = c("n", "n", 
    "n", "n", "n", "n", "n", "n", "n", "n", "n", "s", "s", "s", 
    "s", "s", "s", "s", "s", "s", "s"), atDominantLocation = c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 
    0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA, 
-21L))

CodePudding user response：

Another tidyverse option is to filter atDominantLocation to just the 1s. Then, sort and group by Year and Tag, then get the row with the earliest date using slice. Then, we can bind those rows back to the original dataframe, but just to the rows where atDominantLocation is 0.

library(tidyverse)

df %>% 
  filter(atDominantLocation == 1) %>% 
  arrange(Date) %>% 
  group_by(Year, Tag) %>% 
  slice(1) %>% 
  bind_rows(df %>% filter(atDominantLocation == 0)) %>% 
  arrange(Date)

Output

    Year Date       Location Tag   Section domsec atDominantLocation
   <int> <chr>      <chr>    <chr> <chr>   <chr>               <int>
 1  2017 2017-08-13 a        wnd   n       n                       1
 2  2017 2017-08-21 a        wnd   b       n                       0
 3  2017 2017-08-21 a        wnd   u       n                       0
 4  2017 2017-08-23 a        wnd   f       n                       0
 5  2018 2018-08-13 c        wnd   ww      s                       0
 6  2018 2018-08-14 c        wnd   ss      s                       0
 7  2018 2018-08-15 a        wnd   s       s                       1
 8  2018 2018-08-16 a        wnd   t       s                       0
 9  2018 2018-08-18 a        wnd   h       s                       0
10  2018 2018-08-19 a        wnd   o       s                       0
11  2018 2018-08-20 a        wnd   y       s                       0
12  2018 2018-08-21 a        wnd   f       s                       0
13  2018 2018-08-21 a        wnd   t       s                       0

Data

df <- structure(list(Year = c(2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 
                              2017L, 2017L, 2017L, 2017L, 2017L, 2018L, 2018L, 2018L, 2018L, 
                              2018L, 2018L, 2018L, 2018L, 2018L, 2018L), 
                     Date = c("2017-08-13", "2017-08-14", "2017-08-15", "2017-08-16", "2017-08-17", "2017-08-18", 
                              "2017-08-19", "2017-08-20", "2017-08-21", "2017-08-21", "2017-08-23", 
                              "2018-08-21", "2018-08-18", "2018-08-19", "2018-08-15", "2018-08-17", 
                               "2018-08-14", "2018-08-16", "2018-08-21", "2018-08-13", "2018-08-20"
                              ), Location = c("a", "a", "a", "a", "a", "a", "a", "a", "a", 
                                              "a", "a", "a", "a", "a", "a", "a", "c", "a", "a", "c", "a"), 
                     Tag = c("wnd", "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", 
                             "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", 
                             "wnd", "wnd", "wnd", "wnd", "wnd"), 
                     Section = c("n", "n", "n", "n", "n", "n", "n", "n", "b", "u", "f", "f", "h", "o", 
                                 "s", "s", "ss", "t", "t", "ww", "y"), 
                     domsec = c("n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "s", "s", "s", 
                                "s", "s", "s", "s", "s", "s", "s"), 
                     atDominantLocation = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 
                                            0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA, -21L))

CodePudding user response：

library(data.table)
library(magrittr)

setDT(df)

rbindlist(
  list(df[, .SD[atDominantLocation == 1][1], by = list(Year, Tag)],
       df[atDominantLocation == 0,]), use.names=TRUE) %>% 
  .[order(Tag, Date)]

#>     Year Tag       Date Location Section domsec atDominantLocation
#>  1: 2017 wnd 2017-08-13        a       n      n                  1
#>  2: 2017 wnd 2017-08-21        a       b      n                  0
#>  3: 2017 wnd 2017-08-21        a       u      n                  0
#>  4: 2017 wnd 2017-08-23        a       f      n                  0
#>  5: 2018 wnd 2018-08-13        c      ww      s                  0
#>  6: 2018 wnd 2018-08-14        c      ss      s                  0
#>  7: 2018 wnd 2018-08-15        a       s      s                  1
#>  8: 2018 wnd 2018-08-16        a       t      s                  0
#>  9: 2018 wnd 2018-08-18        a       h      s                  0
#> 10: 2018 wnd 2018-08-19        a       o      s                  0
#> 11: 2018 wnd 2018-08-20        a       y      s                  0
#> 12: 2018 wnd 2018-08-21        a       f      s                  0
#> 13: 2018 wnd 2018-08-21        a       t      s                  0

^{Created on 2022-02-09 by the reprex package (v2.0.1)}