Filter by months when overlapping two years in r-CodePudding

I am struggling to find a solution.

I have the following gridded samples datasets (I paste different parts of my dataset)

First one: structure(list(gid = c("117765", "117765", "117765", "117765", 
    "117765", "117765", "117765", "117765", "117765", "117765", "117765", 
    "117765", "117765", "117765", "117765", "117765", "117765", "117765", 
    "117765", "117765", "117765", "117765", "117765", "117765"), 
        country = c("Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
        "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
        "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
        "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
        "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
        "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
        "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
        "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
        "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
        "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
        "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
        "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)"
        ), Time = structure(c(3667, 3697, 3727, 3758, 3788, 3819, 
        3849, 3880, 3911, 3941, 3972, 4002, 4033, 4063, 4092, 4123, 
        4153, 4184, 4214, 4245, 4276, 4306, 4337, 4367), class = "Date"), 
        Month = c("01", "02", "03", "04", "05", "06", "07", "08", 
        "09", "10", "11", "12", "01", "02", "03", "04", "05", "06", 
        "07", "08", "09", "10", "11", "12"), SPEI1 = c(-0.702853560447693, 
        2.77506303787231, -1.38380765914917, -0.474617034196854, 
        0.610002398490906, -0.389719426631927, 2.31887650489807, 
        1.47994863986969, 1.66277933120728, 0.399970233440399, -1.47139978408813, 
        -0.435711354017258, -0.510784149169922, -1.15937781333923, 
        0.523077189922333, -0.161062479019165, -0.481528997421265, 
        -1.71726500988007, -1.77663195133209, 0.765306115150452, 
        -0.774405002593994, -0.197176232933998, -1.47615599632263, 
        -0.388415157794952), growstart = c(10, 10, 10, 10, 10, 10, 
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
        10, 10, 10), growend = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), maincrop = c(28, 
        28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 
        28, 28, 28, 28, 28, 28, 28, 28)), row.names = 572161:572184, class = "data.frame")

Second one = structure(list(gid = c("100468", "100468", "100468", "100468", 
"100468", "100468", "100468", "100468", "100468", "100468", "100468", 
"100468", "100468", "100468", "100468", "100468", "100468", "100468", 
"100468", "100468", "100468", "100468", "100468", "100468"), 
    country = c("Namibia", "Namibia", "Namibia", "Namibia", "Namibia", 
    "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", 
    "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", 
    "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", 
    "Namibia"), Time = structure(c(3667, 3697, 3727, 3758, 3788, 
    3819, 3849, 3880, 3911, 3941, 3972, 4002, 4033, 4063, 4092, 
    4123, 4153, 4184, 4214, 4245, 4276, 4306, 4337, 4367), class = "Date"), 
    SPEI1 = c(-1.95947802066803, 0.557283878326416, 1.77989518642426, 
    -1.2029390335083, -0.119278997182846, 1.44610369205475, -1.4578732252121, 
    -1.14002466201782, 1.1647777557373, -1.34318947792053, -0.500527501106262, 
    1.50793671607971, -1.45792877674103, -2.00679230690002, -1.51340460777283, 
    -1.9636687040329, -1.40127754211426, -0.182968750596046, 
    0.295145452022552, 0.630711793899536, -0.166128441691399, 
    -0.55840003490448, -2.62139987945557, -1.74482023715973), 
    growstart = c(12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 
    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12), growend = c(4, 
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
    4, 4, 4, 4), maincrop = c(52, 52, 52, 52, 52, 52, 52, 52, 
    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 
    52), Month = c("01", "02", "03", "04", "05", "06", "07", 
    "08", "09", "10", "11", "12", "01", "02", "03", "04", "05", 
    "06", "07", "08", "09", "10", "11", "12")), row.names = 385:408, class = "data.frame")

Third one: structure(list(gid = c("117770", "117770", "117770", "117770", 
"117770", "117770", "117770", "117770", "117770", "117770", "117770", 
"117770", "117770", "117770", "117770", "117770", "117770", "117770", 
"117770", "117770", "117770", "117770", "117770", "117770"), 
    country = c("Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
    "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
    "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
    "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
    "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
    "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
    "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
    "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
    "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
    "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
    "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
    "Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)"
    ), Time = structure(c(3667, 3697, 3727, 3758, 3788, 3819, 
    3849, 3880, 3911, 3941, 3972, 4002, 4033, 4063, 4092, 4123, 
    4153, 4184, 4214, 4245, 4276, 4306, 4337, 4367), class = "Date"), 
    SPEI1 = c(0.649401307106018, 1.423499584198, -2.04273128509521, 
    0.271935135126114, 0.616238355636597, -1.03605198860168, 
    1.6733535528183, 1.78166878223419, 1.87084305286407, 1.10145688056946, 
    -1.23061907291412, -1.64128601551056, -1.00736439228058, 
    -1.91670513153076, 1.09841585159302, 0.464365869760513, 1.01759243011475, 
    -1.08844792842865, -0.508061945438385, -0.196570366621017, 
    -0.805905878543854, 0.117944374680519, -0.862984955310822, 
    -1.63738548755646), growstart = c(5, 5, 5, 5, 5, 5, 5, 5, 
    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), growend = c(12, 
    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 
    12, 12, 12, 12, 12, 12, 12, 12), maincrop = c(37, 37, 37, 
    37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 
    37, 37, 37, 37, 37, 37), Month = c("01", "02", "03", "04", 
    "05", "06", "07", "08", "09", "10", "11", "12", "01", "02", 
    "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"
    )), row.names = 574081:574104, class = "data.frame")

I have a column 'main crop' that indicates the main crop in my cell (gid) and the growing season months (growstart and growend).

For each crop/gid/country/Time, I need to select the rows corresponding to the growing season.

The issue I am struggling with is that some growing season months overlap two years and I don't know how to filter based on this condition?

Example in the first sample (overlapping two years): For crop 28, the starting growing month is 10(October from the previous year) and it ends in 1(January from the following year). So for this crop according to the gid/country/Time, I will get the rows 1980-01-16 for year 1980 (because I don't have the data for 1979) and the rows between 1980-10-16 and 1981-01-16 for 1981.

Example in the second sample (overlapping two years): For crop 52, the starting growing month is 12(December from the previous year) and it ends in 4(April from the following year). So for this crop according to the gid/country/Time, I will only get the rows between 1980-01-16 and 1980-04-16 for year 1980 the rows between 1980-12-16 and 1981-04-16 for 1981.

Example in the second sample (no overlap): For crop 37, the starting growing month is 5(May of the current year) and it ends in 12(December of the current year). I will only get the rows between 1980-05-16 and 1980-05-16 for year 1980 between 1981-05-16 and 1981-12-16 for year 1981.

Please note that I have a dataframe of more than 3 millions observations and different type of crops/growing season. Hence the 3 samples.

I would really appreciate an automated way to get what I want.

Thank you !!

PS: 10 coffees for the helper that would save my sanity.

CodePudding user response：

Combined all into one tibble.

library(tidyverse)
library(lubridate)

crops %>% 
  mutate(Month = month(Time)) %>% 
  mutate(Year = year(Time), Overlap = growend - growstart < 0) %>% 
  rowwise() %>% 
  filter(
    (Overlap & !(Month %in% (growend 1):(growstart-1))) |
    (!Overlap & Month %in% growstart:growend)
  ) %>% 
  select(names(crops))
#> # A tibble: 34 × 8
#> # Rowwise: 
#>    gid    country             Time       Month  SPEI1 growstart growend maincrop
#>    <chr>  <chr>               <date>     <dbl>  <dbl>     <dbl>   <dbl>    <dbl>
#>  1 117765 Congo, Democratic … 1980-01-16     1 -0.703        10       1       28
#>  2 117765 Congo, Democratic … 1980-10-16    10  0.400        10       1       28
#>  3 117765 Congo, Democratic … 1980-11-16    11 -1.47         10       1       28
#>  4 117765 Congo, Democratic … 1980-12-16    12 -0.436        10       1       28
#>  5 117765 Congo, Democratic … 1981-01-16     1 -0.511        10       1       28
#>  6 117765 Congo, Democratic … 1981-10-16    10 -0.197        10       1       28
#>  7 117765 Congo, Democratic … 1981-11-16    11 -1.48         10       1       28
#>  8 117765 Congo, Democratic … 1981-12-16    12 -0.388        10       1       28
#>  9 100468 Namibia             1980-01-16     1 -1.96         12       4       52
#> 10 100468 Namibia             1980-02-15     2  0.557        12       4       52
#> # … with 24 more rows

Data used:

crops <- structure(list(gid = c("117765", "117765", "117765", "117765", 
"117765", "117765", "117765", "117765", "117765", "117765", "117765", 
"117765", "117765", "117765", "117765", "117765", "117765", "117765", 
"117765", "117765", "117765", "117765", "117765", "117765", "100468", 
"100468", "100468", "100468", "100468", "100468", "100468", "100468", 
"100468", "100468", "100468", "100468", "100468", "100468", "100468", 
"100468", "100468", "100468", "100468", "100468", "100468", "100468", 
"100468", "100468", "117770", "117770", "117770", "117770", "117770", 
"117770", "117770", "117770", "117770", "117770", "117770", "117770", 
"117770", "117770", "117770", "117770", "117770", "117770", "117770", 
"117770", "117770", "117770", "117770", "117770"), country = c("Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Namibia", "Namibia", 
"Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", 
"Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", 
"Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", 
"Namibia", "Namibia", "Namibia", "Namibia", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)", "Congo, Democratic Republic of (Zaire)", 
"Congo, Democratic Republic of (Zaire)"), Time = structure(c(3667, 
3697, 3727, 3758, 3788, 3819, 3849, 3880, 3911, 3941, 3972, 4002, 
4033, 4063, 4092, 4123, 4153, 4184, 4214, 4245, 4276, 4306, 4337, 
4367, 3667, 3697, 3727, 3758, 3788, 3819, 3849, 3880, 3911, 3941, 
3972, 4002, 4033, 4063, 4092, 4123, 4153, 4184, 4214, 4245, 4276, 
4306, 4337, 4367, 3667, 3697, 3727, 3758, 3788, 3819, 3849, 3880, 
3911, 3941, 3972, 4002, 4033, 4063, 4092, 4123, 4153, 4184, 4214, 
4245, 4276, 4306, 4337, 4367), class = "Date"), Month = c("01", 
"02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", 
"01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", 
"12", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", 
"11", "12", "01", "02", "03", "04", "05", "06", "07", "08", "09", 
"10", "11", "12", "01", "02", "03", "04", "05", "06", "07", "08", 
"09", "10", "11", "12", "01", "02", "03", "04", "05", "06", "07", 
"08", "09", "10", "11", "12"), SPEI1 = c(-0.702853560447693, 
2.77506303787231, -1.38380765914917, -0.474617034196854, 0.610002398490906, 
-0.389719426631927, 2.31887650489807, 1.47994863986969, 1.66277933120728, 
0.399970233440399, -1.47139978408813, -0.435711354017258, -0.510784149169922, 
-1.15937781333923, 0.523077189922333, -0.161062479019165, -0.481528997421265, 
-1.71726500988007, -1.77663195133209, 0.765306115150452, -0.774405002593994, 
-0.197176232933998, -1.47615599632263, -0.388415157794952, -1.95947802066803, 
0.557283878326416, 1.77989518642426, -1.2029390335083, -0.119278997182846, 
1.44610369205475, -1.4578732252121, -1.14002466201782, 1.1647777557373, 
-1.34318947792053, -0.500527501106262, 1.50793671607971, -1.45792877674103, 
-2.00679230690002, -1.51340460777283, -1.9636687040329, -1.40127754211426, 
-0.182968750596046, 0.295145452022552, 0.630711793899536, -0.166128441691399, 
-0.55840003490448, -2.62139987945557, -1.74482023715973, 0.649401307106018, 
1.423499584198, -2.04273128509521, 0.271935135126114, 0.616238355636597, 
-1.03605198860168, 1.6733535528183, 1.78166878223419, 1.87084305286407, 
1.10145688056946, -1.23061907291412, -1.64128601551056, -1.00736439228058, 
-1.91670513153076, 1.09841585159302, 0.464365869760513, 1.01759243011475, 
-1.08844792842865, -0.508061945438385, -0.196570366621017, -0.805905878543854, 
0.117944374680519, -0.862984955310822, -1.63738548755646), growstart = c(10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, 12, 12, 12, 
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 5, 
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
5, 5), growend = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12, 12, 12, 12, 
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 
12, 12), maincrop = c(28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 52, 52, 
52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 
52, 52, 52, 52, 52, 52, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -72L))

CodePudding user response：

One approach could be to use dplyr, which allows you to group and filter. An important aspect then is the definition of the seasons, which depends on whether it is a season with or without overlap. It is important to note that this approach implicitly assumes that the growing season is constant at the gid, country and maincrop level.

I think one problem with your approach is also that the variable "Month" is stored as a character rather than a numeric variable, which does not allow you to subset the information properly.

In the following code, I have combined your three subsets of data into one data.frame.

library(dplyr)

df <- rbind(one, two, three)
df$Month <- as.numeric(df$Month)

filtered_df <- df %>%
  group_by(gid, country, maincrop) %>%
  filter(Month %in% unique(ifelse(growend <= growstart,
                                  unique(c(seq(growstart[1], 12), seq(1, growend[1]))),
                                  seq(growstart[1], growend[1])))) %>%
  ungroup()

CodePudding user response：

A simple solution could be using some checks between.

If you want to use this on the whole dataset you group by crop.

df |> 
    filter(ifelse(growstart > growend,
                  between(as.numeric(Month), 1, min(growend)) | between(as.numeric(Month), min(growstart), 12),
                  between(as.numeric(Month), 1, min(growend)) & between(as.numeric(Month), min(growstart), 12)))

      gid country       Time      SPEI1 growstart growend maincrop Month
1  100468 Namibia 1980-01-16 -1.9594780        12       4       52    01
2  100468 Namibia 1980-02-15  0.5572839        12       4       52    02
3  100468 Namibia 1980-03-16  1.7798952        12       4       52    03
4  100468 Namibia 1980-04-16 -1.2029390        12       4       52    04
5  100468 Namibia 1980-12-16  1.5079367        12       4       52    12
6  100468 Namibia 1981-01-16 -1.4579288        12       4       52    01
7  100468 Namibia 1981-02-15 -2.0067923        12       4       52    02
8  100468 Namibia 1981-03-16 -1.5134046        12       4       52    03
9  100468 Namibia 1981-04-16 -1.9636687        12       4       52    04
10 100468 Namibia 1981-12-16 -1.7448202        12       4       52    12