Find out if time periods of years overlap-CodePudding

I have a dataframe that lists political mandates with their start and end year. The question I want to answer is "How many people (person_id) have had mandates that overlap in terms of years active?

I've tried sequencing the active years as vectors, and then grouping and summarising by intersecting the vectors of active years.

My input table

id	person_id	start_year	end_year
1	6	1987	1992
2	6	1989	1995

mandates_active <- mandates %>% mutate(active_years = map2(mandate_start_year, mandate_end_year, seq))

I get an additional column with the sequenced active years:

id	person_id	start_year	end_year	active_years
1	6	1987	1992	[1987, 1988, 1989 ...]
2	6	1989	1995	[1989, 1990, 1991 ...]

Then I try to group this bz person_id and summarise by intersecting the active years list, but I need two arguments for the intersect function, hence this doesn't work:

mandates_test <- mandates_active %>% group_by(person_id) %>% summarise(intersect(active_years))

My output would show which person_ids have had multiple mandates during overlapping years.

CodePudding user response：

Maybe this works for you. It shows the person_id and the overlapping years in the final result.

data.frame(df %>%
  rowwise() %>%
  summarize(id, person_id, f = seq(start_year, end_year, 1)) %>%
  group_by(person_id) %>%
  summarize(overlapping_years = list(f[duplicated(f)])))
  person_id      overlapping_years
1         6 1989, 1990, 1991, 1992
2         7
3         8                   1992

If you just want a sum of years

df %>%
  rowwise() %>%
  summarize(id, person_id, f = seq(start_year, end_year, 1)) %>%
  group_by(person_id) %>%
  summarize(overlapping_years = length(f[duplicated(f)]))
# A tibble: 3 × 2
  person_id overlapping_years
      <dbl>             <int>
1         6                 4
2         7                 0
3         8                 1

Extended Data

df <- structure(list(id = c(1L, 2L, 1L, 2L, 1L, 2L), person_id = c(6,
6, 7, 7, 8, 8), start_year = c(1987, 1989, 1987, 1993, 1987,
1992), end_year = c(1992L, 1995L, 1992L, 1995L, 1992L, 1995L)), row.names = c(NA,
6L), class = "data.frame")

CodePudding user response：

library(dplyr)
library(tidyr)

set.seed(123)
#Data example
start_year <- sample(1957:2003, 12, T)
end_year <- start_year   sample(1:4, 12,T) 
data <- data.frame(person_id = 1:12, start_year, end_year)

data
#>    person_id start_year end_year
#> 1          1       1987     1990
#> 2          2       1971     1974
#> 3          3       1970     1971
#> 4          4       1959     1963
#> 5          5       1998     1999
#> 6          6       1999     2000
#> 7          7       1993     1994
#> 8          8       1970     1973
#> 9          9       1981     1985
#> 10        10       1982     1984
#> 11        11       1983     1986
#> 12        12       1961     1963

# Unroll the intervals into registers:
pers_years <- data %>% rowwise() %>% 
  mutate(years = list(start_year:end_year)) %>% 
  unnest(years)

Number of distinct persons by year:

  summarise(persons = n_distinct(person_id))
#> # A tibble: 25 × 2
#>    years persons
#>    <int>   <int>
#>  1  1959       1
#>  2  1960       1
#>  3  1961       2
#>  4  1962       2
#>  5  1963       2
#>  6  1970       2
#>  7  1971       3
#>  8  1972       2
#>  9  1973       2
#> 10  1974       1
#> # … with 15 more rows

Number of distinct persons that have mandates that overlaps with at least other person in the dataset:

pers_years %>% 
  inner_join(
    pers_years %>% group_by(years) %>% 
      summarise(persons = n_distinct(person_id)) %>% 
      filter(persons > 1),
    by = "years") %>% 
  summarise(n_persons = n_distinct(person_id))

#> # A tibble: 1 × 1
#>   n_persons
#>       <int>
#> 1        10