I have the following dataset:
structure(list(ISIN = c("AEA000201011", "AEA000201011", "AEA000201011",
"AEA000201011", "AEA000201011", "AEA000201011", "AEA000201011",
"AEA000201011", "AEA000201011", "AEA000201011", "AEA000801018",
"AEA000801018", "AEA000801018", "AEA000801018", "AEA000801018",
"AEA000801018", "AEA000801018", "AEA000801018", "AEA000801018",
"AEA000801018", "AEA001501013", "AEA001501013", "AEA001501013",
"AEA001501013", "AEA001501013", "AEA001501013", "AEA001501013",
"AEA001501013", "AEA001501013", "AEA001501013"), year = c(2010L,
2011L, 2012L, 2013L, 2014L, 2015L, 2016L, 2017L, 2018L, 2019L,
2010L, 2011L, 2012L, 2013L, 2014L, 2015L, 2016L, 2017L, 2018L,
2019L, 2010L, 2011L, 2012L, 2013L, 2014L, 2015L, 2016L, 2017L,
2018L, 2019L), refin_esg = c(NA, NA, NA, NA, 48.74, 54.32, 63.81,
67.64, 71.74, 68.17, NA, NA, NA, NA, NA, NA, NA, NA, NA, 30.89,
NA, NA, NA, NA, 13.47, 14.63, 19.68, 17.51, 13.92, 15.15), refin_e = c(NA,
NA, NA, NA, 23.21, 53.02, 60.18, 18.64, 50.25, 52.21, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 20.52, NA, NA, NA, NA, 0, 0, 0, 0,
0, 0), mktcap = c(2723907, 4219977, 4576565, 9207428, 9954352,
9329029, 9765366, 9623839, 11548606, 15002191, 1918568, 2047331,
2047331, 3734126, 4655646, 3399201, 3277575, 3260324, 3856508,
5329892, 641477, 647176, 957332, 2452771, 3505987, 1571026, 1646012,
971968, 775941, 526823)), class = c("grouped_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -30L), groups = structure(list(
ISIN = c("AEA000201011", "AEA000801018", "AEA001501013"),
.rows = structure(list(1:10, 11:20, 21:30), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE))
I only want to keep those ISINs (group ID) which have at least 3 consecutive non-NA values. For example, if an ISIN only has two non-NA years, then the whole ISIN gets dropped. Similarly, if an ISIN has 3 non NA years, but they are not consecutive, then that ISIN gets dropped as well.
CodePudding user response:
If we are interested in only the adjacent non-NA rows of length greater than 2 for each ISIN, grouped by ISIN, loop over the refin
columns with if_all
, return a single logical vector (i.e. non-NA - TRUE for all the refin columns for a row or else FALSE), then apply rle
on the vector and check for the lengths
of TRUE value to filter
library(dplyr)
df1 %>%
group_by(ISIN) %>%
filter( with(rle(if_all(starts_with('refin'), complete.cases)),
rep(values & lengths >2, lengths))) %>%
ungroup
Or if we want the whole group having atleast one occurrence of non-NA adjacent elements > 2, wrap with any
over the rle
output
df1 %>%
group_by(ISIN) %>%
filter( any(with(rle(if_all(starts_with('refin'), complete.cases)),
values & lengths >2))) %>%
ungroup
-output
# A tibble: 20 × 5
ISIN year refin_esg refin_e mktcap
<chr> <int> <dbl> <dbl> <dbl>
1 AEA000201011 2010 NA NA 2723907
2 AEA000201011 2011 NA NA 4219977
3 AEA000201011 2012 NA NA 4576565
4 AEA000201011 2013 NA NA 9207428
5 AEA000201011 2014 48.7 23.2 9954352
6 AEA000201011 2015 54.3 53.0 9329029
7 AEA000201011 2016 63.8 60.2 9765366
8 AEA000201011 2017 67.6 18.6 9623839
9 AEA000201011 2018 71.7 50.2 11548606
10 AEA000201011 2019 68.2 52.2 15002191
11 AEA001501013 2010 NA NA 641477
12 AEA001501013 2011 NA NA 647176
13 AEA001501013 2012 NA NA 957332
14 AEA001501013 2013 NA NA 2452771
15 AEA001501013 2014 13.5 0 3505987
16 AEA001501013 2015 14.6 0 1571026
17 AEA001501013 2016 19.7 0 1646012
18 AEA001501013 2017 17.5 0 971968
19 AEA001501013 2018 13.9 0 775941
20 AEA001501013 2019 15.2 0 526823