I trying to get a list of string values by using group_by()
clause in R. Please find a sample data below. Here is what I tried.
result <- data %>%
group_by(station) %>%
summarise(values = list(variable))
measurement_vars <- c("PRCP", "SNOW", "SNWD", "TMAX", "TMIN")
In this case, the values
column will be a list. I want to check if the values
column includes specific strings, such as measurement_vars
, by using %in%
function. The %in%
function does not check all values in the list. Therefore, I tried to unlist()
the values
; however, it did not work. My question is similar to
I want to filter the stations which include all the measurement_vars
.
data <- structure(list(station = c("ACW00011604", "ACW00011604", "ACW00011604",
"ACW00011604", "ACW00011604", "ACW00011604", "ACW00011604", "ACW00011604",
"ACW00011604", "ACW00011604", "ACW00011604", "ACW00011647", "ACW00011647",
"ACW00011647", "ACW00011647", "ACW00011647", "ACW00011647", "ACW00011647",
"AE000041196", "AE000041196", "AE000041196", "AE000041196", "AEM00041194",
"AEM00041194", "AEM00041194", "AEM00041194", "AEM00041217", "AEM00041217",
"AEM00041217", "AEM00041217"), lat = c(17.1167, 17.1167, 17.1167,
17.1167, 17.1167, 17.1167, 17.1167, 17.1167, 17.1167, 17.1167,
17.1167, 17.1333, 17.1333, 17.1333, 17.1333, 17.1333, 17.1333,
17.1333, 25.333, 25.333, 25.333, 25.333, 25.255, 25.255, 25.255,
25.255, 24.433, 24.433, 24.433, 24.433), lon = c(-61.7833, -61.7833,
-61.7833, -61.7833, -61.7833, -61.7833, -61.7833, -61.7833, -61.7833,
-61.7833, -61.7833, -61.7833, -61.7833, -61.7833, -61.7833, -61.7833,
-61.7833, -61.7833, 55.517, 55.517, 55.517, 55.517, 55.364, 55.364,
55.364, 55.364, 54.651, 54.651, 54.651, 54.651), variable = c("TMAX",
"TMIN", "PRCP", "SNOW", "SNWD", "PGTM", "WDFG", "WSFG", "WT03",
"WT08", "WT16", "TMAX", "TMIN", "PRCP", "SNOW", "SNWD", "WT03",
"WT16", "TMAX", "TMIN", "PRCP", "TAVG", "TMAX", "TMIN", "PRCP",
"TAVG", "TMAX", "TMIN", "PRCP", "TAVG"), start = c(1949, 1949,
1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1961, 1961,
1957, 1957, 1957, 1961, 1961, 1944, 1944, 1944, 1944, 1983, 1983,
1983, 1983, 1983, 1983, 1984, 1983), end = c(1949, 1949, 1949,
1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1961, 1961, 1970,
1970, 1970, 1961, 1966, 2022, 2022, 2022, 2022, 2022, 2022, 2022,
2022, 2022, 2022, 2020, 2022)), row.names = c(NA, -30L), class = c("tbl_df",
"tbl", "data.frame"))
CodePudding user response:
We need to loop over the list
. Either use lapply/sapply
or with purrr::map
library(dplyr)
library(purrr)
result %>%
filter(map_lgl(values, ~ all(measurement_vars %in% .x)))
-output
# A tibble: 2 × 2
station values
<chr> <list>
1 ACW00011604 <chr [11]>
2 ACW00011647 <chr [7]>
If we want to filter
the original data, it can be also done
data %>%
group_by(station) %>%
filter(all(measurement_vars %in% variable)) %>%
ungroup
-output
# A tibble: 18 × 6
station lat lon variable start end
<chr> <dbl> <dbl> <chr> <dbl> <dbl>
1 ACW00011604 17.1 -61.8 TMAX 1949 1949
2 ACW00011604 17.1 -61.8 TMIN 1949 1949
3 ACW00011604 17.1 -61.8 PRCP 1949 1949
4 ACW00011604 17.1 -61.8 SNOW 1949 1949
5 ACW00011604 17.1 -61.8 SNWD 1949 1949
6 ACW00011604 17.1 -61.8 PGTM 1949 1949
7 ACW00011604 17.1 -61.8 WDFG 1949 1949
8 ACW00011604 17.1 -61.8 WSFG 1949 1949
9 ACW00011604 17.1 -61.8 WT03 1949 1949
10 ACW00011604 17.1 -61.8 WT08 1949 1949
11 ACW00011604 17.1 -61.8 WT16 1949 1949
12 ACW00011647 17.1 -61.8 TMAX 1961 1961
13 ACW00011647 17.1 -61.8 TMIN 1961 1961
14 ACW00011647 17.1 -61.8 PRCP 1957 1970
15 ACW00011647 17.1 -61.8 SNOW 1957 1970
16 ACW00011647 17.1 -61.8 SNWD 1957 1970
17 ACW00011647 17.1 -61.8 WT03 1961 1961
18 ACW00011647 17.1 -61.8 WT16 1961 1966
Or in base R
Filter(\(x) all(measurement_vars %in% x),
with(data, split(variable, station)))
$ACW00011604
[1] "TMAX" "TMIN" "PRCP" "SNOW" "SNWD" "PGTM" "WDFG" "WSFG" "WT03" "WT08" "WT16"
$ACW00011647
[1] "TMAX" "TMIN" "PRCP" "SNOW" "SNWD" "WT03" "WT16"