Create subset of data using conditions from another data frame-CodePudding

I would like to use something like dplyr to create a subset of data from one data frame using conditions from another data frame. So in one data frame I have a set of data with minimum and maximum years and other sea-level data lsp , and in another frame I have a time series of ocean dynamics. For each row in the lsp dataframe, I would like to extract every year between the minimum and maximum ages in the dynamics data frame and create a sub set of data. I think this will require a for loop. Does anyone have any idea if this is possible?

Desired output using row 1 of LSP as an example:

Row 1 LSP (simplified) is:

Age min	Age max
1997	2007

I want to use this information to create a data frame like this from the dynamics file:

Subset

Year	Dynamics
1997	125
1998	109
1999	152
2000	161
2001	106
2002	120
2003	58
2004	68
2005	110
2006	144
2007	100

Many thanks


## LSP data
structure(list(Depth = c(0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 
8.5, 10.5, 13.5, 14.5, 18.5, 19.5, 27.5, 28.5, 32, 35.5, 40.5, 
41.5), RSL = c(0.03, 0.03, 0.01, 0.01, -0.04, -0.01, -0.03, 0, 
0.04, 0.03, 0, -0.01, -0.05, -0.07, -0.19, -0.24, -0.31, -0.31, 
-0.27, -0.29), RSL_err_1sig = c(0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 
0.1), Age_mean = c(2001.754499, 1994.278776, 1987.678949, 1980.805889, 
1973.270485, 1965.018421, 1957.442729, 1952.134369, 1949.031929, 
1945.148184, 1939.132213, 1936.957531, 1927.311071, 1924.379033, 
1897.26123, 1892.977317, 1876.1995, 1858.135589, 1825.967544, 
1820.605298), Age.min = c(1996.752238, 1985.111654, 1977.483594, 
1968.26211, 1961.886124, 1958.219318, 1947.496532, 1943.084044, 
1941.761439, 1935.843414, 1923.952516, 1920.057048, 1906.228232, 
1902.242998, 1875.327613, 1869.925103, 1834.992176, 1811.928966, 
1784.998245, 1767.524866), Age.max = c(2006.75676, 2003.445898, 
1997.874304, 1993.349668, 1984.654846, 1971.817524, 1967.388926, 
1961.184694, 1956.302419, 1954.452954, 1954.31191, 1953.858014, 
1948.39391, 1946.515068, 1919.194847, 1916.029531, 1917.406824, 
1904.342212, 1866.936843, 1873.68573)), class = "data.frame", row.names = c(NA, 
-20L))

## Dynamics (only head)

structure(list(Year = 1815:1820, dynamics = c(-76.01893261, -64.50519732, 
-66.06270761, -76.22822397, -72.35960029, -77.34157443)), row.names = c(NA, 
6L), class = "data.frame")

CodePudding user response：

As long as your dataset isn't huge, I would take something like the following approach.

Add the (nested) dynamics dataset to each row of your lsp dataset
Unnest the dynamics dataset to get one row per year
Filter out years that aren't relevant

(Optional)

Renest the dynamics columns to you have one row per lsp record with a tibble for all relevant years from the dynamics set.

lsp %>%
    add_column(dynamics %>% nest(data = everything())) %>%
    unnest(data) %>%
    filter(year >= min & year <= max) %>%
    nest(filtered = c(year, value))

CodePudding user response：

Here is a base R option with Map and subset -

Map(function(x, y) subset(dynamics, Year >= x & Year <= y), 
                   LSP$Age.min, LSP$Age.max)

The same logic can be implemented using tidyverse functions as well.

library(dplyr)
library(purrr)

map2(LSP$Age.min, LSP$Age.max, ~dynamics %>% filter(Year >= .x & Year <= .y))

CodePudding user response：

I guess this does what you want to do. First assign names to your input data, so later you know what my codes mean.

lsp <- structure(list(Depth = c(0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 
8.5, 10.5, 13.5, 14.5, 18.5, 19.5, 27.5, 28.5, 32, 35.5, 40.5, 
41.5), RSL = c(0.03, 0.03, 0.01, 0.01, -0.04, -0.01, -0.03, 0, 
0.04, 0.03, 0, -0.01, -0.05, -0.07, -0.19, -0.24, -0.31, -0.31, 
-0.27, -0.29), RSL_err_1sig = c(0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 
0.1), Age_mean = c(2001.754499, 1994.278776, 1987.678949, 1980.805889, 
1973.270485, 1965.018421, 1957.442729, 1952.134369, 1949.031929, 
1945.148184, 1939.132213, 1936.957531, 1927.311071, 1924.379033, 
1897.26123, 1892.977317, 1876.1995, 1858.135589, 1825.967544, 
1820.605298), Age.min = c(1996.752238, 1985.111654, 1977.483594, 
1968.26211, 1961.886124, 1958.219318, 1947.496532, 1943.084044, 
1941.761439, 1935.843414, 1923.952516, 1920.057048, 1906.228232, 
1902.242998, 1875.327613, 1869.925103, 1834.992176, 1811.928966, 
1784.998245, 1767.524866), Age.max = c(2006.75676, 2003.445898, 
1997.874304, 1993.349668, 1984.654846, 1971.817524, 1967.388926, 
1961.184694, 1956.302419, 1954.452954, 1954.31191, 1953.858014, 
1948.39391, 1946.515068, 1919.194847, 1916.029531, 1917.406824, 
1904.342212, 1866.936843, 1873.68573)), class = "data.frame", row.names = c(NA, 
-20L))

dynamics <- structure(list(Year = 1815:1820, dynamics = c(-76.01893261, -64.50519732, 
-66.06270761, -76.22822397, -72.35960029, -77.34157443)), row.names = c(NA, 
6L), class = "data.frame")

Then the actual codes to get the subset.

# first get info of years from the "lsp" dataset
# following your example in your comments
year_min  <- list()
year_max  <- list()
all_years <- list()

for(i in 1:nrow(lsp)){
  year_min[[i]] <- round(lsp$Age.min[[i]])
  year_max[[i]] <- round(lsp$Age.max[[i]])
  all_years[[i]] <- c(year_min[[i]]:year_max[[i]])
  all_years[[i]] <- as.data.frame(all_years[[i]])
  colnames(all_years[[i]]) <- "Year"
}

# now join the info on "Year" from "lsp" data with "dynamics" data to get the subset
library(dplyr)

subset_output <- list()

for (i in 1:length(all_years)){
  subset_output[[i]] <- left_join(dynamics,all_years[[i]])
}