Include in data frame 1 averaged values from several other data frames and based on varying time int-CodePudding

I have a data frame with several variables, and whose first columns look like this:

Place <- c(rep("PlaceA",14),rep("PlaceB",15))
Group_Id <- c(rep("A1",5),rep("A1",6),rep("A2",3),rep("B1",6),rep("B2",4),rep("B2",5))
Time <- as.Date(c("2018-01-15","2018-02-03","2018-02-27","2018-03-10","2018-03-18","2019-02-02","2019-03-01","2019-03-15","2019-03-28","2019-04-05","2019-04-12","2018-02-01",
                  "2018-03-01","2018-04-07","2018-01-17","2018-01-27","2018-02-17","2018-03-03","2018-04-02","2018-04-25","2018-03-03","2018-03-18","2018-04-08","2018-04-20",
                  "2019-01-23","2019-02-09","2019-02-27","2019-03-12","2019-03-30"))
FollowUp <- c("start",paste("week",week(ymd(Time[2:5]))),"start",paste("week",week(ymd(Time[7:11]))),"start",paste("week",week(ymd(Time[13:14]))),"start",paste("week",week(ymd(Time[16:20]))),"start",paste("week",week(ymd(Time[22:24]))),"start",paste("week",week(ymd(Time[26:29]))))
exprmt <- c(rep(1,5),rep(2,6),rep(3,3),rep(4,6),rep(5,4),rep(6,5))

> df1
    Place Group_Id       Time exprmt FollowUp
1  PlaceA       A1 2018-01-15      1    start
2  PlaceA       A1 2018-02-03      1   week 5
3  PlaceA       A1 2018-02-27      1   week 9
4  PlaceA       A1 2018-03-10      1  week 10
5  PlaceA       A1 2018-03-18      1  week 11
6  PlaceA       A1 2019-02-02      2    start
7  PlaceA       A1 2019-03-01      2   week 9
8  PlaceA       A1 2019-03-15      2  week 11
9  PlaceA       A1 2019-03-28      2  week 13
10 PlaceA       A1 2019-04-05      2  week 14
11 PlaceA       A1 2019-04-12      2  week 15
12 PlaceA       A2 2018-02-01      3    start
13 PlaceA       A2 2018-03-01      3   week 9
14 PlaceA       A2 2018-04-07      3  week 14
15 PlaceB       B1 2018-01-17      4    start
16 PlaceB       B1 2018-01-27      4   week 4
17 PlaceB       B1 2018-02-17      4   week 7
18 PlaceB       B1 2018-03-03      4   week 9
19 PlaceB       B1 2018-04-02      4  week 14
20 PlaceB       B1 2018-04-25      4  week 17
21 PlaceB       B2 2018-03-03      5    start
22 PlaceB       B2 2018-03-18      5  week 11
23 PlaceB       B2 2018-04-08      5  week 14
24 PlaceB       B2 2018-04-20      5  week 16
25 PlaceB       B2 2019-01-23      6    start
26 PlaceB       B2 2019-02-09      6   week 6
27 PlaceB       B2 2019-02-27      6   week 9
28 PlaceB       B2 2019-03-12      6  week 11
29 PlaceB       B2 2019-03-30      6  week 13

For each Place (more than 2 in my actual data), I have a separate data frame with temperature records by hours. For example:

set.seed(1032)
t <- c(seq.POSIXt(from = ISOdate(2018,01,01),to = ISOdate(2018,06,01), by = "hour"),seq.POSIXt(from = ISOdate(2019,01,01),to = ISOdate(2019,06,01), by = "hour"))
temp_A <- runif(length(t),min = 5, max = 25)
temp_B <- runif(length(t),min = 3, max = 32)
data_A <- data.frame(t,temp_A)
data_B <- data.frame(t,temp_B)

> head(data_A)
                    t   temp_A
1 2018-01-01 12:00:00 14.24961
2 2018-01-01 13:00:00 21.64925
3 2018-01-01 14:00:00 21.77058
4 2018-01-01 15:00:00 13.31673
5 2018-01-01 16:00:00 16.10350
6 2018-01-01 17:00:00 17.64567

I need to add a column in df1 with average temperature for the time interval by Place, group_Id and exprmt: the first of each group_byshould be a NaN, than I would need the average for each time interval. Knowing that for each Place, the data are also in a separate data frame.

I tried something like this, but it is not working:

df1 <- df1 %>% group_by(Place,Group_Id,exprmt) %>% mutate(
  temp = case_when(FollowUp == "start" & Place == "PlaceA" ~ NA,
                   FollowUp == FollowUp[c(2:n())] & Place == "PlaceA" ~ mean(temp_A[c(which(date(temp_A$t))==lag(Time,1):which(date(temp_A$t))==Time),2]),
                   )
)

I found information on how calculate averages over multiple dataframes (e.g. this or this), but this is not what I am looking for. I would like to do it without a loop. My expected results is (etc stand for and so on..):

> df1
    Place Group_Id       Time exprmt FollowUp                                      expected
1  PlaceA       A1 2018-01-15      1    start                                           NaN
2  PlaceA       A1 2018-02-03      1   week 5 mean temp_A between 2018-01-15 and 2018-02-03
3  PlaceA       A1 2018-02-27      1   week 9 mean temp_A between 2018-02-03 and 2018-02-27
4  PlaceA       A1 2018-03-10      1  week 10 mean temp_A between 2018-02-27 and 2018-03-10
5  PlaceA       A1 2018-03-18      1  week 11 mean temp_A between 2018-03-10 and 2018-03-18
6  PlaceA       A1 2019-02-02      2    start                                           NaN
7  PlaceA       A1 2019-03-01      2   week 9 mean temp_A between 2019-02-02 and 2019-03-01
8  PlaceA       A1 2019-03-15      2  week 11                                           etc
9  PlaceA       A1 2019-03-28      2  week 13                                           etc
10 PlaceA       A1 2019-04-05      2  week 14                                           etc
11 PlaceA       A1 2019-04-12      2  week 15                                           etc
12 PlaceA       A2 2018-02-01      3    start                                           etc
13 PlaceA       A2 2018-03-01      3   week 9                                           etc
14 PlaceA       A2 2018-04-07      3  week 14                                           etc
15 PlaceB       B1 2018-01-17      4    start                                           NaN
16 PlaceB       B1 2018-01-27      4   week 4 mean temp_B between 2018-01-17 and 2018-01-27
17 PlaceB       B1 2018-02-17      4   week 7                                           etc
18 PlaceB       B1 2018-03-03      4   week 9                                           etc
19 PlaceB       B1 2018-04-02      4  week 14                                           etc
20 PlaceB       B1 2018-04-25      4  week 17                                           etc
21 PlaceB       B2 2018-03-03      5    start                                           etc
22 PlaceB       B2 2018-03-18      5  week 11                                           etc
23 PlaceB       B2 2018-04-08      5  week 14                                           etc
24 PlaceB       B2 2018-04-20      5  week 16                                           etc
25 PlaceB       B2 2019-01-23      6    start                                           etc
26 PlaceB       B2 2019-02-09      6   week 6                                           etc
27 PlaceB       B2 2019-02-27      6   week 9                                           etc
28 PlaceB       B2 2019-03-12      6  week 11                                           etc
29 PlaceB       B2 2019-03-30      6  week 13                                           etc

Any help will be appreciated!

CodePudding user response：

Sharing the results with temperature data of 2 places. You can always generalize the same either by joining and creating a single data object (if total places are less) or use an ifelse statement.

library(data.table)
setDT(df1)
setDT(data_A) # converting to data.table
setDT(data_B) # converting to data.table

Merged temperature to have a single data object

data_AB <- merge(data_A, data_B, by = 't')

Create a lag column of Time variable based on Place, Group_Id, exprmt

df1[,':='(LAG_DATE = shift(Time, type = 'lag')), by = .(Place, Group_Id, exprmt)]

Using apply function and user defined function to subset the temperature data based on consecutive time periods and also using data.table functionality along with lapply to get the mean for those subsets

Here I have assumed Place column can somehow be joined/mapped on some condition with the temperature data. Like in the example shared temp_A/temp_B can be formed by concatenating 'temp_' and 6th character of Place column

df1[,':='(EXPECTED = apply(cbind(LAG_DATE, Time, Place), 1, function(x) {
x1 <- as.Date(as.numeric(x[1]), origin = '1970-01-01')
x2 <- as.Date(as.numeric(x[2]), origin = '1970-01-01')
Place <- as.character(x[3])
Mean_Value <- ifelse(is.na(x1), NaN, data_AB[as.Date(t) >= x1 & 
as.Date(t) <= x2, lapply(.SD, mean), .SDcols = paste('temp_', substr(Place, 6, 
6), sep = '')])
return(as.numeric(Mean_Value))
}
))]

CodePudding user response：

I suggest a detailed step-by-step solution (using data.table and lubridate libraries), probably a bit academic, but which tries not to lose the reader. So, please find below a reprex.

Reprex

1. DATA PREPARATION

library(data.table)
library(lubridate)

# Convert the dataframe 'df1' into data.table and add the dummy variable 'StartTime' 
setDT(df1)[, StartTime := shift(Time,1), by = .(Place, Group_Id, exprmt)][]
setcolorder(df1, c("Place", "Group_Id", "FollowUp", "exprmt", "StartTime", "Time"))

# What df1 looks like:
df1
#>      Place Group_Id FollowUp exprmt  StartTime       Time
#>  1: PlaceA       A1    start      1       <NA> 2018-01-15
#>  2: PlaceA       A1   week 5      1 2018-01-15 2018-02-03
#>  3: PlaceA       A1   week 9      1 2018-02-03 2018-02-27
#>  4: PlaceA       A1  week 10      1 2018-02-27 2018-03-10
#>  5: PlaceA       A1  week 11      1 2018-03-10 2018-03-18
#>  6: PlaceA       A1    start      2       <NA> 2019-02-02
#>  7: PlaceA       A1   week 9      2 2019-02-02 2019-03-01
#>  8: PlaceA       ....


# Convert 'StartTime' and 'Time' columns into class 'PosiXct'
sel_cols <- c("StartTime", "Time")
df1[, (sel_cols) := lapply(.SD, as.POSIXct, tz = "GMT"), .SDcols = sel_cols]

# Convert the dataframes 'data_A' and 'data_B' into data.tables
setDT(data_A)
setDT(data_B)

2. JOINS

# Merge 'data_A' and 'data_B' on 't'
data_merge <- merge(data_A, data_B, by = 't')

# Join 'df1' and 'data_merge' with Time > t >= StartTime, and remove unnecessary columns
DF_join_1 <- df1[data_merge, on = .(StartTime <= t,Time > t)
                 ][, `:=` (Place = NULL, Group_Id = NULL, FollowUp = NULL, exprmt = NULL, Time = NULL)
                   ][]

# Join 'DF_join_1' and 'df1' on StartTime, then remove the dummy variable StartTime and reorder columns
DF_join_2 <- DF_join_1[df1, on = .(StartTime)
                       ][, StartTime := NULL
                         ][]

setcolorder(DF_join_2, c("Place", "Group_Id", "Time", "exprmt", "FollowUp", "temp_A", "temp_B"))

3. ADD A COLUMN 'TEMP'

# Create a column 'temp' filled with 'temp_A' values when 'Place == PlaceA' and 'temp_B' values when 'Place == PlaceB'
DF_results <- DF_join_2[, temp := fcase(Place == "PlaceA", temp_A,
                                        Place == "PlaceB", temp_B)
                        ][, `:=` (temp_A = NULL, temp_B = NULL)
                          ][]

4. SUMMARIZE TO GET THE DESIRED OUTPUT

# Summarize DF_results to get the mean of 'temp' by group in the 'expected' variable
DF_results[, .(expected = mean(temp, na.rm = TRUE)), by = .(Place, Group_Id, exprmt, Time, FollowUp)]
#>      Place Group_Id exprmt       Time FollowUp  expected
#>  1: PlaceA       A1      1 2018-01-15    start       NaN
#>  2: PlaceA       A1      1 2018-02-03   week 5 10.618465
#>  3: PlaceA       A1      1 2018-02-27   week 9 15.997990
#>  4: PlaceA       A1      1 2018-03-10  week 10 14.874170
#>  5: PlaceA       A1      1 2018-03-18  week 11  8.005203
#>  6: PlaceA       A1      2 2019-02-02    start       NaN
#>  7: PlaceA       A1      2 2019-03-01   week 9 17.768572
#>  8: PlaceA       A1      2 2019-03-15  week 11  8.525002
#>  9: PlaceA       A1      2 2019-03-28  week 13 20.948760
#> 10: PlaceA       A1      2 2019-04-05  week 14 16.898529
#> 11: PlaceA       A1      2 2019-04-12  week 15  7.172799
#> 12: PlaceA       A2      3 2018-02-01    start       NaN
#> 13: PlaceA       A2      3 2018-03-01   week 9 17.521202
#> 14: PlaceA       A2      3 2018-04-07  week 14 21.653708
#> 15: PlaceB       B1      4 2018-01-17    start       NaN
#> 16: PlaceB       B1      4 2018-01-27   week 4 22.622165
#> 17: PlaceB       B1      4 2018-02-17   week 7 22.462456
#> 18: PlaceB       B1      4 2018-03-03   week 9 10.210829
#> 19: PlaceB       B1      4 2018-04-02  week 14 19.731544
#> 20: PlaceB       B1      4 2018-04-25  week 17 25.700109
#> 21: PlaceB       B2      5 2018-03-03    start       NaN
#> 22: PlaceB       B2      5 2018-03-18  week 11 19.731544
#> 23: PlaceB       B2      5 2018-04-08  week 14 16.757186
#> 24: PlaceB       B2      5 2018-04-20  week 16  5.248006
#> 25: PlaceB       B2      6 2019-01-23    start       NaN
#> 26: PlaceB       B2      6 2019-02-09   week 6  7.720195
#> 27: PlaceB       B2      6 2019-02-27   week 9 13.185666
#> 28: PlaceB       B2      6 2019-03-12  week 11  9.706857
#> 29: PlaceB       B2      6 2019-03-30  week 13 10.022071
#>      Place Group_Id exprmt       Time FollowUp  expected

^{Created on 2021-11-19 by the reprex package (v2.0.1)}