Previously today I posted this question here but I have an additional question that has been raised.
If I had to more variables in original data frame (loc, height):
var = c(rep("A",3),rep("B",2),rep("C",5));var
date = c(as.Date("2022/01/01"),as.Date("2022/02/01"),as.Date("2022/03/01"),
as.Date("2022/01/01"),as.Date("2022/03/01"),
as.Date("2022/01/01"),as.Date("2022/01/01"),as.Date("2022/02/01"),as.Date("2022/02/01"),as.Date("2022/03/01"))
loc = c(rep("london",3),rep("berlin",2),rep("cairo",5))
height =c(13,14,15,13,15,16,12,14,13,15)
data = tibble(var,date,loc,height);data
how can I keep the corresponding values of the loc variable and the total (sum) of height in each month (in addition in the previously answered question) ?
Ideally it must look now like this :
var | Quarter | Month | Condition | loc | height |
---|---|---|---|---|---|
A | 1 | 1 | TRUE | London | 13 |
A | 1 | 2 | TRUE | London | 14 |
A | 1 | 3 | TRUE | London | 15 |
B | 1 | 1 | TRUE | berlin | 13 |
B | 1 | 2 | FALSE | berlin | 0 |
B | 1 | 3 | TRUE | berlin | 15 |
C | 1 | 1 | TRUE | cairo | 28 |
C | 1 | 2 | TRUE | cairo | 27 |
C | 1 | 3 | TRUE | cairo | 15 |
any help ? how I do it in R using dplyr ?
CodePudding user response:
From the previous solution, add .keep_all = TRUE
in distinct
and then fill
the loc
column with the previous non-NA value
library(dplyr)
library(tidyr)
library(lubridate)
data %>%
mutate(month = lubridate::month(date)) %>%
group_by(var, month) %>%
mutate(height = sum(height)) %>%
ungroup %>%
complete(var, month, fill = list(height = 0)) %>%
mutate(Quarter = quarter, Condition = !is.na(date)) %>%
distinct(var, month, Quarter, Condition, .keep_all = TRUE) %>%
fill(loc) %>%
select(-date)
-output
# A tibble: 9 × 6
var month loc height Quarter Condition
<chr> <dbl> <chr> <dbl> <dbl> <lgl>
1 A 1 london 13 1 TRUE
2 A 2 london 14 1 TRUE
3 A 3 london 15 1 TRUE
4 B 1 berlin 13 1 TRUE
5 B 2 berlin 0 1 FALSE
6 B 3 berlin 15 1 TRUE
7 C 1 cairo 28 1 TRUE
8 C 2 cairo 27 1 TRUE
9 C 3 cairo 15 1 TRUE
CodePudding user response:
Building on previous answer:
data <- data %>% mutate(month=month(date),quarter=quarter(month))
left_join(
expand(data, var,month,quarter),
data %>% group_by(var,month, date,loc) %>%
summarize(height=sum(height), .groups="drop") %>%
select(-date) %>%
mutate(condition=TRUE)
) %>%
mutate(condition=!is.na(condition),height=if_else(is.na(height),0,height)) %>%
group_by(var) %>% fill(loc)
Output:
var month quarter loc height condition
<chr> <dbl> <int> <chr> <dbl> <lgl>
1 A 1 1 london 13 TRUE
2 A 2 1 london 14 TRUE
3 A 3 1 london 15 TRUE
4 B 1 1 berlin 13 TRUE
5 B 2 1 berlin 0 FALSE
6 B 3 1 berlin 15 TRUE
7 C 1 1 cairo 28 TRUE
8 C 2 1 cairo 27 TRUE
9 C 3 1 cairo 15 TRUE
CodePudding user response:
Here is dplyr
solution: This part complete(var,Month, fill = list(height = 0))
is from @akrun:
library(dplyr)
data %>%
group_by(var, Quarter = quarter(date), Month = month(date), loc) %>%
summarise(height = sum(height)) %>%
ungroup() %>%
complete(var,Month, fill = list(height = 0)) %>%
fill(c(Quarter, loc), .direction = "down") %>%
mutate(Condition = ifelse(height == 0 , FALSE, TRUE))
var Month Quarter loc height Condition
<chr> <dbl> <int> <chr> <dbl> <lgl>
1 A 1 1 london 13 TRUE
2 A 2 1 london 14 TRUE
3 A 3 1 london 15 TRUE
4 B 1 1 berlin 13 TRUE
5 B 2 1 berlin 0 FALSE
6 B 3 1 berlin 15 TRUE
7 C 1 1 cairo 28 TRUE
8 C 2 1 cairo 27 TRUE
9 C 3 1 cairo 15 TRUE
CodePudding user response:
You can use nesting()
within the complete()
step to get only the combinations you want, and then group by everything you want to keep distinct before summing the heights:
library(dplyr)
library(tidyr)
var = c(rep("A",3),rep("B",2),rep("C",5));var
date = c(as.Date("2022/01/01"),as.Date("2022/02/01"),as.Date("2022/03/01"),
as.Date("2022/01/01"),as.Date("2022/03/01"),
as.Date("2022/01/01"),as.Date("2022/01/01"),as.Date("2022/02/01"),as.Date("2022/02/01"),as.Date("2022/03/01"))
loc = c(rep("london",3),rep("berlin",2),rep("cairo",5))
height =c(13,14,15,13,15,16,12,14,13,15)
data = tibble(var,date,loc,height)
data %>%
mutate(month = lubridate::month(date)) %>%
complete(month, nesting(var, loc)) %>%
mutate(Quarter = lubridate::quarter(month),
Condition = !is.na(date)) %>%
group_by(across(-c(height, date))) %>%
summarise(height = sum(height), .groups = "drop") %>%
arrange(var)
#> # A tibble: 9 × 6
#> month var loc Quarter Condition height
#> <dbl> <chr> <chr> <int> <lgl> <dbl>
#> 1 1 A london 1 TRUE 13
#> 2 2 A london 1 TRUE 14
#> 3 3 A london 1 TRUE 15
#> 4 1 B berlin 1 TRUE 13
#> 5 2 B berlin 1 FALSE NA
#> 6 3 B berlin 1 TRUE 15
#> 7 1 C cairo 1 TRUE 28
#> 8 2 C cairo 1 TRUE 27
#> 9 3 C cairo 1 TRUE 15
Created on 2022-06-01 by the reprex package (v2.0.1)