I have dataset where length of stay of booking going in two or three month i want to create a row for every such bookings where revenue will be divided for every month and remaining information about the booking will remain same. if a booking length is in same month then it will show that as it is.
structure(list(channel = c("109", "109", "Agent"), room_stay_status = c("ENQUIRY",
"ENQUIRY", "CHECKED_OUT"), start_date = structure(c(1637971200,
1640995200, 1640995200), tzone = "UTC", class = c("POSIXct",
"POSIXt")), end_date = structure(c(1643155200, 1642636800, 1641168000
), tzone = "UTC", class = c("POSIXct", "POSIXt")), los = c(60,
19, 2), booker = c("Anuj", "Anuj", "Anuj"), area = c("Goa", "Goa",
"Goa"), property_sku = c("Amna-3b", "Amna-3b", "Amna-3b"), Revenue = c(90223.666,
5979, 7015.9), Booking_ref = c("aed97", "b497h9", "bde65")), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))
output should look like this
structure(list(channel = c("109", "109", "109", "109", "Agent"
), room_stay_status = c("ENQUIRY", "ENQUIRY", "ENQUIRY", "ENQUIRY",
"CHECKED_OUT"), start_date = structure(c(1637971200, 1638316800,
1640995200, 1640995200, 1640995200), tzone = "UTC", class = c("POSIXct",
"POSIXt")), end_date = structure(c(1638230400, 1640908800, 1643155200,
1642636800, 1641168000), tzone = "UTC", class = c("POSIXct",
"POSIXt")), los = c(4, 31, 25, 19, 2), booker = c("Anuj", "Anuj",
"Anuj", "Anuj", "Anuj"), area = c("Goa", "Goa", "Goa", "Goa",
"Goa"), property_sku = c("Amna-3b", "Amna-3b", "Amna-3b", "Amna-3b",
"Amna-3b"), Revenue = c(6014.91106666667, 46615.5607666667, 37593.1941666667,
5979, 7015.9), Booking_ref = c("aed97", "aed97", "aed97", "b497h9",
"bde65")), row.names = c(NA, -5L), class = c("tbl_df", "tbl",
"data.frame"))
Many thanks in advance.
CodePudding user response:
Using solution from this post to split date:
df2 <- df %>%
group_by(id = row_number()) %>% # for each row
mutate(seq = list(seq(start_date, end_date, "day")), # create a sequence of dates with 1 day step
month = map(seq, month)) %>% # get the month for each one of those dates in sequence
unnest() %>% # unnest data
group_by(Booking_ref, id, month) %>% # for each group, row and month
summarise(start_date = min(seq), # get minimum date as start
end_date = max(seq)) %>% # get maximum date as end
ungroup() %>% # ungroup
select(-id, - month)%>%
group_by(Booking_ref)%>%
mutate(last_date=max(end_date)) # get last_date to determine los
df3 <- merge(df2,df%>%select(-start_date,-end_date),by=c('Booking_ref'),all.x=T)%>%
mutate(timespam=end_date-start_date)%>%
mutate(los2=as.numeric(case_when(last_date==end_date~timespam,
T~timespam 1)),
Revenue2=Revenue*los2/los)
out_df <- df3%>%
select(-Revenue,-los,-timespam,-last_date)%>%
rename(Revenue=Revenue2,
los=los2)
> out_df
Booking_ref start_date end_date channel room_stay_status booker area property_sku los Revenue
1 aed97 2022-01-01 2022-01-26 109 ENQUIRY Anuj Goa Amna-3b 25 37593.194
2 aed97 2021-11-27 2021-11-30 109 ENQUIRY Anuj Goa Amna-3b 4 6014.911
3 aed97 2021-12-01 2021-12-31 109 ENQUIRY Anuj Goa Amna-3b 31 46615.561
4 b497h9 2022-01-01 2022-01-20 109 ENQUIRY Anuj Goa Amna-3b 19 5979.000
5 bde65 2022-01-01 2022-01-03 Agent CHECKED_OUT Anuj Goa Amna-3b 2 7015.900
CodePudding user response:
An quick attempt here (assuming your data is named df_in
and df_out
) which seems to do the trick:
library("dplyr")
library("tidyr")
library("lubridate")
# Function for creating a vector from start (st) to end (nd) with intermediate
# months inside
cut_months <- function(st, nd) {
repeat {
# Grow vector, keep adding next month
next_month <- ceiling_date(tail(st, 1) seconds(1), "month")
if (next_month > nd) {
st <- append(st, nd)
break
} else {
st <- append(st, next_month)
}
}
return(st)
}
# Let's try it
print(cut_months(df_in$start_date[1], df_in$end_date[2]))
# [1] "2021-11-27 01:00:00 CET" "2021-12-01 01:00:00 CET" "2022-01-01 00:00:00 CET" "2022-01-20 01:00:00 CET"
# Function for expanding months:
expand_months <- function(df) {
expand_rows <-
df %>%
# Expand months and unnest list-column
mutate(key_dates = mapply(cut_months, start_date, end_date)) %>%
select(-start_date, -end_date) %>%
unnest(key_dates) %>%
# Compute needed values
group_by(Booking_ref) %>%
arrange(Booking_ref, key_dates) %>%
mutate(
start_date = key_dates,
end_date = lead(key_dates),
los = as.numeric(as.duration(start_date %--% end_date), "days"), # Ceiling this?
Revenue = Revenue * los / sum(los, na.rm = TRUE)
) %>%
arrange(Booking_ref, start_date) %>%
# Clean-up
filter(!is.na(end_date)) %>%
select(-key_dates)
expand_rows
}
# Print results and compare:
expand_months(df_in)
## A tibble: 5 x 10
## Groups: Booking_ref [3]
#channel room_stay_status los booker area property_~1 Revenue Booki~2 start_date end_date
#<chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <chr> <dttm> <dttm>
#1 109 ENQUIRY 4 Anuj Goa Amna-3b 6015. aed97 2021-11-27 01:00:00 2021-12-01 01:00:00
#2 109 ENQUIRY 31.0 Anuj Goa Amna-3b 46553. aed97 2021-12-01 01:00:00 2022-01-01 00:00:00
#3 109 ENQUIRY 25.0 Anuj Goa Amna-3b 37656. aed97 2022-01-01 00:00:00 2022-01-26 01:00:00
#4 109 ENQUIRY 19 Anuj Goa Amna-3b 5979 b497h9 2022-01-01 01:00:00 2022-01-20 01:00:00
#5 Agent CHECKED_OUT 2 Anuj Goa Amna-3b 7016. bde65 2022-01-01 01:00:00 2022-01-03 01:00:00
## ... with abbreviated variable names 1: property_sku, 2: Booking_ref
df_out
## A tibble: 5 x 10
#channel room_stay_status start_date end_date los booker area property_~1 Revenue Booki~2
#<chr> <chr> <dttm> <dttm> <dbl> <chr> <chr> <chr> <dbl> <chr>
#1 109 ENQUIRY 2021-11-27 00:00:00 2021-11-30 00:00:00 4 Anuj Goa Amna-3b 6015. aed97
#2 109 ENQUIRY 2021-12-01 00:00:00 2021-12-31 00:00:00 31 Anuj Goa Amna-3b 46616. aed97
#3 109 ENQUIRY 2022-01-01 00:00:00 2022-01-26 00:00:00 25 Anuj Goa Amna-3b 37593. aed97
#4 109 ENQUIRY 2022-01-01 00:00:00 2022-01-20 00:00:00 19 Anuj Goa Amna-3b 5979 b497h9
#5 Agent CHECKED_OUT 2022-01-01 00:00:00 2022-01-03 00:00:00 2 Anuj Goa Amna-3b 7016. bde65
## ... with abbreviated variable names 1: property_sku, 2: Booking_ref
I do not understand entirely how you distribute the Revenue. Consider that left as an exercise to fix :).
Hint: you need a ceiling()
around the computation of the new los
which computes decimal days.