Home > Software engineering >  How to add rows in a dataset based on conditions in R
How to add rows in a dataset based on conditions in R

Time:12-06

I have dataset where length of stay of booking going in two or three month i want to create a row for every such bookings where revenue will be divided for every month and remaining information about the booking will remain same. if a booking length is in same month then it will show that as it is.

structure(list(channel = c("109", "109", "Agent"), room_stay_status = c("ENQUIRY", 
    "ENQUIRY", "CHECKED_OUT"), start_date = structure(c(1637971200, 
    1640995200, 1640995200), tzone = "UTC", class = c("POSIXct", 
    "POSIXt")), end_date = structure(c(1643155200, 1642636800, 1641168000
    ), tzone = "UTC", class = c("POSIXct", "POSIXt")), los = c(60, 
    19, 2), booker = c("Anuj", "Anuj", "Anuj"), area = c("Goa", "Goa", 
    "Goa"), property_sku = c("Amna-3b", "Amna-3b", "Amna-3b"), Revenue = c(90223.666, 
    5979, 7015.9), Booking_ref = c("aed97", "b497h9", "bde65")), row.names = c(NA, 
    -3L), class = c("tbl_df", "tbl", "data.frame"))

output should look like this

structure(list(channel = c("109", "109", "109", "109", "Agent"
    ), room_stay_status = c("ENQUIRY", "ENQUIRY", "ENQUIRY", "ENQUIRY", 
    "CHECKED_OUT"), start_date = structure(c(1637971200, 1638316800, 
    1640995200, 1640995200, 1640995200), tzone = "UTC", class = c("POSIXct", 
    "POSIXt")), end_date = structure(c(1638230400, 1640908800, 1643155200, 
    1642636800, 1641168000), tzone = "UTC", class = c("POSIXct", 
    "POSIXt")), los = c(4, 31, 25, 19, 2), booker = c("Anuj", "Anuj", 
    "Anuj", "Anuj", "Anuj"), area = c("Goa", "Goa", "Goa", "Goa", 
    "Goa"), property_sku = c("Amna-3b", "Amna-3b", "Amna-3b", "Amna-3b", 
    "Amna-3b"), Revenue = c(6014.91106666667, 46615.5607666667, 37593.1941666667, 
    5979, 7015.9), Booking_ref = c("aed97", "aed97", "aed97", "b497h9", 
    "bde65")), row.names = c(NA, -5L), class = c("tbl_df", "tbl", 
    "data.frame"))

Many thanks in advance.

CodePudding user response:

Using solution from this post to split date:

df2 <- df %>%
  group_by(id = row_number()) %>%             # for each row
  mutate(seq = list(seq(start_date, end_date, "day")),  # create a sequence of dates with 1 day step
         month = map(seq, month)) %>%         # get the month for each one of those dates in sequence
  unnest() %>%                                # unnest data
  group_by(Booking_ref, id, month) %>%              # for each group, row and month
  summarise(start_date = min(seq),                 # get minimum date as start
            end_date = max(seq)) %>%               # get maximum date as end
  ungroup() %>%                               # ungroup
  select(-id, - month)%>%
  group_by(Booking_ref)%>%
  mutate(last_date=max(end_date))             # get last_date to determine los

df3 <- merge(df2,df%>%select(-start_date,-end_date),by=c('Booking_ref'),all.x=T)%>%
  mutate(timespam=end_date-start_date)%>%
  mutate(los2=as.numeric(case_when(last_date==end_date~timespam,
                       T~timespam 1)),
         Revenue2=Revenue*los2/los)

out_df <- df3%>%
  select(-Revenue,-los,-timespam,-last_date)%>%
  rename(Revenue=Revenue2,
         los=los2)

> out_df
  Booking_ref start_date   end_date channel room_stay_status booker area property_sku los   Revenue
1       aed97 2022-01-01 2022-01-26     109          ENQUIRY   Anuj  Goa      Amna-3b  25 37593.194
2       aed97 2021-11-27 2021-11-30     109          ENQUIRY   Anuj  Goa      Amna-3b   4  6014.911
3       aed97 2021-12-01 2021-12-31     109          ENQUIRY   Anuj  Goa      Amna-3b  31 46615.561
4      b497h9 2022-01-01 2022-01-20     109          ENQUIRY   Anuj  Goa      Amna-3b  19  5979.000
5       bde65 2022-01-01 2022-01-03   Agent      CHECKED_OUT   Anuj  Goa      Amna-3b   2  7015.900

CodePudding user response:

An quick attempt here (assuming your data is named df_in and df_out) which seems to do the trick:

library("dplyr")
library("tidyr")
library("lubridate")

# Function for creating a vector from start (st) to end (nd) with intermediate 
# months inside
cut_months <- function(st, nd) {
  repeat {
    # Grow vector, keep adding next month
    next_month <- ceiling_date(tail(st, 1)   seconds(1), "month")

    if (next_month > nd) {
      st <- append(st, nd)
      break
    } else {
      st <- append(st, next_month)
    }
  }
  return(st)
}

# Let's try it
print(cut_months(df_in$start_date[1], df_in$end_date[2]))
# [1] "2021-11-27 01:00:00 CET" "2021-12-01 01:00:00 CET" "2022-01-01 00:00:00 CET" "2022-01-20 01:00:00 CET"

# Function for expanding months:
expand_months <- function(df) {
  expand_rows <- 
      df %>% 
      # Expand months and unnest list-column
      mutate(key_dates = mapply(cut_months, start_date, end_date)) %>% 
      select(-start_date, -end_date) %>% 
      unnest(key_dates) %>% 
      # Compute needed values
      group_by(Booking_ref) %>% 
      arrange(Booking_ref, key_dates) %>% 
      mutate(
          start_date = key_dates,
          end_date = lead(key_dates),
          los = as.numeric(as.duration(start_date %--% end_date), "days"),  # Ceiling this?
          Revenue = Revenue * los / sum(los, na.rm = TRUE) 
        ) %>% 
      arrange(Booking_ref, start_date) %>% 
      # Clean-up
      filter(!is.na(end_date)) %>% 
      select(-key_dates)
   
   expand_rows
 }

# Print results and compare:
expand_months(df_in)
## A tibble: 5 x 10
## Groups:   Booking_ref [3]
#channel room_stay_status   los booker area  property_~1 Revenue Booki~2 start_date          end_date           
#<chr>   <chr>            <dbl> <chr>  <chr> <chr>         <dbl> <chr>   <dttm>              <dttm>             
#1 109     ENQUIRY            4   Anuj   Goa   Amna-3b       6015. aed97   2021-11-27 01:00:00 2021-12-01 01:00:00
#2 109     ENQUIRY           31.0 Anuj   Goa   Amna-3b      46553. aed97   2021-12-01 01:00:00 2022-01-01 00:00:00
#3 109     ENQUIRY           25.0 Anuj   Goa   Amna-3b      37656. aed97   2022-01-01 00:00:00 2022-01-26 01:00:00
#4 109     ENQUIRY           19   Anuj   Goa   Amna-3b       5979  b497h9  2022-01-01 01:00:00 2022-01-20 01:00:00
#5 Agent   CHECKED_OUT        2   Anuj   Goa   Amna-3b       7016. bde65   2022-01-01 01:00:00 2022-01-03 01:00:00
## ... with abbreviated variable names 1: property_sku, 2: Booking_ref

df_out
## A tibble: 5 x 10
#channel room_stay_status start_date          end_date              los booker area  property_~1 Revenue Booki~2
#<chr>   <chr>            <dttm>              <dttm>              <dbl> <chr>  <chr> <chr>         <dbl> <chr>  
#1 109     ENQUIRY          2021-11-27 00:00:00 2021-11-30 00:00:00     4 Anuj   Goa   Amna-3b       6015. aed97  
#2 109     ENQUIRY          2021-12-01 00:00:00 2021-12-31 00:00:00    31 Anuj   Goa   Amna-3b      46616. aed97  
#3 109     ENQUIRY          2022-01-01 00:00:00 2022-01-26 00:00:00    25 Anuj   Goa   Amna-3b      37593. aed97  
#4 109     ENQUIRY          2022-01-01 00:00:00 2022-01-20 00:00:00    19 Anuj   Goa   Amna-3b       5979  b497h9 
#5 Agent   CHECKED_OUT      2022-01-01 00:00:00 2022-01-03 00:00:00     2 Anuj   Goa   Amna-3b       7016. bde65  
## ... with abbreviated variable names 1: property_sku, 2: Booking_ref

I do not understand entirely how you distribute the Revenue. Consider that left as an exercise to fix :).
Hint: you need a ceiling() around the computation of the new los which computes decimal days.

  •  Tags:  
  • r
  • Related