Having a dataframe like this:
data.frame(id = c(1,2,3,4), time_stamp_1 = c("Nov 2016-Current", "May 2013-Current", "Oct 2015-Current", "May 2014-Current"), time_stamp_2 = c("Mar 2015-Nov 2016", "May 2008-May 2013", "Aug 2005-Current", "Oct 2014-Jan 2015"))
How is it possible to add new columns which have the time difference in months for every row and whete current insert the "Sept 2022".
Example output:
data.frame(id = c(1,2,3,4), time_stamp_1 = c("Nov 2016-Current", "May 2013-Current", "Oct 2015-Current", "May 2014-Current"), time_stamp_2 = c("Mar 2015-Nov 2016", "May 2008-May 2013", "Aug 2005-Current", "Oct 2014-Jan 2015"), time_stamp_1_duration = c(41,43,24,53), time_stamp_2_duration = c(32,12,45,32))
duration is an example only it is not the real, just for example.
CodePudding user response:
This should do the trick. First replace all the "Current" and "Sept" with the R-recognized abbreviation "Sep", then use tidy::separate
and zoo::as.yearmon()
to convert to year-month format, then calculate the intervals (in months (x12) per OP):
library(tidyr)
library(zoo)
df <- data.frame(id = c(1,2,3,4), time_stamp_1 = c("Nov 2016-Current", "May 2013-Current", "Oct 2015-Current", "May 2014-Current"), time_stamp_2 = c("Mar 2015-Nov 2016", "May 2008-May 2013", "Aug 2005-Current", "Oct 2014-Jan 2015"))
# convert current and Sept to "Sep 2022"
df[2:3] <- lapply(df[2:3], function(x) gsub("-Current|-Sept 2022", "-Sep 2022", x))
df %>%
separate(time_stamp_1, into = c("my1a", "my1b"), sep = "-") %>%
separate(time_stamp_2, into = c("my2a", "my2b"), sep = "-") %>%
mutate(across(my1a:my2b, ~ as.yearmon(.x, format = "%b %Y"))) %>%
mutate(interval_1 = (my1b - my1a) * 12,
interval_2 = (my2b - my2a) * 12) %>%
left_join(df) %>% select(names(df), "interval_1", "interval_2")
Output:
id time_stamp_1 time_stamp_2 interval_1 interval_2
1 1 Nov 2016-Sep 2022 Mar 2015-Nov 2016 70 20
2 2 May 2013-Sep 2022 May 2008-May 2013 112 60
3 3 Oct 2015-Sep 2022 Aug 2005-Sep 2022 83 205
4 4 May 2014-Sep 2022 Oct 2014-Jan 2015 100 3
As G. Grothendieck mentions in the comments, we could wrap this in a function:
# thanks to G. Grothendieck
ts2mos <- function(x) {
x <- gsub("-Current|-Sept 2022", "-Sep 2022", x)
12 * (as.yearmon(sub(".*-", "", x)) - as.yearmon(x, "%b %Y"))
}
df %>% mutate(interval_1 = ts2mos(time_stamp_1),
interval_2 = ts2mos(time_stamp_2))
CodePudding user response:
library(stringr)
timespan_to_duration <- function(x) {
x[ x == 'Current' ] <- 'Sep 2022'
x <- str_replace_all(x, '\\s ', ' 01 ')
x <- as.POSIXct(x, format = '%b %d %Y')
((difftime(x[ 2 ], x[ 1 ], units = 'days') |>
as.integer()) / 30) |>
round()
}
df <- data.frame(id = c(1,2,3,4),
time_stamp_1 = c("Nov 2016-Current", "May 2013-Current", "Oct 2015-Current", "May 2014-Current"),
time_stamp_2 = c("Mar 2015-Nov 2016", "May 2008-May 2013", "Aug 2005-Current", "Oct 2014-Jan 2015"))
df$time_stamp_1_duration <- df$time_stamp_1 |>
str_split('-') |>
lapply(timespan_to_duration) |>
unlist()
df$time_stamp_2_duration <- df$time_stamp_2 |>
str_split('-') |>
lapply(timespan_to_duration) |>
unlist()
df
CodePudding user response:
Using tidyverse
library(dplyr)
library(lubridate)
library(stringr)
df1 %>%
mutate(across(starts_with('time_stamp'), ~ {
tmp <- str_replace(.x, "Current", 'Sep 2022') %>%
str_replace("(\\w ) (\\d )-(\\w ) (\\d )", "\\2-\\1-01/\\4-\\3-01") %>%
interval
tmp %/% months(1)}, .names = "{.col}_duration"))
-output
id time_stamp_1 time_stamp_2 time_stamp_1_duration time_stamp_2_duration
1 1 Nov 2016-Current Mar 2015-Nov 2016 70 20
2 2 May 2013-Current May 2008-May 2013 112 60
3 3 Oct 2015-Current Aug 2005-Current 83 205
4 4 May 2014-Current Oct 2014-Jan 2015 100 3
CodePudding user response:
A tidyverse
approach
library(dplyr)
library(lubridate)
library(stringr)
df %>%
mutate(across(starts_with("time_stamp"), ~ str_replace(.x, "Current", "Sep 2022")),
time_stamp_1_duration = sapply(str_split(time_stamp_1, "-"), function(x)
interval(my(x[1]), my(x[2])) %/% months(1)),
time_stamp_2_duration = sapply(str_split(time_stamp_2, "-"), function(x)
interval(my(x[1]), my(x[2])) %/% months(1)),
across(starts_with("time_stamp"), ~ str_replace(.x, "Sep 2022", "Current")))
id time_stamp_1 time_stamp_2 time_stamp_1_duration
1 1 Nov 2016-Current Mar 2015-Nov 2016 70
2 2 May 2013-Current May 2008-May 2013 112
3 3 Oct 2015-Current Aug 2005-Current 83
4 4 May 2014-Current Oct 2014-Jan 2015 100
time_stamp_2_duration
1 20
2 60
3 205
4 3
CodePudding user response:
One possible solution using the function tstrsplit
from data.table package. Not that I am also using the built-in constant month.abb
.
df[c("duration1", "duration2")] = lapply(df[2:3], function(x) {
x = data.table::tstrsplit(sub("Current", "Sep 2022", x),
split="\\s|-",
type.convert=TRUE,
names=c("mo1", "yr1", "mo2", "yr2"))
x[c("mo1", "mo2")] = lapply(x[c("mo1", "mo2")], match, month.abb)
pmax(x$yr2 - x$yr1-1, 0) * 12 12-x$mo1 x$mo2
})
id time_stamp_1 time_stamp_2 duration1 duration2
1 1 Nov 2016-Current Mar 2015-Nov 2016 70 20
2 2 May 2013-Current May 2008-May 2013 112 60
3 3 Oct 2015-Current Aug 2005-Current 83 205
4 4 May 2014-Current Oct 2014-Jan 2015 100 3