Home > Mobile >  Vectorise date function
Vectorise date function

Time:04-15

I need to clean a series of Large dbs. One column is a Date vector. Depending on the year the variable needs to be cleaned differently.

I wrote a function but it is very, slow especially as I have to use it rowwise()

How can I go about making it better?

This is the convoluted cleaning function:

cleandate_fn <- function(date,year){

  if(year<=2010){
    date = str_pad(date,8,"left","0") 
    date = as.Date(date,format="%d%m%Y")
  } else if (year==2011) {
    date = str_pad(date,6,"left","0") 
    date = ifelse(str_sub(date,1,2)=="02",paste0("20",str_sub(date,3,8)),date)
    date = ifelse(str_sub(date,5,6)=="00",paste0(str_sub(date,1,4),"01",str_sub(date,7,8)),date)
    date = gsub("\\\\|/","0",date)
    date = ifelse(str_sub(date,5,6)=="00",paste0(str_sub(date,1,4),"01",str_sub(date,7,8)),date)
    date = ifelse(str_sub(date,5,8) %in% c("0229","0230","0231"),paste0(str_sub(date,1,4),"0131"),date)
    date = as.Date(date,format="%Y%m%d")
  } else {
    date = ifelse(str_sub(date,4,6)=="//1",paste0(str_sub(date,1,3),"/19",str_sub(date,7,8)),date)
    date = ifelse(str_sub(date,4,6)=="//2",paste0(str_sub(date,1,3),"/20",str_sub(date,7,8)),date)
    date = ifelse(str_sub(date,5,6)=="/9",paste0(str_sub(date,1,4),"19",str_sub(date,7,8)),date)    
    date = ifelse(str_sub(date,5,6)=="/0",paste0(str_sub(date,1,4),"20",str_sub(date,7,8)),date)
    date = gsub("\\\\|/","0",date)
    date = ifelse(str_sub(date,1,2)=="00",paste0("01",str_sub(date,3,8)),date)
    date = ifelse(str_sub(date,3,4)=="00",paste0(str_sub(date,1,2),"01",str_sub(date,5,8)),date)
    date = paste0(str_sub(date,1,2),"05",str_sub(date,5,8))
    date = as.Date(date,format="%d%m%Y")}

  return(date)

}

Toy example:

data <- data.frame(date=c("19052003","29062012","008//210","05/2/001","01011980"),
                   year=rep(2010,5)) 

data <- data %>% 
  rowwise() %>% 
  mutate(date=cleandate_fn(date,unique(data$year))) 

CodePudding user response:

It may be easier to do a grouping by 'year' and then get the first element of year' as entry to the function

library(dplyr)
df1 %>%
    group_by(year) %>%
    mutate(datenew = cleandate_fn(date, first(year))) %>%
    ungroup

-output

# A tibble: 5 × 3
  date      year datenew   
  <chr>    <dbl> <date>    
1 19052003  2010 2003-05-19
2 29062012  2010 2012-06-29
3 008//210  2010 NA        
4 05/2/001  2010 NA        
5 01011980  2010 1980-01-01
  • Related