I need to clean a series of Large dbs. One column is a Date vector. Depending on the year the variable needs to be cleaned differently.
I wrote a function but it is very, slow especially as I have to use it rowwise()
How can I go about making it better?
This is the convoluted cleaning function:
cleandate_fn <- function(date,year){
if(year<=2010){
date = str_pad(date,8,"left","0")
date = as.Date(date,format="%d%m%Y")
} else if (year==2011) {
date = str_pad(date,6,"left","0")
date = ifelse(str_sub(date,1,2)=="02",paste0("20",str_sub(date,3,8)),date)
date = ifelse(str_sub(date,5,6)=="00",paste0(str_sub(date,1,4),"01",str_sub(date,7,8)),date)
date = gsub("\\\\|/","0",date)
date = ifelse(str_sub(date,5,6)=="00",paste0(str_sub(date,1,4),"01",str_sub(date,7,8)),date)
date = ifelse(str_sub(date,5,8) %in% c("0229","0230","0231"),paste0(str_sub(date,1,4),"0131"),date)
date = as.Date(date,format="%Y%m%d")
} else {
date = ifelse(str_sub(date,4,6)=="//1",paste0(str_sub(date,1,3),"/19",str_sub(date,7,8)),date)
date = ifelse(str_sub(date,4,6)=="//2",paste0(str_sub(date,1,3),"/20",str_sub(date,7,8)),date)
date = ifelse(str_sub(date,5,6)=="/9",paste0(str_sub(date,1,4),"19",str_sub(date,7,8)),date)
date = ifelse(str_sub(date,5,6)=="/0",paste0(str_sub(date,1,4),"20",str_sub(date,7,8)),date)
date = gsub("\\\\|/","0",date)
date = ifelse(str_sub(date,1,2)=="00",paste0("01",str_sub(date,3,8)),date)
date = ifelse(str_sub(date,3,4)=="00",paste0(str_sub(date,1,2),"01",str_sub(date,5,8)),date)
date = paste0(str_sub(date,1,2),"05",str_sub(date,5,8))
date = as.Date(date,format="%d%m%Y")}
return(date)
}
Toy example:
data <- data.frame(date=c("19052003","29062012","008//210","05/2/001","01011980"),
year=rep(2010,5))
data <- data %>%
rowwise() %>%
mutate(date=cleandate_fn(date,unique(data$year)))
CodePudding user response:
It may be easier to do a grouping by 'year' and then get the first element of year' as entry to the function
library(dplyr)
df1 %>%
group_by(year) %>%
mutate(datenew = cleandate_fn(date, first(year))) %>%
ungroup
-output
# A tibble: 5 × 3
date year datenew
<chr> <dbl> <date>
1 19052003 2010 2003-05-19
2 29062012 2010 2012-06-29
3 008//210 2010 NA
4 05/2/001 2010 NA
5 01011980 2010 1980-01-01