Home > other >  for loop to clean multiple dataframes
for loop to clean multiple dataframes

Time:02-11

Here's my code:

library(PNADcIBGE)
#DOWNLOAD DATABASES#
data2018q1 <- get_pnadc(year = 2018, quarter = 1, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 1)
data2018q2 <- get_pnadc(year = 2018, quarter = 2, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 2)
data2018q3 <- get_pnadc(year = 2018, quarter = 3, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 3)
data2018q4 <- get_pnadc(year = 2018, quarter = 4, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 4)

#CLEAN DATABASES#

data_list <- list(data_2018q1,data_2018q2,data_2018q3,data_2018q4,data_2019q1,data_2019q2,data_2019q3,data_2019q4,data_2020q1,data_2020q2,data_2020q3,data_2020q4,data_2021q1,data_2021q2,data_2021q3)

I'm trying to use a log10 for the variable VD4020 for all dataframes within data_lis. After log10, I want to do other things, such as renaming columns, adding columns and other cleaning stuff. I tried using lapply with no success, and couldnt do a For loop either. The way I was doing it was by manually going into each dataframe and doing a sub data frame, such as

data_2018q1 <- subset.data.frame(data_2018q2, VD4020!="NA")
data_2018q2 <- subset.data.frame(data_2018q2, VD4020!="NA")
data_2018q3 <- subset.data.frame(data_2018q2, VD4020!="NA")
 
data_2018q1$VD4020 <- log(data_2018q1$VD4020)
data_2018q2$VD4020 <- log(data_2018q2$VD4020)
data_2018q3$VD4020 <- log(data_2018q3$VD4020)

And so on with each and every command. This is really time consuming as I need to do it with 15 dataframes.

CodePudding user response:

Two thoughts.

  1. You didn't ask, but it may be easier to load the data this way:

    datalist <- lapply(setNames(1:4, paste0("data_2018q", 1:4)), function(qtr) {
      get_pnadc(year = 2018, quarter = qtr, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 1)
    })
    
    
  2. Log-10 on one column. FYI, != "NA" is doing string comparisons, you may want !is.na(.).

    datalist2 <- lapply(datalist, function(dat) {
      dat$VD4020[!is.na(dat$VD4020)] <- log10(dat$VD4020[!is.na(dat$VD4020)])
      dat
    })
    

    or write back to datalist <- lapply(...), but for learning/debugging I often like to break it apart until I am confident about the process.


Edit: for multi-years, you can auto-list it with:

eg <- expand.grid(q = 1:4, y = 2018:2021)[,2:1] # so that the 'q' column goes more quickly
eg
#       y q
# 1  2018 1
# 2  2018 2
# 3  2018 3
# 4  2018 4
# 5  2019 1
# 6  2019 2
# 7  2019 3
# 8  2019 4
# 9  2020 1
# 10 2020 2
# 11 2020 3
# 12 2020 4
# 13 2021 1
# 14 2021 2
# 15 2021 3
# 16 2021 4

You said you only need through 2021q3, so we can drop row 16:

nms <- with(eg[-16,], sprintf("data_%sq%s", y, q))
nms
#  [1] "data_2018q1" "data_2018q2" "data_2018q3" "data_2018q4" "data_2019q1" "data_2019q2" "data_2019q3" "data_2019q4" "data_2020q1"
# [10] "data_2020q2" "data_2020q3" "data_2020q4" "data_2021q1" "data_2021q2" "data_2021q3"
datalist <- setNames(Map(function(yr, qtr) {
  get_pnadc(year = yr, quarter = qtr, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = yr, defperiod = 1)
}, eg$y, eg$q), nms)
  • Related