Here's my code:
library(PNADcIBGE)
#DOWNLOAD DATABASES#
data2018q1 <- get_pnadc(year = 2018, quarter = 1, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 1)
data2018q2 <- get_pnadc(year = 2018, quarter = 2, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 2)
data2018q3 <- get_pnadc(year = 2018, quarter = 3, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 3)
data2018q4 <- get_pnadc(year = 2018, quarter = 4, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 4)
#CLEAN DATABASES#
data_list <- list(data_2018q1,data_2018q2,data_2018q3,data_2018q4,data_2019q1,data_2019q2,data_2019q3,data_2019q4,data_2020q1,data_2020q2,data_2020q3,data_2020q4,data_2021q1,data_2021q2,data_2021q3)
I'm trying to use a log10 for the variable VD4020 for all dataframes within data_lis. After log10, I want to do other things, such as renaming columns, adding columns and other cleaning stuff. I tried using lapply with no success, and couldnt do a For loop either. The way I was doing it was by manually going into each dataframe and doing a sub data frame, such as
data_2018q1 <- subset.data.frame(data_2018q2, VD4020!="NA")
data_2018q2 <- subset.data.frame(data_2018q2, VD4020!="NA")
data_2018q3 <- subset.data.frame(data_2018q2, VD4020!="NA")
data_2018q1$VD4020 <- log(data_2018q1$VD4020)
data_2018q2$VD4020 <- log(data_2018q2$VD4020)
data_2018q3$VD4020 <- log(data_2018q3$VD4020)
And so on with each and every command. This is really time consuming as I need to do it with 15 dataframes.
CodePudding user response:
Two thoughts.
You didn't ask, but it may be easier to load the data this way:
datalist <- lapply(setNames(1:4, paste0("data_2018q", 1:4)), function(qtr) { get_pnadc(year = 2018, quarter = qtr, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 1) })
Log-10 on one column. FYI,
!= "NA"
is doing string comparisons, you may want!is.na(.)
.datalist2 <- lapply(datalist, function(dat) { dat$VD4020[!is.na(dat$VD4020)] <- log10(dat$VD4020[!is.na(dat$VD4020)]) dat })
or write back to
datalist <- lapply(...)
, but for learning/debugging I often like to break it apart until I am confident about the process.
Edit: for multi-years, you can auto-list it with:
eg <- expand.grid(q = 1:4, y = 2018:2021)[,2:1] # so that the 'q' column goes more quickly
eg
# y q
# 1 2018 1
# 2 2018 2
# 3 2018 3
# 4 2018 4
# 5 2019 1
# 6 2019 2
# 7 2019 3
# 8 2019 4
# 9 2020 1
# 10 2020 2
# 11 2020 3
# 12 2020 4
# 13 2021 1
# 14 2021 2
# 15 2021 3
# 16 2021 4
You said you only need through 2021q3, so we can drop row 16:
nms <- with(eg[-16,], sprintf("data_%sq%s", y, q))
nms
# [1] "data_2018q1" "data_2018q2" "data_2018q3" "data_2018q4" "data_2019q1" "data_2019q2" "data_2019q3" "data_2019q4" "data_2020q1"
# [10] "data_2020q2" "data_2020q3" "data_2020q4" "data_2021q1" "data_2021q2" "data_2021q3"
datalist <- setNames(Map(function(yr, qtr) {
get_pnadc(year = yr, quarter = qtr, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = yr, defperiod = 1)
}, eg$y, eg$q), nms)