Question relates to using "for" loops in R package and handling error message"undefined columns.
Sample date frame consists of 4 rows and 10 columns. Libraries used magritter and dplyr.
library(magrittr)
library(dplyr)
date <- c ("2020-12-31", "2021-01-29", "2021-02-26", "2021-03-31")
v_1 <- c(66.39, 70.46, 69.37, 67.73)
v_2 <- c(14.16, 12.31, 13.07, 12.85)
v_3 <- c(14.37, 13.02, 13.09, 14.88)
v_4 <- c(0.94, 0.83, 0.74, 0.79)
v_5 <- c(0.42, 0.37, 0.33, 0.29)
v_6 <- c(1.51, 1.30, 1.14, 1.12)
v_7 <- c(1.84, 1.37, 1.91, 1.99)
v_8 <- c(0.16, 0.14, 0.14, 0.14)
v_9 <- c(0.21, 0.20, 0.21, 0.21)
raw.data <- data.frame(date, v_1, v_2, v_3, v_4, v_5, v_6, v_7, v_8, v_9)
The variables v_1 through v_9 are each assigned a priority, namely "High", "Medium" or "Low"
Priority <- c("High", "Medium", "Low")
High <- c("v_4", "v_7")
Medium <- c("v_2", "v_3", "v_9")
Low <- c("v_1", "v_5", "v_6", "v_8")
Required output in tidy/long format to contain 12 rows and 3 columns.
12 Rows: date variable that is repeated thrice.
3 columns: date, Priority and Priority Total. (Priority Total is the row wise sum of the variables falling under each priority category)
date Priority Priority_Total
1 2020-12-31 High 2.78
2 2021-01-29 High 2.20
3 2021-02-26 High 2.65
4 2021-03-31 High 2.78
5 2020-12-31 Medium 28.74
6 2021-01-29 Medium 25.53
7 2021-02-26 Medium 26.37
8 2021-03-31 Medium 27.94
9 2020-12-31 Low 68.48
10 2021-01-29 Low 72.27
11 2021-02-26 Low 70.98
12 2021-03-31 Low 69.28
Required output obtained successfully using the paste and copy method.
Note that "tmp" is a temporary work space.
pc_output <- NULL # Required output initialized
# High Priority
tmp <- raw.data[,c("date",High)] %>% # Selecting date and high priority variables.
mutate(Priority_Total=select(.,-1) %>% # Generating a priority total by summing across the rows.
apply(1,sum,na.rm=TRUE)) %>%
mutate(Priority="High") # Generating a priority category column containing value "High"
tmp <- tmp[,c("date","Priority","Priority_Total")] # Retaining the 3 required columns
pc_output <- rbind(pc_output,tmp) # Merging the derived result with pc_output
# The same procedure is used for medium and low priority variables
# Medium Priority
tmp <- raw.data[,c("date",Medium)] %>% # Selecting date and medium priority variables.
mutate(Priority_Total=select(.,-1) %>% # Generating a priority total by summing across the rows.
apply(1,sum,na.rm=TRUE)) %>%
mutate(Priority="Medium") # Generating a priority category column containing value "Medium"
tmp <- tmp[,c("date","Priority","Priority_Total")] # Retaining the 3 required columns
pc_output <- rbind(pc_output,tmp) # Merging the derived result with pc_output
# Low Priority
tmp <- raw.data[,c("date",Low)] %>% # Selecting date and low priority variables.
mutate(Priority_Total=select(.,-1) %>% # Generating a priority total by summing across the rows.
apply(1,sum,na.rm=TRUE)) %>%
mutate(Priority="Low") # Generating a priority category column containing value "Low"
tmp <- tmp[,c("date","Priority","Priority_Total")] # Retaining the 3 required columns
pc_output <- rbind(pc_output,tmp) # Merging the derived result with pc_output
rm(tmp) # Clearing the temporary work space
print(pc_output)
The "paste and copy" method was run to ensure that all the columns were properly defined.
However when the "for" loop was used, the "undefined columns" problem was encountered.
output <- NULL
for (i in (Priority)){
tmp <- raw.data[,c("date",i)] %>%
mutate(Priority_Total=select(.,-1) %>%
apply(1,sum,na.rm=TRUE)) %>%
mutate(Priority=i)
tmp <- tmp[,c("date","Priority","Priority_Total")]
output <- rbind(output,tmp)
}
Message : "Error in [.data.frame
(raw.data, , c("date", i)) : undefined columns selected"
CodePudding user response:
Loops are rarely the optimal solution to such data transformation problems, as vectorized alternatives exist that are both faster and likely easier to read. As for the error you encountered - c("date", 1)
evaluates to a vector of length 2. That is, R tries to evaluate raw.data[, c("date", "1")]
, while raw.data[, c("date", paste0("v_", i))]
might be what you intended to do. Here is dplyr
alternative without the for-loop:
raw.data %>%
pivot_longer(starts_with("v"),
names_to = "Priority") %>%
mutate(Priority = case_when(
Priority %in% c("v_4", "v_7") ~ "High",
Priority %in% c("v_2", "v_3", "v_9") ~ "Medium",
Priority %in% c("v_1", "v_5", "v_6", "v_8") ~ "Low",
TRUE ~ NA_character_
)) %>%
group_by(date, Priority) %>%
summarise(Priority_Total = sum(value, na.rm = T), .groups = "drop") %>%
arrange(factor(Priority, levels = c("High", "Medium", "Low")))
# A tibble: 12 x 3
date Priority Priority_Total
<chr> <chr> <dbl>
1 2020-12-31 High 2.78
2 2021-01-29 High 2.2
3 2021-02-26 High 2.65
4 2021-03-31 High 2.78
5 2020-12-31 Medium 28.7
6 2021-01-29 Medium 25.5
7 2021-02-26 Medium 26.4
8 2021-03-31 Medium 27.9
9 2020-12-31 Low 68.5
10 2021-01-29 Low 72.3
11 2021-02-26 Low 71.0
12 2021-03-31 Low 69.3