Using "for" loops in R package and error message "Error in `[.data.frame`(, c("d-CodePudding

Question relates to using "for" loops in R package and handling error message"undefined columns.

Sample date frame consists of 4 rows and 10 columns. Libraries used magritter and dplyr.

library(magrittr)
library(dplyr)

date <- c ("2020-12-31", "2021-01-29", "2021-02-26", "2021-03-31")
v_1 <- c(66.39, 70.46, 69.37, 67.73) 
v_2 <- c(14.16, 12.31, 13.07, 12.85) 
v_3 <- c(14.37, 13.02, 13.09, 14.88) 
v_4 <- c(0.94, 0.83, 0.74, 0.79) 
v_5 <- c(0.42, 0.37, 0.33, 0.29) 
v_6 <- c(1.51, 1.30, 1.14, 1.12) 
v_7 <- c(1.84, 1.37, 1.91, 1.99) 
v_8 <- c(0.16, 0.14, 0.14, 0.14)
v_9 <- c(0.21, 0.20, 0.21, 0.21) 

raw.data <- data.frame(date, v_1, v_2, v_3, v_4, v_5, v_6, v_7, v_8, v_9)

The variables v_1 through v_9 are each assigned a priority, namely "High", "Medium" or "Low"

Priority <- c("High", "Medium", "Low")

  High <- c("v_4", "v_7")
  Medium <- c("v_2", "v_3", "v_9")
  Low <- c("v_1", "v_5", "v_6", "v_8")

Required output in tidy/long format to contain 12 rows and 3 columns.

12 Rows: date variable that is repeated thrice.

3 columns: date, Priority and Priority Total. (Priority Total is the row wise sum of the variables falling under each priority category)

      date        Priority    Priority_Total
  1  2020-12-31     High           2.78
  2  2021-01-29     High           2.20
  3  2021-02-26     High           2.65
  4  2021-03-31     High           2.78
  5  2020-12-31   Medium          28.74
  6  2021-01-29   Medium          25.53
  7  2021-02-26   Medium          26.37
  8  2021-03-31   Medium          27.94
  9  2020-12-31      Low          68.48
  10 2021-01-29      Low          72.27
  11 2021-02-26      Low          70.98
  12 2021-03-31      Low          69.28

Required output obtained successfully using the paste and copy method.

Note that "tmp" is a temporary work space.

  pc_output <- NULL # Required output initialized
  

   # High Priority
    
    tmp <- raw.data[,c("date",High)] %>% # Selecting date and high priority variables. 
    
    mutate(Priority_Total=select(.,-1) %>% # Generating a priority total by summing across the rows.
            apply(1,sum,na.rm=TRUE)) %>% 
            mutate(Priority="High") # Generating a priority category column containing value "High"
  
    tmp <- tmp[,c("date","Priority","Priority_Total")] # Retaining the 3 required columns
    
    pc_output <- rbind(pc_output,tmp) # Merging the derived result with pc_output 
  
# The same procedure is used for medium and low priority variables
  
    # Medium Priority 
    tmp <- raw.data[,c("date",Medium)] %>% # Selecting date and medium priority variables. 
      
      mutate(Priority_Total=select(.,-1) %>% # Generating a priority total by summing across the rows.
               apply(1,sum,na.rm=TRUE)) %>% 
      mutate(Priority="Medium") # Generating a priority category column containing value "Medium"
    
    tmp <- tmp[,c("date","Priority","Priority_Total")] # Retaining the 3 required columns
    
    pc_output <- rbind(pc_output,tmp) # Merging the derived result with pc_output 
    

    # Low Priority 
    tmp <- raw.data[,c("date",Low)] %>% # Selecting date and low priority variables. 
      mutate(Priority_Total=select(.,-1) %>% # Generating a priority total by summing across the rows.
      apply(1,sum,na.rm=TRUE)) %>% 
      mutate(Priority="Low") # Generating a priority category column containing value "Low"
    
    tmp <- tmp[,c("date","Priority","Priority_Total")] # Retaining the 3 required columns
    
    pc_output <- rbind(pc_output,tmp) # Merging the derived result with pc_output
    
    rm(tmp) # Clearing the temporary work space

    print(pc_output)

The "paste and copy" method was run to ensure that all the columns were properly defined.

However when the "for" loop was used, the "undefined columns" problem was encountered.

    output <- NULL

    for (i in (Priority)){
      tmp <- raw.data[,c("date",i)] %>% 
      mutate(Priority_Total=select(.,-1) %>%
          apply(1,sum,na.rm=TRUE)) %>%
          mutate(Priority=i)

      tmp <- tmp[,c("date","Priority","Priority_Total")]

  output <- rbind(output,tmp)
  }

Message : "Error in [.data.frame(raw.data, , c("date", i)) : undefined columns selected"

CodePudding user response：

Loops are rarely the optimal solution to such data transformation problems, as vectorized alternatives exist that are both faster and likely easier to read. As for the error you encountered - c("date", 1) evaluates to a vector of length 2. That is, R tries to evaluate raw.data[, c("date", "1")], while raw.data[, c("date", paste0("v_", i))] might be what you intended to do. Here is dplyr alternative without the for-loop:

raw.data %>%
  pivot_longer(starts_with("v"),
               names_to = "Priority") %>%
  mutate(Priority = case_when(
    Priority %in% c("v_4", "v_7") ~ "High",
    Priority %in% c("v_2", "v_3", "v_9") ~ "Medium",
    Priority %in% c("v_1", "v_5", "v_6", "v_8") ~ "Low",
    TRUE ~ NA_character_
  )) %>%
  group_by(date, Priority) %>%
  summarise(Priority_Total = sum(value, na.rm = T), .groups = "drop") %>%
  arrange(factor(Priority, levels = c("High", "Medium", "Low")))

# A tibble: 12 x 3
   date       Priority Priority_Total
   <chr>      <chr>             <dbl>
 1 2020-12-31 High               2.78
 2 2021-01-29 High               2.2 
 3 2021-02-26 High               2.65
 4 2021-03-31 High               2.78
 5 2020-12-31 Medium            28.7 
 6 2021-01-29 Medium            25.5 
 7 2021-02-26 Medium            26.4 
 8 2021-03-31 Medium            27.9 
 9 2020-12-31 Low               68.5 
10 2021-01-29 Low               72.3 
11 2021-02-26 Low               71.0 
12 2021-03-31 Low               69.3