I am new at writing functions in R, and I am trying to calculate Symmetric Mean Absolute Error (SMAPE) performance by month for one of my models. The basic function works but calculates a single value instead of different values for each month in the dataset. Here is a reproducible example:
structure(list(date = structure(c(18948, 18949, 18950, 18951,
18952, 18953, 18954, 18955, 18956, 18957, 18958, 18959, 18960,
18961, 18962, 18963, 18964, 18965, 18966, 18967, 18968, 18969,
18970, 18971, 18972, 18973, 18974, 18975, 18976, 18977, 18978,
18979, 18980, 18981, 18982, 18983, 18984, 18985, 18986, 18987,
18988, 18989, 18990, 18991, 18992, 18993, 18994, 18995, 18996,
18997, 18998, 18999, 19000, 19001, 19002, 19003, 19004, 19005,
19006, 19007, 19008, 19009, 19010, 19011, 19012, 19013, 19014,
19015, 19016, 19017, 19018, 19019, 19020, 19021, 19022, 19023,
19024, 19025, 19026, 19027, 19028, 19029, 19030, 19031, 19032,
19033, 19034, 19035, 19036, 19037, 19038, 19039, 19040, 19041,
19042, 19043), class = "Date"), actual = c(2875, 2755, 2440,
2220, 1378, 1352, 2616, 1709, 1475, 2315, 2223, 4357, 3037, 1725,
2332, 2358, 3135, 3232, 3497, 2876, 2971, 3530, 4268, 4692, 3589,
3496, 4233, 4336, 5810, 6943, 8921, 7491, 8607, 10450, 11309,
13367, 18607, 23426, 19244, 29256, 21001, 27023, 29346, 39840,
41210, 37503, 38473, 35618, 40713, 39363, 43142, 44309, 38706,
34988, 33483, 28847, 32719, 31248, 31502, 19896, 19025, 23586,
20977, 22323, 23900, 22966, 15038, 14283, 15827, 13900, 18274,
18325, 17514, 10616, 8828, 10580, 8888, 15072, 14208, 14426,
7815, 6841, 7257, 8003, 11034, 10637, 10189, 6143, 4401, 5911,
6164, 8030, 10151, 4180, 6929, 3377), consensus2 = c(2899, 2735,
2485, 2199, 1297, 1414, 3026, 1535, 1588, 2435, 2341, 3095, 2241,
2480, 3098, 2513, 2886, 3289, 3427, 3060, 3050, 3564, 3803, 4204,
3188, 3184, 4071, 4063, 4974, 5839, 6641, 6146, 6620, 8446, 11112,
13071, 14963, 18807, 20670, 21149, 22824, 28484, 29376, 31969,
37669, 37706, 42511, 39104, 41362, 44855, 48043, 46670, 40384.96296,
42612.53704, 37730, 38351, 33813, 35651, 31475, 19364, 19364,
19892, 20436, 21114, 21221, 23002, 18035, 15320, 16292, 15735,
14726, 17844, 17635.77778, 11904.48148, 10763.7037, 9986.611111,
9986.611111, 10604.22222, 14246.90741, 14113.55556, 9113.425926,
8236.5, 8759.888889, 7436.462963, 10489.37037, 10507.09259, 9969.5,
5272.111111, 5729.092593, 5989.055556, 6245, 8267.314815, 7844.481481,
3176.703704, 8661.944444, 3320.055556)), row.names = c(NA, -96L
), class = c("tbl_df", "tbl", "data.frame"))
library(lubridate)
library(tidyverse)
data<- data %>% dplyr::select (date, actual, consensus2) %>%
dput()
data$month<- lubridate::month(data$date,label = TRUE)
data<- data %>% mutate(month= as.factor(month))
#Function
smape1 <- function(a, f) {for (i in 1:(nlevels(data$month))) { return (1/length(a) * sum(2*abs(f-a) / (abs(a) abs(f))*100))}}
SMAPE_bymonth<- by(data,data$month, function(a,f)smape1(data$actual,data$consensus2))
SMAPE_bymonth
CodePudding user response:
Not clear about the for
loop inside the smape1
function. If we remove that and create the function with two arguments (a
, f
) that takes the columns from the data, then we just need to group by the 'month' and apply the function by selecting those columns
library(dplyr)
smape2 <- function(a, f)
{
return(1/length(a) * sum(2*abs(f-a) / (abs(a) abs(f))*100))
}
data %>%
group_by(month) %>%
summarise(smape = smape2(actual, consensus2), .groups = 'drop')
# A tibble: 4 × 2
month smape
<ord> <dbl>
1 Jan 8.87
2 Feb 12.1
3 Nov 11.3
4 Dec 12.0
Or using by
, the lambda function function(x)
returns the blocks of grouped data
from the first argument, which is used as input argument after extracting the column 'actual', 'consensus2' instead of from the whole data (data$
)
by(data, droplevels(data$month), function(x) smape2(x$actual,x$consensus2))
droplevels(data$month): Jan
[1] 8.870074
-----------------------------------------------------------------------------------------------------------------------
droplevels(data$month): Feb
[1] 12.05893
-----------------------------------------------------------------------------------------------------------------------
droplevels(data$month): Nov
[1] 11.26306
-----------------------------------------------------------------------------------------------------------------------
droplevels(data$month): Dec
[1] 11.96994