Home > Software design >  R: using rollmean inside ggplot produces erroneous drop at the end
R: using rollmean inside ggplot produces erroneous drop at the end

Time:11-09

I'm smoothing time-series data and plotting them with ggplot. In the past I smoothed the data with TTR, but recently started smoothing on the fly inside ggplot. However, it's producing two artefacts and I'm not sure what I'm missing here.

  1. smoothing inside ggplot shifts the data along the time axis
  2. smoothing inside ggplot produces an erroneous drop off at the end for one data series, but not the other.
ggplot(data=df, aes(x=date, y=x, color=group)) 
   geom_line(aes(y=rollmean(x, 10, fill=NA, align='left'), color=group), na.rm= TRUE, size=0.75) 

produces

GGplot with rollmean

whereas

df.1.ts<-read.zoo(df[df$group=='series1',], format = "%Y-%m-%d")
df.1.SMA10<-data.frame(apply(df.1.ts[,1,drop=F], 2, SMA, n=10))
df.1.SMA10<-cbind(as.Date(time(df.1.ts)), df.1.SMA10)
df.1.SMA10$group<-'series1'
names(df.1.SMA10)[1]<-'date'

df.2.ts<-read.zoo(df[df$group=='series2',], format = "%Y-%m-%d")
df.2.SMA10<-data.frame(apply(df.2.ts[,1,drop=F], 2, SMA, n=10))
df.2.SMA10<-cbind(as.Date(time(df.2.ts)), df.2.SMA10)
df.2.SMA10$group<-'series2'
names(df.2.SMA10)[1]<-'date'

df.SMA10<-rbind(df.1.SMA10, df.2.SMA10)

ggplot(data=df.SMA10, aes(x=date, y=x, color=group))  
  geom_line(size=0.75, na.rm=T) 

produces

Ggplot with pre-smoothed time-series

Sample data:

df<-structure(list(date = structure(c(14242, 14243, 14244, 14245, 
14246, 14247, 14248, 14249, 14250, 14251, 14252, 14253, 14254, 
14255, 14256, 14257, 14258, 14259, 14260, 14261, 14262, 14263, 
14264, 14265, 14266, 14267, 14268, 14269, 14270, 14271, 14272, 
14273, 14274, 14275, 14276, 14277, 14278, 14279, 14280, 14281, 
14282, 14283, 14284, 14285, 14286, 14287, 14288, 14289, 14290, 
14291, 14292, 14293, 14294, 14295, 14296, 14297, 14298, 14299, 
14300, 14301, 14302, 14303, 14304, 14305, 14306, 14307, 14308, 
14309, 14310, 14311, 14312, 14313, 14314, 14315, 14316, 14317, 
14318, 14319, 14320, 14321, 14322, 14323, 14324, 14325, 14326, 
14327, 14328, 14329, 14330, 14331, 14332, 14333, 14334, 14335, 
14214, 14215, 14216, 14217, 14218, 14219, 14220, 14221, 14222, 
14223, 14224, 14225, 14226, 14227, 14228, 14229, 14230, 14231, 
14232, 14233, 14234, 14235, 14236, 14237, 14238, 14239, 14240, 
14241, 14242, 14243, 14244, 14245, 14246, 14247, 14248, 14249, 
14250, 14251, 14252, 14253, 14254, 14255, 14256, 14257, 14258, 
14259, 14260, 14261, 14262, 14263, 14264, 14265, 14266, 14267, 
14268, 14269, 14270, 14271, 14272, 14273, 14274, 14275, 14276, 
14277, 14278, 14279, 14280, 14281, 14282, 14283, 14284, 14285, 
14286, 14287, 14288, 14289, 14290, 14291, 14292, 14293, 14294, 
14295, 14296, 14297, 14298, 14299, 14300, 14301, 14302, 14303, 
14304, 14305, 14306, 14307, 14308, 14309, 14310, 14311, 14312, 
14313, 14314, 14315, 14316, 14317, 14318, 14319, 14320, 14321, 
14322, 14323, 14324, 14325, 14326), class = "Date"), x = c(0.859649122807018, 
0.583333333333333, 0.868055555555556, 0.78125, 0.524305555555556, 
0.475694444444444, 0.538194444444444, 0.798611111111111, 0.576388888888889, 
0.819444444444444, 0.746527777777778, 0.725694444444444, 0.336805555555556, 
0.263888888888889, 0.486111111111111, 0.701388888888889, 0.864583333333333, 
0.701388888888889, 0.524305555555556, 0.916666666666667, 0.715277777777778, 
0.857638888888889, 0.305555555555556, 0.701388888888889, 0.774305555555556, 
0.857638888888889, 0.961805555555556, 0.840277777777778, 0.913194444444444, 
0.909722222222222, 0.746527777777778, 0.711805555555556, 0.895833333333333, 
0.666666666666667, 0.993055555555556, 0.96875, 0.597222222222222, 
0.725694444444444, 0.791666666666667, 0.875, 0.9375, 0.788194444444444, 
0.708333333333333, 0.951388888888889, 0.819444444444444, 0.989583333333333, 
0.965277777777778, 0.947916666666667, 0.996527777777778, 0.979166666666667, 
0.944444444444444, 0.902777777777778, 0.996527777777778, 0.975694444444444, 
1, 1, 1, 1, 0.96875, 0.993055555555556, 0.982638888888889, 0.729166666666667, 
1, 0.993055555555556, 0.975694444444444, 0.996527777777778, 0.993055555555556, 
0.975694444444444, 0.996527777777778, 0.989583333333333, 0.996527777777778, 
1, 0.975694444444444, 0.996527777777778, 1, 0.989583333333333, 
0.996527777777778, 1, 0.996527777777778, 0.975694444444444, 0.975694444444444, 
0.979166666666667, 0.944444444444444, 0.989583333333333, 1, 0.986111111111111, 
0.951388888888889, 0.979166666666667, 0.993055555555556, 0.989583333333333, 
0.951388888888889, 0.996527777777778, 0.993055555555556, 1, 0.0390070921985816, 
0.0173611111111111, 0.229166666666667, 0, 0, 0.107638888888889, 
0.0208333333333333, 0.0763888888888889, 0, 0.121527777777778, 
0.00694444444444444, 0.159722222222222, 0.59375, 0.131944444444444, 
0.131944444444444, 0.0138888888888889, 0.00694444444444444, 0.0659722222222222, 
0.461805555555556, 0.277777777777778, 0.638888888888889, 0.784722222222222, 
0.892361111111111, 0.6875, 0.631944444444444, 0.180555555555556, 
0.00347222222222222, 0.166666666666667, 0.152777777777778, 0, 
0.659722222222222, 0.53125, 0.159722222222222, 0.232638888888889, 
0.673611111111111, 0.670138888888889, 0.631944444444444, 0.760416666666667, 
0.829861111111111, 0.902777777777778, 0.788194444444444, 0.638888888888889, 
0.65625, 0.836805555555556, 0.680555555555556, 0.715277777777778, 
0.677083333333333, 0.798611111111111, 0.579861111111111, 0.788194444444444, 
0.826388888888889, 0.895833333333333, 0.899305555555556, 0.930555555555556, 
0.958333333333333, 0.90625, 0.861111111111111, 0.934027777777778, 
0.798611111111111, 0.888888888888889, 0.961805555555556, 0.975694444444444, 
0.993055555555556, 0.996527777777778, 0.850694444444444, 0.902777777777778, 
0.979166666666667, 0.986111111111111, 0.993055555555556, 0.975694444444444, 
0.809027777777778, 0.972222222222222, 0.951388888888889, 0.899305555555556, 
0.930555555555556, 0.961805555555556, 0.996527777777778, 0.989583333333333, 
0.961805555555556, 0.965277777777778, 0.989583333333333, 0.989583333333333, 
0.940972222222222, 0.996527777777778, 0.947916666666667, 0.982638888888889, 
1, 1, 0.979166666666667, 0.909722222222222, 0.930555555555556, 
0.704861111111111, 0.833333333333333, 0.902777777777778, 0.940972222222222, 
0.96875, 0.802083333333333, 0.836805555555556, 0.989583333333333, 
0.961805555555556, 1, 0.993055555555556, 0.809027777777778, 0.989583333333333, 
0.993055555555556, 0.954861111111111, 0.979166666666667, 0.989583333333333, 
0.982638888888889, 0.989583333333333, 1, 0.961805555555556, 0.925581395348837
), group = c("series1", "series1", "series1", "series1", "series1", 
"series1", "series1", "series1", "series1", "series1", "series1", 
"series1", "series1", "series1", "series1", "series1", "series1", 
"series1", "series1", "series1", "series1", "series1", "series1", 
"series1", "series1", "series1", "series1", "series1", "series1", 
"series1", "series1", "series1", "series1", "series1", "series1", 
"series1", "series1", "series1", "series1", "series1", "series1", 
"series1", "series1", "series1", "series1", "series1", "series1", 
"series1", "series1", "series1", "series1", "series1", "series1", 
"series1", "series1", "series1", "series1", "series1", "series1", 
"series1", "series1", "series1", "series1", "series1", "series1", 
"series1", "series1", "series1", "series1", "series1", "series1", 
"series1", "series1", "series1", "series1", "series1", "series1", 
"series1", "series1", "series1", "series1", "series1", "series1", 
"series1", "series1", "series1", "series1", "series1", "series1", 
"series1", "series1", "series1", "series1", "series1", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2", "series2", "series2", 
"series2", "series2", "series2", "series2")), row.names = c(NA, 
-207L), class = "data.frame")

CodePudding user response:

In your ggplot(.) code, you are calling rollmean(x, ...), which is rolling on all of x, regardless of group. If you want it to be per-group, you can do the following:

ggplot(data=df, aes(x=date, y=x, color=group)) 
   geom_line(aes(y=ave(x, group, FUN = function(Z) zoo::rollmean(Z, 10, fill=NA, align='left')), 
                 color=group), na.rm= TRUE, size=0.75)

enter image description here

I tend to put aggregating/data-munging code outside of ggplot2, though, and this would have helped identify the problem:

df$rolly <- zoo::rollmean(df$x, 10, fill=NA, align='left')

xtabs(~ group   is.na(rolly), data = df)
#          is.na(rolly)
# group     FALSE TRUE
#   series1    94    0
#   series2   104    9

df[c(1:3, 92:97, 196:207),]
#           date          x   group      rolly
# 1   2008-12-29 0.85964912 series1 0.68249269
# 2   2008-12-30 0.58333333 series1 0.67118056
# 3   2008-12-31 0.86805556 series1 0.68541667
# 92  2009-03-30 0.99652778 series1 0.34035904
# 93  2009-03-31 0.99305556 series1 0.24834515
# 94  2009-04-01 1.00000000 series1 0.14903960
# 95  2008-12-01 0.03900709 series2 0.06119238
# 96  2008-12-02 0.01736111 series2 0.05798611
# 97  2008-12-03 0.22916667 series2 0.07222222
# 196 2009-03-12 0.99305556 series2 0.96805556
# 197 2009-03-13 0.80902778 series2 0.96493056
# 198 2009-03-14 0.98958333 series2 0.97658592
# 199 2009-03-15 0.99305556 series2         NA
# 200 2009-03-16 0.95486111 series2         NA
# 201 2009-03-17 0.97916667 series2         NA
# 202 2009-03-18 0.98958333 series2         NA
# 203 2009-03-19 0.98263889 series2         NA
# 204 2009-03-20 0.98958333 series2         NA
# 205 2009-03-21 1.00000000 series2         NA
# 206 2009-03-22 0.96180556 series2         NA
# 207 2009-03-23 0.92558140 series2         NA

Where I would expect the last 9 rows of each series to be NA, not just one series. We can fix that with:

df$rolly <- ave(df$x, df$group, FUN = function(Z) zoo::rollmean(Z, 10, fill=NA, align='left'))
df[c(1:3, 82:97, 196:207),]
#           date          x   group      rolly
# 1   2008-12-29 0.85964912 series1 0.68249269
# 2   2008-12-30 0.58333333 series1 0.67118056
# 3   2008-12-31 0.86805556 series1 0.68541667
# 82  2009-03-20 0.97916667 series1 0.97638889
# 83  2009-03-21 0.94444444 series1 0.97812500
# 84  2009-03-22 0.98958333 series1 0.98298611
# 85  2009-03-23 1.00000000 series1 0.98402778
# 86  2009-03-24 0.98611111 series1         NA
# 87  2009-03-25 0.95138889 series1         NA
# 88  2009-03-26 0.97916667 series1         NA
# 89  2009-03-27 0.99305556 series1         NA
# 90  2009-03-28 0.98958333 series1         NA
# 91  2009-03-29 0.95138889 series1         NA
# 92  2009-03-30 0.99652778 series1         NA
# 93  2009-03-31 0.99305556 series1         NA
# 94  2009-04-01 1.00000000 series1         NA
# 95  2008-12-01 0.03900709 series2 0.06119238
# 96  2008-12-02 0.01736111 series2 0.05798611
# 97  2008-12-03 0.22916667 series2 0.07222222
# 196 2009-03-12 0.99305556 series2 0.96805556
# 197 2009-03-13 0.80902778 series2 0.96493056
# 198 2009-03-14 0.98958333 series2 0.97658592
# 199 2009-03-15 0.99305556 series2         NA
# 200 2009-03-16 0.95486111 series2         NA
# 201 2009-03-17 0.97916667 series2         NA
# 202 2009-03-18 0.98958333 series2         NA
# 203 2009-03-19 0.98263889 series2         NA
# 204 2009-03-20 0.98958333 series2         NA
# 205 2009-03-21 1.00000000 series2         NA
# 206 2009-03-22 0.96180556 series2         NA
# 207 2009-03-23 0.92558140 series2         NA

or, if you're comfortable with dplyr, then

library(dplyr)
df %>% 
  group_by(group) %>% 
  mutate(rolly = zoo::rollmean(x, 10, fill=NA, align='left')) %>% 
  ungroup() %>% 
  ggplot(aes(x=date, y=x, color=group))   
  geom_line(aes(y=rolly, color=group), na.rm= TRUE, size=0.75)
  • Related