Having an issue in saving fitted and observed values in lm model?-CodePudding

I run a linear regression model with looping over the categorical levels of one column of my data. I want to save some of the model summary output.

I achieved to save some of them include coefficients, R2 and Pvalue. However, when I save the fitted and observed values in a data.frame called obsFit, it returns the fitted and observed values of only one categorical level as I need 7 levels.

Any ideas and thoughts?

So sorry for hardcoding R, I am still a novice who is in the learning process.

Here is the chuck of code I used.

# Wilayah 1 vs MADA4 model
> Y1 <- yield[ ,2:5]
> mada4 <- stns[["S1"]][["Mada4_sum"]]
> unique(mada4$aggre_per)
[1] "3"     "4"     "5"     "6"     "7"     "mamjj"
[7] "amj"  
> 
> # create an empty matrix to fill results
> df <- matrix(rep(NA, 9))
> 
> for (agg.per in unique(mada4$aggre_per)) {
    
    subdata <- subset(mada4, aggre_per == agg.per)
    subdata <- cbind(subdata, Y1$I_S1)
    names(subdata)[4] <- "I_S1"
    
    #save model summary
    mod <- lm('I_S1 ~ value', subdata)
    slope     <- summary(mod)[[4]][2,1]
    intercept <- summary(mod)[[4]][1,1]
    r2        <- summary(mod)[[8]]
    adjr2     <- summary(mod)[[9]]
    fstat     <- summary(mod)[[10]][[1]]
    pval      <- summary(mod)[[4]][2,4]
    pearson   <- cor(subdata$I_S1,subdata$value )
    res <- c("Mada4_sum", agg.per, slope, intercept, r2, adjr2, fstat, pval, pearson)
    df <- cbind(df, res)
    
    #save the fitted and observed yields as data.frame
    obsFit <- data.frame(cbind("Mada4_sum", agg.per,subdata$I_S1, fitted(mod)))
    
  }
> 
> df <- data.frame(t(df[,-1]))
> df[,-2:-1] <- apply (df[,-2:-1], 2, as.numeric)
> df[,-2:-1] <- apply (df[,-2:-1], 2, round, digit =2)
> rownames(df) <- 1:nrow(df)
> names(df) <-c( "WI",
                 "aggre_per",
                 "slope",
                 "intercept",
                 "R2",
                 "adjustedR2",
                 "F-stat",
                 "p-value",
                 "correlation")
> head(df)
         WI aggre_per slope intercept   R2 adjustedR2
1 Mada4_sum         3 -1.17   6230.91 0.09       0.03
2 Mada4_sum         4 -0.27   6128.81 0.00      -0.07
3 Mada4_sum         5 -1.04   6240.46 0.02      -0.05
4 Mada4_sum         6 -2.88   6507.82 0.14       0.08
5 Mada4_sum         7 -1.12   6296.72 0.05      -0.02
6 Mada4_sum     mamjj -0.89   6760.92 0.18       0.12
  F-stat p-value correlation
1   1.40    0.26       -0.30
2   0.07    0.80       -0.07
3   0.23    0.64       -0.13
4   2.26    0.15       -0.37
5   0.76    0.40       -0.23
6   3.12    0.10       -0.43
> #check levels of `aggre_per`
> unique(df$aggre_per)
[1] "3"     "4"     "5"     "6"     "7"     "mamjj"
[7] "amj"  
> 
> names(obsFit) <- c("WI", "Aggr.per", "Observed", "Fitted")
> print(obsFit)
           WI Aggr.per Observed           Fitted
103 Mada4_sum      amj     5204 6019.05130014577
104 Mada4_sum      amj     5824 6061.66361454587
105 Mada4_sum      amj     5481 6129.49546195825
106 Mada4_sum      amj     5587 5962.52476063545
107 Mada4_sum      amj     6260 6124.27762754192
108 Mada4_sum      amj     5771   6017.312022007
109 Mada4_sum      amj     5899 5888.60543973734
110 Mada4_sum      amj     6338 6189.50055774613
111 Mada4_sum      amj     6340 6023.39949549272
112 Mada4_sum      amj     6013 6146.01860427665
113 Mada4_sum      amj     6075 6221.67720331355
114 Mada4_sum      amj     6484 5959.91584342728
115 Mada4_sum      amj     6771 5998.17996248043
116 Mada4_sum      amj     6493 6249.50565353401
117 Mada4_sum      amj     6386 6290.37868979533
118 Mada4_sum      amj     6465 6109.49376336229
>

Hereis the dput of my data?

> dput(mada4)
structure(list(year = c(2001, 2002, 2003, 2004, 2005, 2006, 2007, 
2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2001, 2002, 
2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 
2014, 2015, 2016, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 
2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2001, 2002, 2003, 
2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 
2015, 2016, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 
2010, 2011, 2012, 2013, 2014, 2015, 2016, 2001, 2002, 2003, 2004, 
2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 
2016, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 
2011, 2012, 2013, 2014, 2015, 2016), aggre_per = c("3", "3", 
"3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", 
"3", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", 
"4", "4", "4", "4", "5", "5", "5", "5", "5", "5", "5", "5", "5", 
"5", "5", "5", "5", "5", "5", "5", "6", "6", "6", "6", "6", "6", 
"6", "6", "6", "6", "6", "6", "6", "6", "6", "6", "7", "7", "7", 
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", 
"mamjj", "mamjj", "mamjj", "mamjj", "mamjj", "mamjj", "mamjj", 
"mamjj", "mamjj", "mamjj", "mamjj", "mamjj", "mamjj", "mamjj", 
"mamjj", "mamjj", "amj", "amj", "amj", "amj", "amj", "amj", "amj", 
"amj", "amj", "amj", "amj", "amj", "amj", "amj", "amj", "amj"
), value = c(189, 6, 212, 81, 101, 84, 163, 198, 143, 128, 456, 
93, 37, 23, 58, 0, 210, 196, 70, 122, 132, 184, 387, 57, 232, 
153, 11, 325, 228, 75, 41, 31, 141, 95, 219, 217, 178, 154, 139, 
112, 164, 62, 106, 202, 101, 123, 108, 252, 175, 186, 110, 252, 
95, 190, 150, 161, 125, 165, 176, 67, 221, 63, 65, 139, 100, 
267, 207, 202, 48, 322, 240, 250, 270, 298, 159, 164, 178, 29, 
94, 164, 815, 750, 818, 874, 554, 934, 1079, 778, 934, 806, 908, 
851, 765, 313, 366, 586, 526, 477, 399, 591, 405, 528, 676, 330, 
521, 380, 293, 594, 550, 261, 214, 422)), row.names = c(1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 
31L, 32L, 33L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 
45L, 46L, 47L, 48L, 49L, 50L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 
59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 69L, 70L, 71L, 72L, 
73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L, 86L, 
87L, 88L, 89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 
100L, 101L, 103L, 104L, 105L, 106L, 107L, 108L, 109L, 110L, 111L, 
112L, 113L, 114L, 115L, 116L, 117L, 118L), class = "data.frame")
>

> dput(yield)
structure(list(date = structure(c(11323, 11688, 12053, 12418, 
12784, 13149, 13514, 13879, 14245, 14610, 14975, 15340, 15706, 
16071, 16436, 16801), class = "Date"), I_S1 = c(5204, 5824, 5481, 
5587, 6260, 5771, 5899, 6338, 6340, 6013, 6075, 6484, 6771, 6493, 
6386, 6465), II_S1 = c(5081, 5674, 5589, 5809, 6055, 6231, 5728, 
5971, 6268, 5999, 6015, 6624, 6827, 6453, 6382, 6340), III_S1 = c(5071, 
5349, 5703, 5451, 5764, 5528, 5629, 5395, 5797, 5785, 5887, 5581, 
5899, 6179, 6086, 5734), IV_S1 = c(5515, 5491, 5969, 5971, 6265, 
6242, 6153, 6188, 6311, 6447, 6213, 6410, 6744, 6754, 6247, 6415
), I_S2 = c(4735, 5227, 5415, 5894, 3294, 5113, 6056, 6216, 5752, 
5758, 5540, 5418, 6461, 5346, 6437, 5633), II_S2 = c(4837, 5209, 
5111, 5482, 3203, 5389, 5209, 6041, 5617, 5927, 4993, 5342, 6380, 
5122, 6320, 5470), III_S2 = c(4720, 4986, 4846, 4720, 3968, 4912, 
4437, 5675, 5368, 5638, 4892, 4849, 6117, 4996, 5586, 4666), 
    IV_S2 = c(5054, 5584, 5546, 5666, 5116, 6288, 6089, 6529, 
    5880, 6156, 5263, 5358, 6804, 5212, 6174, 5560)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -16L))

CodePudding user response：

You could do this in a by which splits the data by condition and applies a function, later rbind the resulting list. The obsFit you could pass through as an attribute.

res <- by(mada4, mada4$aggre_per, \(x) {
  x <- cbind(x, yield[ ,2:5])
  sm <- summary(mod <- lm(I_S1 ~ value, x))
  o <- setNames(as.vector(sm$coefficients[, c(1, 4)])[c(1, 2, 4)],
                c('Intercept', 'slope', 'pval'))
  o <- c(o, sapply(sm[c('r.squared', 'adj.r.squared', 'fstatistic')], `[`, 1))
  o <- c(o, pearson=with(x, cor(I_S1, value)))[c(2, 1, 4, 5, 6, 3, 7)]
  ft <- cbind(x, x$I_S1, fitted(mod))
  return(`attr<-`(o, 'obsFit', ft))
})

Results

do.call(rbind, res) |> signif(4)
#         slope Intercept r.squared adj.r.squared fstatistic.value    pval  pearson
# 3     -1.1680      6231   0.09105      0.026130          1.40200 0.25600 -0.30170
# 4     -0.2730      6129   0.00464     -0.066460          0.06526 0.80210 -0.06812
# 5     -1.0350      6240   0.01636     -0.053900          0.23290 0.63680 -0.12790
# 6     -2.8780      6508   0.13910      0.077660          2.26300 0.15470 -0.37300
# 7     -1.1220      6297   0.05134     -0.016420          0.75760 0.39880 -0.22660
# amj   -0.8696      6476   0.07058      0.004188          1.06300 0.32000 -0.26570
# mamjj -0.8889      6761   0.18210      0.123700          3.11700 0.09926 -0.42670

obsFits

obsFits <- lapply(res, attr, 'obsFit')
obsFits$`3`  ## first aggre_per `3`
#    year aggre_per value I_S1 II_S1 III_S1 IV_S1 x$I_S1 fitted(mod)
# 1  2001         3   189 5204  5081   5071  5515   5204    6010.131
# 2  2002         3     6 5824  5674   5349  5491   5824    6223.904
# 3  2003         3   212 5481  5589   5703  5969   5481    5983.264
# 4  2004         3    81 5587  5809   5451  5971   5587    6136.292
# 5  2005         3   101 6260  6055   5764  6265   6260    6112.929
# 6  2006         3    84 5771  6231   5528  6242   5771    6132.788
# 7  2007         3   163 5899  5728   5629  6153   5899    6040.503
# 8  2008         3   198 6338  5971   5395  6188   6338    5999.618
# 9  2009         3   143 6340  6268   5797  6311   6340    6063.866
# 10 2010         3   128 6013  5999   5785  6447   6013    6081.389
# 11 2011         3   456 6075  6015   5887  6213   6075    5698.234
# 12 2012         3    93 6484  6624   5581  6410   6484    6122.274
# 13 2013         3    37 6771  6827   5899  6744   6771    6187.691
# 14 2014         3    23 6493  6453   6179  6754   6493    6204.045
# 15 2015         3    58 6386  6382   6086  6247   6386    6163.160
# 16 2016         3     0 6465  6340   5734  6415   6465    6230.913

Note: R >= 4.1 used.