Home > Software design >  Adding categorical control variables to nlsLM regression
Adding categorical control variables to nlsLM regression

Time:12-03

I am trying to run an Nonlinear Least Squares regression to estimate three parameters while controlling for categorical variables. I am currently using the nlsLM function from the minpack.lm package for this.

I have the following data set:

df <- data.frame(Year=c(1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1991, 1991, 1991, 1991, 1991, 1991, 1991, 1991, 1992, 1992, 1992, 1992, 1992, 1992, 1992, 1992, 1993, 1993, 1993, 1993,
                           1993, 1993, 1993, 1993, 1994, 1994, 1994, 1994, 1994, 1994, 1994, 1994, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996,
                           1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1998, 1998, 1998, 1998, 1998, 1998, 1998, 1998, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 2000, 2000, 2000, 2000,
                           2000, 2000, 2000, 2000, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2002, 2002, 2002, 2002, 2002, 2002, 2002, 2002, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003),
                Color=c("blue",  "green",  "yellow", "orange", "purple", "red",    "white",  "brown",  "blue",   "green",  "yellow", "orange", "purple", "red", "white", 
                            "brown", "blue",  "green", "yellow", "orange", "purple", "red",  "white", "brown",  "blue",   "green",  "yellow", "orange", "purple", "red",   
                            "white",  "brown",  "blue",   "green",  "yellow", "orange", "purple", "red", "white",  "brown",  "blue",   "green",  "yellow", "orange", "purple",
                            "red",    "white",  "brown",  "blue",   "green",  "yellow", "orange", "purple", "red",    "white",  "brown",  "blue",   "green",  "yellow", "orange",
                            "purple", "red",    "white",  "brown",  "blue",   "green",  "yellow", "orange", "purple", "red",    "white",  "brown",  "blue",   "green",  "yellow",
                            "orange", "purple", "red",   "white",  "brown",  "blue",   "green",  "yellow", "orange", "purple", "red",    "white",  "brown",  "blue",   "green", 
                            "yellow", "orange", "purple", "red",    "white",  "brown",  "blue",   "green",  "yellow", "orange", "purple", "red",    "white",  "brown",  "blue",  
                            "green",  "yellow", "orange", "purple", "red",    "white",  "brown"),
                Y=c(6.9, 53.6,  3.9,  7.6, 17.3, 29.9, 35.1,  6.2,  6.9, 53.6,  3.6,  8.8, 10.6, 29.9, 23.2,  8.8,  5.8, 51.0,  5.8,  3.9,  9.9, 21.0, 35.8,  6.9,  3.9, 69.5,  5.4,  3.6,
                           13.2, 32.8, 27.3,  8.0,  6.2, 66.2,  3.2,  3.9, 10.6, 27.6, 23.9, 11.7,  8.8, 49.5,  4.3,  4.7,  7.3, 33.2, 18.8, 18.4,  8.8, 49.9,  2.5, 27.6, 11.4, 56.9, 16.9,  9.9,
                           3.6, 59.9,  0.6, 19.9, 16.2, 38.4, 19.9, 12.8,  7.3, 49.5,  2.5, 11.4, 11.4, 32.5, 25.8, 31.4,  4.7, 60.6,  5.4, 14.3, 16.5, 51.4, 26.5, 21.4,  6.5, 61.4,  5.1, 14.7,
                           12.1, 53.6, 22.1, 15.8,  6.5, 61.0,  3.9, 14.3, 12.1, 69.1, 28.4, 18.8,  6.5, 76.9,  1.7,  8.0,  9.1, 43.9, 21.0, 17.3,  3.6, 63.6,  2.8,  9.9,  5.1, 35.1, 20.6, 16.5),
                Value=c(45048.7, 218638.3,  39069.9,  10740.1,  62575.7,  76967.4, 226646.2,  36693.8,  40915.0, 247665.1,  43910.4,  11429.4,  60295.5,  76426.6, 244191.4,
                               36749.2,  35005.8, 228515.1,  42248.2,  10285.1,  60681.4,  72030.6, 229893.0,  36404.7,  43749.9, 268866.1,  38835.1,  11899.6,  58424.4,  82731.1,
                               255466.1,  31277.1,  55047.2, 305402.5,  39084.3,  13398.4,  65122.4,  79750.5, 281509.4,  35542.1,  47780.8, 327010.6,  44074.8,  14565.8,  70142.8,
                               104683.1, 315443.8,  46939.5,  41387.0, 327226.5,  44330.9,  16046.2,  67922.8, 122232.1, 323685.2,  44895.5,  36323.1, 346799.2,  43400.6,  16547.5,
                               77243.2, 111932.1, 331698.8,  47992.3,  34636.8, 357551.3,  41798.8,  17346.3,  87586.4,  99095.4, 366299.7,  53745.3,  39918.4, 357564.7,  43367.9,
                               17921.5,  96130.4, 101582.7, 399612.1,  40792.3,  45870.7, 360308.6,  46312.0,  20444.3, 101972.7,  96745.6, 439824.2,  49499.2,  48152.0, 346522.2,
                               54800.0,  20503.6,  98936.7, 105203.3, 436226.9,  40983.5,  53812.9, 351838.8,  55071.2,  20865.7,  99782.6, 112538.4, 474671.2,  43175.7,  53994.5,
                               333412.4,  54407.9,  19528.1,  95297.1, 101047.5, 470599.2,  33293.8),
                Amount=c(22357.1,  45323.2,   7060.7,      0.2, 103671.4, 100515.1, 122229.3,   1254.9,  78600.7,  48483.2,   6291.6,   1059.7,  28861.1, 179036.4,  40044.7,
                                12921.4,  19601.9,   6095.1,   4667.4,   2194.7,  22358.8, 161020.1,  40368.1,   4000.5, 139611.6,  45724.9,   1262.3,     86.4,  88898.4,  85844.9,
                                262167.2,  19233.5,  21174.3,  16797.2,    246.0,   4284.0, 124309.9, 109092.7,  80172.1,   5315.0,  17300.8,  58570.1,   4240.7,  29715.0,  67126.6,
                                42928.3, 132263.8,  12182.9,  77751.4, 117453.7,    443.9,  21868.6,  63683.6, 212790.1,  28990.6,      0.2,  39413.4, 134290.1,   4665.5,      0.2,
                                135307.1, 114914.2, 258602.7,      0.2,   3391.7,  74113.6,   3070.4,  17796.6,   6223.9, 188960.2, 260430.1,      0.2,  16379.0,  37389.8,   2587.3,
                                1149.9,  54814.3, 183559.8,  55877.1,      0.2,   5835.3,  39010.5,   8263.9,  13463.9,  40232.7, 152270.9, 314975.1, 119611.4,   5811.2, 102397.5,
                                6479.1,    890.6,  24356.6,  68414.0,  85800.6,  16564.8,   9218.9, 170079.5,   5181.0,   3378.0,  37603.9,  98078.2, 533192.3,   5753.8,  41286.3,
                                43227.9,   2494.7,   9025.1,  20819.6, 45227.4, 563984.9,  7129.6))

In the following function, I am estimating parameters z, k and g. Variables "Y", "Value" and "Amount" is given by my dataset. The following code works for me:

library(minpack.lm)

### I set the following starting values for z, k and g:
z <- 10
k <- 0.1
g <- 1

### This is my nls function and formula:
nlsfit <- nlsLM(formula = log(Y) ~ (k/z)*log(Value^z   g*Amount^z),
                   data = df,
                   control = nls.lm.control(ftol = 1e-10, ptol = 1e-10, maxiter = 280),
                   start = list(z = z, k = k, g = g))

However, I know that variables "color" and "Year" may have an impact on my regression and results, and thus I want to control for these. In a regular lm regression, I am able to add these categorical variables, but in the nlsLM function, I get an error. When adding Color as a control variable, I get:

> nlsfit <- nlsLM(formula = log(Y) ~ (k/z)*log(Value^z   g*Amount^z)   Color,
                     data = df,
                     control = nls.lm.control(ftol = 1e-10, ptol = 1e-10, maxiter = 280),
                     start = list(z = z, k = k, g = g))

Error in (k/z) * log(Value^z   g * Amount^z)   Color : 
  non-numeric argument to binary operato

And when adding factor(Year) as a control variable, I get:

> nlsfit <- nlsLM(formula = log(Y) ~ (k/z)*log(Value^z   g*Amount^z)   factor(Year),
                     data = df,
                     control = nls.lm.control(ftol = 1e-10, ptol = 1e-10, maxiter = 280),
                     start = list(z = z, k = k, g = g))

Error in numericDeriv(form[[3L]], names(ind), env) : 
  Missing value or an infinity produced when evaluating the model

I want to add both Color and Year in the (same) nls-function as categorical control variables.

I know NLS might have some problems with categorical variables. I am appreciative for any help or suggestions for other types of solutions or work-arounds.

CodePudding user response:

We can avoid having to provide most of the starting values by switching to nls and using the plinear algorithm. It only requires initial values for those parameters that enter non-linearly. The starting values for the linear parameters, i.e. k and all the Color and Year starting values, can be omitted.

Playing around with it a bit we notice that the g parameter has convergence problems so let us fix a sequence of g parameters and optimize on the rest taking the g which gives the least deviance (i.e. least residual sum of squares).

In fm only the Color and k parameters are significant so refit without the Year parameters (fm2).

g_seq <- seq(-1, 1, .01)
mColor <- model.matrix(~ Color, df)[, -1]
mYear <- model.matrix(~ Year, transform(df, Year = factor(Year)))[, -1]
rss <- sapply(g_seq, function(g)
  try(deviance(nls(log(Y) ~ cbind(kz = log(Value^z   g*Amount^z), mYear, mColor),
                    df, start = list(z = 1), algorithm = "plinear"))))
rss <- as.numeric(rss)
g <- g_seq[which.min(rss)]
g
## [1] 0.19

# fit with best g obtained above
fm <- nls(log(Y) ~ cbind(k = (1/z) * log(Value^z   g*Amount^z), mYear, mColor), df,
                    start = list(z = 1), algorithm = "plinear")

# fit without mYear
fm2 <- nls(log(Y) ~ cbind(k = (1/z) * log(Value^z   g*Amount^z), mColor), df,
                    start = list(z = 1), algorithm = "plinear")

summary(fm2)

giving

Formula: log(Y) ~ cbind(k = (1/z) * log(Value^z   g * Amount^z), mColor)

Parameters:
                 Estimate Std. Error t value Pr(>|t|)    
z                 0.97166    5.47654   0.177 0.859525    
.lin.k            0.16467    0.01472  11.185  < 2e-16 ***
.lin.Colorbrown   0.81941    0.15972   5.130 1.37e-06 ***
.lin.Colorgreen   1.98067    0.17444  11.354  < 2e-16 ***
.lin.Colororange  0.60442    0.14776   4.091 8.55e-05 ***
.lin.Colorpurple  0.53150    0.15857   3.352 0.001124 ** 
.lin.Colorred     1.70567    0.16161  10.554  < 2e-16 ***
.lin.Colorwhite   1.07264    0.16924   6.338 6.24e-09 ***
.lin.Coloryellow -0.60455    0.16543  -3.654 0.000408 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4083 on 103 degrees of freedom

Number of iterations to convergence: 7 
Achieved convergence tolerance: 9.09e-06

CodePudding user response:

How about this:

  df <- data.frame(Year=c(1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1991, 1991, 1991, 1991, 1991, 1991, 1991, 1991, 1992, 1992, 1992, 1992, 1992, 1992, 1992, 1992, 1993, 1993, 1993, 1993,
                          1993, 1993, 1993, 1993, 1994, 1994, 1994, 1994, 1994, 1994, 1994, 1994, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996,
                          1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1998, 1998, 1998, 1998, 1998, 1998, 1998, 1998, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 2000, 2000, 2000, 2000,
                          2000, 2000, 2000, 2000, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2002, 2002, 2002, 2002, 2002, 2002, 2002, 2002, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003),
                   Color=c("blue",  "green",  "yellow", "orange", "purple", "red",    "white",  "brown",  "blue",   "green",  "yellow", "orange", "purple", "red", "white", 
                           "brown", "blue",  "green", "yellow", "orange", "purple", "red",  "white", "brown",  "blue",   "green",  "yellow", "orange", "purple", "red",   
                           "white",  "brown",  "blue",   "green",  "yellow", "orange", "purple", "red", "white",  "brown",  "blue",   "green",  "yellow", "orange", "purple",
                           "red",    "white",  "brown",  "blue",   "green",  "yellow", "orange", "purple", "red",    "white",  "brown",  "blue",   "green",  "yellow", "orange",
                           "purple", "red",    "white",  "brown",  "blue",   "green",  "yellow", "orange", "purple", "red",    "white",  "brown",  "blue",   "green",  "yellow",
                           "orange", "purple", "red",   "white",  "brown",  "blue",   "green",  "yellow", "orange", "purple", "red",    "white",  "brown",  "blue",   "green", 
                           "yellow", "orange", "purple", "red",    "white",  "brown",  "blue",   "green",  "yellow", "orange", "purple", "red",    "white",  "brown",  "blue",  
                           "green",  "yellow", "orange", "purple", "red",    "white",  "brown"),
                   Y=c(6.9, 53.6,  3.9,  7.6, 17.3, 29.9, 35.1,  6.2,  6.9, 53.6,  3.6,  8.8, 10.6, 29.9, 23.2,  8.8,  5.8, 51.0,  5.8,  3.9,  9.9, 21.0, 35.8,  6.9,  3.9, 69.5,  5.4,  3.6,
                       13.2, 32.8, 27.3,  8.0,  6.2, 66.2,  3.2,  3.9, 10.6, 27.6, 23.9, 11.7,  8.8, 49.5,  4.3,  4.7,  7.3, 33.2, 18.8, 18.4,  8.8, 49.9,  2.5, 27.6, 11.4, 56.9, 16.9,  9.9,
                       3.6, 59.9,  0.6, 19.9, 16.2, 38.4, 19.9, 12.8,  7.3, 49.5,  2.5, 11.4, 11.4, 32.5, 25.8, 31.4,  4.7, 60.6,  5.4, 14.3, 16.5, 51.4, 26.5, 21.4,  6.5, 61.4,  5.1, 14.7,
                       12.1, 53.6, 22.1, 15.8,  6.5, 61.0,  3.9, 14.3, 12.1, 69.1, 28.4, 18.8,  6.5, 76.9,  1.7,  8.0,  9.1, 43.9, 21.0, 17.3,  3.6, 63.6,  2.8,  9.9,  5.1, 35.1, 20.6, 16.5),
                   Value=c(45048.7, 218638.3,  39069.9,  10740.1,  62575.7,  76967.4, 226646.2,  36693.8,  40915.0, 247665.1,  43910.4,  11429.4,  60295.5,  76426.6, 244191.4,
                           36749.2,  35005.8, 228515.1,  42248.2,  10285.1,  60681.4,  72030.6, 229893.0,  36404.7,  43749.9, 268866.1,  38835.1,  11899.6,  58424.4,  82731.1,
                           255466.1,  31277.1,  55047.2, 305402.5,  39084.3,  13398.4,  65122.4,  79750.5, 281509.4,  35542.1,  47780.8, 327010.6,  44074.8,  14565.8,  70142.8,
                           104683.1, 315443.8,  46939.5,  41387.0, 327226.5,  44330.9,  16046.2,  67922.8, 122232.1, 323685.2,  44895.5,  36323.1, 346799.2,  43400.6,  16547.5,
                           77243.2, 111932.1, 331698.8,  47992.3,  34636.8, 357551.3,  41798.8,  17346.3,  87586.4,  99095.4, 366299.7,  53745.3,  39918.4, 357564.7,  43367.9,
                           17921.5,  96130.4, 101582.7, 399612.1,  40792.3,  45870.7, 360308.6,  46312.0,  20444.3, 101972.7,  96745.6, 439824.2,  49499.2,  48152.0, 346522.2,
                           54800.0,  20503.6,  98936.7, 105203.3, 436226.9,  40983.5,  53812.9, 351838.8,  55071.2,  20865.7,  99782.6, 112538.4, 474671.2,  43175.7,  53994.5,
                           333412.4,  54407.9,  19528.1,  95297.1, 101047.5, 470599.2,  33293.8),
                   Amount=c(22357.1,  45323.2,   7060.7,      0.2, 103671.4, 100515.1, 122229.3,   1254.9,  78600.7,  48483.2,   6291.6,   1059.7,  28861.1, 179036.4,  40044.7,
                            12921.4,  19601.9,   6095.1,   4667.4,   2194.7,  22358.8, 161020.1,  40368.1,   4000.5, 139611.6,  45724.9,   1262.3,     86.4,  88898.4,  85844.9,
                            262167.2,  19233.5,  21174.3,  16797.2,    246.0,   4284.0, 124309.9, 109092.7,  80172.1,   5315.0,  17300.8,  58570.1,   4240.7,  29715.0,  67126.6,
                            42928.3, 132263.8,  12182.9,  77751.4, 117453.7,    443.9,  21868.6,  63683.6, 212790.1,  28990.6,      0.2,  39413.4, 134290.1,   4665.5,      0.2,
                            135307.1, 114914.2, 258602.7,      0.2,   3391.7,  74113.6,   3070.4,  17796.6,   6223.9, 188960.2, 260430.1,      0.2,  16379.0,  37389.8,   2587.3,
                            1149.9,  54814.3, 183559.8,  55877.1,      0.2,   5835.3,  39010.5,   8263.9,  13463.9,  40232.7, 152270.9, 314975.1, 119611.4,   5811.2, 102397.5,
                            6479.1,    890.6,  24356.6,  68414.0,  85800.6,  16564.8,   9218.9, 170079.5,   5181.0,   3378.0,  37603.9,  98078.2, 533192.3,   5753.8,  41286.3,
                            43227.9,   2494.7,   9025.1,  20819.6, 45227.4, 563984.9,  7129.6))
library(minpack.lm)
### I set the following starting values for z, k and g:
z <- 10
k <- 0.1
g <- 1


df$Year <- as.factor(df$Year)
df$Color <- as.factor(df$Color)
X <- model.matrix(~Color   Year, data=df)[,-1]
df <- cbind(df, X)

form <- paste0("log(Y) ~ (k/z)*log(Value^z   g*Amount^z)   ", paste(paste0("b", 1:ncol(X)), "*", colnames(X), collapse="   "))

nlsfit <- nlsLM(formula = form ,
                data = df,
                control = nls.lm.control(ftol = 1e-10, ptol = 1e-10, maxiter = 280),
                start = list(z = z, k = k, g = g, b1=0, 
                             b2=0, b3=0, b4=0, b5=0, b6=0, 
                             b7=0, b8=0, b9=0, b10=0, 
                             b11=0, b12=0, b13=0, b14=0, 
                             b15=0, b16=0, b17=0, b18=0, 
                             b19=0, b20=0))
pars <- nlsfit$m$getPars()
pars
#>             z             k             g            b1            b2 
#> -2.176726e 00  1.697185e-01 -1.520116e-12  7.285176e-01  1.961322e 00 
#>            b3            b4            b5            b6            b7 
#>  6.062530e-01  5.360258e-01  1.722828e 00  1.062541e 00 -6.178919e-01 
#>            b8            b9           b10           b11           b12 
#> -6.618152e-02 -1.544231e-01 -1.186151e-01 -1.567786e-01 -1.044543e-01 
#>           b13           b14           b15           b16           b17 
#>  6.997504e-02 -1.969730e-01 -4.729006e-02  2.103823e-01  1.488066e-01 
#>           b18           b19           b20 
#>  1.950499e-01 -1.005054e-01 -2.060658e-01
b <- pars[4:23]
xb <- X %*% b
fit <- (pars[2]/pars[1])*log(df$Value^pars[1]   pars[3]*df$Amount^pars[1])   xb
plot(fit, log(df$Y))

Created on 2022-12-02 by the reprex package (v2.0.1)

You get some warnings about NaNs being created throughout the optimization, but the final results don't produce any NaN values.

  • Related