I'm trying to implement a multivariable linear regression with gradient descent but when I try this:
# Starting values
w = np.ones(3) # The number of features is 3
b = float(0)
def gradient_descent():
global w
global b
learning_rate = 0.0001
for i in range(x_train.shape[0]):
prediction = np.dot(x_tarin[i], w) b
error = y_tarin[i] - prediction
for j in range(w.shape[0]):
w[j] = w[j] - (error * x_train[i][j] * learning_rate)
b = b - (error * learning_rate)
def train():
for i in range(10_000):
gradient_descent()
print(i, ':', w, b)
train()
the output is
0 : [inf inf inf] inf
1 : [inf inf inf] inf
2 : [inf inf inf] inf
3 : [inf inf inf] inf
4 : [inf inf inf] inf
5 : [inf inf inf] inf
6 : [inf inf inf] inf
....
so what I did wrong? I tried to decrease the learning rate but nothing changed
data sample:
total_rooms,population,households,bedrooms(target)
5612.0,1015.0,472.0,1283.0
7650.0,1129.0,463.0,1901.0
720.0,333.0,117.0,174.0
1501.0,515.0,226.0,337.0
1454.0,624.0,262.0,326.0
which total_rooms, population and households is x_train with shape (17000, 3) and bedrooms is y_train with shape (17000, 1)
when I try to scale the data using sklearn.preprocessing.StandardScaler
before splitting the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
x_train = train_data[:, :3]
y_train = train_data[:, -1]
I get nan
instead of inf
!
note: The data works fine with scaling or not with sklearn.linear_model.LinearRegression
CodePudding user response:
As suggested in the comments: feature scaling is a good idea (scikit-learn includes SimpleScaler
, but it's pretty straightforward to subtract the mean of each column and divide by the standard deviation as well).
Also: the error term appears to be backwards, the residual is usually prediction - true
.
error = prediction - y[i]
CodePudding user response:
Without optimization or any warranty: Normalization and correctly applied gradient descent formula leads you to something like
import numpy as np
def gradient_descent(x_train, y_train, w=np.ones(3), b=float(0), learning_rate=0.001):
predictions = x_train @ w b
error = predictions - y_train
w = w - learning_rate * error @ x_train
b = b - learning_rate * sum(error)
return w, b
def train():
# data with last column being the target
data = np.array(
[
[5612.0, 1015.0, 472.0, 1283.0],
[7650.0, 1129.0, 463.0, 1901.0],
[720.0, 333.0, 117.0, 174.0],
[1501.0, 515.0, 226.0, 337.0],
[1454.0, 624.0, 262.0, 326.0],
]
)
norm_offset = np.mean(data[:])
norm_factor = 1 / np.std(data[:])
data_normalized = (data - norm_offset) * norm_factor
x_train = data_normalized[:, :-1]
y_train = data_normalized[:, -1]
# start values
w = np.ones(x_train.shape[1])
b = float(0)
# train
for i in range(10_000):
w, b = gradient_descent(x_train, y_train, w, b)
# o = offset, f = factor, w'/b' normalized parameters, w/b original parameters
# y' = w' * x' b'
# f * (y - o) = w' * f * (x - o) b'
# y = w' * (x - o) b' / f o
# y = w' * x - o * sum(w') b' / f o
# => w = w', b = b' / f o - o * sum(w')
b_orig = b / norm_factor norm_offset - sum(w) * norm_offset
ssr = np.sum((data[:, :3] @ w b_orig - data[:, 3]) ** 2)
print(i, ':', w, b_orig, ssr)
if __name__ == "__main__":
train()
...
9999 : [0.13503938 0.69644619 0.75400302] -386.71116671360414 71015.11748640954