Error (Key Error 0) when trying to calculate gradient for an ML model. I think it has something to w-CodePudding

Computes the gradient for linear regression
Args:
X (ndarray (m,n)): Data, m examples with n features
y (ndarray (m,)) : target values
w (ndarray (n,)) : model parameters
b (scalar) : model parameter
Returns:
dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w.
dj_db (scalar): The gradient of the cost w.r.t. the parameter b.

import numpy as np
 
def gradient(X, y, w, b): 

    m,n = X.shape           #(number of examples, number of features)
    dj_dw = np.zeros((n,))
    dj_db = 0
    for i in range(m):                             
        err = (np.dot(X[i],w)   b) - y[i] 
        for j in range(n):                         
            dj_dw[j] = dj_dw[j]   err * X[i, j]    
        dj_db = dj_db   err                        
    dj_dw = dj_dw / m                                
    dj_db = dj_db / m                                
        
    return dj_db, dj_dw

b_init = 785.1811367994083
w_init = np.array([ 0.39133535, 18.75376741, -53.36032453, -26.42131618,-33.2342342])
tmp_dj_db, tmp_dj_dw = gradient(X_train, y_train, w_init, b_init)
print(f'dj_db at initial w,b: {tmp_dj_db}')
print(f'dj_dw at initial w,b: \n {tmp_dj_dw}')

This is the exact error I'm getting

KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3360             try:
-> 3361                 return self._engine.get_loc(casted_key)
   3362             except KeyError as err:

6 frames
/usr/local/lib/python3.7/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

/usr/local/lib/python3.7/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

KeyError: 1

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
<ipython-input-73-1f87825d3f02> in <module>
      1 b_init = 785.1811367994083
      2 w_init = np.array([ 0.39133535, 18.75376741, -53.36032453, -26.42131618,-33.2342342])
----> 3 tmp_dj_db, tmp_dj_dw = gradient(X_train, y_train, w_init, b_init)
      4 print(f'dj_db at initial w,b: {tmp_dj_db}')
      5 print(f'dj_dw at initial w,b: \n {tmp_dj_dw}')

<ipython-input-72-52811c00c1ad> in gradient(X, y, w, b)
     16     dj_db = 0
     17     for i in range(m):
---> 18         err = (np.dot(X.iloc[i,],w)   b) - y[i]
     19         for j in range(n):
     20             dj_dw[j] = dj_dw[j]   err * X.iloc[i, j]

/usr/local/lib/python3.7/dist-packages/pandas/core/series.py in __getitem__(self, key)
    940 
    941         elif key_is_scalar:
--> 942             return self._get_value(key)
    943 
    944         if is_hashable(key):

/usr/local/lib/python3.7/dist-packages/pandas/core/series.py in _get_value(self, label, takeable)
   1049 
   1050         # Similar to Index.get_value, but we do not fall back to positional
-> 1051         loc = self.index.get_loc(label)
   1052         return self.index._get_values_for_loc(self, loc, label)
   1053 

/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3361                 return self._engine.get_loc(casted_key)
   3362             except KeyError as err:
-> 3363                 raise KeyError(key) from err
   3364 
   3365         if is_scalar(key) and isna(key) and not self.hasnans:

KeyError: 1

This it the dataset - It's a drive link

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Dataset

CodePudding user response：

As I suspected the issue was that you were trying to iterate over pandas dataframe. This should work

import numpy as np
import pandas as pd


def gradient(X, y, w, b):
    m, n = X.shape  # (number of examples, number of features)
    dj_dw = np.zeros((n,))
    dj_db = 0
    for i in range(m):
        err = (np.dot(X[i], w)   b) - y[i]
        for j in range(n):
            dj_dw[j] = dj_dw[j]   err * X[i, j]
        dj_db = dj_db   err
    dj_dw = dj_dw / m
    dj_db = dj_db / m

    return dj_db, dj_dw


b_init = 785.1811367994083
w_init = np.array([0.39133535, 18.75376741, -53.36032453, -26.42131618, -33.2342342])
data = pd.read_csv(r'USA_Housing - USA_Housing.csv')
X_train, y_train = data.drop(['Price'], axis=1, errors='ignore'), data['Price']
tmp_dj_db, tmp_dj_dw = gradient(X_train.to_numpy(), y_train.to_numpy(), w_init, b_init)
print(f'dj_db at initial w,b: {tmp_dj_db}')
print(f'dj_dw at initial w,b: \n {tmp_dj_dw}')