Imagine this is the database
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
############
### DATA ###
############
TrainingData = { 'name': ['Alex', 'Ben', 'Marry','Alex', 'Ben', 'Marry'],
'teacher': [1,0,0,1,0,0],
'doctor': [0,1,0,0,1,0],
'engineer': [0,0,1,0,0,1],
'age': [27, 32, 78,27, 32, 78],
'weight': [160, 209, 130,164, 206, 132],
'date': [1,1,1,2,2,2]}
TestData = {'name': ['Alex', 'Ben', 'Marry'],
'teacher': [1,0,0],
'doctor': [0,1,0],
'engineer': [0,0,1],
'age': [np.NaN,np.NaN,np.NaN],
'weight': [np.NaN,np.NaN,np.NaN],
'data': [3,3,3]}
# Convert to pandas dataframe
dfTraining = pd.DataFrame(TrainingData)
dfTest = pd.DataFrame(TestData)
# Print
print(dfTraining)
print(dfTest)
Train:
name teacher doctor engineer age weight date
0 Alex 1 0 0 27 160 1
1 Ben 0 1 0 32 209 1
2 Marry 0 0 1 78 130 1
3 Alex 1 0 0 27 164 2
4 Ben 0 1 0 32 206 2
5 Marry 0 0 1 78 132 2
Test:
name teacher doctor engineer age weight data
0 Alex 1 0 0 NaN NaN 3
1 Ben 0 1 0 NaN NaN 3
2 Marry 0 0 1 NaN NaN 3
I changed them to numpy to prepare for ML model:
Y=df_train.groupby('name')['weight'].apply(lambda x: (x.to_numpy()))
df_train_x=df_train.drop('weight', axis=1)
X= df_train_x.groupby('name').apply(lambda x: (x.to_numpy()))
K=1
df_test_x=df_test.drop('weight', axis=1)
X_pred_null=df_test_x.groupby('name').apply(lambda x: (x.notnull()))
PresentVariables = (X_pred_null.to_numpy())
Now I want exclude null columns from X, but when I use
NearestNeighbor = KNeighborsRegressor(n_neighbors=K).fit(X[:, PresentVariables[0]], Y)
It gives following error:
KeyError Traceback (most recent call last)
Input In [23], in <cell line: 1>()
----> 1 NearestNeighbor = KNeighborsRegressor(n_neighbors=K).fit(X[:, PresentVariables[0]], Y)
File /anaconda/envs/azureml_py38/lib/python3.8/site-packages/pandas/core/series.py:906, in Series.__getitem__(self, key)
903 key = np.asarray(key, dtype=bool)
904 return self._get_values(key)
--> 906 return self._get_with(key)
File /anaconda/envs/azureml_py38/lib/python3.8/site-packages/pandas/core/series.py:921, in Series._get_with(self, key)
916 raise TypeError(
917 "Indexing a Series with DataFrame is not "
918 "supported, use the appropriate DataFrame column"
919 )
920 elif isinstance(key, tuple):
--> 921 return self._get_values_tuple(key)
923 elif not is_list_like(key):
924 # e.g. scalars that aren't recognized by lib.is_scalar, GH#32684
925 return self.loc[key]
File /anaconda/envs/azureml_py38/lib/python3.8/site-packages/pandas/core/series.py:956, in Series._get_values_tuple(self, key)
953 return result
955 if not isinstance(self.index, MultiIndex):
--> 956 raise KeyError("key of type tuple not found and not a MultiIndex")
958 # If key is contained, would have returned by now
959 indexer, new_index = self.index.get_loc_level(key)
KeyError: 'key of type tuple not found and not a MultiIndex'
The error is for X[:, PresentVariables[0]] Here I want to drop columns with null values from dataframe transferred to numpy array Appreciate your help, thanks
CodePudding user response:
Initialize the df
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
############
### DATA ###
############
TrainingData = { 'name': ['Alex', 'Ben', 'Marry','Alex', 'Ben', 'Marry'],
'teacher': [1,0,0,1,0,0],
'doctor': [0,1,0,0,1,0],
'engineer': [0,0,1,0,0,1],
'age': [27, 32, 78,27, 32, 78],
'weight': [160, 209, 130,164, 206, 132],
'date': [1,1,1,2,2,2]}
TestData = {'name': ['Alex', 'Ben', 'Marry'],
'teacher': [1,0,0],
'doctor': [0,1,0],
'engineer': [0,0,1],
'age': [np.NaN,np.NaN,np.NaN],
'weight': [np.NaN,np.NaN,np.NaN],
'data': [3,3,3]}
# Convert to pandas dataframe
df_train = pd.DataFrame(TrainingData)
df_test = pd.DataFrame(TestData)
Assign the weight column as target
y_train = df_train["weight"]
Create x_train by excluding weight and age column.
x_train = df_train.drop(["weight", "age"], axis=1)
Column name contains string input. convert it to numbers by label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(x_train["name"])
x_train["name"] = le.transform(x_train["name"])
x_train = x_train.to_numpy()
Fit the model
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(x_train, y_train)
Apply the same transformation in test as well
x_test = df_test.drop(["age", "weight"], axis=1)
x_test["name"] = le.transform(x_test["name"])
x_test = x_test.to_numpy()
Run prediction
print(neigh.predict(x_test))
With respect to the age columns, I dropped it in training as you were trying to drop the null values.
you can retain the age column while training. But impute the age column before predict step (as age column has null values in testing dataframe).