I am trying to logistic Regression Model, and run some test but I keep getting this error. Not really sure what I have done differently to everyone else
from sklearn import preprocessing
X = df.iloc[:,:len(df.columns)-1]
y = df.iloc[:,len(df.columns)-1]ere
This is how I am separating my columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
TTS
logReg = LogisticRegression(n_jobs=-1)
logReg.fit(X_train, y_train)
y_pred = logReg.predict(X_train)
mae = mean_absolute_error(y_test, y_pred)
print("MAE:" , mae)
ValueError Traceback (most recent call last)
Cell In [112], line 1
----> 1 mae = mean_absolute_error(y_test, y_pred)
2 print("MAE:" , mae)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.py:196, in mean_absolute_error(y_true, y_pred, sample_weight, multioutput)
141 def mean_absolute_error(
142 y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"
143 ):
144 """Mean absolute error regression loss.
145
146 Read more in the :ref:`User Guide <mean_absolute_error>`.
(...)
194 0.85...
195 """
--> 196 y_type, y_true, y_pred, multioutput = _check_reg_targets(
197 y_true, y_pred, multioutput
198 )
199 check_consistent_length(y_true, y_pred, sample_weight)
200 output_errors = np.average(np.abs(y_pred - y_true), weights=sample_weight, axis=0)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.py:100, in _check_reg_targets(y_true, y_pred, multioutput, dtype)
66 def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):
67 """Check that y_true and y_pred belong to the same regression task.
68
69 Parameters
(...)
98 correct keyword.
99 """
--> 100 check_consistent_length(y_true, y_pred)
101 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
102 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py:387, in check_consistent_length(*arrays)
385 uniques = np.unique(lengths)
386 if len(uniques) > 1:
--> 387 raise ValueError(
388 "Found input variables with inconsistent numbers of samples: %r"
389 % [int(l) for l in lengths]
390 )
ValueError: Found input variables with inconsistent numbers of samples: [25404, 101612]
I thought it was the way I split the columns but that doesn't seem to be the issue It works when the test size is 50/50 but no other test size works
CodePudding user response:
You are comparing the predicted labels for the train set with the labels for the test set, which are of different sizes, hence the error.
Replace
y_pred = logReg.predict(X_train)
with
y_pred = logReg.predict(X_test)