I want to plot my model roc curve but i get error:
'<' not supported between instances of 'str' and 'int'
I have numerical and categorical data types.
data_type= dfj.dtypes
data_type
ZAG_LUDNOŚCI int32
GDP_NA_OS int32
DNI POWSZEDNIE int64
PORA_DNIA object
SPECJALNA_BUDOWA_OBIEKTU int32
ILOŚĆ_MIESZKAŃCÓW_OBIEKTU object
CZY_BUDYNEK_JEST_NORMALNIE_ZAJĘTY int32
CZY_CZUJNIK_DYMU_JEST_W_OBIEKCIE object
I deal with them by pipeline:
from sklearn.compose import make_column_selector as selector
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)
numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer([
('one-hot-encoder', categorical_preprocessor, categorical_columns),
('standard_scaler', numerical_preprocessor, numerical_columns)])
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
model = make_pipeline(preprocessor, LogisticRegression(max_iter=1000))
from sklearn import set_config
set_config(display='diagram')
model
Then i learn model and predict:
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = train_test_split(
data, target, random_state=42)
_ = model.fit(data_train, target_train)
y_pred=model.predict(data_test)
model.predict(data_test)[:10]
y_pred
at this moment i want to plot a roc curve but i get error message...
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay
y_score = model.decision_function(data_test)
fpr, tpr, _ = roc_curve(data_test, y_score, pos_label=model.classes_[1])
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
The error:
TypeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_20736/4039759825.py in <module>
4 y_score = model.decision_function(data_test)
5
----> 6 fpr, tpr, _ = roc_curve(data_test, y_score, pos_label=model.classes_[1])
7 roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
~\anaconda3\lib\site-packages\sklearn\metrics\_ranking.py in roc_curve(y_true, y_score, pos_label, sample_weight, drop_intermediate)
977
978 """
--> 979 fps, tps, thresholds = _binary_clf_curve(
980 y_true, y_score, pos_label=pos_label, sample_weight=sample_weight
981 )
~\anaconda3\lib\site-packages\sklearn\metrics\_ranking.py in _binary_clf_curve(y_true, y_score, pos_label, sample_weight)
734 """
735 # Check to make sure y_true is valid
--> 736 y_type = type_of_target(y_true, input_name="y_true")
737 if not (y_type == "binary" or (y_type == "multiclass" and pos_label is not None)):
738 raise ValueError("{0} format is not supported".format(y_type))
~\anaconda3\lib\site-packages\sklearn\utils\multiclass.py in type_of_target(y, input_name)
284 raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")
285
--> 286 if is_multilabel(y):
287 return "multilabel-indicator"
288
~\anaconda3\lib\site-packages\sklearn\utils\multiclass.py in is_multilabel(y)
171 )
172 else:
--> 173 labels = np.unique(y)
174
175 return len(labels) < 3 and (
<__array_function__ internals> in unique(*args, **kwargs)
~\anaconda3\lib\site-packages\numpy\lib\arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
260 ar = np.asanyarray(ar)
261 if axis is None:
--> 262 ret = _unique1d(ar, return_index, return_inverse, return_counts)
263 return _unpack_tuple(ret)
264
~\anaconda3\lib\site-packages\numpy\lib\arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
321 aux = ar[perm]
322 else:
--> 323 ar.sort()
324 aux = ar
325 mask = np.empty(aux.shape, dtype=np.bool_)
TypeError: '<' not supported between instances of 'str' and 'int'
I don't know how to fix it. Thank you for your help! Edit: Target is PRZYPADKOWE_CZY_CELOWE int32 it's look like
0 0
1 0
2 0
3 0
4 1
..
12534 0
12535 0
12536 0
12537 0
12538 0
Name: PRZYPADKOWE_CZY_CELOWE, Length: 12539, dtype: int32
CodePudding user response:
The first parameter to roc_curve
is supposed to be the true labels, but you are passing the independent variables.