I use the below function to detect the effect of those feature selection models on my data, it works perfectly.
what I want is to return the name of selected features for each model, is there any convincing way to do that based on my code?
def evaluating_model(fs, nm,model,X, y):
pipe=Pipeline(steps=[('fs',fs),('sc',nm),('model', model)])
scores = pd.DataFrame(
cross_validate(
pipe,
X,
y,
cv=KFold(n_splits=5, random_state=1, shuffle=True),
scoring=["accuracy",'f1','recall','roc_auc','precision'],
return_train_score=True,
error_score='raise'
)
)
return scores
featureSelection={
"f_classif":SelectKBest(score_func=f_classif, k=10),
"rfe_RandomForest":RFE(estimator=RandomForestClassifier(), n_features_to_select=10),
"sf_XGBClassifier":SelectFromModel(XGBClassifier(), max_features=10),
}
for keyFeature,valueFeature in featureSelection.items():
evaluating_model(valueFeature,anyNormalizer,anyModel,X,y)
CodePudding user response:
Because you're doing a 5-fold cross-validation, you have 5 different feature selections (for each selection type). So you want all 5 feature sets returned?
You can add return_estimator=True
to the cross_validate
, then extract the selectors' support or feature names out.
CodePudding user response:
Based on the accepted answer, The complete solution is:
def evaluating_model(fs, nm,model,X, y):
scores = cross_validate(
Pipeline(steps=[('fs',fs),('sc',nm),('model', model)]),
X,
y,
cv=KFold(n_splits=5, random_state=1, shuffle=True),
scoring=["accuracy",'f1','recall','roc_auc','precision'],
return_train_score=True,
error_score='raise',
return_estimator=True
)
SelectedFeature=list()
for value in scores["estimator"]:
if type(value.named_steps["fs"]).__name__=="RFE":
SelectedFeature.append(
X.columns[value.named_steps["fs"].support_].tolist()
)
elif type(value.named_steps["fs"]).__name__=="SelectKBest" or type(value.named_steps["fs"]).__name__=="SelectFromModel":
SelectedFeature.append(
X.columns[value.named_steps["fs"].get_support()].tolist()
)
return scores,SelectedFeature
featureSelection={
"f_classif":SelectKBest(score_func=f_classif, k=10),
"rfe_RandomForest":RFE(estimator=RandomForestClassifier(), n_features_to_select=10),
"sf_XGBClassifier":SelectFromModel(XGBClassifier(), max_features=10),
}
for keyFeature,valueFeature in featureSelection.items():
scores,SelectedFeature= evaluating_model(valueFeature,anyNormalizer,anyModel,X,y)