How to output Shap values in probability and make force_plot from binary classifier-CodePudding

I Need to plot how each feature impact the predicted probability for each sample from my lightGBM binary classifier. So I need to output Shap values in probability, instead of normal Shap values. It does not appear to have any options to output in term of probability.

The example code below is what I use to generate dataframe of Shap values and do a force_plot for the first data sample. Does anyone know how I should modify to code to change the output? I'm new to Shap value and the Shap package. Thanks a lot in advance.

import pandas as pd
import numpy as np
import shap
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.2)
model = lgbm.LGBMClassifier()
model.fit(X_train, y_train)


explainer = shap.TreeExplainer(model)
shap_values = explainer(X_train)

# force plot of first row for class 1
class_idx = 1
row_idx = 0
expected_value = explainer.expected_value[class_idx]
shap_value = shap_values[:,:,class_idx].values[row_idx]

shap.force_plot (base_value = expected_value,  shap_values = shap_value, features = X_train.iloc[row_idx, :], matplotlib=True)

# dataframe of shap values for class 1
shap_df = pd.DataFrame(shap_values[:,:, 1 ].values, columns = shap_values.feature_names)

CodePudding user response：

You can consider running your output values through a softmax() function. For reference, it is defined as :

def get_softmax_probabilities(x):
    return np.exp(x) / np.sum(np.exp(x)).reshape(-1, 1)

and there is a scipy implementation as well:

from scipy.special import softmax

The output from softmax() will be probabilities proportional to the (relative) values in vector x, which are your shop values.

CodePudding user response：

import pandas as pd
import numpy as np
import shap
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.2)
print('X_train: ',X_train.shape)
print('X_test: ',X_test.shape)

model = lgbm.LGBMClassifier()
model.fit(X_train, y_train)

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)

# plot
# shap.summary_plot(shap_values[class_idx], X_train, plot_type='bar')
# shap.summary_plot(shap_values[class_idx], X_train)

# shap_value = shap_values[:,:,class_idx].values[row_idx]
# shap.force_plot (base_value = expected_value,  shap_values = shap_value, features = X_train.iloc[row_idx, :], matplotlib=True)
# # dataframe of shap values for class 1
# shap_df = pd.DataFrame(shap_values[:,:, 1 ].values, columns = shap_values.feature_names)

# verification
def verification(index_number,class_idx):
    print('-----------------------------------')
    print('index_number: ', index_number)
    print('class_idx: ', class_idx)
    print('')
    
    y_base = explainer.expected_value[class_idx]
    print('y_base: ', y_base)

    player_explainer = pd.DataFrame()
    player_explainer['feature_value'] = X_train.iloc[j].values
    player_explainer['shap_value'] = shap_values[class_idx][j]
    print('verification: ')
    print('y_base   sum_of_shap_values: %.2f'%(y_base   player_explainer['shap_value'].sum()))
    print('y_pred: %.2f'%(y_train[j]))

j = 10  # index
verification(j,0)
verification(j,1)

# show: 
# X_train:  (455, 30)
# X_test:  (114, 30)
# -----------------------------------
# index_number:  10
# class_idx:  0

# y_base:  -2.391423081639827
# verification: 
# y_base   sum_of_shap_values: -9.40
# y_pred: 1.00
# -----------------------------------
# index_number:  10
# class_idx:  1

# y_base:  2.391423081639827
# verification: 
# y_base   sum_of_shap_values: 9.40
# y_pred: 1.00
# -9.40,9.40 takes the maximum value（class_idx:1 = y_pred）, and the result is obviously correct.

I helped you achieve it and verified the reliability of the results.