My goal is to create barplots with counts on the y axis and have the bars labelled with the percentage of the group. The code below gets me halfway there -
import seaborn as sns
from itertools import product
titanic = sns.load_dataset("titanic")
features = ['sex', 'class', 'who', 'adult_male']
n = 1
plt.figure(figsize=[12, 14])
for f in features:
plt.subplot(3, 2, n)
ax = sns.countplot(x=f, hue='survived', edgecolor='black', alpha=0.8, data=titanic)
sns.despine()
plt.title("Countplot of {} by alive".format(f))
n=n 1
plt.tight_layout()
for c in ax.containers:
labels = [f'{h/titanic.survived.count()*100:0.1f}%' if (h := v.get_height()) > 0 else '' for v in c]
ax.bar_label(c,labels=labels, label_type='edge')
The problem is that the percentages incorrect. For example in the "Countplot of sex by survived" chart, male percentage is calculating the percentage of males in the "0" class out of the entire dataset.
How do I adjust my code to calculate the percentage of males in the "0" class out of male category? So the blue bar in the male category should be 81% and the orange bar should be 19%.
CodePudding user response:
Generate within-feature proportions manually, e.g. for feature sex
:
- Compute the proportions of
survived
within eachsex
using
Full example with all features:
titanic = sns.load_dataset('titanic') features = ['sex', 'class', 'who', 'adult_male'] hue = 'survived' fig, axs = plt.subplots(2, 2, figsize=(10, 10), constrained_layout=True) for feat, ax in zip(features, axs.ravel()): # group ordering differs by dtype col = titanic[feat] if col.dtype == 'category': groups = col.cat.categories elif col.dtype == 'bool': groups = [False, True] else: groups = col.unique() # within-feature proportions proportions = titanic.groupby(feat)[hue].value_counts(normalize=True) sns.countplot(x=feat, hue=hue, edgecolor='k', alpha=0.8, data=titanic, ax=ax) ax.set_title(f'Countplot of ${feat}$ by ${hue}$') sns.despine() # retrieve proportions by the container's label (hue) and group (feature) for c in ax.containers: labels = [f'{proportions.loc[g, int(c.get_label())]:.1%}' for g in groups] ax.bar_label(c, labels)