I am trying to loop through all elements in a container but end up missing some. The code am using is below. One will notice the label for endline has no N
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
data = {
'id': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22],
'survey': ['baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline'],
'level': ['low', 'high', 'medium', 'low', 'high', 'medium', 'medium', 'high', 'low', 'low', 'medium', 'high', 'low', 'medium', 'low', 'high', 'low', 'low', 'medium', 'high', 'high', 'high', 'high', 'medium', 'low', 'low', 'medium', 'high', 'low', 'medium', 'high', 'medium', 'low', 'high', 'high', 'medium', 'medium', 'low', 'high', 'low', 'low', 'low', 'low', 'low']
}
df = pd.DataFrame(data)
N = df.groupby(['survey', 'level']).count().sort_index(ascending = True).reset_index()
N['%'] = 100 * N['id'] / N.groupby('survey')['id'].transform('sum')
sns.set_style('white')
ax = sns.barplot(data = N, x = 'survey', y = '%', ci = None,
palette="rainbow", hue = 'level')
labels = [
[f'{pct:.1f}% $(N={_n})$' for pct, _n in zip(c.datavalues, n)]
for c, n in zip(ax.containers, pd.DataFrame(N['id']).to_numpy())
]
for container, label in zip(ax.containers, labels):
ax.bar_label(container, label, fontsize = 10)
sns.despine(ax = ax, left = True)
ax.grid(True, axis = 'y')
ax.yaxis.set_major_formatter(PercentFormatter(100))
ax.set_xlabel('')
ax.set_ylabel('')
plt.tight_layout()
plt.legend(bbox_to_anchor = (1.02, 1), loc = 'upper left', borderaxespad=0)
plt.show()
Any help is appreciated. Thanks in advance!
CodePudding user response:
Seaborn creates one bar container for each of the hue values. To provide the labels for each container, the dataframe needs to be reduced to the corresponding hue value. Pandas' groupby
can be used to group per hue value.
As the ordering is not necessarily consistent, it helps to force one on the survey
and level
columns by making them categorical.
Note that plt.tight_layout()
should be called after the legend creation, to also fit the legend into the figure.
The following example code assumes seaborn 12.2, which uses errorbar=None
instead of ci=None
. A newline character allows the labels to be plotted on two lines. ax.margins
adds some extra white space to better fit the labels.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
data = { 'id': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22],
'survey': ['baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline'],
'level': ['low', 'high', 'medium', 'low', 'high', 'medium', 'medium', 'high', 'low', 'low', 'medium', 'high', 'low', 'medium', 'low', 'high', 'low', 'low', 'medium', 'high', 'high', 'high', 'high', 'medium', 'low', 'low', 'medium', 'high', 'low', 'medium', 'high', 'medium', 'low', 'high', 'high', 'medium', 'medium', 'low', 'high', 'low', 'low', 'low', 'low', 'low']}
df = pd.DataFrame(data)
# force an order by making the column categorical
df['survey'] = pd.Categorical(df['survey'], ['baseline', 'endline'])
df['level'] = pd.Categorical(df['level'], ['low', 'medium', 'high'])
df_N = df.groupby(['survey', 'level']).count().sort_index(ascending=True).reset_index()
df_N['%'] = 100 * df_N['id'] / df_N.groupby('survey')['id'].transform('sum')
sns.set_style('white')
ax = sns.barplot(data=df_N, x='survey', y='%', errorbar=None,
palette="rainbow", hue='level')
for container, (level, df_N_level) in zip(ax.containers, df_N.groupby('level')):
labels = [f'{pct:.1f}%\n$(N={n})$' for pct, n in zip(df_N_level['%'], df_N_level['id'])]
ax.bar_label(container, labels, fontsize=10)
sns.despine(ax=ax, left=True)
ax.grid(True, axis='y')
ax.yaxis.set_major_formatter(PercentFormatter(100))
ax.set_xlabel('')
ax.set_ylabel('')
ax.margins(y=0.15) # optionally some more free space at the top
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
plt.tight_layout()
plt.show()