Bar plot 2 categorical variables-CodePudding

I have a dataset with two categorical variables and would like to have a bar plot with the axis being % and bar labels having both % and N(number of observations). I have explored countplot and it seems to achieve what I want with the exception that the y-axis is count and not percent - I would want the y=axis to be %. Also the bar labels aren't what I want. I would want the labels to have both % and N such as 65.5% (N=12). The percentages should be by x-value (survey) so that the percent of levels for each survey should add to 100. In other words, the percent for level for baseline should add to 100 and the same for end line.

Below is the code

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

data = {
'id': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20],
'survey': ['baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline', 'baseline', 'endline'],
'level': ['low', 'high', 'medium', 'low', 'high', 'medium', 'medium', 'high', 'low', 'low', 'medium', 'high', 'low', 'medium', 'low', 'high', 'low', 'low', 'medium', 'high', 'high', 'high', 'high', 'medium', 'low', 'low', 'medium', 'high', 'low', 'medium', 'high', 'medium', 'low', 'high', 'high', 'medium', 'medium', 'low', 'high', 'low']
}

df = pd.DataFrame(data)
df.survey.value_counts()

plt.figure(figsize=(8,5))
sns.countplot(x = 'survey', data = df, palette = 'rainbow', hue = 'level')

N = df.groupby(['survey','level']).count().sort_index(ascending=False)
N_it = '$\it{N}$'

labels = [
    [f'{pct:.1f}% $(N={_n})$' for pct, _n in zip(c.datavalues, n)]
    for c, n in zip(ax.containers, N.to_numpy())
]

for container, label in zip(ax.containers, labels):
    ax.bar_label(container, label, fontsize=10)
    
sns.despine(ax = ax, left = True)
ax.grid(True, axis = 'y')
ax.yaxis.set_major_formatter(PercentFormatter(100))
ax.set_xlabel('')
ax.set_ylabel('')
plt.tight_layout()
plt.show()

Thanks in advance!

CodePudding user response：

Here is one way to do it with:

# Prep data
df = (
    df.groupby(["survey", "level"])
    .count()
    .sort_index(ascending=False)
    .rename(columns={"id": "count"})
)

# Add percentages
pcts = []
for idx in ["endline", "baseline"]:
    pcts = (
        pcts   (df.loc[(idx,), "count"] * 100 / df.loc[(idx,), "count"].sum()).tolist()
    )

df["pct"] = pcts
df = df.reset_index()

# Plot data
fig = plt.figure(figsize=(8, 5))
ax = sns.barplot(x="survey", y="pct", data=df, palette="rainbow", hue="level")

# Add percentages as labels
x = 0.12  # starting point of first label
# get size of each bar
size = [p.get_width() / 2 for p in ax.patches]
# deal with space between third and fourth bar
size = [size[i]   0.1 if i == 2 else size[i] for i in range(len(size))]

# Add values iteratively
for idx in df.index:
    for col in ["pct", "count"]:
        y = 0.4
        s = f"{df.loc[idx, col]}%"
        if col == "count":
            y = 0.3
            s = f"N={df.loc[idx, col]}"
        ax.text(
            x=x,
            y=y,
            s=s,
            fontsize=10,
            ha="center",
            color="w",
            transform=ax.transAxes,
        )
    x  = size[idx]

Then, in a Jupyter notebook cell:

fig

Outputs: