Create barplot from string data using groupby and multiple columns in pandas dataframe

Question

I'd like to make a bar plot in python with multiple x-categories from counts of data either "yes" or "no". I've started on some code but I believe the track I'm on in a slow way of getting to the solution I want. I'd be fine with a solution that uses either seaborn, Matplotlib, or pandas but not Bokeh because I'd like to make publication-quality figures that scale.

Ultimately what I want is:

bar plot with the categories "canoe", "cruise", "kayak" and "ship" on the x-axis
grouped-by "color", so either Green or Red
showing the proportion of "yes" responses: so number of yes rows divided by the count of "red" and "greens" which in this case is 4 red and 4 green, but that could change.

Here's the dataset I'm working with:

import pandas as pd
data = [{'ship': 'Yes','canoe': 'Yes', 'cruise': 'Yes', 'kayak': 'No','color': 'Red'},{'ship': 'Yes', 'cruise': 'Yes', 'kayak': 'Yes','canoe': 'No','color': 'Green'},{'ship': 'Yes', 'cruise': 'Yes', 'kayak': 'No','canoe': 'No','color': 'Green'},{'ship': 'Yes', 'cruise': 'Yes', 'kayak': 'No','canoe': 'No','color': 'Red'},{'ship': 'Yes', 'cruise': 'Yes', 'kayak': 'Yes','canoe': 'No','color': 'Red'},{'ship': 'No', 'cruise': 'Yes', 'kayak': 'No','canoe': 'Yes','color': 'Green'},{'ship': 'No', 'cruise': 'No', 'kayak': 'No','canoe': 'No','color': 'Green'},{'ship': 'No', 'cruise': 'No', 'kayak': 'No','canoe': 'No','color': 'Red'}]
df = pd.DataFrame(data)

This is what I've started with:

print(df['color'].value_counts())

red = 4 # there must be a better way to code this rather than manually. Perhaps using len()?
green = 4

# get count per type
ca = df['canoe'].value_counts()
cr = df['cruise'].value_counts()
ka = df['kayak'].value_counts()
sh = df['ship'].value_counts()
print(ca, cr, ka, sh)

# group by color
cac = df.groupby(['canoe','color'])
crc = df.groupby(['cruise','color'])
kac = df.groupby(['kayak','color'])
shc = df.groupby(['ship','color'])

# make plots 
cac2 = cac['color'].value_counts().unstack()
cac2.plot(kind='bar', title = 'Canoe by color')

But really what I want is all of the x-categories to be on one plot, only showing the result for "Yes" responses, and taken as the proportion of "Yes" rather than just counts. Help?

Can you attach a picture of what you expected chart should look like? — Scott Boston, Jul 30 '18 at 20:08
Yes @Scott Boston. I need something that looks like this https://stackoverflow.com/questions/31845258/pandas-multi-index-plotting/43999896#43999896 with seaborn — JAG2024, Aug 04 '18 at 16:13

score 4 · Answer 1 · answered Jul 29 '18 at 01:49

Not exactly sure if I understand the question correctly. It looks like it would make more sense to look at the proportion of answers per boat type and color.

import matplotlib.pyplot as plt
import pandas as pd
data = [{'ship': 'Yes','canoe': 'Yes', 'cruise': 'Yes', 'kayak': 'No','color': 'Red'},{'ship': 'Yes', 'cruise': 'Yes', 'kayak': 'Yes','canoe': 'No','color': 'Green'},{'ship': 'Yes', 'cruise': 'Yes', 'kayak': 'No','canoe': 'No','color': 'Green'},{'ship': 'Yes', 'cruise': 'Yes', 'kayak': 'No','canoe': 'No','color': 'Red'},{'ship': 'Yes', 'cruise': 'Yes', 'kayak': 'Yes','canoe': 'No','color': 'Red'},{'ship': 'No', 'cruise': 'Yes', 'kayak': 'No','canoe': 'Yes','color': 'Green'},{'ship': 'No', 'cruise': 'No', 'kayak': 'No','canoe': 'No','color': 'Green'},{'ship': 'No', 'cruise': 'No', 'kayak': 'No','canoe': 'No','color': 'Red'}]
df = pd.DataFrame(data)

ax = df.replace(["Yes","No"],[1,0]).groupby("color").mean().transpose().plot.bar(color=["g","r"])
ax.set_title('Proportion "Yes" answers per of boat type and color')
plt.show()

This means e.g. that 25% of all green canoes answered "yes".

Nice, you could get to the data with `df.set_index('color').eq('Yes').mean(level='color').T` too. — Zero, Aug 04 '18 at 11:57
Thanks yeah I'd like something like this though with multiple x-axis categories: stackoverflow.com/questions/31845258/ — JAG2024, Aug 04 '18 at 16:14

score 2 · Accepted Answer · answered Aug 05 '18 at 04:48

Let's try.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import groupby

data = [{'ship': 'Yes','canoe': 'Yes', 'cruise': 'Yes', 'kayak': 'No','color': 'Red'},{'ship': 'Yes', 'cruise': 'Yes', 'kayak': 'Yes','canoe': 'No','color': 'Green'},{'ship': 'Yes', 'cruise': 'Yes', 'kayak': 'No','canoe': 'No','color': 'Green'},{'ship': 'Yes', 'cruise': 'Yes', 'kayak': 'No','canoe': 'No','color': 'Red'},{'ship': 'Yes', 'cruise': 'Yes', 'kayak': 'Yes','canoe': 'No','color': 'Red'},{'ship': 'No', 'cruise': 'Yes', 'kayak': 'No','canoe': 'Yes','color': 'Green'},{'ship': 'No', 'cruise': 'No', 'kayak': 'No','canoe': 'No','color': 'Green'},{'ship': 'No', 'cruise': 'No', 'kayak': 'No','canoe': 'No','color': 'Red'}]
df = pd.DataFrame(data)
df1 = df.replace(["Yes","No"],[1,0]).groupby("color").mean().stack().rename('% Yes').to_frame()


def add_line(ax, xpos, ypos):
    line = plt.Line2D([xpos, xpos], [ypos + .1, ypos],
                      transform=ax.transAxes, color='gray')
    line.set_clip_on(False)
    ax.add_line(line)

def label_len(my_index,level):
    labels = my_index.get_level_values(level)
    return [(k, sum(1 for i in g)) for k,g in groupby(labels)]

def label_group_bar_table(ax, df):
    ypos = -.1
    scale = 1./df.index.size
    for level in range(df.index.nlevels)[::-1]:
        pos = 0
        for label, rpos in label_len(df.index,level):
            lxpos = (pos + .5 * rpos)*scale
            ax.text(lxpos, ypos, label, ha='center', transform=ax.transAxes)
            add_line(ax, pos*scale, ypos)
            pos += rpos
        add_line(ax, pos*scale , ypos)
        ypos -= .1


colorlist = ['green','red']
cp = sns.color_palette(colorlist)

ax = sns.barplot(x=df1.index, y='% Yes', hue = df1.index.get_level_values(0), data=df1, palette=cp)
#Below 2 lines remove default labels
ax.set_xticklabels('')
ax.set_xlabel('')
label_group_bar_table(ax, df1)

Output:

DataPsycho · Answer 3 · 2018-08-01T15:26:08.213

Not sure if you are exactly looking for it or not, let me know if it works.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

tidy_df = pd.melt(df, id_vars=['color'] ,var_name='variable', value_name='value')
total_df = tidy_df[['variable']].groupby('variable').size().reset_index()
tidy_df = tidy_df.groupby(['color', 'variable', 'value']).size().reset_index()

merged_df = pd.merge(tidy_df, total_df, on='variable', how='left', suffixes=('_left', '_right'))
merged_df['proportion'] = merged_df['0_left']/merged_df['0_right']

# merged_df[merged_df['value'] == 'Yes']

palette ={"Green":"green","Red":"red"} # optional you can select your own
plt.figure(figsize=(12, 6))
sns.barplot(x='variable', y='proportion', hue='color',data=merged_df[merged_df['value'] == 'Yes'], palette=palette)
plt.xticks(rotation=65)
#plt.savefig('numbers.png')
plt.show()

Hi I want something like this though with multiple x-axis categories: https://stackoverflow.com/questions/31845258/pandas-multi-index-plotting/43999896#43999896 — JAG2024, Aug 04 '18 at 16:14

Create barplot from string data using groupby and multiple columns in pandas dataframe

3 Answers3

Linked