-1

I have a dataframe with 95 columns where are Max(), Min() and Avg() values of different measures, I want to plot their histograms on a FacetGrid of 3 columns and 32 rows where 1st column is max value, 2nd is avg value and 3rd is min value, and the rows are the measure type.

I have this code right now:

fig, axes = plt.subplots(nrows=32, ncols=3, figsize=(20, 96))
columnas_numeric = df_agg_new.select_dtypes(include=['float64', 'int64']).columns
columnas_numeric = columnas_numeric.drop('season')

for i, colum in enumerate(columns):
    if colum.endswith('Max'):
        sns.histplot(
            data     = df_agg_new,
            x        = colum,
            stat     = "count",
            kde      = True,
            line_kws = {'linewidth': 2},
            alpha    = 0.3,
            ax       = axes[int(i/3)][0]
        )
        axes[int(i/3)][0].set_title(colum, fontsize = 7, fontweight = "bold")
        axes[int(i/3)][0].tick_params(labelsize = 6)
        axes[int(i/3)][0].set_xlabel("")
    elif colum.endswith('Avg'):
        sns.histplot(
            data     = df_agg_new,
            x        = colum,
            stat     = "count",
            kde      = True,
            line_kws = {'linewidth': 2},
            alpha    = 0.3,
            ax       = axes[int(i/3)][1]
        )
        axes[int(i/3)][1].set_title(colum, fontsize = 7, fontweight = "bold")
        axes[int(i/3)][1].tick_params(labelsize = 6)
        axes[int(i/3)][1].set_xlabel("")
    else:
        sns.histplot(
            data     = df_agg_new,
            x        = colum,
            stat     = "count",
            kde      = True,
            line_kws = {'linewidth': 2},
            alpha    = 0.3,
            ax       = axes[int(i/3)][2]
        )
        axes[int(i/3)][2].set_title(colum, fontsize = 7, fontweight = "bold")
        axes[int(i/3)][2].tick_params(labelsize = 6)
        axes[int(i/3)][2].set_xlabel("")
    
    
fig.tight_layout()
plt.subplots_adjust(top = 0.97)
fig.suptitle('Distribution plots', fontsize = 10, fontweight = "bold");

But don't work because some measure values go to other rows.

this is my list of columns:

Index(['Var1_phase2_Avg', 'Var1_phase2_Max',
       'Var1_phase2_Min', 'Var1_phase3_Avg',
       'Var1_phase3_Max', 'Var1_phase3_Min',
       'Var1_phase1_Avg', 'Var1_phase1_Max',
       'Var1_phase1_Min', 'Var1_phase4_Avg',
       'Var1_phase4_Max', 'Var1_phase4_Min',
       'Var2_phase2_Avg', 'Var2_phase2_Max',
       'Var2_phase3_Avg', 'Var2_phase3_Max',
       'Var2_phase1_Avg', 'Var2_phase1_Max',
       'Var2_phase4_Avg', 'Var2_phase4_Max',
       'Var3_phase2_Avg', 'Var3_phase2_Max',
       'Var3_phase2_Min', 'Var3_phase3_Avg',
       'Var3_phase3_Max', 'Var3_phase3_Min',
       'Var3_phase1_Avg', 'Var3_phase1_Max',
       'Var3_phase1_Min', 'Var3_phase4_Avg',
       'Var3_phase4_Max', 'Var3_phase4_Min', 
       'Var4_phase2_Avg', 'Var4_phase2_Max', 
       'Var4_phase3_Avg', 'Var4_phase3_Max', 
       'Var4_phase1_Avg', 'Var4_phase1_Max', 
       'Var4_phase4_Avg', 'Var4_phase4_Max', 
       'Var5_phase2_Avg', 'Var5_phase2_Max',
       'Var5_phase2_Min', 'Var5_phase3_Avg', 
       'Var5_phase3_Max', 'Var5_phase3_Min', 
       'Var5_phase1_Avg', 'Var5_phase1_Max', 
       'Var5_phase1_Min', 'Var5_phase4_Avg', 
       'Var5_phase4_Max', 'Var5_phase4_Min', 
       'Var6_phase2_Avg', 'Var6_phase2_Max', 
       'Var6_phase2_Min', 'Var6_phase3_Avg', 
       'Var6_phase3_Max', 'Var6_phase1_Avg', 
       'Var6_phase1_Max', 'Var7_phase2_Avg', 
       'Var7_phase2_Max', 'Var7_phase2_Min',
       'Var7_phase3_Avg', 'Var7_phase3_Max',
       'Var7_phase3_Min', 'Var7_phase1_Avg', 
       'Var7_phase1_Max', 'Var7_phase1_Min', 
       'Var7_phase4_Avg', 'Var7_phase4_Max', 
       'Var7_phase4_Min', 'Var8_phase2_Avg', 
       'Var8_phase2_Max', 'Var8_phase2_Min', 
       'Var8_phase3_Avg', 'Var8_phase3_Max', 
       'Var8_phase3_Min', 'Var8_phase1_Avg', 
       'Var8_phase1_Max', 'Var8_phase1_Min',
       'Var8_phase4_Avg', 'Var8_phase4_Max', 
       'Var8_phase4_Min'],
      dtype='object')

And this is how i want to get represented (with their histplot):

       'Var1_phase2_Avg', 'Var1_phase2_Max', 'Var1_phase2_Min', 
       'Var1_phase3_Avg', 'Var1_phase3_Max', 'Var1_phase3_Min',
       'Var1_phase1_Avg', 'Var1_phase1_Max', 'Var1_phase1_Min',
       'Var1_phase4_Avg', 'Var1_phase4_Max', 'Var1_phase4_Min',
       
       'Var2_phase2_Avg', 'Var2_phase2_Max',
       'Var2_phase3_Avg', 'Var2_phase3_Max',
       'Var2_phase1_Avg', 'Var2_phase1_Max',
       'Var2_phase4_Avg', 'Var2_phase4_Max',

       'Var3_phase2_Avg', 'Var3_phase2_Max', 'Var3_phase2_Min',
       'Var3_phase3_Avg', 'Var3_phase3_Max', 'Var3_phase3_Min',
       'Var3_phase1_Avg', 'Var3_phase1_Max', 'Var3_phase1_Min',
       'Var3_phase4_Avg', 'Var3_phase4_Max', 'Var3_phase4_Min', 
       
       'Var4_phase2_Avg', 'Var4_phase2_Max', 
       'Var4_phase3_Avg', 'Var4_phase3_Max', 
       'Var4_phase1_Avg', 'Var4_phase1_Max', 
       'Var4_phase4_Avg', 'Var4_phase4_Max', 

       'Var5_phase2_Avg', 'Var5_phase2_Max', 'Var5_phase2_Min', 
       'Var5_phase3_Avg', 'Var5_phase3_Max', 'Var5_phase3_Min', 
       'Var5_phase1_Avg', 'Var5_phase1_Max', 'Var5_phase1_Min',
       'Var5_phase4_Avg', 'Var5_phase4_Max', 'Var5_phase4_Min', 
       
       'Var6_phase2_Avg', 'Var6_phase2_Max', 'Var6_phase2_Min',
       'Var6_phase3_Avg', 'Var6_phase3_Max',
       'Var6_phase1_Avg', 'Var6_phase1_Max',

       'Var7_phase2_Avg', 'Var7_phase2_Max', 'Var7_phase2_Min',
       'Var7_phase3_Avg', 'Var7_phase3_Max', 'Var7_phase3_Min',
       'Var7_phase1_Avg', 'Var7_phase1_Max', 'Var7_phase1_Min',
       'Var7_phase4_Avg', 'Var7_phase4_Max', 'Var7_phase4_Min',

       'Var8_phase2_Avg', 'Var8_phase2_Max', 'Var8_phase2_Min', 
       'Var8_phase3_Avg', 'Var8_phase3_Max', 'Var8_phase3_Min',
       'Var8_phase1_Avg', 'Var8_phase1_Max', 'Var8_phase1_Min',
       'Var8_phase4_Avg', 'Var8_phase4_Max', 'Var8_phase4_Min'

Each column is for Avg, Max and Min values, and each row if for each phase of the day and later for each different measure.

  • Are your variables `columnas_numeric` and `columns` (used in the for loop) the same? Your use of `axes[int(i/3)][0]` supposes that the list of columns comes in nice groups of 3, which is unlikely as the size of the list isn't divisible by 3. – JohanC Mar 04 '23 at 12:55
  • @JohanC Yes, `columnas_numeric` and `columns` are the same. – Pablo Moreira Garcia Mar 04 '23 at 17:15

1 Answers1

1

Your use of axes[int(i/3)][0] supposes that the list of columns comes in nice groups of 3, which is not the case.

Given how your columns are named, you could:

  • create a list of the names without the suffix (so e.g. 'Var1_phase2' for 'Var1_phase2_Avg')
  • optionally sort that list
  • use the number of names to calculate nrows=len(filas)
  • loop through the row names, and check whether a column with that name and one of the suffixes exists
  • if the column exists, draw the plot
  • if the column doesn't exist, remove the empty plot

Optionally, you could share the x and/or y axes of the plots, which makes them easier to compare. These are parameters in plt.subplots(..., sharex=True, sharey=True. This would also skip the repeated labeling of the axes, which saves some space (but could be unwanted if you have many rows; if so you can enable them again via axes[i, j].tick_params(..., labelbottom=True)).

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

columnas_numeric = df_agg_new.select_dtypes(include=['float64', 'int64']).columns
columnas_numeric = columnas_numeric.drop('season')
# columnas_numeric = ['Var1_phase2_Avg', 'Var1_phase2_Max', 'Var1_phase2_Min', 'Var1_phase3_Avg', 'Var1_phase3_Max', 'Var1_phase3_Min', 'Var1_phase1_Avg', 'Var1_phase1_Max', 'Var1_phase1_Min', 'Var1_phase4_Avg', 'Var1_phase4_Max', 'Var1_phase4_Min', 'Var2_phase2_Avg', 'Var2_phase2_Max', 'Var2_phase3_Avg', 'Var2_phase3_Max', 'Var2_phase1_Avg', 'Var2_phase1_Max', 'Var2_phase4_Avg', 'Var2_phase4_Max', 'Var3_phase2_Avg', 'Var3_phase2_Max', 'Var3_phase2_Min', 'Var3_phase3_Avg', 'Var3_phase3_Max', 'Var3_phase3_Min', 'Var3_phase1_Avg', 'Var3_phase1_Max', 'Var3_phase1_Min', 'Var3_phase4_Avg', 'Var3_phase4_Max', 'Var3_phase4_Min', 'Var4_phase2_Avg', 'Var4_phase2_Max', 'Var4_phase3_Avg', 'Var4_phase3_Max', 'Var4_phase1_Avg', 'Var4_phase1_Max', 'Var4_phase4_Avg', 'Var4_phase4_Max', 'Var5_phase2_Avg', 'Var5_phase2_Max', 'Var5_phase2_Min', 'Var5_phase3_Avg', 'Var5_phase3_Max', 'Var5_phase3_Min', 'Var5_phase1_Avg', 'Var5_phase1_Max', 'Var5_phase1_Min', 'Var5_phase4_Avg', 'Var5_phase4_Max', 'Var5_phase4_Min', 'Var6_phase2_Avg', 'Var6_phase2_Max', 'Var6_phase2_Min', 'Var6_phase3_Avg', 'Var6_phase3_Max', 'Var6_phase1_Avg', 'Var6_phase1_Max', 'Var7_phase2_Avg', 'Var7_phase2_Max', 'Var7_phase2_Min', 'Var7_phase3_Avg', 'Var7_phase3_Max', 'Var7_phase3_Min', 'Var7_phase1_Avg', 'Var7_phase1_Max', 'Var7_phase1_Min', 'Var7_phase4_Avg', 'Var7_phase4_Max', 'Var7_phase4_Min', 'Var8_phase2_Avg', 'Var8_phase2_Max', 'Var8_phase2_Min', 'Var8_phase3_Avg', 'Var8_phase3_Max', 'Var8_phase3_Min', 'Var8_phase1_Avg', 'Var8_phase1_Max', 'Var8_phase1_Min', 'Var8_phase4_Avg', 'Var8_phase4_Max', 'Var8_phase4_Min']

filas = [colum[:-4] for colum in columnas_numeric if colum.endswith('_Avg')]
filas = sorted(filas) # optionally sort the list for the rows

fig, axes = plt.subplots(nrows=len(filas), ncols=3, figsize=(20, 96))

for ax_row, fila in zip(axes, filas):
    for ax, suffix in zip(ax_row, ['Max', 'Avg', 'Min']):
        colum = fila + '_' + suffix
        if not colum in columnas_numeric:
            ax.remove()  # remove empty subplot when there is no data
        else:
            sns.histplot(
                data=df_agg_new,
                x=colum,
                stat="count",
                kde=True,
                line_kws={'linewidth': 2},
                alpha=0.3,
                ax=ax
            )
            ax.set_title(colum, fontsize=7, fontweight="bold")
            ax.tick_params(labelsize=6, labelbottom=True)
            ax.set_xlabel("")

fig.tight_layout()
plt.subplots_adjust(top=0.97)
fig.suptitle('Distribution plots', fontsize=10, fontweight="bold");
plt.show()
JohanC
  • 71,591
  • 8
  • 33
  • 66