0

enter image description hereI'm trying to plot a normal distribution curve for a set of values. Unfortunately, the below code (taken from this post) doesn't seem to be plotting the curve correctly over the histograms (please refer attached image). I'm sure I'm missing something or have done something silly but can't seem to figure out. Can someone please help? I've included my code below - I'm getting the values from a dataframe but have included these as a list s for convenience:

import numpy as np
import scipy
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
from matplotlib.mlab import normpdf
mu = 0
sigma = 1
n_bins = 50
s = [8, 8, 4, 4, 1, 14, 0, 10, 1, 4, 21, 9, 5, 2, 7, 6, 7, 9, 7, 3, 3, 4, 7, 9, 9, 4, 10, 8, 10, 10, 7, 10, 1, 8, 7, 8, 1, 7, 4, 15, 8, 1, 1, 6, 7, 3, 8, 8, 8, 4][![enter image description here][1]][1]
fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)

#histogram
n, bins, patches = axes[1].hist(s, n_bins, normed=True, alpha=.1, edgecolor='black' )
pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))
print(pdf)
median, q1, q3 = np.percentile(s, 50), np.percentile(s, 25), np.percentile(s, 75)

#probability density function
axes[1].plot(bins, pdf, color='orange', alpha=.6)

#to ensure pdf and bins line up to use fill_between.
bins_1 = bins[(bins >= q1-1.5*(q3-q1)) & (bins <= q1)] # to ensure fill starts from Q1-1.5*IQR
bins_2 = bins[(bins <= q3+1.5*(q3-q1)) & (bins >= q3)]
pdf_1 = pdf[:int(len(pdf)/2)]
pdf_2 = pdf[int(len(pdf)/2):]
pdf_1 = pdf_1[(pdf_1 >= norm(mu,sigma).pdf(q1-1.5*(q3-q1))) & (pdf_1 <= norm(mu,sigma).pdf(q1))]
pdf_2 = pdf_2[(pdf_2 >= norm(mu,sigma).pdf(q3+1.5*(q3-q1))) & (pdf_2 <= norm(mu,sigma).pdf(q3))]

#fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR
#axes[1].fill_between(bins_1, pdf_1, 0, alpha=.6, color='orange')
#axes[1].fill_between(bins_2, pdf_2, 0, alpha=.6, color='orange')

#add text to bottom graph.
axes[1].annotate("{:.1f}%".format(100*norm(mu, sigma).cdf(q1)), xy=((q1-1.5*(q3-q1)+q1)/2, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3)-norm(mu, sigma).cdf(q1))), xy=(median, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+1.5*(q3-q1)-q3)-norm(mu, sigma).cdf(q3))), xy=((q3+1.5*(q3-q1)+q3)/2, 0), ha='center')
axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center')
axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center')

axes[1].set_ylabel('Probability Density')

#top boxplot
axes[0].boxplot(s, 0, 'gD', vert=False)
axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5)
axes[0].axis('off')

plt.rcParams["figure.figsize"] = (10,10)

plt.subplots_adjust(hspace=0)
plt.show()
Chipmunk_da
  • 467
  • 2
  • 9
  • 27
  • This answer needs to be updated due to the problem with `from matplotlib.mlab import normpdf`. Please see [issue](https://github.com/materialsproject/pymatgen/issues/1657). you can find the updated answer [here](https://stackoverflow.com/a/69595392/10452700) inspired from original answer. – Mario Oct 16 '21 at 12:05

2 Answers2

1

You have set mu and sigma arbitrarily to 0 and 1 respectively but you should calculate it for your actual data:

data = pd.Series(s)
mu = data.mean()
sigma = data.std()


Update with full working example:
import numpy as np
import scipy
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
n_bins = 50
s = [8, 8, 4, 4, 1, 14, 0, 10, 1, 4, 21, 9, 5, 2, 7, 6, 7, 9, 7, 3, 3, 4, 7, 9, 9, 4, 10, 8, 10, 10, 7, 10, 1, 8, 7, 8, 1, 7, 4, 15, 8, 1, 1, 6, 7, 3, 8, 8, 8, 4]
fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)

#histogram
n, bins, patches = axes[1].hist(s, n_bins, density=True, alpha=.1, edgecolor='black' )
data = pd.Series(s)
mu = data.mean()
sigma = data.std()
pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))
median, q1, q3 = np.percentile(s, 50), np.percentile(s, 25), np.percentile(s, 75)

#probability density function
axes[1].plot(bins, pdf, color='orange', alpha=.6)

#fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR
iqr = 1.5 * (q3-q1)
x1 = np.linspace(q1 - iqr, q1)
x2 = np.linspace(q3, q3 + iqr)
pdf1 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(x1-mu)**2/(2*sigma**2))
pdf2 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(x2-mu)**2/(2*sigma**2))
axes[1].fill_between(x1, pdf1, 0, alpha=.6, color='orange')
axes[1].fill_between(x2, pdf2, 0, alpha=.6, color='orange')

#add text to bottom graph.
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q1)    -norm(mu, sigma).cdf(q1-iqr))), xy=(q1-iqr/2, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3)    -norm(mu, sigma).cdf(q1)    )), xy=(median  , 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+iqr)-norm(mu, sigma).cdf(q3)    )), xy=(q3+iqr/2, 0), ha='center')
axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center')
axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center')

axes[1].set_ylabel('Probability Density')

#top boxplot
axes[0].boxplot(s, 0, 'gD', vert=False)
axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5)
axes[0].axis('off')

enter image description here

Stef
  • 28,728
  • 2
  • 24
  • 52
  • Thanks for your solution! The x axis seems to show the list values (0, 5,10,15..) - shouldn't this show the std dev from the mean (..-2,-1,0,1,2..) like in this post - https://stackoverflow.com/questions/49630427/how-to-plot-normal-distribution-with-percentage-of-data-as-label-in-each-band-bi? Also, when I uncomment `axes[1].fill_between(bins_1, pdf_1, 0, alpha=.6, color='orange')` I get an error `Argument dimensions are incompatible`, which I don't get when `s = np.random.normal(mu, sigma, 50)` - any idea why this happens and is there a way to fix this? Thanks – Chipmunk_da Jun 05 '20 at 10:12
  • 1
    shouldn't this show the std dev from the mean (..-2,-1,0,1,2..) --> no, why do you think so? In the linked example the std dev is set to 1 and the mean to 0, so the it seems to show the std dev but in fact it shows the x values – Stef Jun 05 '20 at 11:16
  • The reason I thought so was because in the linked example the values in `s` are between 0 and 1 but the x axis ranges from -3 to 3 and histograms & curve are plotted along that – Chipmunk_da Jun 05 '20 at 11:21
  • 1
    ...any idea why this happens --> this is because bins_1 and pdf_1 have different lengths (same for bins_2 and pdf_2). The reason is that the pdf range is divided in the middle which is OK for the symmetrical linked example but not in the general case – Stef Jun 05 '20 at 11:22
  • Thanks a lot for the full working example. It works like a charm! The only edit I'd suggest is adding `plt.subplots_adjust(hspace=0)` `plt.show()` at the bottom to show the plot. Thanks! – Chipmunk_da Jun 05 '20 at 14:36
0

Putting it all in a fuction:

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

def CTD(df):
    for col in df.columns:
        n_bins = 50

        fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)

        #histogram
        n, bins, patches = axes[1].hist(boston[col], n_bins, density=True, alpha=.1, edgecolor='black' )
        #data = pd.Series(s)
        mu = boston[col].mean()
        sigma = boston[col].std()
        pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))
        median, q1, q3 = np.percentile(boston.age, 50), np.percentile(boston[col], 25), np.percentile(boston[col], 75)

        #probability density function

        axes[1].plot(bins, pdf, color='orange', alpha=.6)
        #axes[1].figsize=(10,20)
        #fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR
        iqr = 1.5 * (q3-q1)
        x1 = np.linspace(q1 - iqr, q1)
        x2 = np.linspace(q3, q3 + iqr)
        pdf1 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(x1-mu)**2/(2*sigma**2))
        pdf2 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(x2-mu)**2/(2*sigma**2))
        axes[1].fill_between(x1, pdf1, 0, alpha=.6, color='orange')
        axes[1].fill_between(x2, pdf2, 0, alpha=.6, color='orange')
        #add text to bottom graph.
        axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q1)    -norm(mu, sigma).cdf(q1-iqr))), xy=(q1-iqr/2, 0), ha='center')
        axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3)    -norm(mu, sigma).cdf(q1)    )), xy=(median  , 0), ha='center')
        axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+iqr)-norm(mu, sigma).cdf(q3)    )), xy=(q3+iqr/2, 0), ha='center')
        axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center')
        axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center')

        #dashed lines
        plt.axvline(df[col].quantile(0),color='b', linestyle='-.')
        plt.axvline(df[col].quantile(0.25),color='g', linestyle='--')
        plt.axvline(df[col].quantile(0.50),color='g', linestyle='--')
        plt.axvline(df[col].quantile(0.75),color='b', linestyle='--')
        plt.axvline(df[col].quantile(1),color='r', linestyle='-.')

        axes[1].set_ylabel('Probability Density')

        #top boxplot
        axes[0].boxplot(df[col], 0, 'gD', vert=False)
        axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5)
        axes[0].axis('off')
        plt.rcParams["figure.figsize"] = (18,10)

calling function:

CTD(boston)

If this doesn't work for you:

Try this:

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

def CTD(df):
    for col in df.columns:
        sns.set(rc={'figure.figsize':(24,6)})
        plt.figure()
        plt.subplot(121)
        sns.distplot(df[col])
        plt.axvline(np.mean(df[col]),color='b', linestyle='--') # Blue line for mean
        plt.axvline(np.median(df[col]),color='r', linestyle='--')# Red line for Median
        plt.subplot(122)
        sns.distplot(df[col])
        plt.axvline(df[col].quantile(0),color='b', linestyle='-.')
        plt.axvline(df[col].quantile(0.25),color='g', linestyle='--')
        plt.axvline(df[col].quantile(0.50),color='g', linestyle='--')
        plt.axvline(df[col].quantile(0.75),color='b', linestyle='--')
        plt.axvline(df[col].quantile(1),color='r', linestyle='-.')

This Create dashed lines on KDE plot having quantiles.

Cody Gray - on strike
  • 239,200
  • 50
  • 490
  • 574
Amit Bidlan
  • 71
  • 1
  • 4