Distribution fitting of Multiple columns

Question

I am trying to get the distribution fitting of my data using scipy.stats. The data contains multiple columns col_1, col_2, col_3 in a single CSV file.

The problem is distribution fitting only takes a single column to identify a best distribution fittings as I have shown in the below code.

How to get the distribution fitting of all columns at the same time? e.g distribution fitting ofcol_1, col_2, col_3

 import warnings
 warnings.filterwarnings("ignore")

 import pandas as pd
 import numpy as np
 import scipy
 from sklearn.preprocessing import StandardScaler
 import scipy.stats
 import matplotlib.pyplot as plt

 # Load data and select first column

 from sklearn import datasets
 data_set = datasets.load_breast_cancer()

 # Multiple columns of csv
 col_1=data_set.data[:,0]
 col_2=data_set.data[:,1]
 col_3=data_set.data[:,2]

 # Create an index array (x) for data

 x = np.arange(len(col_1))
 size = len(col_1)

 plt.hist(col_1)
 plt.show()


 sc=StandardScaler() 
 yy = col_1.reshape (-1,1)
 sc.fit(yy)
 y_std =sc.transform(yy)
 y_std = y_std.flatten()
 y_std
 del yy


 dist_names = ['beta',
               'expon',
               'gamma',
               'lognorm',
               'norm',
               'pearson3',
               'triang',
               'uniform',
               'weibull_min', 
               'weibull_max']

 # Set up empty lists to stroe results
 chi_square = []
 p_values = []


 # Set up 50 bins for chi-square test
 # Observed data will be approximately evenly distrubuted aross all bins
 percentile_bins = np.linspace(0,100,51)
 percentile_cutoffs = np.percentile(y_std, percentile_bins)
 observed_frequency, bins = (np.histogram(y_std, bins=percentile_cutoffs))
 cum_observed_frequency = np.cumsum(observed_frequency)



 # Loop through candidate distributions

 for distribution in dist_names:
     # Set up distribution and get fitted distribution parameters
     dist = getattr(scipy.stats, distribution)
     param = dist.fit(y_std)

     # Obtain the KS test P statistic, round it to 5 decimal places
     p = scipy.stats.kstest(y_std, distribution, args=param)[1]
     p = np.around(p, 5)
     p_values.append(p)    

     # Get expected counts in percentile bins
     # This is based on a 'cumulative distrubution function' (cdf)
     cdf_fitted = dist.cdf(percentile_cutoffs, *param[:-2], loc=param[-2], 
                           scale=param[-1])
     expected_frequency = []
     for bin in range(len(percentile_bins)-1):
         expected_cdf_area = cdf_fitted[bin+1] - cdf_fitted[bin]
         expected_frequency.append(expected_cdf_area)

     # calculate chi-squared
     expected_frequency = np.array(expected_frequency) * size
     cum_expected_frequency = np.cumsum(expected_frequency)
     ss = sum (((cum_expected_frequency - cum_observed_frequency) ** 2) / cum_observed_frequency)
     chi_square.append(ss)

 # Collate results and sort by goodness of fit (best at top)

 results = pd.DataFrame()
 results['Distribution'] = dist_names
 results['chi_square'] = chi_square
 results['p_value'] = p_values
 results.sort_values(['chi_square'], inplace=True)

 # Report results

 print ('\nDistributions sorted by goodness of fit:')
 print ('----------------------------------------')
 print (results)

Every column data have different distribution fitting and characteristics. Merging into one will not make a clear sense of each column distribution. — Case Msee, Apr 10 '20 at 08:44
What do you mean by "at the same time". If they are independent then successive fitting would just be fine. Do they join a parameter? Then it becomes more tricky. — mikuszefski, Apr 20 '20 at 07:45

Distribution fitting of Multiple columns

0 Answers0