I was trying to build a class that could perform the Kruskal-Wallis test. The class uses the following formula to compute H:
However, it yields a different H-value than the kruskal
function of scipy. Does anyone know why this is the case?
import numpy as np
from scipy.stats import rankdata
from scipy.stats import kruskal
class Kruskal_Wallis():
def __init__(self):
pass
def fit(self, groups):
"""
Performs Kruskal-Wallis test.
:param groups: list containing 1D group arrays
Adds the following attributes:
- n: size of total population
- n_groups: number of groups (n_groups = len(n_i) = len(r_i))
- n_i: array containing group sizes
- df: degrees of freedom
- r2_i: array containing the square of the sum of ranks for each group
- h: kruskal-wallis statistic
"""
def sum_ranks_per_group(groups):
n_groups = len(groups)
n_i = np.array([group.shape[0] for group in groups])
data = np.array([])
for group in groups:
data = np.concatenate((data, group), axis=0)
ranked_data = rankdata(data, method="average")
ranked_groups = ranked_data.reshape((n_groups, n_i[0])) #works only if groups have equal size
summed_ranks = ranked_groups.sum(axis=1)
return summed_ranks
def get_h(n, r2_i, n_i):
summed_r2_i_per_n_i = (r2_i/n_i).sum()
h = (12/(n*(n-1)) * summed_r2_i_per_n_i) - 3*(n+1)
return h
n_groups = len(groups)
n_i = np.array([group.shape[0] for group in groups])
n = sum(n_i)
df = n_groups - 1
r2_i = sum_ranks_per_group(groups)**2
h = get_h(n, r2_i, n_i)
self.n_groups = n_groups
self.n_i = n_i
self.n = n
self.df = df
self.r2_i = r2_i
self.h = h
## Compare results yielded by scipy.stats.kruskal and Kruskal_Wallis class
groups = [np.arange(1,3),
np.arange(3,5)]
res = kruskal(groups[0], groups[1])
kruskal_wallis = Kruskal_Wallis()
kruskal_wallis.fit(groups)
print(res)
print(kruskal_wallis.h)