I believe you need weighted median. I used function weighted_median
from here, you can also try wquantile
's weighted.median
, but it interpolates in a bit different way so you may achieve nonexpected results):
import numpy as np
import pandas as pd
# from here: https://stackoverflow.com/a/32921444/3025981, CC BY-SA by Afshin @ SE
def weighted_median(values, weights):
''' compute the weighted median of values list. The
weighted median is computed as follows:
1- sort both lists (values and weights) based on values.
2- select the 0.5 point from the weights and return the corresponding values as results
e.g. values = [1, 3, 0] and weights=[0.1, 0.3, 0.6] assuming weights are probabilities.
sorted values = [0, 1, 3] and corresponding sorted weights = [0.6, 0.1, 0.3] the 0.5 point on
weight corresponds to the first item which is 0. so the weighted median is 0.'''
#convert the weights into probabilities
sum_weights = sum(weights)
weights = np.array([(w*1.0)/sum_weights for w in weights])
#sort values and weights based on values
values = np.array(values)
sorted_indices = np.argsort(values)
values_sorted = values[sorted_indices]
weights_sorted = weights[sorted_indices]
#select the median point
it = np.nditer(weights_sorted, flags=['f_index'])
accumulative_probability = 0
median_index = -1
while not it.finished:
accumulative_probability += it[0]
if accumulative_probability > 0.5:
median_index = it.index
return values_sorted[median_index]
elif accumulative_probability == 0.5:
median_index = it.index
it.iternext()
next_median_index = it.index
return np.mean(values_sorted[[median_index, next_median_index]])
it.iternext()
return values_sorted[median_index]
# end from
def wmed(group):
return weighted_median(group['score'], group['times'])
import pandas as pd
df = pd.DataFrame([
['user1', 1, 4],
['user1', 7, 2],
['user2', 3, 1],
['user2', 10, 2]
], columns = ['user', 'score', 'times'])
groups = df.groupby('user')
groups.apply(wmed)
# user
# user1 1
# user2 10
# dtype: int64