I am trying to create a matching algorithm using pulp but the results for the sample data I'm getting are wrong as I think the function is flawed.
Sample data:
users = {
1: (5.0, 4.0, 1.0, 2, 1, 1),
2: (8.0, 6.0, 2.0, 3, 2, 1)
}
dataset = pd.DataFrame([
{'id': 1, 'group': 'A', 'weight': 1},
{'id': 2, 'group': 'A', 'weight': 2},
{'id': 3, 'group': 'A', 'weight': 3},
{'id': 4, 'group': 'A', 'weight': 3},
{'id': 5, 'group': 'A', 'weight': 4},
{'id': 6, 'group': 'A', 'weight': 6},
{'id': 7, 'group': 'A', 'weight': 7},
{'id': 8, 'group': 'A', 'weight': 8},
{'id': 9, 'group': 'B', 'weight': 2},
{'d': 10, 'group': 'B', 'weight': 1}
])
I would like to match different ids to users (without repetition). For each user I have a total weight, group A weight, group B weight, unique id count, group A unique id count, group B unique id count.
For the sample above the correct answer should be:
{'id': 5, 'group': 'A', 'weight': 4, 'user_id': 1}
{'id': 10, 'group': 'B', 'weight': 1, 'user_id': 1}
{'id': 3, 'group': 'A', 'weight': 3, 'user_id': 2}
{'id': 4, 'group': 'A', 'weight': 3, 'user_id': 2}
{'id': 9, 'group': 'B', 'weight': 2, 'user_id': 2}
My first attempt:
from pulp import *
import pandas as pd
from itertools import product
def match_weights(users, dataset):
matched_rows = []
variables = LpVariable.dicts("Item", range(len(dataset)), lowBound=0, cat='Binary')
user_vars = {}
for user_id, (total_weight, group_a_weight, group_b_weight, total_unique_users, group_a_unique_users, group_b_unique_users) in users.items():
user_vars[user_id] = {}
user_vars[user_id]['total_weight'] = LpVariable("TotalWeight_{}".format(user_id), lowBound=0, upBound=total_weight)
user_vars[user_id]['group_a_weight'] = LpVariable("GroupAWeight_{}".format(user_id), lowBound=0, upBound=group_a_weight)
user_vars[user_id]['group_b_weight'] = LpVariable("GroupBWeight_{}".format(user_id), lowBound=0, upBound=group_b_weight)
user_vars[user_id]['total_unique_users'] = LpVariable("TotalUniqueUsers_{}".format(user_id), lowBound=0, upBound=total_unique_users, cat='Integer')
user_vars[user_id]['group_a_unique_users'] = LpVariable("GroupAUniqueUsers_{}".format(user_id), lowBound=0, upBound=group_a_unique_users, cat='Integer')
user_vars[user_id]['group_b_unique_users'] = LpVariable("GroupBUniqueUsers_{}".format(user_id), lowBound=0, upBound=group_b_unique_users, cat='Integer')
prob = LpProblem("MatchingProblem", LpMaximize)
prob += lpSum(variables[i] for i in range(len(dataset)))
for user_id, (total_weight, group_a_weight, group_b_weight, total_unique_users, group_a_unique_users, group_b_unique_users) in users.items():
group_a_items = dataset[dataset['group'] == 'A'].index.tolist()
group_b_items = dataset[dataset['group'] == 'B'].index.tolist()
# Total weight constraint
prob += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in range(len(dataset))) <= user_vars[user_id]['total_weight']
# Group A weight constraint
prob += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in group_a_items) <= user_vars[user_id]['group_a_weight']
# Group B weight constraint
prob += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in group_b_items) <= user_vars[user_id]['group_b_weight']
# Total unique user constraint
unique_users = set()
for i in range(len(dataset)):
if variables[i].value() == 1:
unique_users.add(dataset.loc[i, 'id'])
prob += lpSum(1 for u in unique_users) <= user_vars[user_id]['total_unique_users']
# Group A unique user constraint
unique_users_a = set()
for i in group_a_items:
if variables[i].value() == 1:
unique_users_a.add(dataset.loc[i, 'id'])
prob += lpSum(1 for u in unique_users_a) <= user_vars[user_id]['group_a_unique_users']
# Group B unique user constraint
unique_users_b = set()
for i in group_b_items:
if variables[i].value() == 1:
unique_users_b.add(dataset.loc[i, 'id'])
prob += lpSum(1 for u in unique_users_b) <= user_vars[user_id]['group_b_unique_users']
prob.solve()
for user_id, (total_weight, group_a_weight, group_b_weight, total_unique_users, group_a_unique_users, group_b_unique_users) in users.items():
matched_user_rows = []
for i in range(len(dataset)):
if variables[i].value() == 1:
matched_row = dataset.loc[i].copy()
matched_row['user_id'] = user_id
matched_user_rows.append(matched_row)
matched_rows.extend(matched_user_rows)
return matched_rows
However the results are:
{1: {'group_a': [2], 'group_b': [10]}, 2: {'group_a': [2], 'group_b': [10]}}
Looks like my results might overwrite each other but also look wrong.
I tried to rewrite it and got similar incorrect results:
def match_weights(users, dataset):
model = LpProblem("MatchingProblem", LpMaximize)
variables = LpVariable.dicts("Item", dataset.index, lowBound=0, cat='Binary')
model += lpSum(variables[i] for i in dataset.index)
# Add constraints for each user
for user_id, (total_weight, group_a_weight, group_b_weight, _, _, _) in users.items():
# Filter dataset based on user group
group_a_indices = dataset[dataset['group'] == 'A'].index
group_b_indices = dataset[dataset['group'] == 'B'].index
# Total weight constraint
model += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in dataset.index) <= total_weight
# Group A weight constraint
model += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in group_a_indices) <= group_a_weight
# Group B weight constraint
model += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in group_b_indices) <= group_b_weight
unique_user_set = set(dataset['respondent_id'])
for user_id, (total_weight, _, _, total_unique_users, group_a_unique_users, group_b_unique_users) in users.items():
group_a_indices = dataset[dataset['group'] == 'A'].index
group_b_indices = dataset[dataset['group'] == 'B'].index
# Total unique users constraint
model += lpSum(variables[i] for i in dataset.index if dataset.loc[i, 'respondent_id'] in unique_user_set) \
<= total_unique_users
# Group A unique users constraint
model += lpSum(variables[i] for i in group_a_indices if dataset.loc[i, 'respondent_id'] in unique_user_set) \
<= group_a_unique_users
# Group B unique users constraint
model += lpSum(variables[i] for i in group_b_indices if dataset.loc[i, 'respondent_id'] in unique_user_set) \
<= group_b_unique_users
model.solve()
results = {}
for user_id, (_, _, _, _, _, _) in users.items():
group_a_indices = dataset[dataset['group'] == 'A'].index
group_b_indices = dataset[dataset['group'] == 'B'].index
matched_a = [dataset.loc[i, 'respondent_id'] for i in group_a_indices if variables[i].value() == 1]
matched_b = [dataset.loc[i, 'respondent_id'] for i in group_b_indices if variables[i].value() == 1]
results[user_id] = {'group_a': matched_a, 'group_b': matched_b}
return results
Where am I going wrong?