1

I am trying to create a matching algorithm using pulp but the results for the sample data I'm getting are wrong as I think the function is flawed.

Sample data:

users = {
    1: (5.0, 4.0, 1.0, 2, 1, 1),
    2: (8.0, 6.0, 2.0, 3, 2, 1)
}

dataset = pd.DataFrame([
    {'id': 1, 'group': 'A', 'weight': 1},
    {'id': 2, 'group': 'A', 'weight': 2},
    {'id': 3, 'group': 'A', 'weight': 3},
    {'id': 4, 'group': 'A', 'weight': 3},
    {'id': 5, 'group': 'A', 'weight': 4},
    {'id': 6, 'group': 'A', 'weight': 6},
    {'id': 7, 'group': 'A', 'weight': 7},
    {'id': 8, 'group': 'A', 'weight': 8},
    {'id': 9, 'group': 'B', 'weight': 2},
    {'d': 10, 'group': 'B', 'weight': 1}
])

I would like to match different ids to users (without repetition). For each user I have a total weight, group A weight, group B weight, unique id count, group A unique id count, group B unique id count.

For the sample above the correct answer should be:

{'id': 5, 'group': 'A', 'weight': 4, 'user_id': 1}
{'id': 10, 'group': 'B', 'weight': 1, 'user_id': 1}
{'id': 3, 'group': 'A', 'weight': 3, 'user_id': 2}
{'id': 4, 'group': 'A', 'weight': 3, 'user_id': 2}
{'id': 9, 'group': 'B', 'weight': 2, 'user_id': 2}

My first attempt:

from pulp import *
import pandas as pd
from itertools import product

def match_weights(users, dataset):
    matched_rows = []
    
    variables = LpVariable.dicts("Item", range(len(dataset)), lowBound=0, cat='Binary')
    user_vars = {}
    
    for user_id, (total_weight, group_a_weight, group_b_weight, total_unique_users, group_a_unique_users, group_b_unique_users) in users.items():
        user_vars[user_id] = {}
        user_vars[user_id]['total_weight'] = LpVariable("TotalWeight_{}".format(user_id), lowBound=0, upBound=total_weight)
        user_vars[user_id]['group_a_weight'] = LpVariable("GroupAWeight_{}".format(user_id), lowBound=0, upBound=group_a_weight)
        user_vars[user_id]['group_b_weight'] = LpVariable("GroupBWeight_{}".format(user_id), lowBound=0, upBound=group_b_weight)
        user_vars[user_id]['total_unique_users'] = LpVariable("TotalUniqueUsers_{}".format(user_id), lowBound=0, upBound=total_unique_users, cat='Integer')
        user_vars[user_id]['group_a_unique_users'] = LpVariable("GroupAUniqueUsers_{}".format(user_id), lowBound=0, upBound=group_a_unique_users, cat='Integer')
        user_vars[user_id]['group_b_unique_users'] = LpVariable("GroupBUniqueUsers_{}".format(user_id), lowBound=0, upBound=group_b_unique_users, cat='Integer')

    prob = LpProblem("MatchingProblem", LpMaximize)
    prob += lpSum(variables[i] for i in range(len(dataset)))

    for user_id, (total_weight, group_a_weight, group_b_weight, total_unique_users, group_a_unique_users, group_b_unique_users) in users.items():
        group_a_items = dataset[dataset['group'] == 'A'].index.tolist()
        group_b_items = dataset[dataset['group'] == 'B'].index.tolist()

        # Total weight constraint
        prob += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in range(len(dataset))) <= user_vars[user_id]['total_weight']

        # Group A weight constraint
        prob += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in group_a_items) <= user_vars[user_id]['group_a_weight']

        # Group B weight constraint
        prob += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in group_b_items) <= user_vars[user_id]['group_b_weight']

        # Total unique user constraint
        unique_users = set()
        for i in range(len(dataset)):
            if variables[i].value() == 1:
                unique_users.add(dataset.loc[i, 'id'])
        prob += lpSum(1 for u in unique_users) <= user_vars[user_id]['total_unique_users']

        # Group A unique user constraint
        unique_users_a = set()
        for i in group_a_items:
            if variables[i].value() == 1:
                unique_users_a.add(dataset.loc[i, 'id'])
        prob += lpSum(1 for u in unique_users_a) <= user_vars[user_id]['group_a_unique_users']

        # Group B unique user constraint
        unique_users_b = set()
        for i in group_b_items:
            if variables[i].value() == 1:
                unique_users_b.add(dataset.loc[i, 'id'])
        prob += lpSum(1 for u in unique_users_b) <= user_vars[user_id]['group_b_unique_users']

    prob.solve()
    for user_id, (total_weight, group_a_weight, group_b_weight, total_unique_users, group_a_unique_users, group_b_unique_users) in users.items():
        matched_user_rows = []
        for i in range(len(dataset)):
            if variables[i].value() == 1:
                matched_row = dataset.loc[i].copy()
                matched_row['user_id'] = user_id
                matched_user_rows.append(matched_row)
        matched_rows.extend(matched_user_rows)

    return matched_rows

However the results are:

{1: {'group_a': [2], 'group_b': [10]}, 2: {'group_a': [2], 'group_b': [10]}}

Looks like my results might overwrite each other but also look wrong.

I tried to rewrite it and got similar incorrect results:

def match_weights(users, dataset):
   
    model = LpProblem("MatchingProblem", LpMaximize)
    variables = LpVariable.dicts("Item", dataset.index, lowBound=0, cat='Binary')
    model += lpSum(variables[i] for i in dataset.index)

    # Add constraints for each user
    for user_id, (total_weight, group_a_weight, group_b_weight, _, _, _) in users.items():
        # Filter dataset based on user group
        group_a_indices = dataset[dataset['group'] == 'A'].index
        group_b_indices = dataset[dataset['group'] == 'B'].index

        # Total weight constraint
        model += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in dataset.index) <= total_weight

        # Group A weight constraint
        model += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in group_a_indices) <= group_a_weight

        # Group B weight constraint
        model += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in group_b_indices) <= group_b_weight


    unique_user_set = set(dataset['respondent_id'])
    for user_id, (total_weight, _, _, total_unique_users, group_a_unique_users, group_b_unique_users) in users.items():
 
        group_a_indices = dataset[dataset['group'] == 'A'].index
        group_b_indices = dataset[dataset['group'] == 'B'].index

        # Total unique users constraint
        model += lpSum(variables[i] for i in dataset.index if dataset.loc[i, 'respondent_id'] in unique_user_set) \
                 <= total_unique_users

        # Group A unique users constraint
        model += lpSum(variables[i] for i in group_a_indices if dataset.loc[i, 'respondent_id'] in unique_user_set) \
                 <= group_a_unique_users

        # Group B unique users constraint
        model += lpSum(variables[i] for i in group_b_indices if dataset.loc[i, 'respondent_id'] in unique_user_set) \
                 <= group_b_unique_users


    model.solve()
    results = {}
    for user_id, (_, _, _, _, _, _) in users.items():

        group_a_indices = dataset[dataset['group'] == 'A'].index
        group_b_indices = dataset[dataset['group'] == 'B'].index
        matched_a = [dataset.loc[i, 'respondent_id'] for i in group_a_indices if variables[i].value() == 1]
        matched_b = [dataset.loc[i, 'respondent_id'] for i in group_b_indices if variables[i].value() == 1]

        results[user_id] = {'group_a': matched_a, 'group_b': matched_b}

    return results

Where am I going wrong?

Olivia
  • 814
  • 1
  • 14
  • 26
  • First, there appears to be a typo in the last row of your `dataset` – Reinderien Jul 10 '23 at 15:08
  • I am unclear on why you have `prob += lpSum(variables[i] for i in range(len(dataset)))`. You have not mentioned an optimization direction or optimization objective so this seems like "constraint programming" and there would be no objective provided. – Reinderien Jul 10 '23 at 15:24

2 Answers2

2

You've set an optimisation objective and optimisation sense when I believe there should be no objective (and the sense left as default). In other words, once you're done formulating, print(prob) should include

MINIMIZE
None

You should not have variables for total weight, total unique users etc: instead, only have one decision variable kind, which is a binary assignment of whether an ID has been assigned to a user.

The "total" constraint is redundant and you can deal with the individual group values without the total.

Since you're in Pandas, so long as your data are shaped correctly, you do not need to use lpSum; you can operate on the frames and series directly (.dot(), .sum()).

dataset is a frame (good!) that needs id to move to an index. users should be converted from a dictionary to a frame.

import pandas as pd
import pulp

requirements = pd.DataFrame(
    data={
        1: (5.0, 4.0, 1.0, 2, 1, 1),
        2: (8.0, 6.0, 2.0, 3, 2, 1),
    },
    # row index, soon to turn into column names via .T
    index=pd.MultiIndex.from_product(
        iterables=(('weight', 'count'), ('total', 'A', 'B')),
        names=('requirement', 'group'),
    ),
).drop(labels='total', level='group').T  # redundant
requirements.index.name = 'user'
user_values = requirements.index.to_series()
requirements = requirements.stack(level='group')
print(requirements, end='\n\n')

dataset = pd.DataFrame([
    {'id': 1, 'group': 'A', 'weight': 1},
    {'id': 2, 'group': 'A', 'weight': 2},
    {'id': 3, 'group': 'A', 'weight': 3},
    {'id': 4, 'group': 'A', 'weight': 3},
    {'id': 5, 'group': 'A', 'weight': 4},
    {'id': 6, 'group': 'A', 'weight': 6},
    {'id': 7, 'group': 'A', 'weight': 7},
    {'id': 8, 'group': 'A', 'weight': 8},
    {'id': 9, 'group': 'B', 'weight': 2},
    {'id': 10, 'group': 'B', 'weight': 1},
]).set_index('id')
print(dataset, end='\n\n')

combos = pd.merge(left=user_values, right=dataset.index.to_series(), how='cross')
asn_name = 'asn_u' + combos.user.astype(str) + '_i' + combos.id.astype(str)
combos['asn'] = asn_name.apply(pulp.LpVariable, cat=pulp.LpBinary)
combos = combos.set_index(['user', 'id']).asn.unstack(level='user')
print(combos, end='\n\n')

prob = pulp.LpProblem(name='user_assignment')

# Every ID can be assigned to at most one user
for user, total in combos.sum(axis=1).items():
    prob.addConstraint(name=f'excl_u{user}', constraint=total <= 1)

for group, dataset_weights in dataset.groupby('group'):
    group_assigns = combos.loc[dataset_weights.index]
    for user, user_assigns in group_assigns.items():
        requirement = requirements.loc[(user, group), :]
        prob.addConstraint(
            name=f'count_u{user}_g{group}',
            constraint=user_assigns.sum() == requirement['count'],
        )
        prob.addConstraint(
            name=f'weight_u{user}_g{group}',
            constraint=user_assigns.dot(dataset_weights.weight) == requirement['weight'],
        )

print(prob)
prob.solve()
assert prob.status == pulp.LpStatusOptimal

combos = combos.applymap(pulp.LpVariable.value).astype(int)
combos = combos[combos.any(axis=1)]
print(combos)
requirement  count  weight
user group                
1    A         1.0     4.0
     B         1.0     1.0
2    A         2.0     6.0
     B         1.0     2.0

   group  weight
id              
1      A       1
2      A       2
3      A       3
4      A       3
5      A       4
6      A       6
7      A       7
8      A       8
9      B       2
10     B       1

user           1           2
id                          
1      asn_u1_i1   asn_u2_i1
2      asn_u1_i2   asn_u2_i2
3      asn_u1_i3   asn_u2_i3
4      asn_u1_i4   asn_u2_i4
5      asn_u1_i5   asn_u2_i5
6      asn_u1_i6   asn_u2_i6
7      asn_u1_i7   asn_u2_i7
8      asn_u1_i8   asn_u2_i8
9      asn_u1_i9   asn_u2_i9
10    asn_u1_i10  asn_u2_i10

user_assignment:
MINIMIZE
None
SUBJECT TO
excl_u1: asn_u1_i1 + asn_u2_i1 <= 1
excl_u2: asn_u1_i2 + asn_u2_i2 <= 1
...

count_u1_gA: asn_u1_i1 + asn_u1_i2 + asn_u1_i3 + asn_u1_i4 + asn_u1_i5
 + asn_u1_i6 + asn_u1_i7 + asn_u1_i8 = 1

weight_u1_gA: asn_u1_i1 + 2 asn_u1_i2 + 3 asn_u1_i3 + 3 asn_u1_i4
 + 4 asn_u1_i5 + 6 asn_u1_i6 + 7 asn_u1_i7 + 8 asn_u1_i8 = 4
...
VARIABLES
0 <= asn_u1_i1 <= 1 Integer
0 <= asn_u1_i10 <= 1 Integer
0 <= asn_u1_i2 <= 1 Integer
0 <= asn_u1_i3 <= 1 Integer
0 <= asn_u1_i4 <= 1 Integer
0 <= asn_u1_i5 <= 1 Integer
0 <= asn_u1_i6 <= 1 Integer
0 <= asn_u1_i7 <= 1 Integer
0 <= asn_u1_i8 <= 1 Integer
0 <= asn_u1_i9 <= 1 Integer
0 <= asn_u2_i1 <= 1 Integer
0 <= asn_u2_i10 <= 1 Integer
0 <= asn_u2_i2 <= 1 Integer
0 <= asn_u2_i3 <= 1 Integer
0 <= asn_u2_i4 <= 1 Integer
0 <= asn_u2_i5 <= 1 Integer
0 <= asn_u2_i6 <= 1 Integer
0 <= asn_u2_i7 <= 1 Integer
0 <= asn_u2_i8 <= 1 Integer
0 <= asn_u2_i9 <= 1 Integer
...
Result - Optimal solution found

user  1  2
id        
3     0  1
4     0  1
5     1  0
9     0  1
10    1  0
Reinderien
  • 11,755
  • 5
  • 49
  • 77
  • 1
    That works, thanks. Just need to test that it scales well with thousands of ids in my dataset :) – Olivia Jul 11 '23 at 09:21
1

You are on the right track... kinda.

I couldn't really follow the logic in the first attempt. It isn't clear why you were creating so many variables. At the core, this is an assignment problem, where we are assigning ids to users. So the only variable we should really need is something like:

assign[item, user]  # a binary variable indicating assignment

Then, if we need the total weight assigned, we can just sum up the assignments times the weights in standard fashion. No need for more variables.

The second attempt is on a better track, but the variable is set up wrong, You really want to double-index it as I show below, then things fall into place. The more you get used to thinking about this in a math programming context, the easier things get. You need the correct variable and some subsets in your problem indicating the set of all ids and then the groups. If you had many groups, I'd probably reorganize the data to be able to index by group as well, but for 2 groups, this is fine.

Regarding the loops you have, you can combine what I did below into a loop "for each user" and include all the constraints there, but I think it is easier to digest if you do a loop per constraint. In the end it is a preference

I think you can probably finish what I've started below to scoop up the rest of the constraints and solve. Comment back if you are stuck.

Code:

# group assignment

import pandas as pd
import pulp

### DATA

users = {
    1: (5.0, 4.0, 1.0, 2, 1, 1),
    2: (8.0, 6.0, 2.0, 3, 2, 1)
}

dataset = pd.DataFrame([
    {'id': 1, 'group': 'A', 'weight': 1},
    {'id': 2, 'group': 'A', 'weight': 2},
    {'id': 3, 'group': 'A', 'weight': 3},
    {'id': 4, 'group': 'A', 'weight': 3},
    {'id': 5, 'group': 'A', 'weight': 4},
    {'id': 6, 'group': 'A', 'weight': 6},
    {'id': 7, 'group': 'A', 'weight': 7},
    {'id': 8, 'group': 'A', 'weight': 8},
    {'id': 9, 'group': 'B', 'weight': 2},
    {'id': 10, 'group': 'B', 'weight': 1}   # <-- note you had a typo in this row
]).set_index('id')

### a little tidying & conveniences...
for user in users:
    users[user] = dict(zip( ('wt_limit', 'a_wt_limit', 'b_wt_limit', 'item_limit', 'a_item_limit', 'b_item_limit'), users[user]))
ids = dataset.index
print(ids)

# subsets
grp_a_idx = dataset[dataset['group'] == 'A'].index
grp_b_idx = dataset[dataset['group'] == 'B'].index

### PROBLEM SETUP

prob = pulp.LpProblem('id_assigner', pulp.LpMaximize)

### SETS

id_user = { (i, u) for i in dataset.index for u in users }

### VARS

assign = pulp.LpVariable.dicts('assign', indices=id_user, cat=pulp.LpBinary)

### OBJ:  Maximize assignments

prob += pulp.lpSum(assign)

### CONSTRAINTS

# 1.  Can only assign each id only once (or not at all)
for i in ids:
    prob += sum(assign[i, user] for user in users) <= 1

# 2.  Don't overassign total weight, for each user
for user in users:
    prob += sum(assign[i, user]*dataset.loc[i, 'weight'] for i in ids ) <= users[user]['wt_limit']

# 3.  Don't overassign weights for grp A, for each user
for user in users:
    prob += sum(assign[i, user]*dataset.loc[i, 'weight'] for i in grp_a_idx) <= users[user]['a_wt_limit']

# .....  similarly for other constraints.  Use the subsets as needed for group-specific things, like #3 above.

# check it:
print(prob)

### SOLVE

### RESULTS
AirSquid
  • 10,214
  • 2
  • 7
  • 31
  • The other answer works but yours seems to give thr wrong result. I added: for user in users: prob += sum(assign[i, user]*dataset.iloc[i]['count'] for i in ids) <= users[user]['item_limit'] for user in users: prob += sum(assign[i, user]*dataset.iloc[i]['count'] for i in grp_a_idx) <= users[user]['a_item_limit'] for user in users: prob += sum(assign[i, user]*dataset.iloc[i]['count'] for i in grp_b_idx) <= users[user]['b_item_limit'] Then after solving I run: for i in assign: print(assign[i], assign[i].value()) – Olivia Jul 11 '23 at 09:22
  • For user id 1 I get index 3, 9 (so 4,10) and for 2 I get index 0, 1, 8 (1, 2, 9). – Olivia Jul 11 '23 at 09:22
  • 1
    Hi Olivia. This off-by-one was because in my answer I did not reset the dataframe index to `id`. I was using the default index, so everything was off-by one. That is fixed in the edit above along with the accompanying change from `.iloc` -> `.loc`. Both of these answers are highly similar and just show a range of possible approaches, depending on your comfort manipulating pandas and preferences. @Reinderien gets a hat-tip for remembering to show/emphasize the check that the solver status is optimal. – AirSquid Jul 11 '23 at 14:38