I ended up coming up with this algorithm, which is not terribly clean or smart but seems to do the job:
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
class SyncAlgorithm {
static class JoinResult {
public final Integer aId;
public final Integer bId;
public final String identifier;
public JoinResult(Integer aId, Integer bId, String identifier) {
this.aId = aId;
this.bId = bId;
this.identifier = identifier;
}
}
public static void main(String[] args) {
List<JoinResult> table = makeTestTable();
System.out.println("Initial table:");
printTable(table);
System.out.println();
Iterator<JoinResult> iter = table.iterator();
// A.id values we have seen
Map<String, Set<Integer>> aSeen = new HashMap<String, Set<Integer>>();
// A.id values we have used
Map<String, Set<Integer>> aUsed = new HashMap<String, Set<Integer>>();
// B.id values we have seen
Map<String, Set<Integer>> bUsed = new HashMap<String, Set<Integer>>();
// Loop over table to remove unnecessary rows
while (iter.hasNext()) {
JoinResult row = iter.next();
// Make sure sets exist for current identifier
if (!aSeen.containsKey(row.identifier)) {
aSeen.put(row.identifier, new HashSet<Integer>());
}
if (!aUsed.containsKey(row.identifier)) {
aUsed.put(row.identifier, new HashSet<Integer>());
}
if (!bUsed.containsKey(row.identifier)) {
bUsed.put(row.identifier, new HashSet<Integer>());
}
// If there is no match in A remove
if (row.aId == null) {
iter.remove();
// If both A.id and B.id are note null
} else if (row.bId != null) {
// Mark A.id as seen
aSeen.get(row.identifier).add(row.aId);
// If A.id or B.id were already used discard row
if (aUsed.get(row.identifier).contains(row.aId) || bUsed.get(row.identifier).contains(row.bId)) {
iter.remove();
// If both ids are new mark them as used and keep the row
} else {
aUsed.get(row.identifier).add(row.aId);
bUsed.get(row.identifier).add(row.bId);
}
// If A.id is not null but B.id is null save A.id and keep the row
} else {
aSeen.get(row.identifier).add(row.aId);
aUsed.get(row.identifier).add(row.aId);
}
}
// Add A.id values without that have been seen but not used
for (Map.Entry<String, Set<Integer>> aSeenEntry : aSeen.entrySet())
{
Set<Integer> aSeenId = aSeenEntry.getValue();
aSeenId.removeAll(aUsed.get(aSeenEntry.getKey()));
for (Integer aId : aSeenId) {
table.add(new JoinResult(aId, null, aSeenEntry.getKey()));
}
}
System.out.println("Result table:");
printTable(table);
}
static List<JoinResult> makeTestTable() {
List<JoinResult> table = new ArrayList<JoinResult>();
table.add(new JoinResult(100, null, "capital"));
table.add(new JoinResult(201, 1001, "bat"));
table.add(new JoinResult(201, 1002, "bat"));
table.add(new JoinResult(201, 1003, "bat"));
table.add(new JoinResult(202, 1001, "bat"));
table.add(new JoinResult(202, 1002, "bat"));
table.add(new JoinResult(202, 1003, "bat"));
table.add(new JoinResult(null, 5010, "keyboard"));
table.add(new JoinResult(501, 3001, "foo"));
table.add(new JoinResult(502, 3001, "foo"));
return table;
}
static void printTable(List<JoinResult> table) {
System.out.println("A.id B.id identifier");
for (JoinResult row : table) {
System.out.printf("%-8d%-8d%s\n", row.aId, row.bId, row.identifier);
}
}
}
Output:
Initial table:
A.id B.id identifier
100 null capital
201 1001 bat
201 1002 bat
201 1003 bat
202 1001 bat
202 1002 bat
202 1003 bat
null 5010 keyboard
501 3001 foo
502 3001 foo
Result table:
A.id B.id identifier
100 null capital
201 1001 bat
202 1002 bat
501 3001 foo
502 null foo