Here's an example data I have,
word_indices = [
('bus', 554, 1),
('bus', 719, 1),
('bus', 808, 1),
('accessibility', 572, 2),
('accessibility', 724, 2),
('accessibility', 809, 2),
('ada', 725, 3),
('ada', 810, 3),
('accessible', 695, 4),
('accessible', 707, 4),
('accessible', 726, 4),
('accessible', 811, 4),
('get', 10, 5),
('get', 17, 5),
('get', 98, 5),
('get', 179, 5),
('get', 733, 5),
('get', 812, 5),
('tickets', 734, 6),
('tickets', 813, 6),
('tickets', 907, 6),
('nov', 736, 7),
('nov', 815, 7),
('ticket', 816, 8),
('ticket', 818, 8),
('ticket', 828, 8),
('information', 817, 9),
('ticket', 816, 10),
('ticket', 818, 10),
('ticket', 828, 10),
('includes', 819, 11),
('includes', 834, 11),
('wine', 760, 12),
('wine', 820, 12),
('beer', 821, 13),
('supper', 822, 14),
('performance', 262, 15),
('performance', 278, 15),
('performance', 399, 15),
('performance', 823, 15),
('and', 97, 16),
('and', 178, 16),
('and', 261, 16),
('and', 366, 16),
('and', 370, 16),
('and', 397, 16),
('and', 501, 16),
('and', 581, 16),
('and', 636, 16),
('and', 677, 16),
('and', 711, 16),
('and', 824, 16),
('and', 833, 16),
('and', 852, 16),
('and', 871, 16),
('and', 928, 16),
('and', 1017, 16),
('and', 1026, 16),
('and', 1044, 16),
('and', 1088, 16),
('and', 1092, 16),
('and', 1111, 16),
('and', 1126, 16),
('and', 1150, 16),
('and', 1160, 16),
('and', 1166, 16),
('and', 1178, 16),
('and', 1181, 16),
('light', 502, 17),
('light', 825, 17),
('dessert', 826, 18),
('benefactor', 827, 19),
('ticket', 816, 20),
('ticket', 818, 20),
('ticket', 828, 20),
('adds', 829, 21),
('additional', 831, 22),
('contribution', 832, 23),
('and', 97, 24),
('and', 178, 24),
('and', 261, 24),
('and', 366, 24),
('and', 370, 24),
('and', 397, 24),
('and', 501, 24),
('and', 581, 24),
('and', 636, 24),
('and', 677, 24),
('and', 711, 24),
('and', 824, 24),
('and', 833, 24),
('and', 852, 24),
('and', 871, 24),
('and', 928, 24),
('and', 1017, 24),
('and', 1026, 24),
('and', 1044, 24),
('and', 1088, 24),
('and', 1092, 24),
('and', 1111, 24),
('and', 1126, 24),
('and', 1150, 24),
('and', 1160, 24),
('and', 1166, 24),
('and', 1178, 24),
('and', 1181, 24),
('includes', 819, 25),
('includes', 834, 25),
('special', 240, 26),
('special', 255, 26),
('special', 316, 26),
('special', 465, 26),
('special', 759, 26),
('special', 836, 26),
('special', 1027, 26),
('goodie', 837, 27),
('bag', 838, 28),
('upon', 839, 29),
('departure', 841, 30),
('health', 90, 31),
('health', 171, 31),
('health', 842, 31),
('health', 1000, 31),
('health', 1292, 31),
('health', 1313, 31),
('adhere', 845, 32),
('centers', 848, 33),
('disease', 850, 34),
('control', 851, 35),
('and', 97, 36),
('and', 178, 36),
('and', 261, 36),
('and', 366, 36),
('and', 370, 36),
('and', 397, 36),
('and', 501, 36),
('and', 581, 36),
('and', 636, 36),
('and', 677, 36),
('and', 711, 36),
('and', 824, 36),
('and', 833, 36),
('and', 852, 36),
('and', 871, 36),
('and', 928, 36),
('and', 1017, 36),
('and', 1026, 36),
('and', 1044, 36),
('and', 1088, 36),
('and', 1092, 36),
('and', 1111, 36),
('and', 1126, 36),
('and', 1150, 36),
('and', 1160, 36),
('and', 1166, 36),
('and', 1178, 36),
('and', 1181, 36),
('prevention', 853, 37),
('cdc', 268, 38),
('cdc', 854, 38),
('covid', 855, 39),
('guidelines', 271, 40),
('guidelines', 856, 40),
('living', 51, 41),
('living', 132, 41),
('living', 188, 41),
('living', 195, 41),
('living', 213, 41),
('living', 233, 41),
('living', 303, 41),
('living', 588, 41),
('living', 859, 41),
('living', 942, 41),
('living', 978, 41),
('living', 986, 41),
('living', 1227, 41),
('living', 1245, 41),
('room', 52, 42),
('room', 133, 42),
('room', 189, 42),
('room', 196, 42),
('room', 214, 42),
('room', 234, 42),
('room', 304, 42),
('room', 860, 42),
('room', 943, 42),
('room', 979, 42),
('room', 987, 42),
('room', 1228, 42),
('room', 1246, 42),
('tour', 40, 43),
('tour', 53, 43),
('tour', 121, 43),
('tour', 134, 43),
('tour', 190, 43),
('tour', 197, 43),
('tour', 215, 43),
('tour', 235, 43),
('tour', 305, 43),
('tour', 861, 43),
('tour', 944, 43),
('tour', 980, 43),
('tour', 988, 43),
('tour', 1201, 43),
('tour', 1220, 43),
('tour', 1229, 43),
('tour', 1247, 43)]
('bus', 719, 1)
- i.e., word, word index and group number
I am trying to find best path (i.e., minimal distance) between each group word indices. (A path
is a sequential group numbers)
Distance is sum of absolute differences between word group indices.
Example Output:
- In group 1, we will have to select ('bus', 808, '1')
- In group 2, we should get ('accessibility', 809, 2)
- In group 3, we should get ('ada', 810, 3)
- In group 4, We should get ('accessible', 811, 4)
- In group 5, we should get ('get', 812, 5)
- In group 6, we should get ('tickets', 813, 6) and so on....
Choose the path - (808, 809, 810, 811, 812, 813) rather than (719, 724, 725, 726, 733, 734) because the first path has minimal distance (i.e., absolute difference).
Trying to find an efficient & scalable approach but I can't figure out the logic.
from itertools import groupby
matches = []
first_group = None
for en, (key, group) in enumerate(groupby(word_indices, key=lambda x: (x[0], x[-1]))):
current_group = list(group)
if en < 1:
first_group = current_group
continue
if first_group is not None and group:
for rowx, ix, group_idx in first_group:
for rowy, iy, group_idy in current_group:
if ix - iy <= 10:
break
Please do help on this, if someone is familiar with an approach that will work. Much appreciated! Thank you