I have a PyTorch tensor that contains the labels of some samples.
I want to split each label into n_groups
groups, introducing new virtual labels.
For example, for the labels:
labels = torch.as_tensor([0, 0, 0, 1, 1, 1, 2, 2, 2], dtype=torch.long)
One possible solution to subdivide each label into n_groups=2
is the following:
subdivided_labels = [0, 3, 0, 1, 4, 1, 2, 5, 2]
The constraints are the following:
- There is no assumption about the order of the initial labels
- The labels should be well distributed among the groups
- The label ranking should be consistent among different groups, i.e., the first label
0
in the first group should be the first label also in any other group. - The following should always be true
torch.equal(labels, subdivided_labels % num_classes)
- It is possible that the number of groups is greater than the number of samples for a given label
The following tests should pass for the desired algorithm:
@pytest.mark.parametrize(
"labels",
(
torch.randint(100, size=(50,)),
torch.arange(100),
torch.ones(100),
torch.randint(100, size=(50,)).repeat(4),
torch.arange(100).repeat(4),
torch.ones(100).repeat(4),
torch.randint(100, size=(50,)).repeat_interleave(4),
torch.arange(100).repeat_interleave(4),
torch.ones(100).repeat_interleave(4),
),
)
@pytest.mark.parametrize("n_groups", (1, 2, 3, 4, 5, 50, 150))
def test_subdivide_labels(labels, n_groups):
subdivided_labels = subdivide_labels(labels, n_groups=n_groups, num_classes=100)
assert torch.equal(labels, subdivided_labels % 100)
@pytest.mark.parametrize(
"labels, n_groups, n_classes, expected_result",
(
(
torch.tensor([0, 0, 1, 1, 2, 2]),
2,
3,
torch.tensor([0, 3, 1, 4, 2, 5]),
),
(
torch.tensor([0, 0, 1, 1, 2, 2]),
2,
10,
torch.tensor([0, 10, 1, 11, 2, 12]),
),
(
torch.tensor([0, 0, 1, 1, 2, 2]),
1,
10,
torch.tensor([0, 0, 1, 1, 2, 2]),
),
(
torch.tensor([0, 0, 2, 2, 1, 1]),
2,
3,
torch.tensor([0, 3, 2, 5, 1, 4]),
),
(
torch.tensor([0, 0, 2, 2, 1, 1]),
30,
3,
torch.tensor([0, 3, 2, 5, 1, 4]),
),
),
)
def test_subdivide_labels_with_gt(labels, n_groups, n_classes, expected_result):
subdivided_labels = subdivide_labels(labels, n_groups=n_groups, num_classes=n_classes)
assert torch.equal(expected_result, subdivided_labels)
assert torch.equal(labels, subdivided_labels % n_classes)
I have a non-vectorized solution:
import torch
def subdivide_labels(labels: torch.Tensor, n_groups: int, num_classes: int) -> torch.Tensor:
"""Divide each label in groups introducing virtual labels.
Args:
labels: the tensor containing the labels, each label should be in [0, num_classes)
n_groups: the number of groups to create for each label
num_classes: the number of classes
Returns:
a tensor with the same shape of labels, but with each label partitioned in n_groups virtual labels
"""
unique, counts = labels.unique(
sorted=True,
return_counts=True,
return_inverse=False,
)
virtual_labels = labels.clone().detach()
max_range = num_classes * (torch.arange(counts.max()) % n_groups)
for value, count in zip(unique, counts):
virtual_labels[labels == value] = max_range[:count] + value
return virtual_labels
labels = torch.as_tensor([0, 0, 0, 1, 1, 1, 2, 2, 2], dtype=torch.long)
subdivide_labels(labels, n_groups=2, num_classes=3)
tensor([0, 3, 0, 1, 4, 1, 2, 5, 2])
Is it possible to vectorize this algorithm? Alternatively, are there any faster algorithms to perform the same operation?