I am generating class Objects and putting them into std::vector. Before adding, I need to check if they intersect with the already generated objects. As I plan to have millions of them, I need to parallelize this function as it takes a lot of time (The function must check each new object against all previously generated).
Unfortunately, the speed increase is not significant. The profiler also shows very low efficiency (all overhead). Any advise would be appreciated.
bool
Generator::_check_cube (std::vector<Cube> &cubes, const cube &cube)
{
auto ptr_cube = &cube;
auto npol = cubes.size();
auto ptr_cubes = cubes.data();
const auto nthreads = omp_get_max_threads();
bool check = false;
#pragma omp parallel shared (ptr_cube, ptr_cubes, npol, check)
{
#pragma omp single nowait
{
const auto batch_size = npol / nthreads;
for (int32_t i = 0; i < nthreads; i++)
{
const auto bstart = batch_size * i;
const auto bend = ((bstart + batch_size) > npol) ? npol : bstart + batch_size;
#pragma omp task firstprivate(i, bstart, bend) shared (check)
{
struct bd bd1{}, bd2{};
bd1 = allocate_bd();
bd2 = allocate_bd();
for (auto j = bstart; j < bend; j++)
{
bool loc_check;
#pragma omp atomic read
loc_check = check;
if (loc_check) break;
if (ptr_cube->cube_intersecting(ptr_cubes[j], &bd1, &bd2))
{
#pragma omp atomic write
check = true;
break;
}
}
free_bd(&bd1);
free_bd(&bd2);
}
}
}
}
return check;
}
UPDATE: The Cube is actually made of smaller objects Cuboids, each of them have size (L, W, H), position coordinates and rotation. The intersect function:
bool
Cube::cube_intersecting(Cube &other, struct bd *bd1, struct bd *bd2) const
{
const auto nom = number_of_cuboids();
const auto onom = other.number_of_cuboids();
for (int32_t i = 0; i < nom; i++)
{
get_mcoord(i, bd1);
for (int32_t j = 0; j < onom; j++)
{
other.get_mcoord(j, bd2);
if (check_gjk_intersection(bd1, bd2))
{
return true;
}
}
}
return false;
}
//get_mcoord calculates vertices of the cuboids
void
Cube::get_mcoord(int32_t index, struct bd *bd) const
{
for (int32_t i = 0; i < 8; i++)
{
for (int32_t j = 0; j < 3; j++)
{
bd->coord[i][j] = _cuboids[index].get_coord(i)[j];
}
}
}
inline struct bd
allocate_bd()
{
struct bd bd{};
bd.numpoints = 8;
bd.coord = (double **) malloc(8 * sizeof(double *));
for (int32_t i = 0; i < 8; i++)
{
bd.coord[i] = (double *) malloc(3 * sizeof(double));
}
return bd;
}
Typical values: npol > 1 million, threads 32, and each npol Cube consists of 1 - 3 smaller cuboids which are directly checked against other if intersect.