I am trying to apply the loop unrolling to find the inverse of a matrix by the Gauss Jorda method, to reduce the number of memory accesses (bottleneck) when the size of the matrices is very large and does not fit in the caches. I get it to go faster, but the result I get is wrong and I don't know why.
for(k=0;k<size;k++)
{
pivot=original[k][k];
for(j=0;j<size;j++)
{
original[k][j]/=pivot;
inverse[k][j]/=pivot;
}
for(i=0;i<size;i++)
{
if(i!=k)
{
pivot = original[i][k];
for(j=0;j<size;j++)
{
original[i][j] -= original[k][j]*pivot;
inverse[i][j] -= inverse[k][j]*pivot;
}
}
}
}
I hope that the execution of the problem is faster by receiving the number of memory accesses.
my loop unrrolling version is the following:
for(k=0;k<sizeOfMatrix;k += 2)
{
pivot=original[k][k];
for(j=0;j<sizeOfMatrix;j++)
{
original[k][j]/=pivot;
inverse[k][j]/=pivot;
}
for(i=0;i<sizeOfMatrix;i++)
{
if(i!=k && i!=k+1)
{
pivot = original[i][k];
for(j=0;j<sizeOfMatrix;j++)
{
original[i][j] -= original[k][j]*pivot;
inverse[i][j] -= inverse[k][j]*pivot;
}
}
if(i!=k+1)
{
pivot=original[k][k];
for(j=0;j<sizeOfMatrix;j++)
{
original[k+1][j]/=pivot;
inverse[k+1][j]/=pivot;
}
pivot = original[i][k+1];
for(j=0;j<sizeOfMatrix;j++)
{
original[i][j] -= original[k+1][j]*pivot;
inverse[i][j] -= inverse[k+1][j]*pivot;
}
}
}
}