I'm absolute beginner in multitasking who read some basics and try to get it to my project for object visualization. The problem is that my multithreaded solution which I implemented is slower than single threaded one and I don't know why and I have unexpected application code for unknown reason. I give you two cases where I tried to achieve better performance. I would like to know what I don't understand and where I have mistakes in general point of view. I give you part of source code and summarize all questions at the end.
Here is my thread factory implementation (very basic but it's only beginning):
threadfactory.h
#pragma once
#include <vector>
#include "ThreadInterface.h"
#include "../MemoryManagement/MemoryMgr.h"
#include "../Logging/LoggingDefines.h"
class CThreadFactory : public CThreadIntearface
{
public:
CThreadFactory();
CThreadFactory(BYTE max_threads);
~CThreadFactory();
void Init(BYTE max_threads);
void Clear(void);
//update waves
virtual void UpdateWavesInternalPoints(CWaves& waves);
virtual void UpdateWavesNormals(CWaves& waves);
//update vertices
virtual void TransformVertices(const CObject& object, const vector<TVertex>& input, vector<XMFLOAT3>& output, const CXNAMatrix& matrix);
static const char* GetHeapName(void) { return "Thread factory"; }
#if (defined(DEBUG) | defined(_DEBUG))
/**
* Return class name. This function is compiled only in debug mode.
* \return class name
*/
NAME_FUNC();
#endif
private:
void Join(vector<std::thread>& threads);
void ReleaseThreads(vector<std::thread>& threads);
private:
UINT muiNumberofThreads;
private:
DECLARE_HEAP;
};
threadfactory.cpp
#include "ThreadFactory.h"
CThreadFactory::CThreadFactory()
{
TRACE(LOG_DEBUG, string("Start of initialization of object \"") + GetName() + string("\""));
muiNumberofThreads = 1;
TRACE(LOG_DEBUG, string("End of initialization of object \"") + GetName() + string("\""));
}
CThreadFactory::CThreadFactory(BYTE max_threads)
{
TRACE(LOG_DEBUG, string("Start of initialization of object \"") + GetName() + string("\""));
Init(max_threads);
TRACE(LOG_DEBUG, string("End of initialization of object \"") + GetName() + string("\""));
}
CThreadFactory::~CThreadFactory()
{
TRACE(LOG_DEBUG, string("Start of releasing of object \"") + GetName() + string("\""));
Clear();
TRACE(LOG_DEBUG, string("End of releasing of object \"") + GetName() + string("\""));
}
void CThreadFactory::Init(BYTE max_threads)
{
muiNumberofThreads = max_threads;
}
void CThreadFactory::Clear(void)
{
}
void CThreadFactory::Join(vector<std::thread>& threads)
{
for (auto& it : threads)
{
if (it.joinable())
it.join();
}
}
void CThreadFactory::ReleaseThreads(vector<std::thread>& threads)
{
/*for (auto& it : threads)
{
}*/
threads.clear();
}
void CThreadFactory::UpdateWavesInternalPoints(CWaves& waves)
{
if (muiNumberofThreads <= 1)
{
waves.UpdateWaveInteriorPoints(1, waves.RowCount() - 1);
}
else
{
vector<std::thread> threads(muiNumberofThreads - 1);
UINT dwWavePartDifference = waves.RowCount() / muiNumberofThreads;
DWORD dwMinRow = 1, dwMaxRow = 1 + dwWavePartDifference;
for (UINT i = 0; i < muiNumberofThreads - 1; i++)
{
threads[i] = move(std::thread{ &CWaves::UpdateWaveInteriorPoints, &waves, dwMinRow, dwMaxRow });
dwMinRow += dwWavePartDifference;
dwMaxRow += dwWavePartDifference;
}
waves.UpdateWaveInteriorPoints(dwMinRow, dwMaxRow);
Join(threads);
ReleaseThreads(threads);
}
}
void CThreadFactory::UpdateWavesNormals(CWaves& waves)
{
if (muiNumberofThreads <= 1)
{
waves.UpdateWaveNormals(1, waves.RowCount() - 1);
}
else
{
vector<std::thread> threads(muiNumberofThreads - 1);
UINT dwWavePartDifference = waves.RowCount() / muiNumberofThreads;
DWORD dwMinRow = 1, dwMaxRow = 1 + dwWavePartDifference;
for (UINT i = 0; i < muiNumberofThreads - 1; i++)
{
threads[i] = move(std::thread{ &CWaves::UpdateWaveNormals, &waves, dwMinRow, dwMaxRow });
dwMinRow += dwWavePartDifference;
dwMaxRow += dwWavePartDifference;
}
waves.UpdateWaveNormals(dwMinRow, dwMaxRow);
Join(threads);
ReleaseThreads(threads);
}
}
void CThreadFactory::TransformVertices(const CObject& object, const vector<TVertex>& input, vector<XMFLOAT3>& output, const CXNAMatrix& matrix)
{
if (output.size() != input.size())
output.resize(input.size());
if ((muiNumberofThreads <= 1) || (input.size() < 1000))
{
object.TransformVerticesSet(input.begin(), output.begin(), input.size() - 1, matrix);
}
else
{
vector<std::thread> threads(muiNumberofThreads - 1);
UINT uiThreadVertexCount = input.size() / muiNumberofThreads;
UINT uiStartVertexIndex = 0;
for (UINT i = 0; i < muiNumberofThreads - 1; i++)
{
if (uiStartVertexIndex >= input.size())
uiStartVertexIndex = input.size() - 1;
threads[i] = move(std::thread{ &CObject::TransformVerticesSet, &object, input.begin() + uiStartVertexIndex, output.begin() + uiStartVertexIndex, uiThreadVertexCount - 1, matrix });
uiStartVertexIndex += uiThreadVertexCount;
}
object.TransformVerticesSet(input.begin() + uiStartVertexIndex, output.begin() + uiStartVertexIndex, uiThreadVertexCount - 1, matrix);
Join(threads);
ReleaseThreads(threads);
}
}
#if (defined(DEBUG) | defined(_DEBUG))
NAME_BODY(CThreadFactory, "Threads");
#endif
DEFINE_HEAP(CThreadFactory, GetHeapName());
1. Wave update:
I'm using object called Wave. This object has implicitly around 40 000 vertices. I'm using these functions to update it for each frame:
void CWaves::UpdateWaveInteriorPoints(DWORD min_row, DWORD max_row)
{
if (min_row < 1)
min_row = 1;
if (max_row > (RowCount() - 1))
max_row = (RowCount() - 1);
for (DWORD i = min_row; i < max_row; ++i)
{
for (DWORD j = 1; j < ColumnCount() - 1; ++j)
{
// After this update we will be discarding the old previous
// buffer, so overwrite that buffer with the new update.
// Note how we can do this inplace (read/write to same element)
// because we won't need prev_ij again and the assignment happens last.
// Note j indexes x and i indexes z: h(x_j, z_i, t_k)
// Moreover, our +z axis goes "down"; this is just to
// keep consistent with our row indices going down.
GetPrevSolutionVertices()[i*ColumnCount() + j].Position.y =
GetK1()*GetPrevSolutionVertices()[i*ColumnCount() + j].Position.y +
GetK2()*mpObjectMesh->mVertices[i*ColumnCount() + j].Position.y +
GetK3()*(mpObjectMesh->mVertices[(i + 1)*ColumnCount() + j].Position.y +
mpObjectMesh->mVertices[(i - 1)*ColumnCount() + j].Position.y +
mpObjectMesh->mVertices[i*ColumnCount() + j + 1].Position.y +
mpObjectMesh->mVertices[i*ColumnCount() + j - 1].Position.y);
}
}
}
void CWaves::UpdateWaveNormals(DWORD min_row, DWORD max_row)
{
if (min_row < 1)
min_row = 1;
if (max_row >(RowCount() - 1))
max_row = (RowCount() - 1);
for (UINT i = min_row; i < max_row; ++i)
{
for (UINT j = 1; j < ColumnCount() - 1; ++j)
{
float l = mpObjectMesh->mVertices[i*ColumnCount() + j - 1].Position.y;
float r = mpObjectMesh->mVertices[i*ColumnCount() + j + 1].Position.y;
float t = mpObjectMesh->mVertices[(i - 1)*ColumnCount() + j].Position.y;
float b = mpObjectMesh->mVertices[(i + 1)*ColumnCount() + j].Position.y;
mpObjectMesh->mVertices[i*ColumnCount() + j].Normal.x = -r + l;
mpObjectMesh->mVertices[i*ColumnCount() + j].Normal.y = 2.0f*GetSpatialStep();
mpObjectMesh->mVertices[i*ColumnCount() + j].Normal.z = b - t;
XMVECTOR n = XMVector3Normalize(XMLoadFloat3(&mpObjectMesh->mVertices[i*ColumnCount() + j].Normal));
XMStoreFloat3(&mpObjectMesh->mVertices[i*ColumnCount() + j].Normal, n);
mpObjectMesh->mVertices[i*ColumnCount() + j].TangentU = XMFLOAT3(2.0f*GetSpatialStep(), r - l, 0.0f);
XMVECTOR T = XMVector3Normalize(XMLoadFloat3(&mpObjectMesh->mVertices[i*ColumnCount() + j].TangentU));
XMStoreFloat3(&mpObjectMesh->mVertices[i*ColumnCount() + j].TangentU, T);
}
}
}
void CWaves::UpdateWave(float dt)
{
static float t_base = 0.0f;
if ((g_Timer->TotalTime() - t_base) >= 0.25f)
{
t_base += 0.25f;
DWORD i, j;
do
{
i = 5 + rand() % (RowCount() - 5);
j = 5 + rand() % (ColumnCount() - 5);
} while (!((i > 1) && (i < (RowCount() - 2)) &&
(j > 1) && (j < (ColumnCount() - 2))));
float r = MathHelper::RandF(1.0f, 2.0f);
Disturb(i, j, r);
}
static float t = 0;
// Accumulate time.
t += dt;
// Only update the simulation at the specified time step.
if (t >= TimeStep())
{
// Only update interior points; we use zero boundary conditions.
if (g_ThreadFactory)
{
g_ThreadFactory->UpdateWavesInternalPoints(*this);
}
else
{
UpdateWaveInteriorPoints(1, RowCount() - 1);
}
// We just overwrote the previous buffer with the new data, so
// this data needs to become the current solution and the old
// current solution becomes the new previous solution.
std::swap(GetPrevSolutionVertices(), mpObjectMesh->mVertices);
t = 0.0f; // reset time
if (mShapeDescription.mShapeProperties.bLightedObject)
{
//
// Compute normals using finite difference scheme.
//
if (g_ThreadFactory)
{
g_ThreadFactory->UpdateWavesNormals(*this);
}
else
{
UpdateWaveNormals(1, RowCount() - 1);
}
}
}
}
In that case I was thinking that the problem is in CWaves object which I gave all threads and I thought that this cause continuously locking. So I change approach for another case, where I tried to transform vertices using given transformation matrix. Instead of whole object, I'm using container iterators.
2. Vertices transformation
Vertices transformation method called from thread factory displayed above:
void CObject::TransformVerticesSet(vector<TVertex>::const_iterator input, vector<XMFLOAT3>::iterator output, UINT number_of_vertices, const CXNAMatrix& matrix) const
{
for (UINT i = 0; i <= number_of_vertices; i++)
{
CMatrixTransformations::TransformPoint(input[i].Position, matrix, output[i]);
}
}
In that case I tried to use iterators instead of giving whole vertices vector, but the result is equal to previous solution. It's slower than single threaded solution.
EDIT<8.12.2014>
In previous code I use following macros:
TRACE - using for logging system, in release mode is empty
NAME_FUNC, NAME_BODY - macro for declaration and definition of class method which return class name
DECLARE_HEAP, DEFINE_HEAP - create declaration and definition for overloaded new and delete operators
None of this should influence performance in multithreading operations.
Here is output from VS 2013 after I close my application (note that I don't use multithreading for previous situations in that case):
The thread 0x229c has exited with code 27 (0x1b).
The thread 0x22dc has exited with code 27 (0x1b).
The thread 0x11ac has exited with code 27 (0x1b).
The thread 0x328c has exited with code 27 (0x1b).
The thread 0x205c has exited with code 27 (0x1b).
The thread 0xf4c has exited with code 27 (0x1b).
The thread 0x894 has exited with code 27 (0x1b).
The thread 0x3094 has exited with code 27 (0x1b).
The thread 0x2eb4 has exited with code 27 (0x1b).
The thread 0x2ef8 has exited with code 27 (0x1b).
The thread 0x22f4 has exited with code 27 (0x1b).
The thread 0x2810 has exited with code 27 (0x1b).
The thread 0x29e0 has exited with code 27 (0x1b).
The thread 0x2e54 has exited with code 27 (0x1b).
D3D11 WARNING: Process is terminating. Using simple reporting. Please call ReportLiveObjects() at runtime for standard reporting. [ STATE_CREATION WARNING #0: UNKNOWN]
D3D11 WARNING: Live Producer at 0x012F05A0, Refcount: 8. [ STATE_CREATION WARNING #0: UNKNOWN]
D3D11 WARNING: Live Object at 0x012F1D38, Refcount: 0. [ STATE_CREATION WARNING #0: UNKNOWN]
D3D11 WARNING: Live Object at 0x013BA3F8, Refcount: 0. [ STATE_CREATION WARNING #0: UNKNOWN]
The program '[13272] EngineDX.exe' has exited with code 27 (0x1b).
It seems that third party APIs (maybe DX) is creating threads but in process manager I see usage only of one thread. That could be a problem...
So here are my questions:
- Is my implementation of thread factory so wrong or updating of 40 000 vertices don't have to be divided into more threads?
- If I got locking I would like to know why. Solution for vertices transformation is using iterators and vertices vector container is divided so I shouldn't have locking.
- I decided to create threads for each function calling from one reason. At first I had thread vector container as a member class of thread factory. But this caused memory leak in debug mode (release mode didn't have this problem). Just pure declaration without doing anything. I never found out why. Is there something else which is necessary to release thread correctly?
- Now my application ended with code 27 because all threads returned this error code. What does it mean?
- The weird think is, that when I use 8 threads (7 + main thread on 8 threaded CPU), in debug mode I see that all 8 threads do something. But in release mode, there is no change according to using only one thread (main one). Does it wrong behavior or it could be expected for some reasons?
Sorry for long text but I wanted to be more precise to avoid misinterpretations. Thanks for answers.
EDIT 17.12.2014:
I reimplemented the function which threads use (and make it independent from Wave class), there are no share object references or variables, but it still doesn't work. I don't understand why.... Interesting thing is, that when I set 8 threads to use, in debug executable I see my Core i7 run at 100%, but with no benefits in frame rates. With release executable I see only 4 threads run up and CPU on 25%.
New multithreaded function:
void UpdateWaveInteriorPoints(TVertexFieldIterator previous_vertex_field, TVertexFieldIterator actual_vertex_field, DWORD min_row, DWORD max_row, float k1, float k2, float k3, UINT column_count)
{
if (min_row < 1)
min_row = 1;
/*if (max_row >(RowCount() - 1))
max_row = (RowCount() - 1);*/
for (DWORD i = min_row; i < max_row; ++i)
{
for (DWORD j = 1; j < column_count - 1; ++j)
{
// After this update we will be discarding the old previous
// buffer, so overwrite that buffer with the new update.
// Note how we can do this inplace (read/write to same element)
// because we won't need prev_ij again and the assignment happens last.
// Note j indexes x and i indexes z: h(x_j, z_i, t_k)
// Moreover, our +z axis goes "down"; this is just to
// keep consistent with our row indices going down.
previous_vertex_field[i*column_count + j].Position.y =
k1*previous_vertex_field[i*column_count + j].Position.y +
k2*actual_vertex_field[i*column_count + j].Position.y +
k3*(actual_vertex_field[(i + 1)*column_count + j].Position.y +
actual_vertex_field[(i - 1)*column_count + j].Position.y +
actual_vertex_field[i*column_count + j + 1].Position.y +
actual_vertex_field[i*column_count + j - 1].Position.y);
}
}
}
Function which create the threads:
TVertexFieldIterator tActualVertexIterator = waves.mpObjectMesh->mVertices.begin();
TVertexFieldIterator tPreviousVertexIterator = waves.GetPrevSolutionVertices().begin();
std::vector<std::thread> threads;
//std::vector<std::future<void>> threads;
UINT dwWavePartDifference = waves.RowCount() / muiNumberofThreads;
DWORD dwMinRow = 1, dwMaxRow = dwWavePartDifference;
DWORD dwVertexCount = dwWavePartDifference*waves.ColumnCount();
for (UINT i = 0; i < muiNumberofThreads - 1; i++)
{
//threads.emplace_back(std::async( std::launch::async, &CWaves::UpdateWaveInteriorPoints, &waves, tPreviousVertexIterator, tActualVertexIterator, dwMinRow, dwMaxRow, waves.GetK1(), waves.GetK2(), waves.GetK3(), waves.ColumnCount() ));
threads.emplace_back(std::thread(&UpdateWaveInteriorPoints, tPreviousVertexIterator, tActualVertexIterator, dwMinRow, dwMaxRow, waves.GetK1(), waves.GetK2(), waves.GetK3(), waves.ColumnCount()));
tActualVertexIterator += dwVertexCount;
tPreviousVertexIterator += dwVertexCount;
}
tPreviousVertexIterator -= waves.ColumnCount(); //row - 1
tActualVertexIterator -= waves.ColumnCount(); //row - 1
waves.UpdateWaveInteriorPoints(tPreviousVertexIterator, tActualVertexIterator, dwMinRow, dwMaxRow, waves.GetK1(), waves.GetK2(), waves.GetK3(), waves.ColumnCount());
for (UINT i = 0; i < muiNumberofThreads -1; i++)
{
//threads[i].wait();
threads[i].join();
}
Marek