I'm trying to convert this scalar code:
struct CICDecimator {
static constexpr int64_t scale = ((int64_t)1) << 32;
int mStages;
int mFactor;
float mGainCorrection;
int64_t *pIntegrators;
int64_t *pCombs;
CICDecimator(int stages = 4, int factor = 8) {
assert(stages > 0);
assert(factor > 0);
mStages = stages;
pIntegrators = new int64_t[mStages + 1]{};
pCombs = new int64_t[mStages]{};
if (mFactor != factor) {
mFactor = factor;
mGainCorrection = 1.0f / (float)(pow(mFactor, mStages));
}
}
~CICDecimator() {
delete[] pIntegrators;
delete[] pCombs;
}
float process(const float *buffer) {
for (int i = 0; i < mFactor; i++) {
pIntegrators[0] = buffer[i] * scale;
for (int j = 1; j <= mStages; j++) {
pIntegrators[j] += pIntegrators[j - 1];
}
}
int64_t s = pIntegrators[mStages];
for (int i = 0; i < mStages; i++) {
int64_t t = s;
s -= pCombs[i];
pCombs[i] = t;
}
return mGainCorrection * (s / (float)scale);
}
};
to the SSE/SIMD
version (built with -march=nocona
), trying to gain performances on process()
function (which is called heavy, within an audio application).
The main problem is that buffer's input will be m128
(4 x float) values. And it seems CICDecimator
need 64 bit integer for precision to work proper (so, only two slot).
Here's my actual code:
struct CICDecimatorV4 {
const __m128 scale4 = _mm_set1_ps((float)(((int64_t)1) << 32));
int mStages;
int mFactor;
__m128 mGainCorrection4;
__m128d *pIntegrators4_1;
__m128d *pIntegrators4_2;
__m128d *pCombs4_1;
__m128d *pCombs4_2;
CICDecimatorV4(int stages = 4, int factor = 8) {
assert(stages > 0);
assert(factor > 0);
mStages = stages;
pIntegrators4_1 = new __m128d[mStages + 1]{};
pIntegrators4_2 = new __m128d[mStages + 1]{};
pCombs4_1 = new __m128d[mStages]{};
pCombs4_2 = new __m128d[mStages]{};
if (mFactor != factor) {
mFactor = factor;
mGainCorrection4 = _mm_div_ps(_mm_set1_ps(1.0f), _mm_set1_ps((float)pow(mFactor, mStages)));
}
}
~CICDecimatorV4() {
delete[] pIntegrators4_1;
delete[] pIntegrators4_2;
delete[] pCombs4_1;
delete[] pCombs4_2;
}
__m128 process(const __m128 *buffer) {
for (int i = 0; i < mFactor; i++) {
// float to double
__m128i bufferInt = _mm_cvttps_epi32(_mm_mul_ps(buffer[i], scale4));
pIntegrators4_1[0] = _mm_cvtepi32_pd(bufferInt);
pIntegrators4_2[0] = _mm_cvtepi32_pd(_mm_srli_si128(bufferInt, 8));
for (int j = 1; j <= mStages; j++) {
pIntegrators4_1[j] += pIntegrators4_1[j - 1];
pIntegrators4_2[j] += pIntegrators4_2[j - 1];
}
}
__m128d sD1 = pIntegrators4_1[mStages];
__m128d sD2 = pIntegrators4_2[mStages];
for (int i = 0; i < mStages; i++) {
__m128d t1 = sD1;
__m128d t2 = sD2;
sD1 -= pCombs4_1[i];
sD2 -= pCombs4_2[i];
pCombs4_1[i] = t1;
pCombs4_2[i] = t2;
}
// double to float
__m128 sF1 = _mm_cvtpd_ps(sD1);
__m128 sF2 = _mm_cvtpd_ps(sD2);
__m128 sF; // <<<< TODO: i need to merge the two float on a single one
return _mm_mul_ps(mGainCorrection4, _mm_div_ps(sF, scale4));
}
};
I converted int64_t
to m128d
(two slots both), split the input m128
to two m128d
and duplicate each calculation for keep vector and double precision calculation.
Questions:
- is this approch correct?
- how can I convert two
m128d
to a singlem128
? (first on lower section, second on upper part) (i.e. the<<<< TODO part
) - can be the whole improved? (:P)
Thanks