Using SSE 2, on Intel core2Duo.
The time spent in sse_add() and normal_add() is not constant in multiple run, and in fact now after several modifications is always coming out as 0.
The program basically finds the sum of each of the columns of the following matrix:
1,2,10,13,15,160,6,19
1,2,10,13,15,160,6,19
1,2,10,13,15,160,6,19
1,2,10,13,15,160,6,19
1,2,10,13,15,160,6,19
1,2,10,13,15,160,6,19
1,2,10,13,15,160,6,19
1,2,10,13,15,160,6,19
I have verifies the results and it is coming correct by both the functions:
results= 8, 16, 80, 104, 120, 1280, 48, 152
Why is it happening? Or is it because I am not measuring the time properly? Can you please run the same code on your machine and verify?
Updated Based on the suggestions, I have put a for loop as show below, but the time is still coming out to be 0 (obviously I have to devide total time by no. of iteration to get correct value, but why I am getting total time 0? ):
// variable declarations used for time calculation
double elapTicks;
double elapMilli ;
double begin = BeginTimer();
for(int i=0; i<1000000000;i++)
{
//sum of consecutive rows
__m128i t1=_mm_adds_epi16( x1[0] , x2[0] );
__m128i t2=_mm_adds_epi16( x3[0] , x4[0] );
__m128i t3=_mm_adds_epi16( x5[0] , x6[0] );
__m128i t4=_mm_adds_epi16( x7[0] , x8[0] );
//t5=t1+t2 & t6=t3 + t4
__m128i t5=_mm_adds_epi16( t1 ,t2 );
__m128i t6=_mm_adds_epi16( t3 ,t4 );
///t7=t6+t5, which is final answer
__m128i t7=_mm_adds_epi16( t5 ,t6 );
}
printf ("Timer set to: %f\n", begin);
// variable definitions to calculate time taken
elapTicks = EndTimer(begin)-begin; // stop the timer,and calculate the time
taken
elapMilli = elapTicks/1000; // milliseconds from Begin to End
printf ("Time in SSE in Milliseconds : %f\n", elapMilli);
}
The Original programs are as under.
*UPDATE: Removed all printf and malloc*
Timing functions one by one in separate programs:
SSE version
clock_t BeginTimer()
{
//timer declaration
clock_t Begin; //initialize Begin
Begin = clock() * CLK_TCK; //start the timer
return Begin;
}
clock_t EndTimer(clock_t begin)
{
clock_t End;
End = clock() * CLK_TCK; //stop the timer
return End;
}
int main( )
{
sse_add();
getch();
return 0;
}
void sse_add()
{
__declspec(align(16)) unsigned short a1[8]={1,2,10,13,15,160,6,19};
__declspec(align(16)) unsigned short a2[8]={1,2,10,13,15,160,6,19};
__declspec(align(16)) unsigned short a3[8]={1,2,10,13,15,160,6,19};
__declspec(align(16)) unsigned short a4[8]={1,2,10,13,15,160,6,19};
__declspec(align(16)) unsigned short a5[8]={1,2,10,13,15,160,6,19};
__declspec(align(16)) unsigned short a6[8]={1,2,10,13,15,160,6,19};
__declspec(align(16)) unsigned short a7[8]={1,2,10,13,15,160,6,19};
__declspec(align(16)) unsigned short a8[8]={1,2,10,13,15,160,6,19};
//__m128i maps to the XMM[0-7] registers
__m128i *x1 = (__m128i*) &a1[0];
__m128i *x2 = (__m128i*) &a2[0];
__m128i *x3 = (__m128i*) &a3[0];
__m128i *x4 = (__m128i*) &a4[0];
__m128i *x5 = (__m128i*) &a5[0];
__m128i *x6 = (__m128i*) &a6[0];
__m128i *x7 = (__m128i*) &a7[0];
__m128i *x8 = (__m128i*) &a8[0];
// _mm_adds_epi16 : Adds the 8 signed 16-bit integers in a to the 8 signed \
//16-bit integers in b and saturates.
// variable declarations used for time calculation
float elapTicks;
float elapMilli ;
double begin = BeginTimer();
printf ("Timer set to: %.2f\n", begin); // print the initialised timer (0)
//sum of consecutive rows
__m128i t1=_mm_adds_epi16( x1[0] , x2[0] );
__m128i t2=_mm_adds_epi16( x3[0] , x4[0] );
__m128i t3=_mm_adds_epi16( x5[0] , x6[0] );
__m128i t4=_mm_adds_epi16( x7[0] , x8[0] );
//t5=t1+t2 & t6=t3 + t4
__m128i t5=_mm_adds_epi16( t1 ,t2 );
__m128i t6=_mm_adds_epi16( t3 ,t4 );
///t7=t6+t5, which is final answer
__m128i t7=_mm_adds_epi16( t5 ,t6 );
// variable definitions to calculate time taken
elapTicks = EndTimer(begin); // stop the timer, and calculate the time taken
elapMilli = elapTicks/1000; // milliseconds from Begin to End
printf ("Time in SSE in Milliseconds : %.2f\n", elapMilli);
}
Normal version
clock_t BeginTimer()
{
//timer declaration
clock_t Begin; //initialize Begin
Begin = clock() * CLK_TCK; //start the timer
return Begin;
}
clock_t EndTimer(clock_t begin)
{
clock_t End;
End = clock() * CLK_TCK; //stop the timer
return End;
}
int main( )
{
normal_add();
getch();
return 0;
}
void normal_add()
{
unsigned short a1[8]={1,2,10,13,15,160,6,19};
unsigned short a2[8]={1,2,10,13,15,160,6,19};
unsigned short a3[8]={1,2,10,13,15,160,6,19};
unsigned short a4[8]={1,2,10,13,15,160,6,19};
unsigned short a5[8]={1,2,10,13,15,160,6,19};
unsigned short a6[8]={1,2,10,13,15,160,6,19};
unsigned short a7[8]={1,2,10,13,15,160,6,19};
unsigned short a8[8]={1,2,10,13,15,160,6,19};
unsigned short t1[8], t2[8], t3[8], t4[8],t5[8], t6[8], t7[8];
float elapTicks;
float elapMilli ;
double begin1 = BeginTimer();
printf ("Timer reset to: %f\n", begin1); // print the initialised timer (0)
for(int i=0; i<8;i++)
{
t1[i]=a1[i] +a2[i];
}
for(int i=0; i<8;i++)
{
t2[i]=a3[i] +a4[i];
}
for(int i=0; i<8;i++)
{
t3[i]=a5[i] +a6[i];
}
for(int i=0; i<8;i++)
{
t4[i]=a7[i] +a8[i];
}
for(int i=0; i<8;i++)
{
t5[i]=t1[i] +t2[i];
}
for(int i=0; i<8;i++)
{
t6[i]=t3[i] +t4[i];
}
for(int i=0; i<8;i++)
{
t7[i]=t5[i] +t6[i];
}
elapTicks = EndTimer(begin1); // stop the timer, and calculete the time taken
elapMilli = elapTicks/1000; // milliseconds from Begin to End
printf ("Time spent in normal add in Milliseconds : %.2f\n", elapMilli);
}