I'm working on an application that very often needs to convert 6 to 8 signed 32 bit integers to 32 bit real numbers. I replaced the delphi code with custom assembler code and to my great surprise the FPU conversion is always as fast and on some computers a good amount faster than the SSE conversion. Here's some code that illustrates:
program Project1;
{$R *.res}
uses
windows,dialogs,sysutils;
type
piiii=^tiiii;
tiiii=record i1,i2,i3,i4:longint; end;
pssss=^tssss;
tssss=record s1,s2,s3,s4:single; end;
var
convert_value:single=13579.02468;
function convert_x87(adata:longint):single;
asm
mov [esp-4],eax
fild longint([esp-4])
fmul [convert_value]
end;
procedure convert_sse(afrom,ato,aconv:pointer);
asm
CVTDQ2PS xmm0,[eax]
mulps xmm0,[ecx]
movaps [edx],xmm0
end;
procedure get_mem(var p1,p2:pointer);
begin
getmem(p1,31);
p2:=pointer((longint(p1)+15) and (not 15));
end;
var
a,b,c,d:cardinal;
z:single;
i:piiii;
s1,s2:pssss;
w1,w2,w3:pointer;
begin
b:=gettickcount;
a:=0;
repeat
z:=convert_x87(a);
inc(a);
until a=0;
c:=gettickcount-b;
get_mem(pointer(w1),pointer(i));
get_mem(pointer(w2),pointer(s1));
get_mem(pointer(w3),pointer(s2));
s1.s1:=convert_value;
s1.s2:=convert_value;
s1.s3:=convert_value;
s1.s4:=convert_value;
b:=gettickcount;
i.i1:=0;
i.i2:=1;
i.i3:=2;
i.i4:=3;
repeat
convert_sse(i,s2,s1);
inc(i.i1,4);
inc(i.i2,4);
inc(i.i3,4);
inc(i.i4,4);
until i.i1=0;
d:=gettickcount-b;
freemem(w1);
freemem(w2);
freemem(w3);
showmessage('FPU:'+inttostr(c)+'/SSE:'+inttostr(d));
end.
There needs to be a rescaling (so a multiply) during conversion, that's why there's one in there. The value used is just a random one I picked, but the result was the same no matter what value I used. Also there is a very tiny difference in rounding between the FPU and SSE but it doesn't matter in this case.
But if you run that code you'll see that the FPU path is never slower than the SSE path and it doesn't make sense. Anyone have an idea what's going on?
EDIT: Here's different source code with the loop in assembler. The results are really interesting. If the increment instructions are commented out, the SSE version is faster than the FPU version by a noticable amount, but if the increment instructions are included then they are roughly the same speed:
program Project1;
{$R *.res}
uses
windows,dialogs,sysutils;
type
piiii=^tiiii;
tiiii=record i1,i2,i3,i4:longint; end;
pssss=^tssss;
tssss=record s1,s2,s3,s4:single; end;
var
convert_value:single=13579.02468;
procedure test_convert_x87;
asm
// init test data
push ebx
xor ebx,ebx
mov [esp-4],$98765432
// convert and multiply 1 int32 to 1 single
@next_loop:
// inc [esp-4]
fild longint([esp-4])
fmul [convert_value]
fstp single([esp-8])
// loop
dec ebx
jnz @next_loop
pop ebx
end;
procedure test_convert_sse(afrom,ato,aconv:pointer);
asm
// init test data
push ebx
xor ebx,ebx
mov [eax+0],$98765432
mov [eax+4],$98765432
mov [eax+8],$98765432
mov [eax+12],$98765432
// convert and multiply 4 int32 to 4 single
@next_loop:
// inc [eax+0]
// inc [eax+4]
// inc [eax+8]
// inc [eax+12]
cvtdq2ps xmm0,[eax]
mulps xmm0,[ecx]
movaps [edx],xmm0
// loop
sub ebx,4
jnz @next_loop
pop ebx
end;
procedure get_mem(var p1,p2:pointer);
begin
getmem(p1,31);
p2:=pointer((longint(p1)+15) and (not 15));
end;
var
b,c,d:cardinal;
i:piiii;
s1,s2:pssss;
w1,w2,w3:pointer;
begin
b:=gettickcount;
test_convert_x87;
c:=gettickcount-b;
get_mem(pointer(w1),pointer(i));
get_mem(pointer(w2),pointer(s1));
get_mem(pointer(w3),pointer(s2));
s1.s1:=convert_value;
s1.s2:=convert_value;
s1.s3:=convert_value;
s1.s4:=convert_value;
b:=gettickcount;
test_convert_sse(i,s2,s1);
d:=gettickcount-b;
freemem(w1);
freemem(w2);
freemem(w3);
showmessage('FPU:'+inttostr(c)+'/SSE:'+inttostr(d));
end.