Did you intend heaviside = 0.5*(sign(1,x)+1)
? In any case testing with gcc 4.8.1 fortran shows High Performance Mark's idea should be beneficial. Here are 3 possibilities:
heaviside1 - original
heaviside2 - High Performance Mark's idea
heaviside3 - another variation
function heaviside1 (x)
double precision heaviside1, x
heaviside1 = 0.5 * (sign(1d0,x) + 1)
end
function heaviside2 (x)
double precision heaviside2, x
heaviside2 = sign(0.5d0,x) + 0.5
end
function heaviside3 (x)
double precision heaviside3, x
heaviside3 = 0
if (x .ge. 0) heaviside3 = 1
end
program demo
double precision heaviside1, heaviside2, heaviside3, x, a, b, c
do
x = 0.5 - RAND(0)
a = heaviside1(x)
b = heaviside2(x)
c = heaviside3(x)
print *, "x=", x, "heaviside(x)=", a, b, c
enddo
end
When compiled, gcc generates these 3 stand-alone functions:
<heaviside1_>:
vmovsd xmm0,QWORD PTR [rcx]
vandpd xmm0,xmm0,XMMWORD PTR [rip+0x2d824]
vorpd xmm0,xmm0,XMMWORD PTR [rip+0x2d80c]
vaddsd xmm0,xmm0,QWORD PTR [rip+0x2d7f4]
vmulsd xmm0,xmm0,QWORD PTR [rip+0x2d81c]
ret
<heaviside2_>:
vmovsd xmm0,QWORD PTR [rcx]
vandpd xmm0,xmm0,XMMWORD PTR [rip+0x2d844]
vorpd xmm0,xmm0,XMMWORD PTR [rip+0x2d85c]
vaddsd xmm0,xmm0,QWORD PTR [rip+0x2d844]
ret
<heaviside3_>:
vxorpd xmm0,xmm0,xmm0
vmovsd xmm1,QWORD PTR [rip+0x2d844]
vcmplesd xmm0,xmm0,QWORD PTR [rcx]
vandpd xmm0,xmm1,xmm0
ret
When compiled with gcc, heaviside1 generates a multiply that might slow execution.
heaviside2 eliminates the multiply.
heaviside3 has the same number of instructions as heaviside2, but uses 2 fewer memory accesses.
For the stand-alone functions:
instruction memory reference
count count
heaviside1 6 5
heaviside2 5 4
heaviside3 5 2
The inline code for these functions avoids the need for the return instruction and ideally passes the arguments in registers and preloads other registers with needed constants. The exact result depends on the compiler used and the calling code. An estimate for inlined code:
instruction memory reference
count count
heaviside1 4 0
heaviside2 3 0
heaviside3 2 0
It looks like the function could be handled by as few as two compiler generated instructions: vcmplesd+vandpd. The first instruction creates a mask of all zeros if the argument is negative, or a mask of all ones otherwise. The second instruction applies the mask to a register constant value of one in order to produce the result value of zero or one.
Though I have not benchmarked these functions, it looks like the heaviside function should not take much execution time.
---09/23/2013: adding x86_64 assembly language versions and C language benchmark---
file functions.s
//----------------------------------------------------------------------------
.intel_syntax noprefix
.text
//-----------------------------------------------------------------------------
// this heaviside function generates its own register constants
// double heaviside_a1 (double arg);
.globl heaviside_a1
heaviside_a1:
mov rax,0x3ff0000000000000
xorpd xmm1,xmm1 # xmm1: constant 0.0
cmplesd xmm1,xmm0 # xmm1: mask (all Fs or all 0s)
movq xmm0,rax # xmm0: constant 1.0
andpd xmm0,xmm1
retq
//-----------------------------------------------------------------------------
// this heaviside function uses register constants passed from caller
// double heaviside_a2 (double arg, double const0, double const1);
.globl heaviside_a2
heaviside_a2:
cmplesd xmm1,xmm0 # xmm1: mask (all Fs or all 0s)
movsd xmm0,xmm2 # xmm0: constant 1.0
andpd xmm0,xmm1
retq
//-----------------------------------------------------------------------------
file ctest.c
#define __USE_MINGW_ANSI_STDIO 1
#include <windows.h>
#include <stdio.h>
#include <stdint.h>
// functions.s
double heaviside_a1 (double x);
double heaviside_a2 (double arg, double const0, double const1);
//-----------------------------------------------------------------------------
static double heaviside_c1 (double x)
{
double result = 0;
if (x >= 0) result = 1;
return result;
}
//-----------------------------------------------------------------------------
//
// queryPerformanceCounter - similar to QueryPerformanceCounter, but returns
// count directly.
uint64_t queryPerformanceCounter (void)
{
LARGE_INTEGER int64;
QueryPerformanceCounter (&int64);
return int64.QuadPart;
}
//-----------------------------------------------------------------------------
//
// queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns count direcly.
uint64_t queryPerformanceFrequency (void)
{
LARGE_INTEGER int64;
QueryPerformanceFrequency (&int64);
return int64.QuadPart;
}
//----------------------------------------------------------------------------
//
// lfsr64gpr - left shift galois type lfsr for 64-bit data, general purpose register implementation
//
static uint64_t lfsr64gpr (uint64_t data, uint64_t mask)
{
uint64_t carryOut = data >> 63;
uint64_t maskOrZ = -carryOut;
return (data << 1) ^ (maskOrZ & mask);
}
//---------------------------------------------------------------------------
int runtests (uint64_t pattern, uint64_t mask)
{
uint64_t startCount, elapsed, index, loops = 800000000;
double ns;
double total = 0;
startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
double x, result;
pattern = lfsr64gpr (pattern, mask);
x = (double) (int64_t) pattern;
result = heaviside_c1 (x);
total += result;
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("heaviside_c1: %7.2f ns\n", ns);
startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
double x, result;
pattern = lfsr64gpr (pattern, mask);
x = (double) (int64_t) pattern;
result = heaviside_a1 (x);
//printf ("heaviside_a1 (%lf): %lf\n", x, result);
total += result;
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("heaviside_a1: %7.2f ns\n", ns);
startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
double x, result;
const double const0 = 0.0;
const double const1 = 1.0;
pattern = lfsr64gpr (pattern, mask);
x = (double) (int64_t) pattern;
result = heaviside_a2 (x, const0, const1);
//printf ("heaviside_a2 (%lf): %lf\n", x, result);
total += result;
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("heaviside_a2: %7.2f ns\n", ns);
return total;
}
//---------------------------------------------------------------------------
int main (void)
{
uint64_t mask;
mask = 0xBEFFFFFFFFFFFFFF;
// raise our priority to increase measurement accuracy
SetPriorityClass (GetCurrentProcess (), REALTIME_PRIORITY_CLASS);
printf ("using pseudo-random data\n");
runtests (1, mask);
return 0;
}
//---------------------------------------------------------------------------
mingw64 build command: gcc -Wall -Wextra -O3 -octest.exe ctest.c functions.s
Program output from Intel Core i7-2600K at 4.0 GHz:
using pseudo-random data
heaviside_c1: 2.24 ns
heaviside_a1: 2.00 ns
heaviside_a2: 2.00 ns
These timing results include execution of pseudo-random argument generation and result totalization code needed to keep the optimizer from eliminating the otherwise unused heaviside_c1 local function.
heaviside_c1 is from the original fortran suggestion, ported to C.
heaviside_a1 is an assembly language implementation.
heaviside_a2 is a modification of the assembly language version that uses register constants passed by the caller to avoid the overhead of generating them. For my processor, benchmarking shows no advantage to passing constants.
The assembly language functions assume xmm0 returns the result and xmm1 and xmm2 are available as scratch registers. This is valid for the x64 calling convention used by Windows. This assumption should be confirmed for other calling conventions.
In order to avoid memory accesses, the assembly language version expects the argument to be passed by value in a register (XMM0). Because this is not the fortran default, a special declaration is required. This one seems to work properly for gfortran 64-bit:
interface
real(c_double) function heaviside_a1(x)
use iso_c_binding, only: c_double
real(c_double), VALUE :: x
end function heaviside_a1
end interface