1

I've written quite a big library for matrix operations for Delphi and FPC. There exists now an extension for this library for the Intel AVX extension but I could only manage to get that compiled in FPC. My idea was to create .o files in FPC which contains the AVX assembler codes and include these files in Delphi. I tried to follow this question here: Linking FPC .o files into Delphi

but without success. I was able to dump the function names and tried to import these in the Delphi unit. The problem is that I always get an error saying that the .o files is in the wrong format.

I use CodeTyphoon for compilation which internally uses FPC 3.1.1 and Delphi2010 as a first try.

The code is once compiled in FPC and one time in Delphi using the approriate ifdefs.

My base code looks like this (just an excerpt):

// ###################################################################
// #### This file is part of the mathematics library project, and is
// #### offered under the licence agreement described on
// #### http://www.mrsoft.org/
// ####
// #### Copyright:(c) 2011, Michael R. . All rights reserved.
// ####
// #### Unless required by applicable law or agreed to in writing, software
// #### distributed under the License is distributed on an "AS IS" BASIS,
// #### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// #### See the License for the specific language governing permissions and
// #### limitations under the License.
// ###################################################################


unit AVXMatrixMultOperations;

interface

{$IFDEF CPUX64}
{$DEFINE x64}
{$ENDIF}
{$IFDEF cpux86_64}
{$DEFINE x64}
{$ENDIF}
{$IFNDEF x64}

uses MatrixConst;

{$IFNDEF FPC}
// this fails -> wrong object format
{$L '.\AVXPrecompiled\win32\AVXMatrixMultOperations.o'}
{$ENDIF}

// full matrix operations
procedure AVXMatrixMultAligned(dest : PDouble; const destLineWidth : TASMNativeInt; mt1, mt2 : PDouble; width1, height1, width2, height2 : TASMNativeInt; const LineWidth1, LineWidth2 : TASMNativeInt);
{$IFNDEF FPC} external '' name 'AVXMATRIXMULTOPERATIONS_$$_AVXMATRIXMULTALIGNED$crc2A67AB04'; {$ENDIF}

{$ENDIF}

implementation

{$IFDEF FPC} {$ASMMODE intel} {$ENDIF}

{$IFNDEF x64}

{$IFDEF FPC}

procedure AVXMatrixMultAligned(dest : PDouble; const destLineWidth : TASMNativeInt; mt1, mt2 : PDouble; width1, height1, width2, height2 : TASMNativeInt; const LineWidth1, LineWidth2 : TASMNativeInt);
var bytesWidth2, destOffset : TASMNativeInt;
    iter : TASMNativeInt;
{$IFDEF FPC}
begin
{$ENDIF}
asm
   // prolog - simulate stack
   push ebx;
   push edi;
   push esi;

   mov ecx, dest;

   mov edi, width1;
   imul edi, -8;
   mov iter, edi;

   sub mt1, edi;

   //destOffset := destLineWidth - Width2*sizeof(double);
   mov ebx, Width2;
   shl ebx, 3;
   mov eax, destLineWidth;
   sub eax, ebx;
   mov destOffset, eax;

   //bytesWidth2 := width2*sizeof(double);
   mov bytesWidth2, ebx;

   // for y := 0 to height1 - 1 do
   @@foryloop:

      // r12 -> counter to width2
      mov esi, width2;
      sub esi, 2;
      jl @LastXColumn;

      @@forxloop:
      // for x := 0 to width2 div 2 - 1
          // esi: mt1 - width1*sizeof(double)
          // mt2: mt2
          mov edx, mt1;
          mov ebx, mt2;
          mov eax, iter;
          mov edi, LineWidth2;

          vxorpd ymm0, ymm0, ymm0;
          vxorpd ymm1, ymm1, ymm1;

          cmp eax, -32;
          jg @@Innerloop2Begin;

          // for z := 0 to width1 - 1do
          // AVX part:
          @@InnerLoop1:
             // 4x4 block
             vmovapd xmm2, [ebx];
             add ebx, edi;
             vmovapd xmm4, xmm2;

             vmovapd xmm3, [ebx];
             add ebx, edi;

             // shuffle so we can multiply

             // swap such that we can immediately multiply
             vmovlhps xmm2, xmm2, xmm3;
             vmovhlps xmm3, xmm3, xmm4;

             // next 4 elements
             vmovapd xmm4, [ebx];
             add ebx, edi;
             vmovapd xmm6, xmm4;

             vmovapd xmm5, [ebx];
             add ebx, edi;

             vmovapd ymm7, [edx + eax]

             vmovlhps xmm4, xmm4, xmm5;
             vmovhlps xmm5, xmm5, xmm6;

             vinsertf128 ymm2, ymm2, xmm4, 1;
             vinsertf128 ymm3, ymm3, xmm5, 1;

             // now multiply and add
             vmulpd ymm2, ymm2, ymm7;
             vmulpd ymm3, ymm3, ymm7;

             vaddpd ymm0, ymm0, ymm2;
             vaddpd ymm1, ymm1, ymm3;
          add eax, 32;
          jl @@InnerLoop1;

          vextractf128 xmm2, ymm0, 1;
          vextractf128 xmm3, ymm1, 1;

          vhaddpd xmm0, xmm0, xmm2;
          vhaddpd xmm1, xmm1, xmm3;

          test eax, eax;
          jz @@InnerLoopEnd2;

          @@Innerloop2Begin:

          // rest in single elements
          @@InnerLoop2:
             vmovapd xmm2, [ebx];
             add ebx, edi;

             vmovddup xmm3, [edx + eax];

             vmulpd xmm2, xmm2, xmm3;
             vmovhlps xmm4, xmm4, xmm2;

             vaddsd xmm0, xmm0, xmm2;
             vaddsd xmm1, xmm1, xmm4;
          add eax, 8;
          jnz @@InnerLoop2;

          @@InnerLoopEnd2:

          // finall horizontal addition
          vhaddpd xmm0, xmm0, xmm1;

          vmovapd [ecx], xmm0;

          // increment the pointers
          // inc(mt2), inc(dest);
          //add dword ptr [mt2], 8;
          add mt2, 16;
          add ecx, 16;

      // end for x := 0 to width2 div 2 - 1
      sub esi, 2;
      jge @@forxloop;

      @LastXColumn:

      cmp esi, -1;
      jne @NextLine;

      // last column of mt2
      mov eax, iter;
      mov ebx, mt2;

      vxorpd xmm0, xmm0, xmm0;

      @InnerLoop2:
         vmovsd xmm1, [edx + eax];
         vmovsd xmm2, [ebx];

         vmulsd xmm1, xmm1, xmm2;
         vaddsd xmm0, xmm0, xmm1;

         add ebx, edi;
      add eax, 8;
      jnz @InnerLoop2;

      vmovsd [ecx], xmm0;
      add ecx, 8;
      add mt2, 8;

      @NextLine:
      // dec(mt2, Width2);
      // inc(PByte(mt1), LineWidth1);
      // inc(PByte(dest), destOffset);
      //mov ebx, bytesWidth2;
      //sub dword ptr [mt2], ebx;
      mov eax, bytesWidth2;
      sub mt2, eax;
      mov eax, LineWidth1;
      add mt1, eax;
      add ecx, destOffset;

   // end for y := 0 to height1 - 1
   //dec eax;
   dec height1;
   jnz @@foryloop;

   // epilog
   vzeroupper;

   pop esi;
   pop edi;
   pop ebx;
end;
{$IFDEF FPC}
end;
{$ENDIF}

{$ENDIF}

{$ENDIF}

end.
mrabat
  • 802
  • 7
  • 15
  • 1
    Compile into a DLL and use that. I don't think delphi can handle the object format. – David Heffernan Feb 09 '18 at 21:48
  • AFAIK, FPC can produce COFF and ELF files (and perhaps more). I guess you should set the options (`-Acoff`) to make FPC produce COFF files. AFAIK, Delphi won't link to ELF files. Note that probably both formats can have the `.o` extension. https://www.freepascal.org/docs-html/user/usersu15.html – Rudy Velthuis Feb 09 '18 at 21:59
  • Hmmm... could it be that your FPC produces 64 bit code and not 32 bit code? – Rudy Velthuis Feb 09 '18 at 22:34
  • 2
    The win32/64 version of FPC should produce coff, but of course the FPC target should match the delphi target, 32/64-wise. That said, the RTL identifiers it might link to, including stuff related to unit initialization is probably different. It doesn't help that you don't get a precise message what is wrong. Try to link an empty file first? Btw *nix/Cygwin's "file" command is useful for a quick check to see what type a file really is. Avoid debug info, it is definitely different. Strip if needed. – Marco van de Voort Feb 09 '18 at 22:42
  • I stronlge persue the idea of statically include the object files to get monolithic bpl or exe files in delphi - so I'm going to try to explicitly create COFF files in the fpc settings and completely remove debug settings. – mrabat Feb 12 '18 at 09:53
  • Actually I tried to add the -Acoff param to my codetyphoon custom params in the compiler section. But... It won't even compile then.. strange. So.. I'm going to check out Arnauds suggestion. – mrabat Feb 13 '18 at 06:49
  • Ok. Another update. I tried to use Arnaud's linked object converter tool and it seems that the files are already coff32 files which afaik Delphi should be able to read. The thing is that I still get an E2045 wrong file format. So I guess these files are exceeding one of the limits described in http://docs.embarcadero.com/products/rad_studio/delphiAndcpp2009/HelpUpdate2/EN/html/devcommon/cm_bad_dri_obj_xml.html – mrabat Feb 13 '18 at 06:59
  • Maybe you can use an external assembler like nasm: fpc -Anasm tells fpc to use nasm for assembling. – FPK Feb 18 '18 at 12:25

1 Answers1

2

Since there is a single function involved here, the easiest is IMHO to convert the FPC AVXMatrixMultOperations.o file directly.

Use the great Object file converter tool.

You may try to convert from one binary format to another, accepted by Delphi.

But I guess that the cleanest way is to convert it to asm:

objconv -fasm AVXMatrixMultOperations.o

It will create a AVXMatrixMultOperations.asm file, which could be used to replace the unknown AVX instructions by simple db ..,..,..,.. bytes. Typically, the generated .asm file has the assembler on the left side, and the raw hexadecimal bytes on the right side.

This is how I dealt with old Delphi compilers in my libraries, for instance:

function crc32csse42(crc: cardinal; buf: PAnsiChar; len: cardinal): cardinal;
asm // eax=crc, edx=buf, ecx=len
        not     eax
        test    ecx, ecx
        jz      @0
        test    edx, edx
        jz      @0
@3:     test    edx, 3
        jz      @8 // align to 4 bytes boundary
        {$ifdef ISDELPHI2010}
        crc32   eax, byte ptr[edx]
        {$else}
        db      $F2, $0F, $38, $F0, $02
        {$endif}
        inc     edx
        ....

So in your case, something like

{$ifdef FPC}
vinsertf128 ymm2, ymm2, xmm4, 1;
vinsertf128 ymm3, ymm3, xmm5, 1;
{$else}
db $xx,$yy,$zz
db $xx,$yy,$zz
{$endif}
Arnaud Bouchez
  • 42,305
  • 3
  • 71
  • 159
  • Do new ones do AVX(2) then? My Seattle doesn't, and I simply package FPC dlls with my Delphi apps. – Marco van de Voort Feb 10 '18 at 21:45
  • As I explained, you can convert the AVX(2) into db $xx,$xx,$xx using the .asm generated by the ObjConvert tool. It is much cleaner than using an external dll. – Arnaud Bouchez Feb 12 '18 at 08:49
  • For me not. Deployment is not really a problem, and the db stuff is IMHO dirtier. I keep it in mind for exceptional cases though. – Marco van de Voort Feb 12 '18 at 09:33
  • Sorry for the late comment.... Thanks for the input. That was actually my first idea. The thing is that there are about 50k of code with AVX instructions. The first one was only an example. I actually want to try the fpc object linking thing since this is also easier to handle in case there are changes. – mrabat Feb 12 '18 at 09:51
  • Since Delphi definitely does not recognize the FPC coff object files I guess my last resort beside the dll approach is to write some kind of automatic AVX to dx$xx,$yy,$zz converter based on the asm files. Staying tuned... – mrabat Feb 13 '18 at 08:59
  • @mrabat Even when you convert the FPC .o file via objconvert into the proper format expected by Delphi? – Arnaud Bouchez Feb 13 '18 at 10:05
  • @Arnaud: I tried... the output was: format already coff32 nothing to do... – mrabat Feb 13 '18 at 10:16
  • @Arnaud: Yeah right (should have read remy's article more thoroughly)!!! thx for the hint. Though I managed to write an automation tool to convert the fpc output to DB statements (via the objconv tool) – mrabat Feb 14 '18 at 10:56
  • Alright I got my automatic db creation tool working based on @Arnaud 's suggestions. Thank you! – mrabat Feb 14 '18 at 13:55
  • @mrabat It may perhaps be worth sharing the tool as OpenSource, since it may help others... – Arnaud Bouchez Feb 19 '18 at 12:55
  • @Arnaud: The tool is available at https://github.com/mikerabat/mrmath/tree/master/AVXPrecompiled . (I know the naming could be better). The tool itself has a few restrictions and I don't have a clue inhowfar it can be used for other projects. But feel free to use and adjust it ;) – mrabat Feb 19 '18 at 13:43
  • Thanks for sharing! :) – Arnaud Bouchez Feb 20 '18 at 09:51