12

I am in search of a data structure which enables me to quickly (prefarably O(1)-quickly) determine if a given GUID is a member of a Collection of GUIDs or not.

My current approach is to use a TDictionary with 0 as values.

While this works quickly, it seems to be a waste to use a Hashmap to rehash a GUID, which is by defintion considered to be unique, and to have the Dictionary handle values which are unneeded.

There must be a better solution for this, but I can't find one. Can you?

David Heffernan
  • 601,492
  • 42
  • 1,072
  • 1,490
sum1stolemyname
  • 4,506
  • 3
  • 26
  • 44

3 Answers3

13

Very few data structures offer O(1) access. One's the Array, the other one's the HashMap (David's answer), and I only know one other: The Trie. Here follows a simple implementation of a bit-wise Trie: Has some interesting properties:

  • Immune to memory fragmentation since no re-allocations take place.
  • O(1) add and existence test. Of course, the constant involved in O(1) is fairly large.

The code:

program Project23;

{$APPTYPE CONSOLE}

uses
  SysUtils, Generics.Collections;

type

  PGuidTrieNode=^TGuidTrieNode;
  TGuidTrieNode = record
    Sub:array[Boolean] of PGuidTrieNode;
  end;
  TGuidByteArray = array[0..15] of Byte;

  TGuidTrie = class
  protected
    Root: PGuidTrieNode;
  public
    constructor Create;
    destructor Destroy;override;

    procedure Add(G: TGUID);
    function Exists(G: TGUID): Boolean;
  end;

{ TGuidTrie }

procedure TGuidTrie.Add(G: TGUID);
var GBA: TGuidByteArray absolute G;
    Node: PGuidTrieNode;
    i: Integer;
    Bit: Integer;
    IsBitSet: Boolean;
const BitMask: array[0..7] of Byte = (1, 2, 4, 8, 16, 32, 64, 128);
begin
  Assert(SizeOf(G) = SizeOf(TGuidByteArray));
  Node := Root;
  for i:=0 to High(GBA) do
  begin
    for Bit := 0 to 7 do
    begin
      IsBitSet := (GBA[i] and BitMask[Bit]) <> 0;
      if (i = High(GBA)) and (Bit = 7) then
        begin
          // Payload
          Node.Sub[IsBitSet] := Pointer(1);
        end
      else
        begin
          if not Assigned(Node.Sub[IsBitSet]) then
            Node.Sub[IsBitSet] := GetMemory(SizeOf(TGuidTrieNode));
          Node := Node.Sub[IsBitSet];
        end;
    end;
  end;
end;

constructor TGuidTrie.Create;
begin
  Root := GetMemory(SizeOf(TGuidTrieNode))
end;

destructor TGuidTrie.Destroy;

  procedure KillNode(Node: PGuidTrieNode);
  var i:Integer;
  begin
    if Assigned(Node.Sub[True]) then
        if Node.Sub[True] <> Pointer(1) then
        begin
          KillNode(Node.Sub[True]);
        end;
    FreeMemory(Node);
  end;

begin
  KillNode(Root);
  inherited;
end;

function TGuidTrie.Exists(G: TGUID): Boolean;
var GBA: TGuidByteArray absolute G;
    Node: PGuidTrieNode;
    i: Integer;
    Bit: Integer;
    IsBitSet: Boolean;
const BitMask: array[0..7] of Byte = (1, 2, 4, 8, 16, 32, 64, 128);
begin
  Assert(SizeOf(G) = SizeOf(TGuidByteArray));
  Node := Root;
  for i:=0 to 15 do
  begin
    for Bit := 0 to 7 do
    begin
      IsBitSet := (GBA[i] and BitMask[Bit]) <> 0;
      if not Assigned(Node.Sub[IsBitSet]) then
      begin
        Result := False;
        Exit;
      end;
      Node := Node.Sub[IsBitSet];
    end;
  end;
  Result := True; // Node now contains the Payload
end;

const G1: TGUID = '{68D09F12-3E0D-4963-B32C-4EE3BD90F69C}';
      G2: TGUID = '{BEED37F6-9757-41DC-8463-AF094392652B}';

var T: TGuidTrie;

begin
  try

    T := TGuidTrie.Create;
    try
      if T.Exists(G1) then WriteLn('Exists')
                      else WriteLn('NOT Exists');
      T.Add(G1);
      if T.Exists(G1) then WriteLn('Exists')
                      else WriteLn('NOT Exists');

      if T.Exists(G2) then WriteLn('Exists')
                      else WriteLn('NOT Exists');
      T.Add(G2);
      if T.Exists(G2) then WriteLn('Exists')
                      else WriteLn('NOT Exists');
    finally T.Free;
    end;

  except
    on E: Exception do
      Writeln(E.ClassName, ': ', E.Message);
  end;
end.
menjaraz
  • 7,551
  • 4
  • 41
  • 81
Cosmin Prund
  • 25,498
  • 2
  • 60
  • 104
  • 3
    I don't think a Trie is inherently slower then a Hash Table, especially when used with natural data; And they do have interesting guaranteed properties, unlike the "statistical" properties offered by a hash table. But given the nature of the data to be indexed, I'd say GUID's are the worst case for a Trie *and* the best case for a Hash Table. A hash table will simply love that random data, while a Trie will be unable to find enough common prefixes to use efficient storage. – Cosmin Prund Mar 14 '11 at 19:41
  • SO at it's best: Get a solution to your problems AND new gain insight to the problem's domain! +1 – sum1stolemyname Mar 15 '11 at 13:53
7

I think you are 99% of the way there.

Hashing sounds like the right solution. The obvious way to take advantage of the special nature of the GUID is to supply your own hash function which combines into a single 32 bit integer the 4 32 bit integers that make up a GUID. I'd just XOR the 4 integers.

I presume you are using Generics.Collections.TDictionary. You can supply your own hash function by passing a custom comparer to the constructor. I wouldn't worry about storing spare values, I don't think it will affect performance in a discernible way.

I trust that you are storing your GUIDs as 128 bit integers and not as strings.

Finally, it has occurred to me that the default comparer for a GUID might indeed already do the hash code generation this way. It's worth checking that out before making any changes.

EDIT

Default hash code uses Bob Jenkins hash applied to the binary data. An XOR would be faster, but the default hash code doesn't seem like it would be a performance bottleneck.

In other words, I think that TDictionary<TGUID,Integer> will serve your needs perfectly adequately.

David Heffernan
  • 601,492
  • 42
  • 1,072
  • 1,490
  • 1
    "I'd just sum the 4 integers " A XOR operation would also do it. – Andrej Kirejeŭ Mar 14 '11 at 12:43
  • @Andrei Yes, that would be better, no need to mess with range checking. Thanks. – David Heffernan Mar 14 '11 at 12:55
  • From the asm POV, xor is not faster than adding - e.g. the ZLib authors, in creating Adler32, made some good speed optimization. – Arnaud Bouchez Mar 14 '11 at 21:02
  • @A.Bouchez I guess it just avoids fiddling with range checking. Of course you could just do it with inline assembler and then the compiler would never need to know! – David Heffernan Mar 14 '11 at 21:06
  • I upvoted yesterday, I can't upvote again for the Edit. I wanted to add something: For one of my "pet" projects I used a TDictionary, and I indexed huge amounts of data. It all worked smoothly until I ran out of RAM and then it abruptly failed. If the data fits into RAM I have no problem using a TDictionary to index it. For data that doesn't fit into RAM I implemented a HashTable that's backed up by a file on disk, and for my workload it outperforms my B-Tree implementation. All in all the `TDictionary<>` is one amazing data structures, and this answer should get more upvotes. – Cosmin Prund Mar 15 '11 at 06:43
2
type
    PGuidDictionaryItem = ^TGuidDictionaryItem;

    TGuidDictionaryItem = record
        Key: TGuid;
        Value: Pointer;
        Next: PGuidDictionaryItem;
    end;

    TGuidDictionary = class
    private
    const
        HashSize = 2048;
    var
        Size: integer;
        FTable: array [0..HashSize-1] of PGuidDictionaryItem;

        function GetHashCode(Guid: TGUID): integer;
    public
        constructor Create;
        destructor Destroy; override;

        procedure Add(Key: TGUID; Value: TObject);
        function TryFind(Key: TGUID; out Value: TObject): boolean;
        function Contains(Key: TGUID): Boolean;
        procedure Remove(Key: TGuid);
    end;

{ TGuidDictionary }

procedure TGuidDictionary.Add(Key: TGUID; Value: TObject);
var
    Hc: integer;
    PHi: PGuidDictionaryItem;
begin
    Hc := GetHashCode(Key);

    if FTable[Hc] <> nil then
    begin
        PHi := FTable[Hc];
        repeat
            if TGuidEx.EqualGuids(PHi.Key, Key) then
                Break;

            PHi := Phi.Next;
        until PHi = nil;
    end
    else
        Phi := nil;

    if PHi <> nil then
        PHi.Value := Value
    else
    begin
        New(PHi);
        PHi.Value := Value;
        PHi.Key := Key;
        PHi.Next := FTable[Hc];
        FTable[Hc] := PHi;
    end;
end;

function TGuidDictionary.Contains(Key: TGUID): Boolean;
var
    O: TObject;
begin
    Result := TryFind(Key, O);
end;

constructor TGuidDictionary.Create;
var
    i: integer;
begin
    inherited;

    for i := Low(FTable) to High(FTable) do
        FTable[i] := nil;
end;

destructor TGuidDictionary.Destroy;
var
    i: integer;
    Phi, PhiNext: PGuidDictionaryItem;
begin
    for i := Low(FTable) to High(FTable) do
    begin
        Phi := FTable[i];
        while Phi <> nil do
        begin
            PhiNext := Phi.Next;
            Dispose(Phi);
            Phi := PhiNext;
        end;
    end;

    inherited;
end;

function TGuidDictionary.GetHashCode(Guid: TGUID): integer;
var
    N: array [0..3] of integer absolute Guid;
begin
    Result := Abs(N[0] xor N[1] xor N[2] xor N[3]) mod HashSize;
end;

procedure TGuidDictionary.Remove(Key: TGuid);
var
    Hc: Integer;
    Phi, BeforPhi: PGuidDictionaryItem;

begin
    Hc := GetHashCode(Key);

    BeforPhi := nil;
    Phi := FTable[Hc];
    while (Phi <> nil) and not TGuidEx.EqualGuids(Phi.Key, Key) do
    begin
        BeforPhi := Phi;
        Phi := Phi.Next;
    end;

    if Phi = nil then
        Exit;

    if BeforPhi <> nil then
        BeforPhi.Next := Phi.Next
    else
        FTable[Hc] := Phi.Next;

    Dispose(Phi);
end;

function TGuidDictionary.TryFind(Key: TGUID; out Value: TObject): boolean;
var
    Hc: Integer;
    Phi: PGuidDictionaryItem;
begin
    Hc := GetHashCode(Key);
    Phi := FTable[Hc];
    while (Phi <> nil) and not TGuidEx.EqualGuids(Phi.Key, Key) do
        Phi := Phi.Next;

    if Phi <> nil then
        Value := TObject(Phi.Value)
    else
        Value := nil;

    Result := Phi <> nil;
end;

procedure TestDictMisc.TestGuidDictionary;
const
    G1: TGUID = '{68D09F12-3E0D-4963-B32C-4EE3BD90F69C}';
    G2: TGUID = '{BEED37F6-9757-41DC-8463-AF094392652B}';
var
    T: TGuidDictionary;
    Obj1, Obj2, O: TObject;
begin
    T := TGuidDictionary.Create;
    Obj1 := TObject.Create();
    Obj2 := TObject.Create();
    try
        CheckFalse(T.Contains(G1));

        T.Add(G1, Obj1);
        CheckTrue(T.Contains(G1));

        T.Add(G2, Obj2);
        CheckTrue(T.Contains(G2));

        T.Add(G2, Obj2);
        CheckTrue(T.Contains(G2));

        CheckTrue(T.TryFind(G1, {out} O));
        CheckSame(Obj1, O);

        CheckTrue(T.TryFind(G2, {out} O));
        CheckSame(Obj2, O);

        T.Remove(G1);
        CheckFalse(T.Contains(G1));
        CheckFalse(T.TryFind(G1, {out} O));

        T.Add(G1, Obj1);
        CheckTrue(T.TryFind(G1, {out} O));
        CheckSame(Obj1, O);

    finally
        Obj1.Free();
        Obj2.Free();

        T.Free;
    end;
end;
bummi
  • 27,123
  • 14
  • 62
  • 101