4

I have a lot of files(about 160 000) and I need to have information about position of individual words in files(fulltext). So I used Dictionary like this:

WordDict : TDictionary<string, TDictionary<string, TIntegerDynArray>>;

Now I know that WORD1 is in FILE1,FILE3 and FILE100 and positions in each file <1,3,5>,<2,8,35> etc. I can fill it, I can use it - its very fast. But I don't know how effectively store dictionary to file.

EDIT: effectively - I mean quickly and small size of file

user1990191
  • 43
  • 1
  • 4
  • what u mean by "effectively" or "efficiently" ? small file ? fast load of whole file ? fast search for word without loading file ? Currently it looks as most strait-forward approach would be to use TMemIniFile. – Arioch 'The Jan 18 '13 at 11:37
  • Use a JSON emitter to dump it, and a JSON parser to load it – David Heffernan Jan 18 '13 at 11:37
  • `TIntegerDynArray` - better use `TArray` since u anyway use generics. See http://stackoverflow.com/questions/11029353 http://stackoverflow.com/questions/14383093 – Arioch 'The Jan 18 '13 at 11:40
  • @David does SuperObject or mORMot's JSON work "out of the box" with generics ? especially creating objects when loading ? And using DBX JSON would hardly be fastest and bug-free – Arioch 'The Jan 18 '13 at 11:42
  • I'd also archived text storage with DiUCL or ZIP (do not if there is PPMd implementation for Delphi and reliyng on 7z wrappers would be overkill) to make it load faster and size less. The most redundancy would come from repeating file names, and archiving should deal with it fine, like it does for OpenOffice and MS-Office 2007+. Zip would also check file for corruption when unpacking. – Arioch 'The Jan 18 '13 at 11:47
  • 2
    Anpother approach would be using embedded SQL like NexusDB or SQLite - that would remove aforementioned redundancy by normalization yet would introduce their own overhead. – Arioch 'The Jan 18 '13 at 11:48

1 Answers1

13

You can use the streaming system of Delphi to write a proprietary stream format. If size matters (contrary to speed) you can zip the stream. Here is some code:

type
  TFilePos = TArray<Integer>;
  TFileDict = TDictionary<string, TFilePos>;
  TWordDict = class (TDictionary<string, TFileDict>)
  private
    procedure LoadFromStream(stream: TStream);
    procedure SaveToStream(stream: TStream);
  public
    procedure LoadFromZip(const AFileName: string);
    procedure LoadFromFile(const AFileName: string);
    procedure SaveToZip(const AFileName: string);
    procedure SaveToFile(const AFileName: string);
  end;

procedure TWordDict.LoadFromZip(const AFileName: string);
var
  stream: TStream;
  localHeader: TZipHeader;
  zipFile: TZipFile;
begin
  zipFile := TZipFile.Create;
  try
    zipFIle.Open(AFIleName, zmRead);
    zipFile.Read('worddict', stream, localHeader);
    try
      LoadFromStream(stream);
    finally
      stream.Free;
    end;
    zipFile.Close;
  finally
    zipFile.Free;
  end;
end;

procedure TWordDict.SaveToZip(const AFileName: string);
var
  stream: TStream;
  zipFile: TZipFile;
begin
  stream := TMemoryStream.Create;
  try
    SaveToStream(stream);
    stream.Position := 0;
    zipFile := TZipFile.Create;
    try
      zipFile.Open(AFileName, zmWrite);
      zipFile.Add(stream, 'worddict');
      zipFile.Close;
    finally
      zipFile.Free;
    end;
  finally
    stream.Free;
  end;
end;

procedure TWordDict.SaveToStream(stream: TStream);
var
  posi: System.Generics.Collections.TPair<string, TFilePos>;
  i: Integer;
  pair: System.Generics.Collections.TPair<string, TFileDict>;
  writer: TWriter;
begin
  writer := TWriter.Create(stream, 4096);
  try
    writer.WriteListBegin;
    for pair in Self do
    begin
      writer.WriteString(pair.Key);
      writer.WriteListBegin;
      for posi in pair.Value do
      begin
        writer.WriteString(posi.Key);
        writer.WriteInteger(Length(posi.Value));
        for i in posi.Value do
        begin
          writer.WriteInteger(i);
        end;
      end;
      writer.WriteListEnd;
    end;
    writer.WriteListEnd;
  finally
    writer.Free;
  end;
end;

procedure TWordDict.LoadFromStream(stream: TStream);
var
  sFiles: TFileDict;
  aPosi: TFilePos;
  size: Integer;
  i: Integer;
  sWord: string;
  reader: TReader;
  sFile: string;
begin
  Clear;
  reader := TReader.Create(stream, 1024);
  try
    reader.ReadListBegin;
    while not reader.EndOfList do
    begin
      sWord := reader.ReadString;
      sFiles := TFileDict.Create;
      reader.ReadListBegin;
      while not reader.EndOfList do
      begin
        sFile := reader.ReadString;
        size := reader.ReadInteger;
        SetLength(aPosi, size);
        for I := 0 to size - 1 do
        begin
          aPosi[I] := reader.ReadInteger;
        end;
        sFiles.Add(sFile, Copy(aPosi));
      end;
      reader.ReadListEnd;
      Add(sWord, sFiles);
    end;
    reader.ReadListEnd;
  finally
    reader.Free;
  end;
end;

procedure TWordDict.LoadFromFile(const AFileName: string);
var
  stream: TStream;
begin
  stream := TFileStream.Create(AFileName, fmOpenRead);
  try
    LoadFromStream(stream);
  finally
    stream.Free;
  end;
end;

procedure TWordDict.SaveToFile(const AFileName: string);
var
  stream: TStream;
begin
  stream := TFileStream.Create(AFileName, fmCreate);
  try
    SaveToStream(stream);
  finally
    stream.Free;
  end;
end;
Uwe Raabe
  • 45,288
  • 3
  • 82
  • 130