0

Updated Question:

Hi. I'm trying to sort a index file in-place. The file consits out of 14B data chunks and usually are too large to be loaded into the RAM. The first 8B are the bytes I want to sort after. I implemented a Heapsort algorithm which so far except for performance is working great!

I am wondering whether my implementation could be improved and how I could possibly speed this process up by using some RAM. I was thinking about possibly partially keeping the heap in RAM but I'm not sure on how that would work.

My code so far:

sortidx.h

#ifndef SORTIDX_H
#define SORTIDX_H

// Includes
#include <atomic>
#include <fstream>
#include <iostream>
#include <limits>
#include <string>
#include <thread>

// Constants
constexpr size_t hashSize = 8;
constexpr size_t offsetSize = 6;
constexpr size_t writeSize = hashSize + offsetSize;

// Typedefs & Structs
typedef std::lock_guard<std::mutex> scoped_lock;

struct IndexEntry {
    unsigned char hash[hashSize]; // First 64 bits of the hash
    unsigned char position[offsetSize]; // Position of word in dictionary (48-bit little endian integer)
} __attribute__( (__packed__) );

// Functions
bool operator>( const IndexEntry &rhs, const IndexEntry &lhs );

constexpr size_t getParent( size_t i ) {
    return (i - 1) / 2;
}

constexpr size_t getLeft( size_t i ) {
    return i * 2 + 1;
}

constexpr size_t getRight( size_t i ) {
    return i * 2 + 2;
}

void sortIDX( std::string idxFile );

void heapifyIDX( size_t heapifyLimit );
void sortIDXHeap( size_t numDataSets );

void readData( IndexEntry* entry, size_t pos );
void writeData( IndexEntry* entry, size_t pos );
bool isInHeap( size_t pos );
void orderHeap( IndexEntry &top, size_t posTop );

#endif

sortidx.cpp

#include "sortidx.h"

using namespace std;

streampos fileSize;
size_t numDataSets;
size_t limit;
atomic<size_t> pos;
fstream* file;

bool operator>( const IndexEntry &rhs, const IndexEntry &lhs ) {
    for ( size_t i = 0; i < hashSize; i++ ) {
        if ( rhs.hash[i] > lhs.hash[i] )
            return true;
        else if ( rhs.hash[i] < lhs.hash[i] )
            return false;
    }

    return false;
}

void sortIDX( string idxFile ) {
    file = new fstream( idxFile, ios::in | ios::out | ios::binary | ios::ate );
    fileSize = file->tellg();
    numDataSets = fileSize / writeSize;
    limit = numDataSets - 1;
    const size_t localLimit = limit;
    const size_t heapifyLimit = getParent( limit );
    thread* sorterThread;

    sorterThread = new thread( heapifyIDX, heapifyLimit );

    while ( pos <= heapifyLimit ) {
        // Some progressbar stuff (uses pos)
    }

    sorterThread->join();
    delete sorterThread;

    pos = 0;
    sorterThread = new thread( sortIDXHeap, localLimit );

    while ( pos < localLimit ) {
        // Some progressbar stuff (uses pos)
    }

    sorterThread->join();
    delete sorterThread;

    file->close();
    delete file;
}

void heapifyIDX( size_t heapifyLimit ) {
    IndexEntry top;
    size_t posTop;

    for ( pos = 0; pos <= heapifyLimit; pos++ ) {
        posTop = heapifyLimit - pos;

        readData( &top, posTop );

        orderHeap( top, posTop );
    }
}

void sortIDXHeap( size_t numDataSets ) {
    IndexEntry last;
    IndexEntry top;
    size_t posLast;
    size_t posTop;

    for ( pos = 0; pos < numDataSets; pos++ ) {
        posLast = numDataSets - pos;
        posTop = 0;
        limit = posLast - 1;

        readData( &last, posTop );
        readData( &top, posLast );
        writeData( &last, posLast );

        orderHeap( top, posTop );
    }
}

void readData( IndexEntry* entry, size_t pos ) {
    file->seekg( pos * writeSize );
    file->read( (char*)entry, writeSize );
}

void writeData( IndexEntry* entry, size_t pos ) {
    file->seekp( pos * writeSize );
    file->write( (char*)entry, writeSize );
}

bool isInHeap( size_t pos ) {
    return pos <= limit;
}

void orderHeap( IndexEntry &top, size_t posTop ) {
    static IndexEntry left;
    static IndexEntry right;
    static size_t posLeft;
    static size_t posRight;
    static bool swapped;

    do {
        posLeft = getLeft( posTop );
        posRight = getRight( posTop );

        if ( isInHeap( posLeft ) ) {
            readData( &left, posLeft );

            if ( isInHeap( posRight ) ) {
                readData( &right, posRight );

                if ( right > left ) {
                    if ( right > top ) {
                        writeData( &right, posTop );
                        posTop = posRight;

                        swapped = true;
                    } else {
                        swapped = false;
                    }
                } else {
                    if ( left > top ) {
                        writeData( &left, posTop );
                        posTop = posLeft;

                        swapped = true;
                    } else {
                        swapped = false;
                    }
                }
            } else {
                if ( left > top ) {
                    writeData( &left, posTop );
                    posTop = posLeft;

                    swapped = true;
                } else {
                    swapped = false;
                }
            }
        } else {
            swapped = false;
        }
    } while ( swapped );

    writeData( &top, posTop );
}

Original Question:

I hope you can help me out a little with a problem I have been stuck on quite some time.
I'm implementing a simple lookup table to quickly search a file. My current problem is the index file. Currently I'm iterating over the data file and create my index entries with 8 bytes of data that I'm going to look for followed by 6 bytes of data indicating the location of that data set in the original file. So my index file consits out of 14 byte blocks of data. Now I want to sort that file so I can easily find my data by doing binary search in the index file. That is the part where I'm struggeling so far.

I need to sort these 14 byte entries in-place by their first 8 bytes. Just sorting by the first 8 bytes shouldn't be too much of an issue. I'm rather puzzled over how I can sort the file itself.
I was thingking about either trying to implement a "iterator" class for the file so I could pass it to std::sort which should do the job fairly well. But since I'm not sure what interfaces I should provide for that to work and I also can't read the current progress I did some research and was reminded of the Heapsort algorithm which sounds extremly good since it has O(n*log(n)), is in-place and I can estimate the progress fairly well.

So far so good. I'm still a bit puzzeled over the actual implementation of this since I'm not sure what would be the best way of swapping several bytes of data in a file. Also I'm interested to hear whether you have alternative suggestions on how to sort this file since the index files are several GBs in size and performance matters a lot!

BrainStone
  • 3,028
  • 6
  • 32
  • 59
  • I'm afraid your question is a bit too broad as it is. Can you narrow your problem down to one particular issue? Also show what you have tried so far in code, which usually helps. – E_net4 Sep 19 '16 at 08:23
  • Be sure to have enough RAM load the file fully into RAM, that is one precondition for best performance. – Peter G. Sep 19 '16 at 08:25
  • @PeterG. I can't load a 17 GB file into RAM when I only have 8 GB. Also this code should be able to run on any machine that can compile it. Regardless of the specs of said machine. – BrainStone Sep 19 '16 at 08:28

2 Answers2

0

Why do you need to sort it in-place? Your had disc should have no problem to store another 17 GB file.

I would sort it this way.

  1. read the file in chunks you can handle in RAM. Use any sorting algorithm you want on that chunk (e.g. quicksort, heap sort, ..)

  2. write the sorted chunk to the had disc (in a separate file for each chunk)

  3. goto 1 until you reached the end of your data

  4. merge the sorted files of the chunks and write the overall sorted result as a final sorted file.

  5. delete the sorted-chunk-files.

MrSmith42
  • 9,961
  • 6
  • 38
  • 49
  • Doing it in-place saves a lot of data to be honest. Also your suggestion would most likely work in-place too. Sorting chunks of the file in RAM and then merging them. – BrainStone Sep 19 '16 at 08:47
  • @BrainStone soring the sorted chunks in the original file will get you in trouble when you do the final merge also in-place on the file because you will need to ensure that you do not overwrite data of chunks not yet merged. – MrSmith42 Sep 19 '16 at 09:58
0

Since the first few elements of the array are accessed the most I decided to load the first elements into RAM until I reach the limit (which is passed a parameter). The achieve that I modified my code like that:

// ...
size_t arraySize = 0;
IndexEntry* cacheArray;

void readIntoArray( size_t numElements ) {
    if ( arraySize != 0 )
        writeFromArray();

    arraySize = numElements;
    cacheArray = new IndexEntry[arraySize];
    file->seekg( 0 );

    for ( size_t i = 0; i < arraySize; i++ ) {
        file->read( (char*)(cacheArray + i), writeSize );
    }
}

void writeFromArray() {
    file->seekp( 0 );

    for ( size_t i = 0; i < arraySize; i++ ) {
        file->write( (char*)(cacheArray + i), writeSize );
    }

    arraySize = 0;
    delete[] cacheArray;
}

void sortIDX( string idxFile, size_t cacheSize, bool quiet ) {
    // ...

    cacheSize /= writeSize;
    readIntoArray( min(cacheSize, numDataSets) );

    sorterThread = new thread( heapifyIDX, heapifyLimit );

    // ...

    sorterThread->join();
    delete sorterThread;

    writeFromArray();

    file->close();
    delete file;
}

void readData( IndexEntry* entry, size_t pos ) {
    if ( pos < arraySize ) {
        *entry = cacheArray[pos];
    } else {
        file->seekg( pos * writeSize );
        file->read( (char*)entry, writeSize );
    }
}

void writeData( IndexEntry* entry, size_t pos ) {
    if ( pos < arraySize ) {
        cacheArray[pos] = *entry;
    } else {
        file->seekp( pos * writeSize );
        file->write( (char*)entry, writeSize );
    }
}
BrainStone
  • 3,028
  • 6
  • 32
  • 59