Updated Question:
Hi. I'm trying to sort a index file in-place. The file consits out of 14B data chunks and usually are too large to be loaded into the RAM. The first 8B are the bytes I want to sort after. I implemented a Heapsort algorithm which so far except for performance is working great!
I am wondering whether my implementation could be improved and how I could possibly speed this process up by using some RAM. I was thinking about possibly partially keeping the heap in RAM but I'm not sure on how that would work.
My code so far:
sortidx.h
#ifndef SORTIDX_H
#define SORTIDX_H
// Includes
#include <atomic>
#include <fstream>
#include <iostream>
#include <limits>
#include <string>
#include <thread>
// Constants
constexpr size_t hashSize = 8;
constexpr size_t offsetSize = 6;
constexpr size_t writeSize = hashSize + offsetSize;
// Typedefs & Structs
typedef std::lock_guard<std::mutex> scoped_lock;
struct IndexEntry {
unsigned char hash[hashSize]; // First 64 bits of the hash
unsigned char position[offsetSize]; // Position of word in dictionary (48-bit little endian integer)
} __attribute__( (__packed__) );
// Functions
bool operator>( const IndexEntry &rhs, const IndexEntry &lhs );
constexpr size_t getParent( size_t i ) {
return (i - 1) / 2;
}
constexpr size_t getLeft( size_t i ) {
return i * 2 + 1;
}
constexpr size_t getRight( size_t i ) {
return i * 2 + 2;
}
void sortIDX( std::string idxFile );
void heapifyIDX( size_t heapifyLimit );
void sortIDXHeap( size_t numDataSets );
void readData( IndexEntry* entry, size_t pos );
void writeData( IndexEntry* entry, size_t pos );
bool isInHeap( size_t pos );
void orderHeap( IndexEntry &top, size_t posTop );
#endif
sortidx.cpp
#include "sortidx.h"
using namespace std;
streampos fileSize;
size_t numDataSets;
size_t limit;
atomic<size_t> pos;
fstream* file;
bool operator>( const IndexEntry &rhs, const IndexEntry &lhs ) {
for ( size_t i = 0; i < hashSize; i++ ) {
if ( rhs.hash[i] > lhs.hash[i] )
return true;
else if ( rhs.hash[i] < lhs.hash[i] )
return false;
}
return false;
}
void sortIDX( string idxFile ) {
file = new fstream( idxFile, ios::in | ios::out | ios::binary | ios::ate );
fileSize = file->tellg();
numDataSets = fileSize / writeSize;
limit = numDataSets - 1;
const size_t localLimit = limit;
const size_t heapifyLimit = getParent( limit );
thread* sorterThread;
sorterThread = new thread( heapifyIDX, heapifyLimit );
while ( pos <= heapifyLimit ) {
// Some progressbar stuff (uses pos)
}
sorterThread->join();
delete sorterThread;
pos = 0;
sorterThread = new thread( sortIDXHeap, localLimit );
while ( pos < localLimit ) {
// Some progressbar stuff (uses pos)
}
sorterThread->join();
delete sorterThread;
file->close();
delete file;
}
void heapifyIDX( size_t heapifyLimit ) {
IndexEntry top;
size_t posTop;
for ( pos = 0; pos <= heapifyLimit; pos++ ) {
posTop = heapifyLimit - pos;
readData( &top, posTop );
orderHeap( top, posTop );
}
}
void sortIDXHeap( size_t numDataSets ) {
IndexEntry last;
IndexEntry top;
size_t posLast;
size_t posTop;
for ( pos = 0; pos < numDataSets; pos++ ) {
posLast = numDataSets - pos;
posTop = 0;
limit = posLast - 1;
readData( &last, posTop );
readData( &top, posLast );
writeData( &last, posLast );
orderHeap( top, posTop );
}
}
void readData( IndexEntry* entry, size_t pos ) {
file->seekg( pos * writeSize );
file->read( (char*)entry, writeSize );
}
void writeData( IndexEntry* entry, size_t pos ) {
file->seekp( pos * writeSize );
file->write( (char*)entry, writeSize );
}
bool isInHeap( size_t pos ) {
return pos <= limit;
}
void orderHeap( IndexEntry &top, size_t posTop ) {
static IndexEntry left;
static IndexEntry right;
static size_t posLeft;
static size_t posRight;
static bool swapped;
do {
posLeft = getLeft( posTop );
posRight = getRight( posTop );
if ( isInHeap( posLeft ) ) {
readData( &left, posLeft );
if ( isInHeap( posRight ) ) {
readData( &right, posRight );
if ( right > left ) {
if ( right > top ) {
writeData( &right, posTop );
posTop = posRight;
swapped = true;
} else {
swapped = false;
}
} else {
if ( left > top ) {
writeData( &left, posTop );
posTop = posLeft;
swapped = true;
} else {
swapped = false;
}
}
} else {
if ( left > top ) {
writeData( &left, posTop );
posTop = posLeft;
swapped = true;
} else {
swapped = false;
}
}
} else {
swapped = false;
}
} while ( swapped );
writeData( &top, posTop );
}
Original Question:
I hope you can help me out a little with a problem I have been stuck on quite some time.
I'm implementing a simple lookup table to quickly search a file. My current problem is the index file. Currently I'm iterating over the data file and create my index entries with 8 bytes of data that I'm going to look for followed by 6 bytes of data indicating the location of that data set in the original file. So my index file consits out of 14 byte blocks of data. Now I want to sort that file so I can easily find my data by doing binary search in the index file. That is the part where I'm struggeling so far.
I need to sort these 14 byte entries in-place by their first 8 bytes. Just sorting by the first 8 bytes shouldn't be too much of an issue. I'm rather puzzled over how I can sort the file itself.
I was thingking about either trying to implement a "iterator" class for the file so I could pass it to std::sort
which should do the job fairly well. But since I'm not sure what interfaces I should provide for that to work and I also can't read the current progress I did some research and was reminded of the Heapsort algorithm which sounds extremly good since it has O(n*log(n))
, is in-place and I can estimate the progress fairly well.
So far so good. I'm still a bit puzzeled over the actual implementation of this since I'm not sure what would be the best way of swapping several bytes of data in a file. Also I'm interested to hear whether you have alternative suggestions on how to sort this file since the index files are several GBs in size and performance matters a lot!