24

Answering to another question, I wrote the program below to compare different search methods in a sorted array. Basically I compared two implementations of Interpolation search and one of binary search. I compared performance by counting cycles spent (with the same set of data) by the different variants.

However I'm sure there is ways to optimize these functions to make them even faster. Does anyone have any ideas on how can I make this search function faster? A solution in C or C++ is acceptable, but I need it to process an array with 100000 elements.

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <stdint.h>
#include <assert.h>

static __inline__ unsigned long long rdtsc(void)
{
  unsigned long long int x;
     __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
     return x;
}

int interpolationSearch(int sortedArray[], int toFind, int len) {
    // Returns index of toFind in sortedArray, or -1 if not found
    int64_t low = 0;
    int64_t high = len - 1;
    int64_t mid;

    int l = sortedArray[low];
    int h = sortedArray[high];

    while (l <= toFind && h >= toFind) {
        mid = low + (int64_t)((int64_t)(high - low)*(int64_t)(toFind - l))/((int64_t)(h-l));

        int m = sortedArray[mid];

        if (m < toFind) {
            l = sortedArray[low = mid + 1];
        } else if (m > toFind) {
            h = sortedArray[high = mid - 1];
        } else {
            return mid;
        }
    }

    if (sortedArray[low] == toFind)
        return low;
    else
        return -1; // Not found
}

int interpolationSearch2(int sortedArray[], int toFind, int len) {
    // Returns index of toFind in sortedArray, or -1 if not found
    int low = 0;
    int high = len - 1;
    int mid;

    int l = sortedArray[low];
    int h = sortedArray[high];

    while (l <= toFind && h >= toFind) {
        mid = low + ((float)(high - low)*(float)(toFind - l))/(1+(float)(h-l));
        int m = sortedArray[mid];

        if (m < toFind) {
            l = sortedArray[low = mid + 1];
        } else if (m > toFind) {
            h = sortedArray[high = mid - 1];
        } else {
            return mid;
        }
    }

    if (sortedArray[low] == toFind)
        return low;
    else
        return -1; // Not found
}

int binarySearch(int sortedArray[], int toFind, int len) 
{
    // Returns index of toFind in sortedArray, or -1 if not found
    int low = 0;
    int high = len - 1;
    int mid;

    int l = sortedArray[low];
    int h = sortedArray[high];

    while (l <= toFind && h >= toFind) {
        mid = (low + high)/2;

        int m = sortedArray[mid];

        if (m < toFind) {
            l = sortedArray[low = mid + 1];
        } else if (m > toFind) {
            h = sortedArray[high = mid - 1];
        } else {
            return mid;
        }
    }

    if (sortedArray[low] == toFind)
        return low;
    else
        return -1; // Not found
}

int order(const void *p1, const void *p2) { return *(int*)p1-*(int*)p2; }

int main(void) {
    int i = 0, j = 0, size = 100000, trials = 10000;
    int searched[trials];
    srand(-time(0));
    for (j=0; j<trials; j++) { searched[j] = rand()%size; }

    while (size > 10){
        int arr[size];
        for (i=0; i<size; i++) { arr[i] = rand()%size; }
        qsort(arr,size,sizeof(int),order);

        unsigned long long totalcycles_bs = 0;
        unsigned long long totalcycles_is_64 = 0;
        unsigned long long totalcycles_is_float = 0;
        unsigned long long totalcycles_new = 0;
        int res_bs, res_is_64, res_is_float, res_new;
        for (j=0; j<trials; j++) {
            unsigned long long tmp, cycles = rdtsc();
            res_bs = binarySearch(arr,searched[j],size);
            tmp = rdtsc(); totalcycles_bs += tmp - cycles; cycles = tmp;

            res_is_64 = interpolationSearch(arr,searched[j],size);
            assert(res_is_64 == res_bs || arr[res_is_64] == searched[j]); 
            tmp = rdtsc(); totalcycles_is_64 += tmp - cycles; cycles = tmp;

            res_is_float = interpolationSearch2(arr,searched[j],size);
            assert(res_is_float == res_bs || arr[res_is_float] == searched[j]); 
            tmp = rdtsc(); totalcycles_is_float += tmp - cycles; cycles = tmp;
        }
        printf("----------------- size = %10d\n", size);
        printf("binary search          = %10llu\n", totalcycles_bs);
        printf("interpolation uint64_t = %10llu\n",  totalcycles_is_64);
        printf("interpolation float    = %10llu\n",  totalcycles_is_float);
        printf("new                    = %10llu\n",  totalcycles_new);
        printf("\n");
        size >>= 1;
    }
}
Community
  • 1
  • 1
kriss
  • 23,497
  • 17
  • 97
  • 116
  • And how does this compare to STL functions? – Oliver Charlesworth Jan 20 '11 at 23:54
  • Out of interest, what are the results of your code, on your system? – Andrew Cooper Jan 20 '11 at 23:58
  • @Andrew Cooper: 100000 * 10000 : binary search = 3874977, interpolation uint64_t = 2986578, interpolation float = 2334951. – kriss Jan 21 '11 at 00:08
  • @Oli Charlesworth: didn't tried to compare, so I don't know. But it indeed could be interesting to test them. – kriss Jan 21 '11 at 00:10
  • I believe that `interpolationSearch()` will hit a divide-by-0 if the array to search has only one element that happens to be what's being looked for. – Michael Burr Jan 21 '11 at 00:54
  • Still one vote and this question will be closed! Wondering... what's the rationale for people voting to close ? Polemic question ? Not a real question ? I'm asking for actual alternate implementations of search function to test performance. – kriss Jan 21 '11 at 08:40
  • I would have thought this question is far from being ready to close. – Andrew Bainbridge Jan 21 '11 at 10:43
  • @Oli Charlesworth, @Jon, @David Heffernan, @dalle, @Graviton: Hey guys don't you understand that when someone upvote a question, that implies they believe answering it is possible... ok, I understand you are not interested by the answer, then why not downvote instead of closing? Because it's cheaper (no rep) ? Also would you bother to explain your rationale ? I thought it was simple "propose/suggest fastest C or C++ implementation to solve the search problem with the provided interface and testing context"... what is not real here ? How can I improve it in your eyes if you give no hint. – kriss Jan 21 '11 at 11:50
  • 1
    @kriss: My rationale was "not a real question"! It's not solving a real problem, there's no fixed answer, it doesn't really help anyone. Hence my comment above "how does it compare to STL?" The point being, this problem (a fast binary search) has already been solved in the overwhelming majority of situations. – Oliver Charlesworth Jan 21 '11 at 11:57
  • 3
    @Oli Charlesworth: Questions like this are fun and informative. Learning how to make the code faster is the aim. And the question is written in C, where STL doesn't exist. – Andrew Bainbridge Jan 21 '11 at 12:19
  • @Andrew: They may be fun and informative, but they don't have an answer. They're really more of a discussion, and so aren't really appropriate. The question may have been written in C, but that doesn't stop you reading an STL implementation, and translating the algorithm. (Note also that there was a C++ tag.) – Oliver Charlesworth Jan 21 '11 at 12:23
  • @Oli: you mean you should never question and test actual efficiency of "solved" subjects, or ever speak of well known solutions (probably 99% of SO content) ? If STL is actually faster in this precise case (I published my testing environment), it should be simple to propose an answer using it, and the last word would be *use STL*. That's exactly what you are forbidding others to do, including me by voting to close. But what is bothering me more is that you seem to imply there is no way to improve the question... and that the subject is doomed. Why not leave others try and see if it's true ? – kriss Jan 21 '11 at 12:27
  • 2
    Whether something has an answer is only part of the usefulness of the question being here on SO. There are clearly things to be learned here. Besides, the question is answerable. I have a faster implementation that I now can't post. – Andrew Bainbridge Jan 21 '11 at 12:39
  • 1
    @Andrew: I don't doubt that there's useful things to be learnt here, but it's more appropriate for community wiki. There's no definitive answer (not least because the definition of "fastest" depends on platform and other things), and it could easily degenerate into a pissing contest. – Oliver Charlesworth Jan 21 '11 at 12:46
  • @Oli: What is "community wiki"? http://communitywiki.org/ doesn't seem to have anything to do with programming. – Andrew Bainbridge Jan 21 '11 at 14:00
  • @Andrew: See http://meta.stackexchange.com/questions/11740/what-are-community-wiki-posts. – Oliver Charlesworth Jan 21 '11 at 14:24
  • Cycle counting makes your metric highly chip dependent, and since you express the algorithms in c compiler dependent as well. Bad metric. – dmckee --- ex-moderator kitten Jan 22 '11 at 23:41
  • 1
    I cast the final vote to reopen the question. I agree with Oli that this may be mostly an academic exercise, and when circumstances allow, you should use one of the standard sorts provided by a library of your choice. But that doesn't make it "not a real question". I also edited the question to remove the "Code Golf" aspects, and convert it into a strictly code-based optimization question. I don't see anyone else's rationale for closing that I find compelling. – Cody Gray - on strike Jan 24 '11 at 04:56
  • @dmckee: true enough, the metric used is highly context dependent. That implies I should expose results as a matrix including processors, compilers, input data distribution, etc to specify the context. And what can you suggest as a better metric in *real world* ? When running a program I'm not interested much in it's theorical Big-O run-time (I was when looking for a good algorithm), but by it's actual run-time. – kriss Jan 24 '11 at 06:15
  • @kriss: It means the answer is *always* "try it and see". Or more often, "try any O(ln N) search and quit 'cause it's fast enough". And it means this question is Too Localized. – dmckee --- ex-moderator kitten Jan 25 '11 at 02:09
  • @dmckee: well, I believe I've poorly chosen my title. The goal was more on how implementation details can enhance behavior of an algorithm, and say change the point where interpolationSearch becomes faster or slower than binary search. But the actual answers are more about changing algorithm than changing implementation. So indeed it's probably too localized. But really in my daily job I'm working on similar problems where extracting the last bit of speed of a real system is important. – kriss Jan 25 '11 at 09:45
  • 1
    The fastest search method is going to depend on the values in your array. Binary will probably be faster than interpolation when the data is smooth (values are equally spaced), but interpolation methods will be better when the data is exponential or logarithmic. You could also consider an algorithm like this one (http://stackoverflow.com/questions/4171966/two-egg-problem-confusion), which would guarantee a hit within X accesses to the array. – oosterwal Feb 01 '11 at 21:59

8 Answers8

17

If you have some control over the in-memory layout of the data, you might want to look at Judy arrays.

Or to put a simpler idea out there: a binary search always cuts the search space in half. An optimal cut point can be found with interpolation (the cut point should NOT be the place where the key is expected to be, but the point which minimizes the statistical expectation of the search space for the next step). This minimizes the number of steps but... not all steps have equal cost. Hierarchical memories allow executing a number of tests in the same time as a single test, if locality can be maintained. Since a binary search's first M steps only touch a maximum of 2**M unique elements, storing these together can yield a much better reduction of search space per-cacheline fetch (not per comparison), which is higher performance in the real world.

n-ary trees work on that basis, and then Judy arrays add a few less important optimizations.

Bottom line: even "Random Access Memory" (RAM) is faster when accessed sequentially than randomly. A search algorithm should use that fact to its advantage.

Ben Voigt
  • 277,958
  • 43
  • 419
  • 720
9

Benchmarked on Win32 Core2 Quad Q6600, gcc v4.3 msys. Compiling with g++ -O3, nothing fancy.

Observation - the asserts, timing and loop overhead is about 40%, so any gains listed below should be divided by 0.6 to get the actual improvement in the algorithms under test.

Simple answers:

  1. On my machine replacing the int64_t with int for "low", "high" and "mid" in interpolationSearch gives a 20% to 40% speed up. This is the fastest easy method I could find. It is taking about 150 cycles per look-up on my machine (for the array size of 100000). That's roughly the same number of cycles as a cache miss. So in real applications, looking after your cache is probably going to be the biggest factor.

  2. Replacing binarySearch's "/2" with a ">>1" gives a 4% speed up.

  3. Using STL's binary_search algorithm, on a vector containing the same data as "arr", is about the same speed as the hand coded binarySearch. Although on the smaller "size"s STL is much slower - around 40%.

Andrew Bainbridge
  • 4,651
  • 3
  • 35
  • 50
  • nice stats , kindly point me to the tool etc you used for bench marking.I am totally new to this field. – Anupam Saini Jul 08 '11 at 10:08
  • I didn't use any tool, just the compiler. The original poster's program already timed itself. I just ran it many times with various modifications. I seem to remember that I changed exactly what was timed as well, but I've forgotten the details. – Andrew Bainbridge Jul 08 '11 at 13:52
  • ahh .. In that case will modify the program itself.Thanks for the help Andrew – Anupam Saini Jul 12 '11 at 05:30
  • 1
    +1 for benchmarking it and taking the time to post your results. – SmacL May 24 '13 at 07:25
4

I have an excessively complicated solution, which requires a specialized sorting function. The sort is slightly slower than a good quicksort, but all of my tests show that the search function is much faster than a binary or interpolation search. I called it a regression sort before I found out that the name was already taken, but didn't bother to think of a new name (ideas?).

There are three files to demonstrate.

The regression sort/search code:

#include <sstream>
#include <math.h>
#include <ctime>
#include "limits.h"

void insertionSort(int array[], int length) {
   int key, j;
   for(int i = 1; i < length; i++) {
      key = array[i];
      j = i - 1;
      while (j >= 0 && array[j] > key) {
         array[j + 1] = array[j];
         --j;
      }
      array[j + 1] = key;
   }
}

class RegressionTable {
   public:
      RegressionTable(int arr[], int s, int lower, int upper, double mult, int divs);
      RegressionTable(int arr[], int s);
      void sort(void);
      int find(int key);
      void printTable(void);
      void showSize(void);
   private:
      void createTable(void);
      inline unsigned int resolve(int n);
      int * array;
      int * table;
      int * tableSize;
      int size;
      int lowerBound;
      int upperBound;
      int divisions;
      int divisionSize;
      int newSize;
      double multiplier;
};

RegressionTable::RegressionTable(int arr[], int s) {
   array = arr;
   size = s;
   multiplier = 1.35;
   divisions = sqrt(size);
   upperBound = INT_MIN;
   lowerBound = INT_MAX;
   for (int i = 0; i < size; ++i) {
      if (array[i] > upperBound)
         upperBound = array[i];
      if (array[i] < lowerBound)
         lowerBound = array[i];
   }
   createTable();
}

RegressionTable::RegressionTable(int arr[], int s, int lower, int upper, double mult, int divs) {
   array = arr;
   size = s;
   lowerBound = lower;
   upperBound = upper;
   multiplier = mult;
   divisions = divs;
   createTable();
}

void RegressionTable::showSize(void) {
   int bytes = sizeof(*this);
   bytes = bytes + sizeof(int) * 2 * (divisions + 1);
}

void RegressionTable::createTable(void) {
   divisionSize = size / divisions;
   newSize = multiplier * double(size);
   table = new int[divisions + 1];
   tableSize = new int[divisions + 1];

   for (int i = 0; i < divisions; ++i) {
      table[i] = 0;
      tableSize[i] = 0;
   }

   for (int i = 0; i < size; ++i) {
      ++table[((array[i] - lowerBound) / divisionSize) + 1];
   }

   for (int i = 1; i <= divisions; ++i) {
      table[i] += table[i - 1];
   }
   table[0] = 0;

   for (int i = 0; i < divisions; ++i) {
      tableSize[i] = table[i + 1] - table[i];
   }
}

int RegressionTable::find(int key) {
   double temp = multiplier;
   multiplier = 1;

   int minIndex = table[(key - lowerBound) / divisionSize];
   int maxIndex = minIndex + tableSize[key / divisionSize];
   int guess = resolve(key);
   double t;
   while (array[guess] != key) {
      // uncomment this line if you want to see where it is searching.
      //cout << "Regression Guessing " << guess << ", not there." << endl;
      if (array[guess] < key) {
         minIndex = guess + 1;
      }
      if (array[guess] > key) {
         maxIndex = guess - 1;
      }
      if (array[minIndex] > key || array[maxIndex] < key) {
         return -1;
      }
      t = ((double)key - array[minIndex]) / ((double)array[maxIndex] - array[minIndex]);
      guess = minIndex + t * (maxIndex - minIndex);
   }

   multiplier = temp;

   return guess;
}

inline unsigned int RegressionTable::resolve(int n) {
   float temp;
   int subDomain = (n - lowerBound) / divisionSize;
   temp = n % divisionSize;
   temp /= divisionSize;
   temp *= tableSize[subDomain];
   temp += table[subDomain];
   temp *= multiplier;
   return (unsigned int)temp;
}

void RegressionTable::sort(void) {
   int * out = new int[int(size * multiplier)];
   bool * used = new bool[int(size * multiplier)];
   int higher, lower;
   bool placed;

   for (int i = 0; i < size; ++i) {

      /* Figure out where to put the darn thing */
      higher = resolve(array[i]);
      lower = higher - 1;

      if (higher > newSize) {
         higher = size;
         lower = size - 1;
      } else if (lower < 0) {
         higher = 0;
         lower = 0;
      }
      placed = false;
      while (!placed) {
         if (higher < size && !used[higher]) {
            out[higher] = array[i];
            used[higher] = true;
            placed = true;
         } else if (lower >= 0 && !used[lower]) {
            out[lower] = array[i];
            used[lower] = true;
            placed = true;
         }
         --lower;
         ++higher;
      }
   }
   int index = 0;
   for (int i = 0; i < size * multiplier; ++i) {
      if (used[i]) {
         array[index] = out[i];
         ++index;
      }
   }

   insertionSort(array, size);
}

And then there is the regular search functions:

#include <iostream>
using namespace std;

int binarySearch(int array[], int start, int end, int key) {
   // Determine the search point.
   int searchPos = (start + end) / 2;
   // If we crossed over our bounds or met in the middle, then it is not here.
   if (start >= end)
      return -1;
   // Search the bottom half of the array if the query is smaller.
   if (array[searchPos] > key)
      return binarySearch (array, start, searchPos - 1, key);
   // Search the top half of the array if the query is larger.
   if (array[searchPos] < key)
      return binarySearch (array, searchPos + 1, end, key);
   // If we found it then we are done.
   if (array[searchPos] == key)
      return searchPos;
}

int binarySearch(int array[], int size, int key) {
   return binarySearch(array, 0, size - 1, key);
}

int interpolationSearch(int array[], int size, int key) {
   int guess = 0;
   double t;
   int minIndex = 0;
   int maxIndex = size - 1;
   while (array[guess] != key) {

      t = ((double)key - array[minIndex]) / ((double)array[maxIndex] - array[minIndex]);
      guess = minIndex + t * (maxIndex - minIndex);

      if (array[guess] < key) {
         minIndex = guess + 1;
      }
      if (array[guess] > key) {
         maxIndex = guess - 1;
      }
      if (array[minIndex] > key || array[maxIndex] < key) {
         return -1;
      }
   }

   return guess;
}

And then I wrote a simple main to test out the different sorts.

    #include <iostream>
    #include <iomanip>
    #include <cstdlib>
    #include <ctime>
    #include "regression.h"
    #include "search.h"
    using namespace std;

    void randomizeArray(int array[], int size) {
       for (int i = 0; i < size; ++i) {
          array[i] = rand() % size;
       }
    }

    int main(int argc, char * argv[]) {

       int size = 100000;
       string arg;
       if (argc > 1) {
          arg = argv[1];
          size = atoi(arg.c_str());
       }
       srand(time(NULL));
       int * array;
       cout << "Creating Array Of Size " << size << "...\n";
       array = new int[size];

       randomizeArray(array, size);
       cout << "Sorting Array...\n";
       RegressionTable t(array, size, 0, size*2.5, 1.5, size);
       //RegressionTable t(array, size);
       t.sort();
       int trials = 10000000;
       int start;

       cout << "Binary Search...\n";
       start = clock();
       for (int i = 0; i < trials; ++i) {
          binarySearch(array, size, i % size);
       }
       cout << clock() - start << endl;

       cout << "Interpolation Search...\n";
       start = clock();
       for (int i = 0; i < trials; ++i) {
          interpolationSearch(array, size, i % size);
       }
       cout << clock() - start << endl;

       cout << "Regression Search...\n";
       start = clock();
       for (int i = 0; i < trials; ++i) {
          t.find(i % size);
       }
       cout << clock() - start << endl;

       return 0;
}

Give it a try and tell me if it's faster for you. It's super complicated, so it's really easy to break it if you don't know what you are doing. Be careful about modifying it.

I compiled the main with g++ on ubuntu.

regality
  • 6,496
  • 6
  • 29
  • 26
  • It looks interesting. I will test it tonight. But to makes things clear the test run is done on an already sorted array, hence the sort performance will not be included in the test, just the interpolation function one. – kriss Jan 21 '11 at 09:51
  • I tryied it and it is 2 times faster than interpolation. Yet I will have to look at details to understand if it really is an answer (if it is I could also accept set or hash table implementations for which find will also be faster). +1 anyway. – kriss Jan 24 '11 at 06:31
3

Look first at the data and whether a big gain can be got by data specific method over a general method.

For large static sorted datasets, you can create an additional index to provide partial pigeon holing, based on the amount of memory you're willing to use. e.g. say we create a 256x256 two dimensional array of ranges, which we populate with the start and end positions in the search array of elements with corresponding high order bytes. When we come to search, we then use the high order bytes on the key to find the range / subset of the array we need to search. If we did have ~ 20 comparisons on our binary search of 100,000 elements O(log2(n)) we're now down to ~4 comarisons for 16 elements, or O(log2 (n/15)). The memory cost here is about 512k

Another method, again suited to data that doesn't change much, is to divide the data into arrays of commonly sought items and rarely sought items. For example, if you leave your existing search in place running a wide number of real world cases over a protracted testing period, and log the details of the item being sought, you may well find that the distribution is very uneven, i.e. some values are sought far more regularly than others. If this is the case, break your array into a much smaller array of commonly sought values and a larger remaining array, and search the smaller array first. If the data is right (big if!), you can often achieve broadly similar improvements to the first solution without the memory cost.

There are many other data specific optimizations which score far better than trying to improve on tried, tested and far more widely used general solutions.

SmacL
  • 22,555
  • 12
  • 95
  • 149
3

Unless your data is known to have special properties, pure interpolation search has the risk of taking linear time. If you expect interpolation to help with most data but don't want it to hurt in the case of pathological data, I would use a (possibly weighted) average of the interpolated guess and the midpoint, ensuring a logarithmic bound on the run time.

R.. GitHub STOP HELPING ICE
  • 208,859
  • 35
  • 376
  • 711
  • @R..: in that case I'm just curious in actual case I encounter a binary search is usually more than good enough for me. The data set tested here is pseudo-random as you can see and I'm not really worried about seeing interpolation degenerating, but maybe comparing interpolated point and midpoint could give a way to choose to continue next steps using binary search only. – kriss Jan 21 '11 at 00:17
  • As a real-world example of interpolation degenerating, one application for search is seek-to-timestamp in unindexed video files. For any video where the typical bitrate is low but the peak bitrate is very high, interpolation-based search can lead to pathologically many seek-to-byte operations on the underlying file (or worse yet, network stream). – R.. GitHub STOP HELPING ICE Jan 21 '11 at 00:41
  • I think it's easier to use a similar approach to that of introsort to ensure logartihmic run time bounds for degenerate cases, that is count the number of iterations done so far and if it takes too many iterations (defined as some multiple of the logarithm) to find the correct position switch to binary search. I think that might have a smaller impact on performance then generally modifying the search position for non degenerate cases (assuming the if to switch to binary gets correctly predicated, which should be the case). Of course this prediction is purely theoretical... – Grizzly Jan 21 '11 at 02:16
  • I think you fail to understand how easy the hybrid algorithm is. In your interpolation version after the line `mid = ...`, just add `mid = (2*mid + low + hi)/4;` – R.. GitHub STOP HELPING ICE Jan 21 '11 at 04:14
  • @R..: I tried hybrid version. On my testset it's slower than both interpolation search and binary search on nearly every cases, including degenerating ones (it happens with random data). However I should probably prepare a worst-case testset to see how it really performs. – kriss Jan 21 '11 at 08:34
  • Try searching an exponentially spaced array, e.g. 1,2,4,8,16,32,64,128,... Use base 1.03125 or so instead of base 2 if you want to make it big enough to test well. – R.. GitHub STOP HELPING ICE Jan 21 '11 at 16:20
3

One way of approaching this is to use a space versus time trade-off. There are any number of ways that could be done. The extreme way would be to simply make an array with the max size being the max value of the sorted array. Initialize each position with the index into sortedArray. Then the search would simply be O(1).

The following version, however, might be a little more realistic and possibly be useful in the real world. It uses a "helper" structure that is initialized on the first call. It maps the search space down to a smaller space by dividing by a number that I pulled out of the air without much testing. It stores the index of the lower bound for a group of values in sortedArray into the helper map. The actual search divides the toFind number by the chosen divisor and extracts the narrowed bounds of sortedArray for a normal binary search.

For example, if the sorted values range from 1 to 1000 and the divisor is 100, then the lookup array might contain 10 "sections". To search for value 250, it would divide it by 100 to yield integer index position 250/100=2. map[2] would contain the sortedArray index for values 200 and larger. map[3] would have the index position of values 300 and larger thus providing a smaller bounding position for a normal binary search. The rest of the function is then an exact copy of your binary search function.

The initialization of the helper map might be more efficient by using a binary search to fill in the positions rather than a simple scan, but it is a one time cost so I didn't bother testing that. This mechanism works well for the given test numbers which are evenly distributed. As written, it would not be as good if the distribution was not even. I think this method could be used with floating point search values too. However, extrapolating it to generic search keys might be harder. For example, I am unsure what the method would be for character data keys. It would need some kind of O(1) lookup/hash that mapped to a specific array position to find the index bounds. It's unclear to me at the moment what that function would be or if it exists.

I kludged the setup of the helper map in the following implementation pretty quickly. It is not pretty and I'm not 100% sure it is correct in all cases but it does show the idea. I ran it with a debug test to compare the results against your existing binarySearch function to be somewhat sure it works correctly.

The following are example numbers:

100000 * 10000 : cycles binary search          = 10197811
100000 * 10000 : cycles interpolation uint64_t = 9007939
100000 * 10000 : cycles interpolation float    = 8386879
100000 * 10000 : cycles binary w/helper        = 6462534

Here is the quick-and-dirty implementation:

#define REDUCTION 100  // pulled out of the air
typedef struct {
    int init;  // have we initialized it?
    int numSections;
    int *map;
    int divisor;
} binhelp;

int binarySearchHelp( binhelp *phelp, int sortedArray[], int toFind, int len)
{
    // Returns index of toFind in sortedArray, or -1 if not found
    int low;
    int high;
    int mid;

    if ( !phelp->init && len > REDUCTION ) {
        int i;
        int numSections = len / REDUCTION;
        int divisor = (( sortedArray[len-1] - 1 ) / numSections ) + 1;
        int threshold;
        int arrayPos;

        phelp->init = 1;
        phelp->divisor = divisor;
        phelp->numSections = numSections;
        phelp->map = (int*)malloc((numSections+2) * sizeof(int));
        phelp->map[0] = 0;
        phelp->map[numSections+1] = len-1;
        arrayPos = 0;
        // Scan through the array and set up the mapping positions.  Simple linear
        // scan but it is a one-time cost.
        for ( i = 1; i <= numSections; i++ ) {
            threshold = i * divisor;
            while ( arrayPos < len && sortedArray[arrayPos] < threshold )
                arrayPos++;
            if ( arrayPos < len )
                phelp->map[i] = arrayPos;
            else
                // kludge to take care of aliasing
                phelp->map[i] = len - 1;
        }
    }

    if ( phelp->init ) {
        int section = toFind / phelp->divisor;
        if ( section > phelp->numSections )
            // it is bigger than all values
            return -1;

        low = phelp->map[section];
        if ( section == phelp->numSections )
            high = len - 1;
        else
            high = phelp->map[section+1];
    } else {
        // use normal start points
        low = 0;
        high = len - 1;
    }

    // the following is a direct copy of the Kriss' binarySearch
    int l = sortedArray[low];
    int h = sortedArray[high];

    while (l <= toFind && h >= toFind) {
        mid = (low + high)/2;

        int m = sortedArray[mid];

        if (m < toFind) {
            l = sortedArray[low = mid + 1];
        } else if (m > toFind) {
            h = sortedArray[high = mid - 1];
        } else {
            return mid;
        }
    }

    if (sortedArray[low] == toFind)
        return low;
    else
        return -1; // Not found
}

The helper structure needs to be initialized (and memory freed):

    help.init = 0;
    unsigned long long totalcycles4 = 0;
    ... make the calls same as for the other ones but pass the structure ...
        binarySearchHelp(&help, arr,searched[j],length);
    if ( help.init )
        free( help.map );
    help.init = 0;
Mark Wilkins
  • 40,729
  • 5
  • 57
  • 110
2

Posting my current version before the question is closed (hopefully I will thus be able to ehance it later). For now it is worse than every other versions (if someone understand why my changes to the end of loop has this effect, comments are welcome).

int newSearch(int sortedArray[], int toFind, int len) 
{
    // Returns index of toFind in sortedArray, or -1 if not found
    int low = 0;
    int high = len - 1;
    int mid;

    int l = sortedArray[low];
    int h = sortedArray[high];

    while (l < toFind && h > toFind) {
        mid = low + ((float)(high - low)*(float)(toFind - l))/(1+(float)(h-l));

        int m = sortedArray[mid];

        if (m < toFind) {
            l = sortedArray[low = mid + 1];
        } else if (m > toFind) {
            h = sortedArray[high = mid - 1];
        } else {
            return mid;
        }
    }

    if (l == toFind)
        return low;
    else if (h == toFind)
        return high;
    else
        return -1; // Not found
}
kriss
  • 23,497
  • 17
  • 97
  • 116
0

The implementation of the binary search that was used for comparisons can be improved. The key idea is to "normalize" the range initially so that the target is always > a minimum and < than a maximum after the first step. This increases the termination delta size. It also has the effect of special casing targets that are less than the first element of the sorted array or greater than the last element of the sorted array. Expect approximately a 15% improvement in search time. Here is what the code might look like in C++.

int binarySearch(int * &array, int target, int min, int max)
{ // binarySearch
  // normalize min and max so that we know the target is > min and < max
  if (target <= array[min]) // if min not normalized
  { // target <= array[min]
      if (target == array[min]) return min;
      return -1;
  } // end target <= array[min]
  // min is now normalized

  if (target >= array[max]) // if max not normalized
  { // target >= array[max]
      if (target == array[max]) return max;
      return -1;
  } // end target >= array[max]
    // max is now normalized

  while (min + 1 < max)
  { // delta >=2
    int tempi = min + ((max - min) >> 1); // point to index approximately in the middle between min and max
    int atempi = array[tempi]; // just in case the compiler does not optimize this
    if (atempi > target)max = tempi; // if the target is smaller, we can decrease max and it is still normalized        
    else if (atempi < target)min = tempi; // the target is bigger, so we can increase min and it is still normalized        
        else return tempi; // if we found the target, return with the index
        // Note that it is important that this test for equality is last because it rarely occurs.
  } // end delta >=2
  return -1; // nothing in between normalized min and max
} // end binarySearch
Peter Baum
  • 33
  • 5