-3

I have tried to implement a C QuickSelect algorithm as described in this post (3 way quicksort (C implementation)). However, all I get are performances 5 to 10 times less than the default qsort (even with an initial shuffling). I tried to dig into the original qsort source code as provide here (https://github.com/lattera/glibc/blob/master/stdlib/qsort.c), but it's too complex. Does anybody have a simpler, and better algorithm? Any idea is welcomed. Thanks, NB: my original problem is to try to get the Kth smallest values of an array to the first Kth indices. So I planned to call quickselect K times EDIT 1: Here is the Cython Code as copied and adapted from the link above

cdef void qswap(void* a, void* b, const size_t size) nogil:
    cdef char temp[size]# C99, use malloc otherwise
    #char serves as the type for "generic" byte arrays

    memcpy(temp, b,    size)
    memcpy(b,    a,    size)
    memcpy(a,    temp, size)

cdef void qshuffle(void* base, size_t num, size_t size) nogil: #implementation of Fisher
    cdef int i, j, tmp# create local variables to hold values for shuffle

    for i in range(num - 1, 0, -1): # for loop to shuffle
        j = c_rand() % (i + 1)#randomise j for shuffle with Fisher Yates
        qswap(base + i*size, base + j*size, size)

cdef void partition3(void* base,
                      size_t *low, size_t *high, size_t size,
                      QComparator compar) nogil:       
    # Modified median-of-three and pivot selection.                      
    cdef void *ptr = base
    cdef size_t lt = low[0]
    cdef size_t gt = high[0] # lt is the pivot
    cdef size_t i = lt + 1# (+1 !) we don't compare pivot with itself
    cdef int c = 0

    while (i <= gt):
        c = compar(ptr + i * size, ptr + lt * size)
        if (c < 0):# base[i] < base[lt] => swap(i++,lt++)
            qswap(ptr + lt * size, ptr + i * size, size)
            i += 1
            lt += 1
        elif (c > 0):#base[i] > base[gt] => swap(i, gt--)
            qswap(ptr + i * size, ptr + gt* size, size)
            gt -= 1
        else:#base[i] == base[gt]
            i += 1
    #base := [<<<<<lt=====gt>>>>>>]
    low[0] = lt                                          
    high[0] = gt 


cdef void qselectk3(void* base, size_t lo, size_t hi, 
   size_t size, size_t k, 
   QComparator compar) nogil:                                             
    cdef size_t low = lo                                          
    cdef size_t high = hi                                                      

    partition3(base, &low, &high,  size, compar)    

    if ((k - 1) < low): #k lies in the less-than-pivot partition           
        high = low - 1
        low = lo                      
    elif ((k - 1) >= low and  (k - 1) <= high): #k lies in the equals-to-pivot partition
        qswap(base, base + size*low, size)
        return                              
    else: # k > high => k lies in the greater-than-pivot partition                    
        low = high + 1
        high = hi 
    qselectk3(base, low, high, size, k, compar)

"""
 A selection algorithm to find the nth smallest elements in an unordered list. 
 these elements ARE placed at the nth positions of the input array                                                                         
"""
cdef void qselect(void* base, size_t num, size_t size,
                              size_t n,
                              QComparator compar) nogil:
    cdef int k
    qshuffle(base, num, size)
    for k in range(n):
        qselectk3(base + size*k, 0, num - k - 1, size, 1, compar)

I use python timeit to get the performance of both method pyselect(with N=50) and pysort. Like this

def testPySelect():
    A = np.random.randint(16, size=(10000), dtype=np.int32)
    pyselect(A, 50)
timeit.timeit(testPySelect, number=1)

def testPySort():
    A = np.random.randint(16, size=(10000), dtype=np.int32)
    pysort(A)
timeit.timeit(testPySort, number=1)
melpomene
  • 84,125
  • 8
  • 85
  • 148
N. Wells
  • 143
  • 1
  • 12
  • You need to show what you have tried, and also how you compiled it (compiling without optimisations on is the second most frequent cause of performance problems). – n. m. could be an AI Aug 25 '18 at 10:45
  • I have used to the code provided in the link AS IS. For compilation, I am using GCC 5 on Mac OS X Lion. – N. Wells Aug 25 '18 at 10:47
  • But which flags/options were used during the compilation? – ead Aug 25 '18 at 11:01
  • I also use openmp in the project. So besides numpy, includes, only openmp compile and linkage option (-fopenmp). – N. Wells Aug 25 '18 at 11:10
  • 1
    **compiling without optimisations on is the second most frequent cause of performance problems** – n. m. could be an AI Aug 25 '18 at 11:44
  • How big is K? And how big is N, the data set? Calling quick select multiple times rapidly becomes slower than calling quick sort once. AFAICR, quick select is an O(N.logN) operation like quick sort, so calling it K times risks the overall performance. – Jonathan Leffler Aug 25 '18 at 13:53
  • Benchmarking for N=50 makes little sense, algorithms are designed for larger data sets. And for such small problems, shuffling is counterproductive. –  Aug 25 '18 at 14:00
  • That#s obviously not C. Cython is a completely different language - apparently. – too honest for this site Aug 25 '18 at 14:05

2 Answers2

1

The answer by @chqrlie is the good and final answer, yet to complete the post, I am posting the Cython version along with the benchmarking results. In short, the proposed solution is 2 times faster than qsort on long vectors!


    cdef void qswap2(void *aptr, void *bptr, size_t size) nogil:
        cdef uint8_t* ac = <uint8_t*>aptr
        cdef uint8_t* bc = <uint8_t*>bptr
        cdef uint8_t t
        while (size > 0): t = ac[0]; ac[0] = bc[0]; bc[0] = t; ac += 1; bc += 1; size -= 1

    cdef struct qselect2_stack:
        uint8_t *base
        uint8_t *last

    cdef void qselect2(void *base, size_t nmemb, size_t size,
                      size_t k, QComparator compar) nogil:
        cdef qselect2_stack stack[64]
        cdef qselect2_stack *sp = &stack[0]

        cdef uint8_t *lb
        cdef uint8_t*ub
        cdef uint8_t *p
        cdef uint8_t *i
        cdef uint8_t *j
        cdef uint8_t *top

        if (nmemb < 2 or size <= 0):
            return

        top = <uint8_t *>base
        if(k < nmemb): 
            top += k*size 
        else: 
            top += nmemb*size

        sp.base = <uint8_t *>base
        sp.last = <uint8_t *>base + (nmemb - 1) * size
        sp += 1

        cdef size_t offset

        while (sp > stack):
            sp -= 1
            lb = sp.base
            ub = sp.last

            while (lb < ub and lb < top):
                #select middle element as pivot and exchange with 1st element
                offset = (ub - lb) >> 1
                p = lb + offset - offset % size
                qswap2(lb, p, size)

                #partition into two segments
                i = lb + size
                j = ub
                while 1:
                    while (i < j and compar(lb, i) > 0):
                        i += size
                    while (j >= i and compar(j, lb) > 0):
                        j -= size
                    if (i >= j):
                        break
                    qswap2(i, j, size)
                    i += size
                    j -= size

                # move pivot where it belongs
                qswap2(lb, j, size)

                # keep processing smallest segment, and stack largest
                if (j - lb <= ub - j):
                    sp.base = j + size
                    sp.last = ub
                    sp += 1
                    ub = j - size
                else:
                    sp.base = lb
                    sp.last = j - size
                    sp += 1
                    lb = j + size

    cdef int int_comp(void* a, void* b) nogil:
        cdef int ai = (<int*>a)[0] 
        cdef int bi = (<int*>b)[0]
        return (ai > bi ) - (ai < bi)

    def pyselect2(numpy.ndarray[int, ndim=1, mode="c"] na, int n):
        cdef int* a = <int*>&na[0]
        qselect2(a, len(na), sizeof(int), n, int_comp)

Here are the benchmark results (1,000 tests):

#of elements   K      #qsort (s)                     #qselect2 (s)
1,000          50     0.1261                         0.0895
1,000          100    0.1261                         0.0910

10,000         50     0.8113                         0.4157
10,000         100    0.8113                         0.4367
10,000         1,000  0.8113                         0.4746

100,000        100    7.5428                         3.8259
100,000        1,000  7,5428                         3.8325
100,000        10,000 7,5428                         4.5727

For those who are curious, this piece of code is a jewel in the field of surface reconstruction using neural networks. Thanks again to @chqrlie, your code is unique on The Web.

N. Wells
  • 143
  • 1
  • 12
0

Here is a quick implementation for your purpose: qsort_select is a simple implementation of qsort with automatic pruning of unnecessary ranges.

Without && lb < top, it behaves like the regular qsort except for pathological cases where more advanced versions have better heuristics. This extra test prevents complete sorting of ranges that are outside the target 0 .. (k-1). The function selects the k smallest values and sorts them, the rest of the array has the remaining values in an undefinite order.

#include <stdio.h>
#include <stdint.h>

static void exchange_bytes(uint8_t *ac, uint8_t *bc, size_t size) {
    while (size-- > 0) { uint8_t t = *ac; *ac++ = *bc; *bc++ = t; }
}

/* select and sort the k smallest elements from an array */
void qsort_select(void *base, size_t nmemb, size_t size,
                  int (*compar)(const void *a, const void *b), size_t k)
{
    struct { uint8_t *base, *last; } stack[64], *sp = stack;
    uint8_t *lb, *ub, *p, *i, *j, *top;

    if (nmemb < 2 || size <= 0)
        return;

    top = (uint8_t *)base + (k < nmemb ? k : nmemb) * size;
    sp->base = (uint8_t *)base;
    sp->last = (uint8_t *)base + (nmemb - 1) * size;
    sp++;
    while (sp > stack) {
        --sp;
        lb = sp->base;
        ub = sp->last;
        while (lb < ub && lb < top) {
            /* select middle element as pivot and exchange with 1st element */
            size_t offset = (ub - lb) >> 1;
            p = lb + offset - offset % size;
            exchange_bytes(lb, p, size);

            /* partition into two segments */
            for (i = lb + size, j = ub;; i += size, j -= size) {
                while (i < j && compar(lb, i) > 0)
                    i += size;
                while (j >= i && compar(j, lb) > 0)
                    j -= size;
                if (i >= j)
                    break;
                exchange_bytes(i, j, size);
            }
            /* move pivot where it belongs */
            exchange_bytes(lb, j, size);

            /* keep processing smallest segment, and stack largest */
            if (j - lb <= ub - j) {
                sp->base = j + size;
                sp->last = ub;
                sp++;
                ub = j - size;
            } else {
                sp->base = lb;
                sp->last = j - size;
                sp++;
                lb = j + size;
            }
        }
    }
}

int int_cmp(const void *a, const void *b) {
    int aa = *(const int *)a;
    int bb = *(const int *)b;
    return (aa > bb) - (aa < bb);
}

#define ARRAY_SIZE  50000

int array[ARRAY_SIZE];

int main(void) {
    int i;
    for (i = 0; i < ARRAY_SIZE; i++) {
        array[i] = ARRAY_SIZE - i;
    }
    qsort_select(array, ARRAY_SIZE, sizeof(*array), int_cmp, 50);
    for (i = 0; i < 50; i++) {
        printf("%d%c", array[i], i + 1 == 50 ? '\n' : ',');
    }
    return 0;
}
chqrlie
  • 131,814
  • 10
  • 121
  • 189
  • @N.Wells: this is a simple implementation of `qsort` with automatic pruning of unnecessary ranges: without ` && lb < top`, it behaves like the regular `qsort` except for pathological cases where more advanced versions have better heuristics. The extra test prevents complete sorting of ranges that are outside the target **0 .. (k-1)**. The function selects the `k` smallest values and sorts them, the rest of the array has the remaining values in an undefinite order. – chqrlie Aug 25 '18 at 15:32