1

I made this segmented sieve that uses wheel factorization. Here you find the explanation.

By setting the wheel size to 210 and using a segment vector uint8_t of size 277140 = 6 * (11 * 13 * 17 * 19 + 1) = nB*(segment_size+1) the sieve is fast enough for n=10^9.

If you want to decrease the segment memory, for example setting segment_size_min=2048 and int64_t segment_size = 4; a vector uint8_t of size 58350 = 6 * (4 * 11 * 13 * 17 + 1) is used but the execution time is longer, about double.



///     This is a implementation of the bit wheel segmented sieve 
///     with max base wheel size choice  30 , 210 , 2310

#include <iostream>
#include <cmath>
#include <vector>
#include <cstdlib>
#include <stdint.h>

const int64_t n_PB_max = 5;
const int64_t PrimesBase[n_PB_max] = {2,3,5,7,11};

const int64_t del_bit[8] =
{
  ~(1 << 0),~(1 << 1),~(1 << 2),~(1 << 3),
  ~(1 << 4),~(1 << 5),~(1 << 6),~(1 << 7)
};

const int64_t bit_count[256] =
{
  0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};

int64_t Euclidean_Diophantine( int64_t coeff_a, int64_t  coeff_b)
{
    // return y in  Diophantine equation  coeff_a x + coeff_b y  = 1
    int64_t k=1;
    std::vector<int64_t> div_t; 
    std::vector<int64_t> rem_t;
    std::vector<int64_t> coeff_t;
    div_t.push_back(coeff_a);
    rem_t.push_back(coeff_b);
    coeff_t.push_back((int64_t) 0);
    div_t.push_back((int64_t) div_t[0] / rem_t[0]);
    rem_t.push_back((int64_t) div_t[0] % rem_t[0]);
    coeff_t.push_back((int64_t) 0);
    while (rem_t[k] > 1)
    {
        k++;
        div_t.push_back((int64_t) rem_t[k - 2] / rem_t[k - 1]);
        rem_t.push_back((int64_t) rem_t[k - 2] % rem_t[k - 1]);
        coeff_t.push_back((int64_t) 0);
    }
    k--;
    coeff_t[k] = -div_t[k + 1];
    if (k > 0)
        coeff_t[k - 1] = (int64_t) 1;
    while (k > 1)
    {
        k--;
        coeff_t[k - 1] = coeff_t[k + 1];
        coeff_t[k] += (int64_t) (coeff_t[k + 1] * (-div_t[k + 1]));
    }
    if (k == 1)
        return (int64_t) (coeff_t[k - 1] + coeff_t[k] * ( -div_t[k]));
    else
        return (int64_t) (coeff_t[0]);
}

int64_t segmented_bit_sieve_wheel(uint64_t n,int64_t max_bW)
{

    int64_t segment_size_min = 4096;   //can be scaled down to have smaller segments
    
    int64_t sqrt_n = (int64_t) std::sqrt(n);

    int64_t  count_p = (int64_t)0;

    int64_t n_PB = (int64_t) 3;    
    int64_t bW = (int64_t) 30;
    
    //get bW base wheel equal to p1*p2*...*pn <=min(max_bW,sqrt_n)  with n=n_PB
    while(n_PB < n_PB_max && (bW * PrimesBase[n_PB] <= std::min(max_bW , sqrt_n)))
    {
        bW *= PrimesBase[n_PB];
        n_PB++;
    }

    for (int64_t i = 0; i < n_PB; i++)
        if (n > (uint64_t) PrimesBase[i])
            count_p++;


    if (n > (uint64_t) (1 + PrimesBase[n_PB - 1])){

        int64_t k_end = (n < (uint64_t)bW) ? (int64_t) 2 :(int64_t) (n / (uint64_t) bW + 1);
        int64_t n_mod_bW = (int64_t) (n % (uint64_t) bW);
        int64_t k_sqrt = (int64_t) std::sqrt(k_end / bW) + 1;

        //find reduct residue set modulo bW
        std::vector<char> Remainder_i_t(bW + 1,true); 
        for (int64_t i = 0; i < n_PB; i++)
            for (int64_t j = PrimesBase[i]; j < bW + 1; j += PrimesBase[i])
                Remainder_i_t[j] = false;
        std::vector<int64_t> RW;
        for (int64_t j = 2; j < bW + 1; j++)
            if (Remainder_i_t[j] == true)
                RW.push_back(-bW + j);
        RW.push_back(1);
        int64_t  nR = RW.size();   //nR=phi(bW)

        std::vector<int64_t> C1(nR * nR);
        std::vector<int64_t> C2(nR * nR);
        for (int64_t j = 0; j < nR - 2; j++)
        {
            int64_t rW_t , rW_t1;
            rW_t1 = Euclidean_Diophantine(bW , -RW[j]);
            for (int64_t i = 0; i < nR; i++)
            {
                if (i == j)
                {
                    C2[nR * i + j] = 0;
                    C1[nR * i + j] = RW[j] + 1;
                }
                else if(i == nR - 3 - j)
                {
                    C2[nR * i + j] = 1;
                    C1[nR * i + j] = RW[j] - 1;
                }
                else
                {
                    rW_t = (int64_t) (rW_t1 * (-RW[i])) % bW;
                    if (rW_t > 1)
                        rW_t -= bW;
                    C1[nR * i + j] = rW_t + RW[j];
                    C2[nR * i + j] = (int64_t) (rW_t * RW[j]) / bW + 1;
                    if (i == nR - 1)
                        C2[nR * i + j] -= 1;
                }
            }
            C2[nR * j + nR - 2] = (int64_t) 1;
            C1[nR * j + nR - 2] = -(bW + RW[j]) - 1;
            C1[nR * j + nR - 1] = RW[j] + 1;
            C2[nR * j + nR - 1] = (int64_t )0;
        }
        for (int64_t i = nR - 2; i < nR; i++)
        {
            C2[nR * i + nR - 2] = (int64_t) 0;
            C1[nR * i + nR - 2] = -RW[i] - 1;
            C1[nR * i + nR - 1] = RW[i] + 1;
            C2[nR * i + nR - 1] = (int64_t) 0;
        }

        int64_t segment_size = 1; //can be scaled up to have larger segments
        int64_t p_mask_i = 3;    //number primes for pre-sieve vector mask 
        for (int64_t i = 0; i < p_mask_i; i++)
            segment_size *= (bW + RW[i]);
        while (segment_size < std::max(k_sqrt , segment_size_min) && p_mask_i < 7 )
        {
            segment_size *= (bW + RW[p_mask_i]);
            p_mask_i++;
        }

        int64_t  nB = nR / 8;     //nB number of byte for residue of congruence class equal to nR=8*nB        
        int64_t segment_size_b = nB * segment_size;
        
        //vector used for the first segment containing prime numbers less than sqrt(n)
        std::vector<uint8_t> Primes(nB + segment_size_b, 0xff);
 
        int64_t  p , pb , mb , mb_i , ib , i , jb , j , jp , k , kb;
        int64_t kmax = (int64_t) std::sqrt(segment_size / bW) + 1;
        //sieve for the first segment
        for (k = (int64_t)1; k  <= kmax; k++)
        {
            kb = nB * k;
            mb_i = kb * k * bW;     //nB * k * k  * bW     
            for (jb = 0; jb < nB; jb++)
            {
                for (j = 0; j < 8; j++)
                {
                    if(Primes[kb + jb] & (1 << j))
                    {
                        jp = j + jb * 8;
                        pb = nB * (bW * k + RW[jp]);
                        for (ib = 0; ib < nB; ib++)
                        {
                            for (i = 0; i < 8; i++)
                            {
                                mb = mb_i + nB * k * C1[(i + ib * 8) * nR + jp] + nB * C2[(i + ib * 8) * nR + jp];
                                for (; mb <= segment_size_b; mb += pb)
                                    Primes[mb + ib] &= del_bit[i];
                            }
                        }
                    }
                }
            }
        }
        //count the prime numbers in the first segment
        for (kb = nB; kb < std::min (nB + segment_size_b , nB * k_end); kb++)
            count_p += bit_count[Primes[kb]]; 
        if (kb == nB * k_end && kb <= segment_size_b && kb > 0)
            for (ib = 0; ib < nB; ib++)
                for (i = 0; i  < 8; i++)
                    if(Primes[kb + ib]& (1 << i) && RW[i + ib * 8] < (n_mod_bW - bW))
                        count_p++;
 
        if (k_end > segment_size) 
        {
            // vector mask pre-sieve multiples of primes bW+RW[j]  with 0<j<p_mask_i
            std::vector<uint8_t> Segment_i(nB+segment_size_b , 0xff);
            for (j = 0; j < p_mask_i; j++)
            {
                p = bW+RW[j];
                pb = p * nB;                
                for (ib = 0; ib < nB; ib++)
                {
                    for (i = 0; i < 8; i++)
                    {
                        mb = -segment_size + bW + C1[(i + ib  * 8) * nR + j] + C2[(i + ib * 8) * nR + j];
                        if (mb < 0)
                            mb=(mb % p + p) % p;
                        mb *= nB;                    
                        for (; mb <= segment_size_b; mb += pb)
                            Segment_i[mb + ib] &= del_bit[i];
                    }
                }
            }
            
            //vector used for subsequent segments of size (nB+segment_size_b)=nB*(1+segment_size)       
            std::vector<uint8_t> Segment_t(nB + segment_size_b);
            int64_t k_low , kb_low;
            for (k_low = segment_size; k_low < k_end; k_low += segment_size)
            {
                kb_low = k_low * nB;
                for (kb = (int64_t)0; kb < (nB + segment_size_b); kb++)
                    Segment_t[kb] = Segment_i[kb];
                
                kmax = (std::min(segment_size , (int64_t) std::sqrt((k_low + segment_size) / bW) + 2));
                j = p_mask_i;
                for(k = (int64_t) 1; k <= kmax; k++)
                {
                    kb = k * nB;
                    mb_i = -k_low + bW * k * k;    
                    for (jb = 0; jb < nB; jb++)
                    {
                        for (; j < 8; j++)
                        {
                            if (Primes[kb + jb] & (1 << j))
                            {
                                jp = j + jb * 8;
                                p = bW * k + RW[jp];
                                pb = p * nB;
                                for (ib = 0; ib < nB; ib++)
                                {
                                    for (i = 0; i < 8; i++)
                                    {
                                        mb = mb_i + k * C1[(i + ib * 8) * nR+jp] + C2[(i + ib * 8) * nR+jp];
                                        if (mb < 0)
                                            mb = (mb % p + p) % p;
                                        mb *= nB;
                                        for (; mb <= segment_size_b; mb += pb)
                                            Segment_t[mb + ib] &= del_bit[i];
                                    }
                                }
                            }
                        }
                        j = (int64_t) 0;
                    }
                }
                for ( kb = nB + kb_low; kb < std::min (kb_low + segment_size_b + nB , nB * k_end); kb++)
                    count_p += bit_count[Segment_t[kb - kb_low]];
            }
            if (kb == nB * k_end && kb - kb_low <= segment_size_b && kb - kb_low > (int64_t) 0)
                for (ib = 0; ib < nB; ib++)
                    for (i = 0; i < 8; i++)
                        if(Segment_t[kb - kb_low + ib]& (1 << i) && RW[i + ib * 8] < (n_mod_bW - bW))
                            count_p++;
        }
    }

    return count_p;
}

int main()
{
    int64_t n=1000000000;

    //segmented_bit_sieve_wheel(n,max_bW) with max base wheel size choice max_bW= 30 , 210 , 2310
    std::cout << " primes < " << n << ": "<< segmented_bit_sieve_wheel(n , 210) << std::endl;    

    return 0;
}


I was expecting the opposite behavior can someone help me understand what the size of the segment should be to optimize in terms of speed?

EDIT: I want to clarify that the Euclidean_Diophantine function is used only in the initial phase to find the arrays C1 and C2 and can be eliminated by replacing

rW_t1 = Euclidean_Diophantine(bW , -RW[j]);

with

int64_t j1=0;
while((RW[j] * RW[j1]) % bW != bW - 1 && j1 < nR - 1)
    j1++;
rW_t1 = RW[j1];

Also to get an idea about timing by compiling using rextester.com g++ with the -O3 option you get absolute running time: 0.51s

so i tested the time spent on switching blocks by deleting in the second part

for(k = (int64_t) 1; k <= kmax; k++)
{
...
}

we therefore obtain absolute running time equal to 0.18s which does not vary much in the two configurations.

user140242
  • 159
  • 6

1 Answers1

1

The rule is benchmark, benchmark, benchmark.

But not just overall timing. Where does time go inside of your graph? If you were using more functions, a flame graph would help. But you aren't.

So I'm going to instead recommend that you add a little instrumentation, possibly hidden behind a #ifdef. What you want to do is divide your code into sections based on different kinds of activities. Like setting up a new block to sieve versus sieving the block. Have a timer going. Every time you switch, record a new timestamp, add the elapsed for the previous activity, and save the new timestamp to time the current activity.

As for why smaller blocks would be slower, that's because you spend time sieving, and also spend time every time you switch blocks calculating where thing are. Going from blocks of size 277140 to 58350 you wind up switching blocks about 4.75 times as often. If you were spending 1/3.75 of the time on switching before (36.36%) you'll now be spending the same 2.75/3.75 on sieving plus 4.75/3.75 on switching blocks, resulting in taking 2x as long.

One idea to speed it up is that you are calling Euclidean_Diophantine with the same first parameter a lot. (I think the parameter works out to be 210.) It might be faster if you first a lookup table solving the problem mod 210, and then use that to skip most of the calculation.

btilly
  • 43,296
  • 3
  • 59
  • 88