Implementing HLL in python to estimate the cardinality

Question

I'm trying to implement the HLL algorithm in python.

I'm using data folders with the format of 13 bytes

"\x00\x01\x02\x03\x05\x06\x07\x08\x09\x0a\x0b\x0c",described as follows:

srcIP = "\x00\x01\x02\x03"
srcPort = "\x04\x05"
dstIP = "\x06\x07\x08\x09"
dstPort = "\x0a\x0b"
protocol = "\x0c"

The code is as follows:

import hashlib
import math

#Actual
with open(""""folder location"""") as fl:
        tmp_trace = []
        pairSrcDst ={}
        while True:
            five_tuple = fl.read(13)
            ipPair = five_tuple[0:4] + five_tuple[6:10]
            if five_tuple == b"":
                break
            tmp_trace.append(ipPair)

for pair in tmp_trace:
   if pair[0:4] in pairSrcDst.keys():
       pairSrcDst[pair[0:4]] += 1
   else:
       pairSrcDst[pair[0:4]] = 1

accuratecount = len(pairSrcDst)
print(accuratecount)

#Estimation

def harmonic_mean(arr):
    n = len(arr)
    sum = 0
    for i in range(n):
        if arr[i] != 0:
            sum += 1.0 / arr[i]
    result = n / sum
    return result

def count_leading_zeros(string):
    count = 0
    for char in string:
        if char == '0':
            count += 1
        else:
            return count
    print(count)
    return count


with open(""""folder location"""") as fl:
    j = 0
    numberofcontainers = 32  # NUMBER OF BITS FOR BUCKET
    numberofbucketbits = math.ceil(math.log2(numberofcontainers))
    containerlist = [0] * numberofcontainers
    while True:
        five_tuple = fl.read(13)
        IPset = five_tuple[0:4] + five_tuple[6:10]
        if five_tuple == b"":
            break


        hashvalue = hashlib.sha256(IPset).digest()
        integer_value = int.from_bytes(hashvalue, byteorder='big')
        hashvalueinbit = bin(integer_value)[2:]

        bucket1 = hashvalueinbit[:numberofbucketbits] 
        bucket2 = hashvalueinbit[numberofbucketbits:]
        containerindex = int(bucket1,2)
        leadingzeros = count_leading_zeros(bucket2)
        if containerlist[containerindex] < leadingzeros:
            containerlist[containerindex] = leadingzeros


    print("container values",containerlist)

    # STEP 2 - Counting
    HarmonicMean = harmonic_mean(containerlist)
    #print(HarmonicMean)
    twopow = 2 ** HarmonicMean
    phi = 0.77351
    bitxphi = numberofcontainers / phi
    #print(bitxphi)
    Estimatedcount = twopow * bitxphi

    print("the estimated count is:", Estimatedcount)
    print("the accuracte count is :", accuratecount)

    perc = (1 - abs(accuratecount - Estimatedcount) / accuratecount) * 100

    print("the accuracy percentage is", perc)

I have tried several bucket sizes, hash functions, still the estimation is completely higher than the accurate counting.

Could anyone see a potential issues with this please? Thank you

Implementing HLL in python to estimate the cardinality

0 Answers0