I'm trying to implement the HLL algorithm in python.
I'm using data folders with the format of 13 bytes
"\x00\x01\x02\x03\x05\x06\x07\x08\x09\x0a\x0b\x0c",described as follows:
srcIP = "\x00\x01\x02\x03"
srcPort = "\x04\x05"
dstIP = "\x06\x07\x08\x09"
dstPort = "\x0a\x0b"
protocol = "\x0c"
The code is as follows:
import hashlib
import math
#Actual
with open(""""folder location"""") as fl:
tmp_trace = []
pairSrcDst ={}
while True:
five_tuple = fl.read(13)
ipPair = five_tuple[0:4] + five_tuple[6:10]
if five_tuple == b"":
break
tmp_trace.append(ipPair)
for pair in tmp_trace:
if pair[0:4] in pairSrcDst.keys():
pairSrcDst[pair[0:4]] += 1
else:
pairSrcDst[pair[0:4]] = 1
accuratecount = len(pairSrcDst)
print(accuratecount)
#Estimation
def harmonic_mean(arr):
n = len(arr)
sum = 0
for i in range(n):
if arr[i] != 0:
sum += 1.0 / arr[i]
result = n / sum
return result
def count_leading_zeros(string):
count = 0
for char in string:
if char == '0':
count += 1
else:
return count
print(count)
return count
with open(""""folder location"""") as fl:
j = 0
numberofcontainers = 32 # NUMBER OF BITS FOR BUCKET
numberofbucketbits = math.ceil(math.log2(numberofcontainers))
containerlist = [0] * numberofcontainers
while True:
five_tuple = fl.read(13)
IPset = five_tuple[0:4] + five_tuple[6:10]
if five_tuple == b"":
break
hashvalue = hashlib.sha256(IPset).digest()
integer_value = int.from_bytes(hashvalue, byteorder='big')
hashvalueinbit = bin(integer_value)[2:]
bucket1 = hashvalueinbit[:numberofbucketbits]
bucket2 = hashvalueinbit[numberofbucketbits:]
containerindex = int(bucket1,2)
leadingzeros = count_leading_zeros(bucket2)
if containerlist[containerindex] < leadingzeros:
containerlist[containerindex] = leadingzeros
print("container values",containerlist)
# STEP 2 - Counting
HarmonicMean = harmonic_mean(containerlist)
#print(HarmonicMean)
twopow = 2 ** HarmonicMean
phi = 0.77351
bitxphi = numberofcontainers / phi
#print(bitxphi)
Estimatedcount = twopow * bitxphi
print("the estimated count is:", Estimatedcount)
print("the accuracte count is :", accuratecount)
perc = (1 - abs(accuratecount - Estimatedcount) / accuratecount) * 100
print("the accuracy percentage is", perc)
I have tried several bucket sizes, hash functions, still the estimation is completely higher than the accurate counting.
Could anyone see a potential issues with this please? Thank you