2

For my project, I have to compress a file using Huffman algorithm. I figured out most of the part, however I am having hard time dealing with other ASCII characters such as newlines, tabs, etc.This is what I have done so far, however I am not able to get the other characters while handling the file creation.

import operator

def getFreq(mylst):
    dct = {'a':0,'b':0,'c':0,'d':0,'e':0,'f':0,'g':0,'h':0,'i':0,'j':0,'k':0,'l':0,
       'm':0,'n':0,'o':0,'p':0,'q':0,'r':0,'s':0,'t':0,'u':0,'v':0,'w':0,'x':0,'y':0,'z':0,' ':0,
       'A':0,'B':0,'C':0,'D':0,'E':0,'F':0,'G':0,'H':0,'I':0,'J':0,'K':0,'L':0,
       'M':0,'N':0,'O':0,'P':0,'Q':0,'R':0,'S':0,'T':0,'U':0,'V':0,'W':0,'X':0,'Y':0,'Z':0,'.':0,',':0,
       '1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0,'9':0,'0':0, '-':0,'(':0, ')':0}

    for k, v in dct.items():
        for i in mylst:
            if i == k:
                dct[k] += 1
                up_dct = sorted(dct.items(), key=operator.itemgetter(1), reverse=True)

    srt_dct = dict((k, v) for k, v in up_dct)

    return srt_dct


def assign_code(nodes, label, result, prefix = ''):
    childs = nodes[label]
    tree = {}
    if len(childs) == 2:
        tree['0'] = assign_code(nodes, childs[0], result, prefix+'0')
        tree['1'] = assign_code(nodes, childs[1], result, prefix+'1')
        return tree
    else:
        result[label] = prefix
        return label

def Huffman_code(_vals):
    vals = _vals.copy()
    nodes = {}
    for n in vals.keys(): # leafs initialization
        nodes[n] = []

    while len(vals) > 1: # binary tree creation
        s_vals = sorted(vals.items(), key=lambda x:x[1])
        a1 = s_vals[0][0]
        a2 = s_vals[1][0]
        vals[a1+a2] = vals.pop(a1) + vals.pop(a2)
        nodes[a1+a2] = [a1, a2]
    code = {}
    root = a1+a2
    tree = {}
    tree = assign_code(nodes, root, code)   # assignment of the code for the given binary tree
    return code, tree

r_file = open('test.txt', 'r')

ro = r_file.read()


lst = list(ro)

freq = getFreq(lst)

code, tree = Huffman_code(freq)

encoded = ''.join([code[t] for t in ro])
print('Encoded text:',encoded)

w_file = open('encrypt.txt','wt')
w_file.write(encoded)
w_file.close()

d_file = open('encrypt.txt', 'r')

dec_ode = d_file.read()

decoded = []
i = 0
while i < len(dec_ode): # decoding using the binary graph
    ch = dec_ode[i]
    act = tree[ch]
    while not isinstance(act, str):
        i += 1
        ch = dec_ode[i]
        act = act[ch]
    decoded.append(act)
    i += 1


test2_file = open('decode.txt', 'wt')
test2_file.write(''.join(decoded))

test2_file.close()

print('Decoded text:',''.join(decoded))
Alden
  • 2,229
  • 1
  • 15
  • 21
Jay Patel
  • 33
  • 3
  • Aren't you supposed to encode byte sequence, not character sequence? – Mikhail Antonov Apr 17 '17 at 17:54
  • 1
    Step one would be to get the direction of your slashes right. It's `\n`, `\t`, etc., with backslashes, not forward slashes. – user2357112 Apr 17 '17 at 18:02
  • 1
    Also if you'd encode bytes, you would get rid of that monstrous dictionary initialization. You'd use something like `dct = [0] * 256`. Array index would be the byte value then. – Mikhail Antonov Apr 17 '17 at 18:05

1 Answers1

1

Try this as your getFreq function instead:

def getFreq(mylst):
    counters = [0 for _ in range(256)]
    for c in mylst:
        counters[ord(c)] += 1
    return { chr(c): v for c, v in enumerate(counters) }

You need to view your list of characters as bytes rather than distinct characters. This function converts the input characters to their corresponding ASCII value and uses that to index into a list of counters. For example, every time a is encountered, the 97th element of counters gets incremented. Then after all characters have been counted counters is converted back into a dict like your program is expecting.

Alden
  • 2,229
  • 1
  • 15
  • 21
  • It throws an IndexError: list index out of range. Is it because it changes the position of character and value, which messes up the use of that dictionary in other places – Jay Patel Apr 17 '17 at 21:05