I am writing some code to build a table of variable length (Huffman) codes, and I wanted to use the multiprocessing module for fun. The idea is to have each process try to get a node from the queue. They do work on the node, and either put that nodes two children back into the work queue, or they put the variable length code into result queue. They are also passing messages to a message queue, which should be printed by a thread in the main process. Here is the code so far:
import Queue
import multiprocessing as mp
from threading import Thread
from collections import Counter, namedtuple
Node = namedtuple("Node", ["child1", "child2", "weight", "symbol", "code"])
def _sort_func(node):
return node.weight
def _encode_proc(proc_number, work_queue, result_queue, message_queue):
while True:
try:
#get a node from the work queue
node = work_queue.get(timeout=0.1)
#if it is an end node, add the symbol-code pair to the result queue
if node.child1 == node.child2 == None:
message_queue.put("Symbol processed! : proc%d" % proc_number)
result_queue.put({node.symbol:node.code})
#otherwise do some work and add some nodes to the work queue
else:
message_queue.put("More work to be done! : proc%d" % proc_number)
node.child1.code.append(node.code + '0')
node.child2.code.append(node.code + '1')
work_queue.put(node.child1)
work_queue.put(node.child2)
except Queue.Empty: #everything is probably done
return
def _reporter_thread(message_queue):
while True:
try:
message = message_queue.get(timeout=0.1)
print message
except Queue.Empty: #everything is probably done
return
def _encode_tree(tree, symbol_count):
"""Uses multiple processes to walk the tree and build the huffman codes."""
#Create a manager to manage the queues, and a pool of workers.
manager = mp.Manager()
worker_pool = mp.Pool()
#create the queues you will be using
work = manager.Queue()
results = manager.Queue()
messages = manager.Queue()
#add work to the work queue, and start the message printing thread
work.put(tree)
message_thread = Thread(target=_reporter_thread, args=(messages,))
message_thread.start()
#add the workers to the pool and close it
for i in range(mp.cpu_count()):
worker_pool.apply_async(_encode_proc, (i, work, results, messages))
worker_pool.close()
#get the results from the results queue, and update the table of codes
table = {}
while symbol_count > 0:
try:
processed_symbol = results.get(timeout=0.1)
table.update(processed_symbol)
symbol_count -= 1
except Queue.Empty:
print "WAI DERe NO SYMBOLzzzZzz!!!"
finally:
print "Symbols to process: %d" % symbol_count
return table
def make_huffman_table(data):
"""
data is an iterable containing the string that needs to be encoded.
Returns a dictionary mapping symbols to codes.
"""
#Build a list of Nodes out of the characters in data
nodes = [Node(None, None, weight, symbol, bytearray()) for symbol, weight in Counter(data).items()]
nodes.sort(reverse=True, key=_sort_func)
symbols = len(nodes)
append_node = nodes.append
while len(nodes) > 1:
#make a new node out of the two nodes with the lowest weight and add it to the list of nodes.
child2, child1 = nodes.pop(), nodes.pop()
new_node = Node(child1, child2, child1.weight+child2.weight, None, bytearray())
append_node(new_node)
#then resort the nodes
nodes.sort(reverse=True, key=_sort_func)
top_node = nodes[0]
return _encode_tree(top_node, symbols)
def chars(fname):
"""
A simple generator to make reading from files without loading them
totally into memory a simple task.
"""
f = open(fname)
char = f.read(1)
while char != '':
yield char
char = f.read(1)
f.close()
raise StopIteration
if __name__ == "__main__":
text = chars("romeo-and-juliet.txt")
table = make_huffman_table(text)
print table
The current output of this is:
More work to be done! : proc0
WAI DERe NO SYMBOLzzzZzz!!!
Symbols to process: 92
WAI DERe NO SYMBOLzzzZzz!!!
Symbols to process: 92
WAI DERe NO SYMBOLzzzZzz!!!
Symbols to process: 92
It just repeats the last bit forever. After the first process adds work to the node, everything just stops. Why is that? Am I not understand/using queues properly? Sorry for all the code to read.