I'm implementing an image downloader with the producer-consumer model. One thread is responsible for generating (url, filename) pairs and put them in queue. I want MAX_THREADS threads to pick the pairs and start downloading. Here are my threads:
class Extractor(Thread):
def __init__(self, group=None, target=None, name=None,
args=(), kwargs=None, verbose=None, items=None):
super(Extractor, self).__init__()
self.target = target
self.name = name
self.items = items
def run(self):
while True:
for item in self.items:
if not QUEUE.full():
QUEUE.put_nowait(extract(item))
logging.debug('Putting ' + str(item) + ' : ' + str(QUEUE.qsize()) + ' items in queue')
class Downloader(Thread):
def __init__(self, group=None, target=None, name=None,
args=(), kwargs=None, verbose=None):
super(Downloader, self).__init__()
self.target = target
self.name = name
self.seen = set()
def run(self):
while True:
if not QUEUE.empty():
pair = QUEUE.get_nowait()
# I have seen the URL
if pair[0] in self.seen:
continue
else:
# Never seen it before
self.seen.add(pair[0])
logging.debug('Downloading ' + str(pair[1]) + ' : ' + str(QUEUE.qsize()) + ' items in queue')
download_one_pic(pair)
if __name__ == '__main__':
items = None
items = crawl('username__', items)
worker_threads = []
producer = Extractor(name='Extractor', items=items)
producer.daemon = True
producer.start()
consumer = Downloader(name='Downloader[1]')
consumer2 = Downloader(name='Downloader[2]')
worker_threads.append(consumer)
worker_threads.append(consumer2)
for thread in worker_threads:
thread.start()
thread.join()
The queue has the max size of 50 and I want Producer thread to run regardless of other threads so I have it demonized. One thing is weird is that the consumer2 thread never gets started and I don't have any idea why. As in my log, only Downloader[1]
does the job and the queue keeps fluctuating between 49 and 50 so I knew that the Downloader[2]
never gets started.