I have a list of URLs in article_urls: list[str] and I'm trying to create several workers which go to the URLs and get a different URL from the webpage and then put that URL in another queue. I want several other workers to go through that other queue and download the files at the URLS and then post them to S3. I think the program is stopping on entry = await queue.get() and never moving on. How do I fix this?
def get_first_pass_or_none(inp, driver):
for x in inp:
try:
return x(driver)
except:
pass
return None
async def url_producer(download_queue, pages_queue, producer_id):
first = True
while True:
try:
article_url = await pages_queue.get()
instance_driver = wd[producer_id] # or any other webdriver
instance_driver.get(article_url)
article_id = str(uuid.uuid4())
if first:
await asyncio.sleep(2)
instance_driver.find_element(
By.XPATH, '//*[@id="onetrust-close-btn-container"]/button').click()
first = False
article_title = get_first_pass_or_none([lambda instance_driver: instance_driver.find_element(
By.XPATH, '//*[@id="documentTitle"]').text], instance_driver)
author = get_first_pass_or_none([lambda instance_driver: re.sub(r"\([^()]*\)", "", instance_driver.find_element(
By.XPATH, '//*[@id="authordiv"]/span[2]/span/a/strong')
.text)], instance_driver)
publication_info = get_first_pass_or_none([lambda instance_driver: instance_driver.find_element(
By.XPATH, '//*[@id="authordiv"]/span[2]/span').text], instance_driver)
publication_location = get_first_pass_or_none([lambda publication_info: re.findall(
r'\[(.*?)\]', publication_info)[0]], instance_driver)
publication_date = publication_info
output_metadata[article_id] = {
"title": article_title,
"author": author,
"location": publication_location,
"date": publication_date
}
pdf_url = instance_driver.find_element(
By.CLASS_NAME, 'pdf-download').get_attribute('href')
await download_queue.put({
"article_id": article_id,
"pdf_url": pdf_url,
})
print(download_queue.qsize())
pages_queue.task_done()
except Exception as e:
logger.debug(f"Error {e}")
keyboard.wait(keys[producer_id])
async def pdf_downloader(queue, consumer_id):
while True:
try:
print(f"pdf_downloader {consumer_id} waiting for queue")
entry = await queue.get()
print(f"pdf_downloader {consumer_id} got queue entry")
article_id = entry['article_id']
pdf_url = entry['pdf_url']
response = requests.get(pdf_url)
pdf_content = response.content
object_key = f"{article_id}.pdf"
s3.Bucket(bucket_name).put_object(Key=object_key, Body=pdf_content)
queue.task_done()
except Exception as e:
logger.debug(f"Error {e}")
async def main():
# Create a shared queue
download_queue = asyncio.Queue()
pages_queue = asyncio.Queue()
for page_url in article_urls:
pages_queue.put_nowait(page_url)
# Create two producers and two consumers
producers = [asyncio.create_task(
url_producer(download_queue, pages_queue, i)) for i in range(1)]
consumers = [asyncio.create_task(
pdf_downloader(download_queue, i)) for i in range(8)]
# Wait for the producers to finish
await asyncio.gather(*producers)
# Cancel the consumers
for consumer in consumers:
consumer.cancel()
Nothing outputted to logger.debug. Terminal Output:
pdf_downloader 0 waiting for queue pdf_downloader 1 waiting for queue pdf_downloader 2 waiting for queue pdf_downloader 3 waiting for queue pdf_downloader 4 waiting for queue pdf_downloader 5 waiting for queue pdf_downloader 6 waiting for queue pdf_downloader 7 waiting for queue 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16