-1

I have a list of URLs in article_urls: list[str] and I'm trying to create several workers which go to the URLs and get a different URL from the webpage and then put that URL in another queue. I want several other workers to go through that other queue and download the files at the URLS and then post them to S3. I think the program is stopping on entry = await queue.get() and never moving on. How do I fix this?

def get_first_pass_or_none(inp, driver):
    for x in inp:
        try:
            return x(driver)
        except:
            pass
    return None


async def url_producer(download_queue, pages_queue, producer_id):
    first = True
    while True:
        try:
            article_url = await pages_queue.get()
            instance_driver = wd[producer_id]  # or any other webdriver
            instance_driver.get(article_url)
            article_id = str(uuid.uuid4())

            if first:
                await asyncio.sleep(2)
                instance_driver.find_element(
                    By.XPATH, '//*[@id="onetrust-close-btn-container"]/button').click()
                first = False

            article_title = get_first_pass_or_none([lambda instance_driver: instance_driver.find_element(
                By.XPATH, '//*[@id="documentTitle"]').text], instance_driver)
            author = get_first_pass_or_none([lambda instance_driver: re.sub(r"\([^()]*\)", "", instance_driver.find_element(
                By.XPATH, '//*[@id="authordiv"]/span[2]/span/a/strong')
                .text)], instance_driver)
            publication_info = get_first_pass_or_none([lambda instance_driver: instance_driver.find_element(
                By.XPATH, '//*[@id="authordiv"]/span[2]/span').text], instance_driver)
            publication_location = get_first_pass_or_none([lambda publication_info: re.findall(
                r'\[(.*?)\]', publication_info)[0]], instance_driver)
            publication_date = publication_info

            output_metadata[article_id] = {
                "title": article_title,
                "author": author,
                "location": publication_location,
                "date": publication_date
            }
            pdf_url = instance_driver.find_element(
                By.CLASS_NAME, 'pdf-download').get_attribute('href')
            await download_queue.put({
                "article_id": article_id,
                "pdf_url": pdf_url,
            })
            print(download_queue.qsize())
            pages_queue.task_done()
        except Exception as e:
            logger.debug(f"Error {e}")
            keyboard.wait(keys[producer_id])


async def pdf_downloader(queue, consumer_id):
    while True:
        try:
            print(f"pdf_downloader {consumer_id} waiting for queue")
            entry = await queue.get()
            print(f"pdf_downloader {consumer_id} got queue entry")
            article_id = entry['article_id']
            pdf_url = entry['pdf_url']
            response = requests.get(pdf_url)
            pdf_content = response.content
            object_key = f"{article_id}.pdf"
            s3.Bucket(bucket_name).put_object(Key=object_key, Body=pdf_content)
            queue.task_done()
        except Exception as e:
            logger.debug(f"Error {e}")


async def main():
    # Create a shared queue
    download_queue = asyncio.Queue()
    pages_queue = asyncio.Queue()
    for page_url in article_urls:
        pages_queue.put_nowait(page_url)

    # Create two producers and two consumers
    producers = [asyncio.create_task(
        url_producer(download_queue, pages_queue, i)) for i in range(1)]
    consumers = [asyncio.create_task(
        pdf_downloader(download_queue, i)) for i in range(8)]

    # Wait for the producers to finish
    await asyncio.gather(*producers)

    # Cancel the consumers
    for consumer in consumers:
        consumer.cancel()

Nothing outputted to logger.debug. Terminal Output:

pdf_downloader 0 waiting for queue pdf_downloader 1 waiting for queue pdf_downloader 2 waiting for queue pdf_downloader 3 waiting for queue pdf_downloader 4 waiting for queue pdf_downloader 5 waiting for queue pdf_downloader 6 waiting for queue pdf_downloader 7 waiting for queue 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16

  • What happens when you run out of URLs? The `await queue.get()` is going to hang forever. You need to give the workers some way of knowing they are done. Either add a "you're done" object to the queue, or have the workers wait for a small amount of time, and if they don't get an object, to check if a flag is set. – Frank Yellin May 04 '23 at 20:08
  • What happens if you add the `consumers` to your `asyncio.gather()` call? – Philip Wrage May 04 '23 at 20:32

1 Answers1

-1

I have used Queue to pass work between producers and consumers before, and my personal preference is to start my consumers first, and then start my producer(s) to make sure that the consumers are ready and waiting when work arrives.

I would try the following:

async def main():
    # Create a shared queue
    download_queue = asyncio.Queue()
    pages_queue = asyncio.Queue()
    for page_url in article_urls:
        pages_queue.put_nowait(page_url)

    # Create two producers and eight consumers
    consumers = [asyncio.create_task(
        pdf_downloader(download_queue, i)) for i in range(8)]
    producers = [asyncio.create_task(
        url_producer(download_queue, pages_queue, i)) for i in range(1)]
    
    # Wait for the producers and consumers to finish
    await asyncio.gather(*(producers + consumers))

I removed the cancellation of the consumers, because I wasn't sure why you were doing that if you wanted them to run and process the PDF data. You can obviously modify this example if something like that is required again.

I also noticed that you are performing a GET request with requests, which will block, regardless of how many async consumers you have running. If you want to run this using asyncio without blocking, you will have to run this function in a thread or process pool using loop.run_in_executor():

async def pdf_downloader(queue, consumer_id):
    loop = asyncio.get_event_loop()
    while True:
        try:
            print(f"pdf_downloader {consumer_id} waiting for queue")
            entry = await queue.get()
            print(f"pdf_downloader {consumer_id} got queue entry")
            article_id = entry['article_id']
            pdf_url = entry['pdf_url']
            request_future = loop.run_in_executor(None, requests.get, pdf_url)
            response = await request_future
            pdf_content = response.content
            object_key = f"{article_id}.pdf"
            s3.Bucket(bucket_name).put_object(Key=object_key, Body=pdf_content)
            queue.task_done()
        except Exception as e:
            logger.debug(f"Error {e}")

You may also have to do this with your call to put_object() from the Amazon Boto libary.

Philip Wrage
  • 1,505
  • 1
  • 12
  • 23