Concurrent is not working properly with beautifulsoup, not fetching all the links

Question

In this code I want to extract content from a newspaper link using beautifulsoup. But it is not working properly, each link in the list "filtered_Final_LIST" has links which has multiple articles. The function 'ext_url' is not returning all the pages results when I am using concurrent library.

And, Normal for loop is working properly. I have used this concurrent library to increase extraction speed. Am I doing something wrong?

import concurrent.futures
import time

MAX_THREADS = 30

filtered_Final_LIST = ['https://www.financialexpress.com/economy/finmin-asks-ministries-to-restrict-expenses-within-prescribed-limit/2410766/"',
 'https://www.financialexpress.com/economy/uk-inflation-hits-near-30-year-high-pressuring-boe-and-households/2410761/"',
 'https://www.financialexpress.com/economy/economic-recovery-yet-to-attain-durability-says-report/2410690/"',
 'https://www.financialexpress.com/economy/vagaries-of-weather-drive-near-13-lakh-maha-farmers-to-crop-insurance-scheme/2410030/"']


def ext_url(url):
  global List_articles, List_header, List_date, List_month, List_year, List_source

  ## Lists to get dates and news articles
  List_articles = []
  List_header = []
  List_date = []
  List_month = []
  List_year = []
  List_source = []

  # for i in range(len(filtered_Final_LIST)):
    # if 'https://www.financialexpress.com/economy/' in str(Final_LIST[i]):
    # opening the url for reading

  html = urllib.request.urlopen(url , timeout = 10)
  print(url)

  # parsing the html file
  htmlParse = BeautifulSoup(html, 'html.parser')
  
  # getting all the paragraphs of articles
  for para in htmlParse.find_all(['div'], class_='entry-content wp-block-post-content'):
    List_articles.append(para.get_text())
  

  # Getting respective month, date, year the article published
  from datetime import datetime

  date = htmlParse.find(itemprop="article:published_time").get("content")
  match = re.search(r'\d{4}-\d{2}-\d{2}', date)
  dt = datetime.strptime(match.group(), '%Y-%m-%d').date()
  List_month.append(dt.month)
  List_date.append(dt.day)
  List_year.append(dt.year)
 

  # getting all the headings of articles
  for para in htmlParse.find_all(['h1'], class_='wp-block-post-title'):
    List_header.append(para.get_text())

  # getting all the source of articles
  for para in htmlParse.find_all(['div'], class_='author-link ie_custom_theme_multiple_authors'):
    List_source.append(para.get_text())
  

  return List_articles, List_header, List_date, List_month, List_year, List_source
    
with concurrent.futures.ThreadPoolExecutor() as executor :
    for i in range(len(filtered_Final_LIST)):
      executor.submit(ext_url, (filtered_Final_LIST[i]))

score 1 · Accepted Answer · answered Jan 19 '22 at 20:40

import trio
import httpx
from bs4 import BeautifulSoup
import pandas as pd

# pip install trio httpx

mainurl = 'https://www.financialexpress.com/economy/'
news = [
    'finmin-asks-ministries-to-restrict-expenses-within-prescribed-limit/2410766/',
    'uk-inflation-hits-near-30-year-high-pressuring-boe-and-households/2410761/',
    'economic-recovery-yet-to-attain-durability-says-report/2410690/',
    'vagaries-of-weather-drive-near-13-lakh-maha-farmers-to-crop-insurance-scheme/2410030/'
]

allin = []


async def get_soup(content):
    return BeautifulSoup(content, 'lxml')


async def worker(receiver):
    async with receiver:
        async for client, new in receiver:
            r = await client.get(mainurl + new)
            soup = await get_soup(r.text)
            prs = [x.text for x in soup.select(
                '.entry-content > p:not(:last-child)')]
            title = soup.select_one('.wp-block-post-title').text
            author = soup.select_one('div.author-link a').text
            publish = soup.select_one(
                '[itemprop="article:published_time"]')['content'].split('T')[0].split('-')
            target = [title, author, *publish, prs]
            allin.append(target)


async def main():
    async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
        sender, receiver = trio.open_memory_channel(0)

        async with receiver:
            for _ in range(5):
                nurse.start_soon(worker, receiver.clone())

            async with sender:
                for new in news:
                    await sender.send([client, new])


if __name__ == "__main__":
    trio.run(main)
    df = pd.DataFrame(
        allin, columns=['Title', 'Author', 'Year', 'Month', 'Day', 'Paragraphs'])
    print(df)
    df.to_csv('data.csv', index=False)

woow amazing ,thanks for showing me right path to use async, await... — graj499, Jan 19 '22 at 21:36

Concurrent is not working properly with beautifulsoup, not fetching all the links

1 Answers1