I'm trying to input zip codes taken from a predefined excel file into a website to fetch populated results and write them back to the same file in a new column. To be specific, each zip code should produce a household number, which is what I want to record and enter in the final column of the excel file.
With my current attempt, the script is doing fine. What I wish to do now is apply concurrent.futures
library within the script to make it faster.
import concurrent.futures as futures
from openpyxl import load_workbook
from bs4 import BeautifulSoup
import requests
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
'referer': 'https://www.unitedstateszipcodes.org/60634/'
}
link = 'https://www.unitedstateszipcodes.org/{}/'
def get_content(company,url):
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text,"lxml")
household = int(soup.select_one("th:-soup-contains('Occupied Housing Units') + td").get_text(strip=True).replace(",",""))
company.offset(0,3).value = household
return household
if __name__ == '__main__':
wb = load_workbook('input.xlsx')
ws = wb['result']
for company in ws['B']:
if company.value == "Zip Code": continue
if not company.value: break
content_url = link.format(company.value)
print(get_content(company,content_url))
wb.save('input.xlsx')
# if __name__ == "__main__":
# wb = load_workbook('input.xlsx')
# ws = wb['result']
# with futures.ThreadPoolExecutor(max_workers=6) as executor:
# future_to_url = {
# executor.submit(get_content, company, link.format(company.value)): company for company in ws['B']
# }
# futures.as_completed(future_to_url)
# wb.save('input.xlsx')
Here is the excel file for your consideration and this is how it looks like.