3

I would like to create a web scraper that collects the specific holdings of an ETF. I found that Zacks.com creates a nice list of what I am looking for. I am trying to use BeautifulSoup however I am having a difficult time pinpointing the data under in the "Symbol" column. What do I need to change or add to collect all the symbols as a list?

import requests
from bs4 import BeautifulSoup

tickers = ["XLU","XLRE"] #list of tickers whose financial data needs to be extracted
financial_dir = {}


for ticker in tickers:
 #getting holdings data from Zacks for the given ticker
 temp_dir = {}
 url = 'https://www.zacks.com/funds/etf/'+ticker+'/holding'
 page = requests.get(url)
 page_content = page.content
 soup = BeautifulSoup(page_content,'html.parser')
 tabl = soup.find_all("table", {"id" : "etf_holding_table"})
 for t in tabl:
     rows = t.find_all("button")
mshudoma
  • 41
  • 1
  • 4
  • 2
    The major ETF sponsors all provide CSV files with holdings of each ETF. Why scrape outdated info from zacks instead of getting up to date info direct from the ETF sponsor? – Stripedbass Nov 27 '20 at 16:30

2 Answers2

4
import requests
import re

keys = ['XLU', 'XLRE']


headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"
}


def main(url):
    with requests.Session() as req:
        req.headers.update(headers)
        for key in keys:
            r = req.get(url.format(key))
            print(f"Extracting: {r.url}")
            goal = re.findall(r'etf\\\/(.*?)\\', r.text)
            print(goal)


main("https://www.zacks.com/funds/etf/{}/holding")

Output:

Extracting: https://www.zacks.com/funds/etf/XLU/holding
['NEE', 'DUK', 'D', 'SO', 'AEP', 'XEL', 'EXC', 'SRE', 'WEC', 'ES', 'PEG', 'AWK', 'ED', 'DTE', 'PPL', 'ETR', 'AEE', 'EIX', 'CMS', 'FE', 'LNT', 'AES', 'ATO', 'EVRG', 'CNP', 'PNW', 'NI', 'NRG']   
Extracting: https://www.zacks.com/funds/etf/XLRE/holding
['AMT', 'PLD', 'CCI', 'EQIX', 'DLR', 'PSA', 'SBAC', 'WELL', 'AVB', 'O', 'WY', 'SPG', 'ARE', 'EQR', 'VTR', 'CBRE', 'PEAK', 'EXR', 'DRE', 'MAA', 'ESS', 'BXP', 'UDR', 'HST', 'IRM', 'REG', 'VNO', 'AIV', 'FRT', 'KIM', 'SLG']
1

To answer Andrew Hick's comment, here is a version you can use to get weights, for etfs and mutual funds. You'll have to play with formatting a bit.

    import requests
    import re
    
    etf_keys = ['XLU', 'XLRE']
    mutual_fund_keys = ['VFTAX']
    
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"
    }
   
    
    def main_etf(url):
        with requests.Session() as req:
            req.headers.update(headers)
            for key in etf_keys:
                r = req.get(url.format(key))
                print(f"Extracting: {r.url}")
                etf_stock_list = re.findall(r'etf\\\/(.*?)\\', r.text)
                print(etf_stock_list)
                etf_stock_details_list = re.findall(r'<\\\/span><\\\/span><\\\/a>",(.*?), "<a class=\\\"report_document newwin\\', r.text)
                print(etf_stock_details_list)
    
    
    def main_mutual(url):
        with requests.Session() as req:
            req.headers.update(headers)
            for key in mutual_fund_keys:
                r = req.get(url.format(key))
                print(f"Extracting: {r.url}")
                mutual_stock_list = re.findall(r'\\\/mutual-fund\\\/quote\\\/(.*?)\\', r.text)
                print(mutual_stock_list)    
                mutual_stock_details_list = re.findall(r'"sr-only\\\"><\\\/span><\\\/span><\\\/a>",(.*?)%", "', r.text)
                print(mutual_stock_details_list)
                
    
    main_etf("https://www.zacks.com/funds/etf/{}/holding")
    main_mutual("https://www.zacks.com/funds/mutual-fund/quote/{}/holding")