I'm trying to get .xml data from SEC filings. It's in the second table. But, if I get to a page that doesn't have the .xml, I want the html vers, first & only table. Could someone please help me understand how to iterate or skip a the first table if there are two, and to get the first a['href'] in the first table if only one is present?
from urllib2 import urlopen
import requests
from bs4 import BeautifulSoup
tableCount = 0
linklist = [https://www.sec.gov/Archives/edgar/data/1070789/000149315217011092/0001493152-17-011092-index.htm, https://www.sec.gov/Archives/edgar/data/1592603/000139160917000254/0001391609-17-000254-index.htm]
for l in linklist:
html = urlopen(l)
soup = BeautifulSoup(html.read().decode('latin-1', 'ignore'),"lxml")
table = soup.findAll(class_='tableFile') # works for getting all .htm links
for item in table:
tableCount +=1
url = table[0].a["href"]
if table.count >= 1:
url = table[1].a["href"]
else:
url = table.a["href"]