For the life of me I cannot figure this out. I have been scraping some SEC filings in a loop and want to scrape tables on the web page into a dataframe. The majority of the URLs work but a few URLs are not working. I tried to inspect the actual HTML to look for differences between the ones that worked and didn't work but my understanding of HTML is not the greatest.
import pandas as pd
url = 'https://www.sec.gov/Archives/edgar/data/0000892534/000117891307002012/zk74243.htm'
df = pd.read_html(url, flavor = 'lxml')
the error it spits out is always the same for the problematic urls.
IndexError Traceback (most recent call last)
<ipython-input-13-784175815486> in <module>
----> 1 df = pd.read_html(url, flavor = 'lxml')
C:\Python\Python38\lib\site-packages\pandas\io\html.py in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only)
1088 )
1089 _validate_header_arg(header)
-> 1090 return _parse(
1091 flavor=flavor,
1092 io=io,
C:\Python\Python38\lib\site-packages\pandas\io\html.py in _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs)
915 for table in tables:
916 try:
--> 917 ret.append(_data_to_frame(data=table, **kwargs))
918 except EmptyDataError: # empty table
919 continue
C:\Python\Python38\lib\site-packages\pandas\io\html.py in _data_to_frame(**kwargs)
791 # fill out elements of body that are "ragged"
792 _expand_elements(body)
--> 793 tp = TextParser(body, header=header, **kwargs)
794 df = tp.read()
795 return df
C:\Python\Python38\lib\site-packages\pandas\io\parsers.py in TextParser(*args, **kwds)
2221 """
2222 kwds["engine"] = "python"
-> 2223 return TextFileReader(*args, **kwds)
2224
2225
C:\Python\Python38\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
893 self.options["has_index_names"] = kwds["has_index_names"]
894
--> 895 self._make_engine(self.engine)
896
897 def close(self):
C:\Python\Python38\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
1145 ' "python-fwf")'.format(engine=engine)
1146 )
-> 1147 self._engine = klass(self.f, **self.options)
1148
1149 def _failover_to_python(self):
C:\Python\Python38\lib\site-packages\pandas\io\parsers.py in __init__(self, f, **kwds)
2308 self.num_original_columns,
2309 self.unnamed_cols,
-> 2310 ) = self._infer_columns()
2311
2312 # Now self.columns has the set of columns that we will process.
C:\Python\Python38\lib\site-packages\pandas\io\parsers.py in _infer_columns(self)
2691 columns = [names]
2692 else:
-> 2693 columns = self._handle_usecols(columns, columns[0])
2694 else:
2695 try:
IndexError: list index out of range
Here are some other URLs giving me issues.
https://www.sec.gov/Archives/edgar/data/0001119774/000117891309002587/zk97422.htm https://www.sec.gov/Archives/edgar/data/0001158780/000117891309002357/zk97328.htm