I'm new to Python, and BeautifulSoup so bear with me...
I'm trying figure out how to remove the Doctype from an HTML file using Beautifulsoup4, but can't seem to figure out exactly how to achieve this.
def saveToText(self):
filename = os.path.join(self.parent.ReportPath, str(self.parent.CharName.text()) + "_report.txt")
filename, filters = QFileDialog.getSaveFileName(self, "Save Report", filename, "Text (*.txt);;All Files (*.*)")
if filename is not None and str(filename) != '':
try:
if re.compile('\.txt$').search(str(filename)) is None:
filename = str(filename)
filename += '.txt'
soup = BeautifulSoup(self.reportHtml, "lxml")
try: # THROWS AttributeError IF NOT FOUND ..
soup.find('font').extract()
except AttributeError:
pass
try: # THROWS AttributeError IF NOT FOUND ..
soup.find('head').extract()
except AttributeError:
pass
soup.html.unwrap()
soup.body.unwrap()
for b in soup.find_all('b'):
b.unwrap()
for table in soup.find_all('table'):
table.unwrap()
for td in soup.find_all('td'):
td.unwrap()
for br in soup.find_all('br'):
br.replace_with('\n')
for center in soup.find_all('center'):
center.insert_after('\n')
for dl in soup.find_all('dl'):
dl.insert_after('\n')
for dt in soup.find_all('dt'):
dt.insert_after('\n')
for hr in soup.find_all('hr'):
hr.replace_with(('-' * 80) + '\n')
for tr in soup.find_all('tr'):
tr.insert_before(' ')
tr.insert_after('\n')
print(soup)
except IOError:
QMessageBox.critical(None, 'Error!', 'Error writing to file: ' + filename, 'OK')
I tried using:
from bs4 import Doctype
if isinstance(e, Doctype):
e.extract()
but that complains that 'e' is a unresolved reference. I've searched through the documentation and google, but I haven't found anything that works.
On a side note, is there a way to reduce this code?