I am trying to parse an HTML link into the code and take its source code as list of strings. As I have to use get some relevant data from it, I am decoding everything into UTF-8 scheme.
I am also using beautifulsoup4 which extracts the text in decoded form.
This is my code that I have used.
def do_underline(line,mistakes):
last = u'</u></font>'
first = u"<u><font color='red'>"
a = [i.decode(encoding='UTF-8', errors='ignore') for i in line]
lenm = len(mistakes)
for i in range(lenm):
a.insert(mistakes[lenm-i-1][2],last)
a.insert(mistakes[lenm-i-1][1],first)
b = u''
return b.join(a)
def readURL(u):
"""
URL -> List
Opens a webpage's source code and extract it text
along with blank and new lines.
enumerate all lines.(including blank and new lines
"""
global line_dict,q
line_dict = {}
p = opener.open(u)
p1 = p.readlines()
q = [i.decode(encoding = 'UTF-8',errors='ignore') for i in p1]
q1 = [BeautifulSoup(i).get_text() for i in q]
q2 = list(enumerate(q1))
line_dict = {i:j for (i,j) in enumerate(q)}
return q2
def process_file(f):
"""
(.html file) -> List of Spelling Mistakes
"""
global line_dict
re = readURL(f)
de = del_blankempty(re)
fd = form_dict(de)
fflist = []
chklst = []
for i in fd:
chklst = chklst + list_braces(i,line_dict)
fflist = fflist + find_index_mistakes(i,fd)
final_list = list(set(is_inside_braces_or_not(chklst,fflist)))
final_dict = {i:sorted(list(set([final_list[j] for j in range(len(final_list)) if final_list[j][0] == i])),key=lambda student: student[1]) for i in fd}
for i in line_dict:
if i in fd:
line_dict[i] = do_underline(line_dict[i],final_dict[i])
else:
line_dict[i] = line_dict[i]
create_html_file(line_dict)
print "Your Task is completed"
def create_html_file(a):
import io
fl = io.open('Spellcheck1.html','w', encoding='UTF-8')
for i in a:
fl.write(a[i])
print "Your HTML text file is created"
I am getting the following error every time i run the script.
Traceback (most recent call last):
File "checker.py", line 258, in <module>
process_file('https://www.fanfiction.net/s/9421614/1/The-Night-Blooming-Flower')
File "checker.py", line 243, in process_file
line_dict[i] = do_underline(line_dict[i],final_dict[i])
File "checker.py", line 89, in do_underline
a = [i.decode(encoding='UTF-8', errors='ignore') for i in line]
File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeEncodeError: 'ascii' codec can't encode character u'\xf3' in position 0: ordinal not in range(128)
Any suggestions how i can remove this error. if there is a way which decodes evrything into UTF-8 coming from the given link, then i think it will solve the problem.