'ascii' codec can't encode character

Question

I am trying to parse an HTML link into the code and take its source code as list of strings. As I have to use get some relevant data from it, I am decoding everything into UTF-8 scheme.

I am also using beautifulsoup4 which extracts the text in decoded form.

This is my code that I have used.

def do_underline(line,mistakes):
    last = u'</u></font>'
    first = u"<u><font color='red'>"
    a = [i.decode(encoding='UTF-8', errors='ignore') for i in line]
    lenm = len(mistakes)
    for i in range(lenm):
        a.insert(mistakes[lenm-i-1][2],last)
        a.insert(mistakes[lenm-i-1][1],first)
    b = u''
    return b.join(a)

def readURL(u):
    """
    URL -> List

    Opens a webpage's source code and extract it text
    along with blank and new lines.
    enumerate all lines.(including blank and new lines

    """
    global line_dict,q
    line_dict = {}
    p = opener.open(u)
    p1 = p.readlines()
    q = [i.decode(encoding = 'UTF-8',errors='ignore') for i in p1]
    q1 = [BeautifulSoup(i).get_text() for i in q]
    q2 = list(enumerate(q1))
    line_dict = {i:j for (i,j) in enumerate(q)}
    return q2

def process_file(f):
    """
    (.html file) -> List of Spelling Mistakes
    """
    global line_dict
    re = readURL(f)
    de = del_blankempty(re)
    fd = form_dict(de)

    fflist = []
    chklst = []

    for i in fd:
        chklst = chklst + list_braces(i,line_dict)
        fflist = fflist + find_index_mistakes(i,fd)

    final_list = list(set(is_inside_braces_or_not(chklst,fflist)))

    final_dict = {i:sorted(list(set([final_list[j] for j in range(len(final_list)) if final_list[j][0] == i])),key=lambda student: student[1]) for i in fd}

    for i in line_dict:
        if i in fd:
            line_dict[i] = do_underline(line_dict[i],final_dict[i])
        else:
            line_dict[i] = line_dict[i]

    create_html_file(line_dict)
    print "Your Task is completed"

def create_html_file(a):
    import io
    fl = io.open('Spellcheck1.html','w', encoding='UTF-8')
    for i in a:
        fl.write(a[i])
    print "Your HTML text file is created"

I am getting the following error every time i run the script.

Traceback (most recent call last):
  File "checker.py", line 258, in <module>
    process_file('https://www.fanfiction.net/s/9421614/1/The-Night-Blooming-Flower')
  File "checker.py", line 243, in process_file
    line_dict[i] = do_underline(line_dict[i],final_dict[i])
  File "checker.py", line 89, in do_underline
    a = [i.decode(encoding='UTF-8', errors='ignore') for i in line]
  File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
    return codecs.utf_8_decode(input, errors, True)
UnicodeEncodeError: 'ascii' codec can't encode character u'\xf3' in position 0: ordinal not in range(128)

Any suggestions how i can remove this error. if there is a way which decodes evrything into UTF-8 coming from the given link, then i think it will solve the problem.

From the exception, I'd say that `i` is *already* encoded (a byte string), so Python tries to **decode** it first so that it can then handle your request to encode back to UTF-8. — Martijn Pieters, Jun 18 '14 at 09:48
*Where is the definition of do_underline?* It was there in the text above, I've put it in the tags properly now. — SiHa, Jun 18 '14 at 10:56
This has helpful information: http://nedbatchelder.com/text/unipain.html — Ned Batchelder, Jun 18 '14 at 11:11

'ascii' codec can't encode character

0 Answers0