1

I was trying to write the code for processing an html webpage and create an excel sheet out of it. There is the error throwing up. Please let me know if anyone can help. A lot of code which i ran is commented. I wanted to include that too.

book = xlwt.Workbook(encoding="utf-8")
sheet1 = book.add_sheet("Product List")

i=0
for row in soup.findAll('li', { "class" : "product-link" }):
    link = row.a['href']
    #print(link)
    #print(link[23:])

    conn = http.client.HTTPSConnection("www.sanjamar.com")
    conn.request("GET", link[23:])
    req = conn.getresponse()
    #print(req.status, req.reason) 
    data2 = req.read()
    soup2=BeautifulSoup(data2)

    Name=soup2.find('title')
    Name_text = Name.text
    words = Name_text.split('|')
    Name = words[0]
    print(Name)

    if len(soup2.find_all("div", {"id":"productDescription"})) > 0:
        Desc = soup2.find('div', {"id":"productDescription"})
        ProdDesc = Desc.p
        #print(ProdDesc)

        if ProdDesc != None:
            Desc == ProdDesc.text

    if len(soup2.find_all("div", {"id":"productBenefits"})) > 0:
        Feat = soup2.find('div',{"id":"productBenefits"})
        ProdFeat = Feat.ul
        #print(ProdFeat)

        if ProdFeat != None:
            Feat == ProdFeat.text

    if len(soup2.find_all("table", {"class":"mceItemTable"})) > 0:
        Spec = soup2.find('table',{"class":"mceItemTable"})
        #print(Spec)
        if Spec != None:
            specrow = ''
            for row in Spec.findAll('tr'):
                specrow = specrow + ',' + row.text

        if specrow != None:
            Spec = specrow[1:]

        words = Spec.partition(",")[2]        


    record = (Name,Desc.text,Feat.text[20:],words)
    print(record)

    for col_index, item in enumerate(record):
        sheet1.write(i, col_index, item)
    i += 1

book.save("Sanjamar1.xls")

'''
    if len(soup2.find_all("table", {"class":"variations"})) > 0:
    options = soup2.find('table',{"class":"variations"})
    Prodoptions = options.select
    print('options')
    print(Prodoptions)

if len(soup2.find_all("div", {"id":"availableColorsWrapper"})) > 0:
    options = soup2.find('div',{"id":"availableColorsWrapper"})
    ProdColors = options.ul
    print('Colors')
    print(ProdColors)

if len(soup2.find_all("a", {"class":"fancybox-media"})) > 0:
    options = soup2.find('a',{"class":"fancybox-media"})
    ProdVideos = options['href']
    print('Videos')
    print(ProdVideos)

'''
'''
j = 0
if len(soup2.find_all("a", {"class":"lit-link"})) > 0:
    for row1 in soup2.findAll('a', {"class":"lit-link"}):
        file_download1 = row1['href'] 
        words = file_download1.split('/')
        print (words[-1])
        if words[-1][-3:]=="pdf":
            print (file_download1)
            if file_download1 != None:
                if file_download1 != '': 
                    try:
                        resource = urllib.request.urlopen(file_download1)
                        file_name1 = words[-1]
                        output = open(file_name1,"wb")
                        output.write(resource.read())
                        output.close()
                    except urllib.request.HTTPError:
                        print('Nthn')


j += 1
   '''

#i += 1

The error its showing up is:

TypeError                                 Traceback (most recent call last)
<ipython-input-17-468fc8825863> in <module>()
     63     i += 1
     64 
---> 65 book.save('Sanjamar.xls')
     66 
     67 '''

C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\Workbook.py in save(self, 
filename_or_stream)
    708 
    709         doc = CompoundDoc.XlsDoc()
--> 710         doc.save(filename_or_stream, self.get_biff_data())
    711 
    712 

C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\Workbook.py in 
get_biff_data(self)
    672         all_links          = self.__all_links_rec()
    673 
--> 674         shared_str_table   = self.__sst_rec()
    675         after = country + all_links + shared_str_table
    676 

C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\Workbook.py in _ 
_sst_rec(self)
    634 
    635     def __sst_rec(self):
--> 636         return self.__sst.get_biff_record()
    637 
    638     def __ext_sst_rec(self, abs_stream_pos):

C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\BIFFRecords.py in 
get_biff_record(self)
     77                 self._add_to_sst(s)
     78             else:
---> 79                 self._add_rt_to_sst(s)
     80         del data
     81         self._new_piece()

C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\BIFFRecords.py in 
_add_rt_to_sst(self, rt)
    106 
    107     def _add_rt_to_sst(self, rt):
--> 108         rt_str, rt_fr = upack2rt(rt, self.encoding)
    109         is_unicode_str = rt_str[2] == b'\x09'[0]
    110         if is_unicode_str:

C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\UnicodeUtils.py in 
upack2rt(rt, encoding)
     84             # code in Rows.py ensures that
     85             # fontx can be None only for the first piece
---> 86             fr += pack('<HH', offset, fontx)
     87         # offset is the number of MS C wchar characters.
     88         # That is 1 if c <= u'\uFFFF' else 2

TypeError: must be str, not bytes
Santosh
  • 103
  • 2
  • 4
  • 13
  • 1
    that can't be the entire error message, please show the entire traceback message as it'd be relevant to aiding the issue. – Tadhg McDonald-Jensen Jun 02 '17 at 15:55
  • This might be helpful https://stackoverflow.com/questions/6224052/what-is-the-difference-between-a-string-and-a-byte-string – Will Da Silva Jun 02 '17 at 15:55
  • @Tadhg McDonald-Jensen Added the whole error message. Please have a look – Santosh Jun 02 '17 at 16:00
  • right before the line `sheet1.write(i, col_index, item)` I'd like you to do `print(type(item))` and tell me if they all say `str` /`bytes` or if it's mixed. – Tadhg McDonald-Jensen Jun 02 '17 at 16:05
  • @Tadhg McDonald-Jensen There are combinations of `str` and `list` types. And not always. One row could be all `str`. And some of the rows could be 3 `str`and 1 `list` – Santosh Jun 02 '17 at 16:09
  • not sure passing a `list` as the label is valid, take a look at [the docs for the write method](http://xlwt.readthedocs.io/en/latest/api.html#xlwt.Worksheet.Worksheet.write). but if that isn't the issue this looks like either a version incompatibility or a bug. – Tadhg McDonald-Jensen Jun 02 '17 at 16:13
  • Thanks, The issue is with `list`. I used the same variable called `words` in two places. That was causing to retain a list value in some cases. resolved. – Santosh Jun 02 '17 at 16:52
  • @TadhgMcDonald-Jensen Please look into my new question – Santosh Jun 06 '17 at 16:10
  • for future reference, adding a link to the new question you want me to look at would be helpful. – Tadhg McDonald-Jensen Jun 09 '17 at 03:17

0 Answers0