I was trying to write the code for processing an html webpage and create an excel sheet out of it. There is the error throwing up. Please let me know if anyone can help. A lot of code which i ran is commented. I wanted to include that too.
book = xlwt.Workbook(encoding="utf-8")
sheet1 = book.add_sheet("Product List")
i=0
for row in soup.findAll('li', { "class" : "product-link" }):
link = row.a['href']
#print(link)
#print(link[23:])
conn = http.client.HTTPSConnection("www.sanjamar.com")
conn.request("GET", link[23:])
req = conn.getresponse()
#print(req.status, req.reason)
data2 = req.read()
soup2=BeautifulSoup(data2)
Name=soup2.find('title')
Name_text = Name.text
words = Name_text.split('|')
Name = words[0]
print(Name)
if len(soup2.find_all("div", {"id":"productDescription"})) > 0:
Desc = soup2.find('div', {"id":"productDescription"})
ProdDesc = Desc.p
#print(ProdDesc)
if ProdDesc != None:
Desc == ProdDesc.text
if len(soup2.find_all("div", {"id":"productBenefits"})) > 0:
Feat = soup2.find('div',{"id":"productBenefits"})
ProdFeat = Feat.ul
#print(ProdFeat)
if ProdFeat != None:
Feat == ProdFeat.text
if len(soup2.find_all("table", {"class":"mceItemTable"})) > 0:
Spec = soup2.find('table',{"class":"mceItemTable"})
#print(Spec)
if Spec != None:
specrow = ''
for row in Spec.findAll('tr'):
specrow = specrow + ',' + row.text
if specrow != None:
Spec = specrow[1:]
words = Spec.partition(",")[2]
record = (Name,Desc.text,Feat.text[20:],words)
print(record)
for col_index, item in enumerate(record):
sheet1.write(i, col_index, item)
i += 1
book.save("Sanjamar1.xls")
'''
if len(soup2.find_all("table", {"class":"variations"})) > 0:
options = soup2.find('table',{"class":"variations"})
Prodoptions = options.select
print('options')
print(Prodoptions)
if len(soup2.find_all("div", {"id":"availableColorsWrapper"})) > 0:
options = soup2.find('div',{"id":"availableColorsWrapper"})
ProdColors = options.ul
print('Colors')
print(ProdColors)
if len(soup2.find_all("a", {"class":"fancybox-media"})) > 0:
options = soup2.find('a',{"class":"fancybox-media"})
ProdVideos = options['href']
print('Videos')
print(ProdVideos)
'''
'''
j = 0
if len(soup2.find_all("a", {"class":"lit-link"})) > 0:
for row1 in soup2.findAll('a', {"class":"lit-link"}):
file_download1 = row1['href']
words = file_download1.split('/')
print (words[-1])
if words[-1][-3:]=="pdf":
print (file_download1)
if file_download1 != None:
if file_download1 != '':
try:
resource = urllib.request.urlopen(file_download1)
file_name1 = words[-1]
output = open(file_name1,"wb")
output.write(resource.read())
output.close()
except urllib.request.HTTPError:
print('Nthn')
j += 1
'''
#i += 1
The error its showing up is:
TypeError Traceback (most recent call last)
<ipython-input-17-468fc8825863> in <module>()
63 i += 1
64
---> 65 book.save('Sanjamar.xls')
66
67 '''
C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\Workbook.py in save(self,
filename_or_stream)
708
709 doc = CompoundDoc.XlsDoc()
--> 710 doc.save(filename_or_stream, self.get_biff_data())
711
712
C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\Workbook.py in
get_biff_data(self)
672 all_links = self.__all_links_rec()
673
--> 674 shared_str_table = self.__sst_rec()
675 after = country + all_links + shared_str_table
676
C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\Workbook.py in _
_sst_rec(self)
634
635 def __sst_rec(self):
--> 636 return self.__sst.get_biff_record()
637
638 def __ext_sst_rec(self, abs_stream_pos):
C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\BIFFRecords.py in
get_biff_record(self)
77 self._add_to_sst(s)
78 else:
---> 79 self._add_rt_to_sst(s)
80 del data
81 self._new_piece()
C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\BIFFRecords.py in
_add_rt_to_sst(self, rt)
106
107 def _add_rt_to_sst(self, rt):
--> 108 rt_str, rt_fr = upack2rt(rt, self.encoding)
109 is_unicode_str = rt_str[2] == b'\x09'[0]
110 if is_unicode_str:
C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\UnicodeUtils.py in
upack2rt(rt, encoding)
84 # code in Rows.py ensures that
85 # fontx can be None only for the first piece
---> 86 fr += pack('<HH', offset, fontx)
87 # offset is the number of MS C wchar characters.
88 # That is 1 if c <= u'\uFFFF' else 2
TypeError: must be str, not bytes