I have created a Crawler with Scrapy for a dynamic javascript website. Sometimes it works, sometimes it don't. This made wonder whether this was because of unstable internet maybe?
import scrapy
from scrapy_splash import SplashRequest
import csv
from time import sleep
import subprocess
wait_script = """
function main(splash)
splash:set_user_agent(splash.args.ua)
assert(splash:go(splash.args.url))
-- requires Splash 2.3
splash:wait(5)
-- while not splash:select('.shelfProductTile-descriptionLink') do
splash:wait(1)
end
return {html=splash:html()}
end
"""
class IceTeaSpider(scrapy.Spider):
name = "icetea"
def start_requests(self):
url = "<my-url>"
yield SplashRequest(url, args={
'lua_source': wait_script,
})
def parse(self, response):
breadcrumbs = response.xpath(
'//*[@id="search-content"]/div/shared-breadcrumb/div/ul'
)
products = response.css("a.shelfProductTile-descriptionLink::text").getall()
for html in breadcrumbs:
yield {
"category": html.css("a.ng-star-inserted::text").getall()
+ html.css("span.ng-star-inserted::text").getall(),
"products": products,
}
with open('breadcrumbs_result.csv', 'w') as f:
write = csv.writer(f)
write.writerows([['Results'], [str(html.css("a.ng-star-inserted::text").getall())]])
with open('products_result.csv', 'w') as f:
write = csv.writer(f)
write.writerow(['Product Name'])
write.writerows([products])
Is there something simple I can do like checking whether products returns an object, if not maybe I execute the crawler again? Or am I missing something?