we use scrapy to crawl a website where you need to be logged in.
There is one website with different pages to crawl. So we have for example 3 different spiders and just need one login.
So we tried to use one driver for all spiders and we need to run the spiders sequentially:
#...
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from scrapy.utils.project import get_project_settings
#...
class LoginAndCrawl():
login_url = "https://example.com"
retry_count = 0
max_retries = 10
webdriver_timeout = 30
crawler_delay = 1
def __init__(self):
options = Options()
options.headless = True
self.driver = webdriver.Firefox(options=options)
self.prerender()
configure_logging()
self.runner = CrawlerRunner(get_project_settings())
self.crawl()
reactor.run()
self.driver.close()
@defer.inlineCallbacks
def crawl(self):
yield self.runner.crawl(MySpider1, driver=self.driver)
yield self.runner.crawl(MySpider2, driver=self.driver)
yield self.runner.crawl(MySpider3, driver=self.driver)
reactor.stop ()
def prerender(self):
try:
self.log_in()
except Exception as e:
self.retry_count += 1
if self.retry_count > self.max_retries:
self.driver.close()
self.driver = None
else:
self.prerender()
def log_in(self):
#... login code
helper = LoginAndCrawl()
class MySpider1(AbstractSpider)
def __init__(self, driver=None, **kwargs):
if not driver:
raise Exception("no driver")
super().__init__(**kwargs)
self.driver = driver
# do some stuff
class MySpider2(MySpider1)
def __init__(self, driver=None, **kwargs):
if not driver:
raise Exception("no driver")
super().__init__(driver, **kwargs)
# do some stuff
class MySpider3(MySpider1)
def __init__(self, driver=None, **kwargs):
if not driver:
raise Exception("no driver")
super().__init__(driver, **kwargs)
# do some stuff
If we just run one spider, everything is fine. But if we run more than one, the crawled documents are stored multiple times in out elasticsearch index. For example every document from MySpider1 is stored 3 times, MySpider2 twice and MySpider3 has every document stored once.
We tried to check if the duplicates are in our pipline before passing to elasticsearch but there aren't any duplicates passed from our side.
Our impression is that the elastic pipeline somehow keeps the documents from each spider and then saves them for each of them.
Is there any known issue with this implementation? Can someone confirm this wrong behavior? Is there any way to fix this problem?