i want to use scrapy and django for scrape some date and i implemented it with scrapy-django-dashboard document well but when i run this commant:
scrapy crawl post_spider -a id=1 -a do_action=yes
i get this error:
django.core.exceptions.SynchronousOnlyOperation: You cannot call this from an async context - use a thread or sync_to_async.
this is my model.py class in django :
@python_2_unicode_compatible
class NewsWebsite(models.Model):
name = models.CharField(max_length=200)
url = models.URLField()
scraper = models.ForeignKey(
Scraper, blank=True, null=True, on_delete=models.SET_NULL)
scraper_runtime = models.ForeignKey(
SchedulerRuntime, blank=True, null=True, on_delete=models.SET_NULL)
def __str__(self):
return self.name
@python_2_unicode_compatible
class Article(models.Model):
title = models.CharField(max_length=200)
# https://stackoverflow.com/a/44026807
news_website = models.ForeignKey(
NewsWebsite, blank=True, null=True, on_delete=models.SET_NULL)
description = models.TextField(blank=True)
url = models.URLField(blank=True)
thumbnail = models.CharField(max_length=200, blank=True)
checker_runtime = models.ForeignKey(
SchedulerRuntime, blank=True, null=True, on_delete=models.SET_NULL)
def __str__(self):
return self.title
class ArticleItem(DjangoItem):
django_model = Article
@receiver(pre_delete)
def pre_delete_handler(sender, instance, using, **kwargs):
if isinstance(instance, NewsWebsite):
if instance.scraper_runtime:
instance.scraper_runtime.delete()
if isinstance(instance, Article):
if instance.checker_runtime:
instance.checker_runtime.delete()
pre_delete.connect(pre_delete_handler)
and this is my spider:
class ArticleSpider(DjangoSpider):
name = 'article_spider'
def __init__(self, *args, **kwargs):
self._set_ref_object(NewsWebsite, **kwargs)
self.scraper = self.ref_object.scraper
self.scrape_url = self.ref_object.url
self.scheduler_runtime = self.ref_object.scraper_runtime
self.scraped_obj_class = Article
self.scraped_obj_item_class = ArticleItem
super(ArticleSpider, self).__init__(self, *args, **kwargs)
also this is my pipline:
class DjangoWriterPipeline(object):
def process_item(self, item, spider):
if spider.conf['DO_ACTION']:
try:
item['news_website'] = spider.ref_object
checker_rt = SchedulerRuntime(runtime_type='C')
checker_rt.save()
item['checker_runtime'] = checker_rt
item.save()
spider.action_successful = True
spider.logger.info("{cs}Item {id} saved to Django DB.{ce}".format(
id=item._id_str,
cs=spider.bcolors['OK'],
ce=spider.bcolors['ENDC']))
except IntegrityError as e:
spider.logger.error(str(e))
raise DropItem("Missing attribute.")
return item
as you can see i implemented it just like document but when i want to run scrapy command i have sync_to_async error . even i put @sync_to_async on top of def process_item(self, item, spider): function but it still doesnt work
my database is postgis
this is the link of document: https://scrapy-django-dashboard.readthedocs.io/en/latest/
more about the error :
[2022-01-10 15:29:21,617] engine: INFO - Closing spider (finished)
2022-01-10 15:29:21 [scrapy.core.engine] INFO: Closing spider (finished)
[2022-01-10 15:29:21,619] log: INFO - Closing Django DB connection.
2022-01-10 15:29:21 [post_spider] INFO: Closing Django DB connection.
[2022-01-10 15:29:21,623] signal: ERROR - Error caught on signal handler: <bound method DjangoBaseSpider.spider_closed of <PostSpider 'post_spider' at 0x7f4bd8148fa0>>
Traceback (most recent call last):
File "/home/mehdi/myenv/lib/python3.8/site-packages/scrapy/utils/defer.py", line 161, in maybeDeferred_coro
result = f(*args, **kw)
File "/home/mehdi/myenv/lib/python3.8/site-packages/pydispatch/robustapply.py", line 55, in robustApply
return receiver(*arguments, **named)
File "/home/mehdi/myenv/lib/python3.8/site-packages/scrapy_django_dashboard/spiders/django_base_spider.py", line 299, in spider_closed
connection.close()
File "/home/mehdi/myenv/lib/python3.8/site-packages/django/utils/asyncio.py", line 24, in inner
raise SynchronousOnlyOperation(message)
django.core.exceptions.SynchronousOnlyOperation: You cannot call this from an async context - use a thread or sync_to_async.