With Scrapy, I will scrape a single page (via script and not from console) to check all the links on this page if they are allowed by the robots.txt
file.
In the scrapy.robotstxt.RobotParser
abstract base class, I found the method allowed(url, user_agent), but I don't see how to use it.
import scrapy
class TestSpider(scrapy.Spider):
name = "TestSpider"
def __init__(self):
super(TestSpider, self).__init__()
def start_requests(self):
yield scrapy.Request(url='http://httpbin.org/', callback=self.parse)
def parse(self, response):
if 200 <= response.status < 300:
links = scrapy.linkextractors.LinkExtractor.extract_links(response)
for idx, link in enumerate(links):
# How can I check each link is allowed by robots.txt file?
# => allowed(link.url , '*')
# self.crawler.engine.downloader.middleware.middlewares
# self.crawler AttributeError: 'TestSpider' object has no attribute 'crawler'
To run 'TestSpider' spider, in settings.py
set
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
Go to the project’s top level directory and run:
scrapy crawl TestSpider
Appreciate any help.
My solution:
import scrapy
from scrapy.downloadermiddlewares.robotstxt import RobotsTxtMiddleware
from scrapy.utils.httpobj import urlparse_cached
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TestSpider(CrawlSpider):
name = "TestSpider"
def __init__(self):
super(TestSpider, self).__init__()
self.le = LinkExtractor(unique=True, allow_domains=self.allowed_domains)
self._rules = [
Rule(self.le, callback=self.parse)
]
def start_requests(self):
self._robotstxt_middleware = None
for middleware in self.crawler.engine.downloader.middleware.middlewares:
if isinstance(middleware, RobotsTxtMiddleware):
self._robotstxt_middleware = middleware
break
yield scrapy.Request(url='http://httpbin.org/', callback=self.parse_robotstxt)
def parse_robotstxt(self, response):
robotstxt_middleware = None
for middleware in self.crawler.engine.downloader.middleware.middlewares:
if isinstance(middleware, RobotsTxtMiddleware):
robotstxt_middleware = middleware
break
url = urlparse_cached(response)
netloc = url.netloc
self._robotsTxtParser = None
if robotstxt_middleware and netloc in robotstxt_middleware._parsers:
self._robotsTxtParser = robotstxt_middleware._parsers[netloc]
return self.parse(response)
def parse(self, response):
if 200 <= response.status < 300:
links = self.le.extract_links(response)
for idx, link in enumerate(links):
# Check if link target is forbidden by robots.txt
if self._robotsTxtParser:
if not self._robotsTxtParser.allowed(link.url, "*"):
print(link.url,' Disallow by robotstxt file')