To be clear I try to crawl forums about casino, for now, I have succeeded to do so using the same scheme as below :
class test_spider(scrapy.Spider):
count=0
name = "test_spyder"
start_urls = [
'https://casinogrounds.com/forum/search/?&q=Casino&search_and_or=or&sortby=relevancy',
]
rules = ( Rule(LinkExtractor(restrict_css=('a:contains("Next")::attr(href)')), callback='parse') )
def parse(self, response) :
print(self.count)
for href in response.css("span.ipsType_break.ipsContained a::attr(href)") :
new_url = response.urljoin(href.extract())
#print(new_url)
yield scrapy.Request(new_url, callback = self.parse_review)
next_page = response.css('a:contains("Next")::attr(href)').extract_first()
print(next_page)
if next_page is not None:
yield scrapy.Request(next_page, callback = self.parse)
def parse_review(self, response):
parsed_uri = urlparse(response.url)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
for review in response.css('article.cPost.ipsBox.ipsComment.ipsComment_parent.ipsClearfix.ipsClear.ipsColumns.ipsColumns_noSpacing.ipsColumns_collapsePhone') :
yield {
'name': review.css('strong a.ipsType_break::text').extract_first(),
'date': review.css('time::attr(title)').extract_first(),
'review': review.css('p::text').extract(),
'url' : response.url
}
next_page = response.css('li.ipsPagination_next a::attr(href)').extract_first()
if next_page is not None:
yield response.follow(next_page, callback=self.parse_review)
So when I execute that spider within a python scripts, normally (I mean for other forums) it crawled all the threads of all the pages from the start url.
But for that one it does not, it scraps only all the threads of the first page, it gets the right URL for going onto the second page, but do call another time the parse function.
And of course, if I put all the URL of the pages in the start_urls list it scraps all pages...
Thank you for the help.