I've created a script using scrapy to parse all the links recursively from the left-sided area of this webpage. It's necessary to go for recursion as most of the links in there have sublinks and so on.
The following script appears to scrape all the inks accordingly. However, what I can't do is reuse the links from unique_links within parse_content
method. If I try to use the links while the recursion is going on, the script will use lots of duplicate links within parse_content
method. I've added an imaginary block of code after the two lines of comment within parse
method to represent what I wish to do.
import scrapy
from bs4 import BeautifulSoup
from scrapy.crawler import CrawlerProcess
class mySpider(scrapy.Spider):
name = "myspider"
start_urls = ["https://www.amazon.de/-/en/gp/bestsellers/automotive/ref=zg_bs_nav_0"]
unique_links = set()
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url,callback=self.parse,dont_filter=True)
def parse(self,response):
soup = BeautifulSoup(response.text,"lxml")
link_list = []
for item in soup.select("li:has(> span.zg_selected) + ul > li > a[href]"):
item_link = item.get("href")
link_list.append(item_link)
self.unique_links.append(item_link)
#THE FOLLOWING IS SOMETHING I WANTED TO DO WITH THE `unique_links` IF I COULD EXECUTE THE FOLLOWING BLOCK
#AFTER ALL THE LINKS ARE STORED IN `unique_links`
for new_link in self.unique_links:
yield scrapy.Request(new_link,callback=self.parse_content,dont_filter=True)
def parse_content(self,response):
soup = BeautifulSoup(response.text,"lxml")
for item in soup.select("span.a-list-item > .a-section a.a-link-normal"):
print(item.get("href"))
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT':'Mozilla/5.0',
'LOG_LEVEL':'ERROR',
})
c.crawl(mySpider)
c.start()
How can I reuse the links of unique_links
within parse_content
method?
EDIT: I'm terribly sorry if I still could not be able to clarify what I wanted to achieve. However, this is how I solved it. Any better approach is welcome.
class mySpider(scrapy.Spider):
name = "myspider"
start_urls = ["https://www.amazon.de/-/en/gp/bestsellers/automotive/ref=zg_bs_nav_0"]
unique_links = set()
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url,callback=self.parse,dont_filter=True)
def parse(self,response):
soup = BeautifulSoup(response.text,"lxml")
link_list = []
for item in soup.select("li:has(> span.zg_selected) + ul > li > a[href]"):
item_link = item.get("href")
link_list.append(item_link)
if item_link not in self.unique_links:
yield scrapy.Request(item_link,callback=self.parse_content,dont_filter=True)
self.unique_links.add(item_link)
for new_link in link_list:
yield scrapy.Request(new_link,callback=self.parse,dont_filter=True)
def parse_content(self,response):
# soup = BeautifulSoup(response.text,"lxml")
# for item in soup.select("span.a-list-item > .a-section a.a-link-normal"):
# print(item.get("href"))
print("------>",response.url)
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT':'Mozilla/5.0',
'LOG_LEVEL':'ERROR',
})
c.crawl(mySpider)
c.start()