0

I am trying to follow all internal links while tracking all internal and external links to a website. I have just started working with Scrapy and I am unable to figure out how I could crawl while following all internal links in the website.

It just fetches the links on depth one but doesn't follow them.

class BRS(CrawlSpider):
    name = "brs"
    rules = (Rule(SgmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
    def __init__(self):
        global start_urls
        #settings.overrides['DEPTH_LIMIT'] = 10                                                                                                    
        path = os.path.dirname(os.path.abspath(__file__))
        with open(os.path.join(path,"urls.txt"), "rt") as f:
            self.start_urls = filter(None,[url.strip() for url in f.readlines()])
        start_urls = self.start_urls


    def parse(self, response):
        brsitem = BrsItem()
        brsitem['url'] = response.url
        internal = LinkExtractor(allow_domains=[response.url])
        external = LinkExtractor(deny_domains=[response.url])
        links = internal.extract_links(response)
        internal = []
        fd = open('output.txt','a+')
        for link in links:
            internal.append(link.url)

        links = external.extract_links(response)
        external = []
        for link in links:
            external.append(link.url)
        for link in internal:
            fd.write(link+"\tinternal\n")

        for link in external:
            fd.write(link+"\texternal\n")

        return brsitem

My urls.txt contains as of now: http://www.stackoverflow.com

Any help is appreciated.

Akshay Hazari
  • 3,186
  • 4
  • 48
  • 84

1 Answers1

1

Got it working using a reference of this link and also got my ip blocked on stackoverflow when I had forgotten to set the DEPTH_LIMIT parameter. Somethings are learnt the hard way.

import scrapy
from scrapy.contrib.linkextractors.sgml import  SgmlLinkExtractor
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor
import urllib2,requests
from scrapy.conf import settings
from requests.auth import HTTPBasicAuth
import urllib2,requests,os,sys
from urlparse import urlparse
from brs.items import BrsItem


class BRS(CrawlSpider):
    name = "brs"

    def __init__(self):
        global start_urls,rules 
        settings.overrides['DEPTH_LIMIT'] = 10
        path = os.path.dirname(os.path.abspath(__file__))
        with open(os.path.join(path,"urls.txt"), "r+") as f:
            self.start_urls = filter(None,[url.strip() for url in f.readlines()])

        start_urls = self.start_urls
        self.rules = (Rule(SgmlLinkExtractor(allow=()), callback=self.parse_items, follow=True),)
        rules = self.rules
        self._rules = rules



    def extract_domain(self,url):
        return urlparse(url).netloc


    def parse_items(self, response):

        internal = LinkExtractor(allow_domains=[self.extract_domain(response.url)])
        external = LinkExtractor(deny_domains=[self.extract_domain(response.url)])
        links = internal.extract_links(response)
        internal = []
        fd = open('output.txt','a+')
        for link in links:
            internal.append(link.url)

        for link in internal: 
            fd.write(link+"\tinternal\n")

        links = external.extract_links(response)
        external = []
        for link in links:
            external.append(link.url)
        for link in external:
            fd.write(link+"\texternal\n")
        for link in internal:
            yield scrapy.Request(link.strip(), callback=self.parse_attr)



    def parse_attr(self, response):
        brsitem = BrsItem()
        brsitem['url'] = response.url.strip()
        return brsitem
Akshay Hazari
  • 3,186
  • 4
  • 48
  • 84