0

I'm trying to scrape glassdoor with the Scrapy library. I've got all the links to extract the info in a mongo database.

The error I'm getting is:

2019-09-02 13:54:56 [scrapy.core.engine] ERROR: Error while obtaining start requests
Traceback (most recent call last):
  File "/home/ubuntu/miniconda3/lib/python3.7/site-packages/scrapy/core/engine.py", line 127, in _next_request
    request = next(slot.start_requests)
  File "/home/ubuntu/miniconda3/lib/python3.7/site-packages/scrapy/spiders/__init__.py", line 73, in start_requests
    yield Request(url, dont_filter=True)
  File "/home/ubuntu/miniconda3/lib/python3.7/site-packages/scrapy/http/request/__init__.py", line 25, in __init__
    self._set_url(url)
  File "/home/ubuntu/miniconda3/lib/python3.7/site-packages/scrapy/http/request/__init__.py", line 69, in _set_url
    raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: 
2019-09-02 13:54:56 [scrapy.core.engine] INFO: Closing spider (finished)

The code I have is:

# Importing libraries.
import scrapy
from scrapy.http import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import CloseSpider
import json
import re

# Importing files.
import mongo_db as db

# Glassdoor Scraper
class GlassdoorScrapySpider(CrawlSpider):

    # Spider name, domain, and headers.
    name = 'glassdoor_scraper'
    allowed_domain = ['https://www.glassdoor.com']

    # The first method called by GlassdoorScrapySpider.
    def start_requests(self):
        # Connecting to MongoDB.
        connection = db.connect_to_database()

        # Reading all links in database.
        db_links = db.read_crawled_urls(client=connection)

        # Calling the parse function to scrape all data.
        for link in db_links:
            yield Request(url=link, callback=self.parse, headers=self.headers)

        # Closing connection with MongoDB.
        db.close_connection(connection)

    # This method gets all the job_posting json data inside the urls.
    def parse(self, response):

        text = response.xpath('//*[@id="JobContent"]/script/text()') # Extracting the tag with the JSON.
        text = text.extract()[0].strip() # Extracting the text and removing the leading/trailing spaces.
        text = re.sub(r'<.*?>', '', text) # Deleting the HMTL inside the description.
        text = text.replace('\r', '') # Removing unnecesary end lines.
        text = text.replace('\n', '') # Removing unnecesary end lines.
        text = text.replace('\t', '') # Removing unnecesary tabs.
        text = text.replace('\\', '') # Removing unnecesary characters.

        try:
            loaded_json = json.loads(text)
            db.save_scraped(client=connection, new_data=loaded_json, task_number=self.task_number, broken=False)
        except:
            print('\nReturned JSON is broken.\n')
            if loaded_json:
                db.save_scraped(client=connection, new_data=loaded_json, task_number=self.task_number, broken=True)

I've tried with self.start_urls = [] and with self.start_urls = db_links (because db_links is a list I get from mongo). And of course, I putted that in a method called __init__.
None of that works.
I don't know what else to try.

.

EDIT:

I'm trying to change the code, to see if I can find a solution but it still fails.
I've checked the "db_link" variable and it's fine, is a list with all the links. I've alto putted the connection and db_close, etc inside the __init__ method.

# Importing libraries.
import scrapy
from scrapy.http import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import CloseSpider
import json
import re

# Importing files.
import mongo_db as db

# Glassdoor Scraper
class GlassdoorScrapySpider(CrawlSpider):

    # Spider name, domain, and headers.
    name = 'glassdoor_scraper'
    allowed_domain = ['https://www.glassdoor.com']
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
               'AppleWebKit/537.36 (KHTML, like Gecko) '
               'Chrome/32.0.1700.102 Safari/537.36',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
               'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
               'Accept-Encoding': 'none',
               'Accept-Language': 'en-US,en;q=0.8',
               'Connection': 'keep-alive'
               }

    # Connecting to MongoDB.
    connection = db.connect_to_database()
    # Reading all links in database.
    db_links = db.read_crawled_urls(client=connection)
    # Closing connection with MongoDB.
    db.close_connection(connection)

    # The first method called by GlassdoorCrawlSpider.
    def __init__(self, **kwargs):
        super(GlassdoorScrapySpider, self).__init__(**kwargs)
        self.start_urls = [db_links]

    # The second method called by GlassdoorCrawlSpider.
    def start_requests():
        for link in db_links:
            # Calling the parse function with the requested html to scrape all data.
            yield Request(url=link, callback=self.parse, headers=self.headers)

    # This method gets all the job_posting json data inside the urls.
    def parse(self, response):

        text = response.xpath('//*[@id="JobContent"]/script/text()') # Extracting the tag with the JSON.
        text = text.extract()[0].strip() # Extracting the text and removing the leading/trailing spaces.
        text = re.sub(r'<.*?>', '', text) # Deleting the HMTL inside the description.
        text = text.replace('\r', '') # Removing unnecesary end lines.
        text = text.replace('\n', '') # Removing unnecesary end lines.
        text = text.replace('\t', '') # Removing unnecesary tabs.
        text = text.replace('\\', '') # Removing unnecesary characters.

        try:
            loaded_json = json.loads(text)
            db.save_scraped(client=connection, new_data=loaded_json, task_number=self.task_number, broken=False)
        except:
            print('\nReturned JSON is broken.\n')
            if loaded_json:
                db.save_scraped(client=connection, new_data=loaded_json, task_number=self.task_number, broken=True)

.

EDIT 2:

If you would like to see the implementation of "read_crawled_urls", here it is:

def read_crawled_urls(client):
    # The actual database
    db = client.TaskExecution

    # Selecting the collection of the database.
    collection = db.Urls

    url_list = []
    for entry in collection.find():
        url_list.append(entry['link'])

    return url_list

When I run this spider from a main.py file doing:
os.system('scrapy runspider gs_scraper.py')
The code throws the error. But if I run it from terminal, it aparently works fine.

Octaviotastico
  • 124
  • 5
  • 13

0 Answers0