I am trying to scrape the following website for basic real estate listing information:
https://www.propertyfinder.ae/en/search?c=2&fu=0&l=50&ob=nd&page=1&rp=y
Parts of the website are dynamically loaded from a back end API when the page is scrolled down using javascript. To get around this I have tried using Scrapy with Splash to render the javascript. The issue I am having is that while instead of returning all the listings it only returns the first 8. I thought the problem was the page wasn't scrolled down and so the page wasnt populated and the divs I needed weren't rendered. I then tried adding some Lua code (which I have no experience with) to scroll the page down in hope it would be populated, however it hasn't worked. Below is my spider:
import scrapy
from scrapy.shell import inspect_response
import pandas as pd
import functools
import time
import requests
from lxml.html import fromstring
import math
from scrapy_splash import SplashRequest
import scrapy_splash
class pfspider(scrapy.Spider):
name = 'property_finder_spider'
start_urls = ["https://www.propertyfinder.ae/en/search?c=2&fu=0&l=50&ob=nd&page=1&rp=y"]
script1 = """function main(splash)
local num_scrolls = 10
local scroll_delay = 1.0
local scroll_to = splash:jsfunc("window.scrollTo")
local get_body_height = splash:jsfunc(
"function() {return document.body.scrollHeight;}"
)
assert(splash:go(splash.args.url))
splash:wait(splash.args.wait)
for _ = 1, num_scrolls do
scroll_to(0, get_body_height())
splash:wait(scroll_delay)
end
return splash:html()
end"""
def start_requests(self):
for urll in self.start_urls:
# yield scrapy_splash.SplashRequest(url=urll, callback=self.parse, endpoint='execute', args={'wait':2, 'lua_source': script1})
yield scrapy_splash.SplashRequest(url=urll, endpoint='render.html', callback=self.parse)
def parse(self, response):
inspect_response(response, self)
containers = response.xpath('//div[@class="column--primary"]/div[@class="card-list__item"]')
Listing_names_pf = containers[0].xpath('//h2[@class="card__title card__title-link"]/text()').extract()
Currency_pf = ['AED'] * len(Listing_names_pf)
Prices_pf = containers[0].xpath('//span[@class="card__price-value"]/text()').extract()
type_pf = containers[0].xpath('//p[@class="card__property-amenity card__property-amenity--property-type"]/text()').extract()
Bedrooms_pf = containers[0].xpath('//p[@class="card__property-amenity card__property-amenity--bedrooms"]/text()').extract()
Bathrooms_pf = containers[0].xpath('//p[@class="card__property-amenity card__property-amenity--bathrooms"]/text()').extract()
SQF_pf = containers[0].xpath('//p[@class="card__property-amenity card__property-amenity--area"]/text()').extract()
Location_pf = containers[0].xpath('//span[@class="card__location-text"]/text()').extract()
Links_pf = containers[0].xpath('//div[@class="card-list__item"]/a/@href').extract()
Links_pf_full = []
for link in Links_pf:
Links_pf_full.append('https://www.propertyfinder.ae/'+link)
Another thing I noticed was when the is page rendered in splash, in the html output file there is a script called Tealium that does have the listing data for all items in lists but not under the divs in the page.
any and all help or suggestions would be greatly appreciated.