I am programming a web crawler in python with Scrapy. The purpose is to monitor changes to a webpage at pre-determined time intervals. After logging in to the website, the spider requests a web page every X minutes and certain data is extracted from the page and saved to a text file. It turns out that the text file is written only when the spider closes, and the lines in the text files are not in chronological order. I can't figure it out what is happening. Maybe it's a specific way of working of the Scrapy module? Any ideas?
import scrapy
from scrapy.http import Request
from scrapy.http import FormRequest
from scraping_example.loginform import fill_login_form
from datetime import datetime
import time
class ExampleSpiderSpider(scrapy.Spider):
name = 'example_spider'
allowed_domains = ['example.com']
start_urls = ['http:/www.example.com/login']
login_user = 'edging780'
login_pass = ''
def parse(self, response):
(args, url, method) = fill_login_form(response.url,
response.body, self.login_user, self.login_pass)
return FormRequest(url, method=method, formdata=args,
callback=self.after_login)
def after_login(self, response):
for i in range(0,6):
request = Request('https://www.example.com/page_to_scrape', callback=self.get_table, dont_filter = True)
request.meta['dateTime'] = str(datetime.now())
request.meta['order'] = str(i)
yield request
time.sleep(600)
return
def get_table(self, response):
table = response.xpath('//table[@class="example_table"]/tbody/tr[not(contains(@class,"thead"))]')
Data=[]
for n_row in range(0,len(table)):
row = table[n_row]
Data.append(row.xpath('td[1]/text()').extract())
dictionary = {'Time': response.meta['dateTime'],
'Order': response.meta['order'],
'Data': Data}
with open('output.txt', 'a') as f:
f.write(str(dictionary) + '\n')
return