The page I want to scrape is:http://pcdtattoo.en.alibaba.com/productlist-1.html
There are 7 pages of products, I want to capture each item's name, image URL and link. Following is my codes:
var products = [];
casper.start();
for (i=1;i<8;i++){
casper.thenOpen('http://pcdtattoo.en.alibaba.com/productlist-' + i + '.html')
casper.wait(4000,function getItems(){
var itemArr = this.evaluate(function(){
var $ = jQuery;
var c = $('.app-productsGalleryView li');
var items = [];
c.each(function(i){
var product = {};
product.title = $(this).find('.product-title>a').text().trim();
product.link = $(this).find('.product-title>a').attr('href');
product.img = $(this).find('img').attr('src');
items.push(product);
})
return items;
})
//To capture product description (has nothing to do with this question)
itemArr.forEach(function(item,i){
casper.thenOpen('http://pcdtattoo.en.alibaba.com/'+item.link,function(item){
var f = function(){
item.intro = this.evaluate(function(){
var $ = jQuery;
return $('#richTextContainer').prop('innerHTML');
})
}
return f;
}(item))
})
for (j=0;j<itemArr.length;j++){
products.push(itemArr[j]);
}
})
}
casper.run(function(){
//use phantom fs module to write what is captured to a js file for future use
fs.write('test.js','Products=' + JSON.stringify(products),'w');
this.exit();
});
The scraping goes well, except that for each of the 7 pages, only the first 2 or 3 items' img URLs are scraped correctly, others are all loading gif, the result can be seen on this page: http://nycweb.io/pcd/prodcuts.php (Every product list page has 36 items, you can see only items[0,1,36,37,72,73...]'s image URL are correct, the rest are all loading gif)
I guess this is because there is a "waterfall" effect on the product list page, you have to scroll down to have the images' URL, which is currently loading gif, replaced by true image URLs.
But how do I trigger this from within my casper code? I have tried casper.wait()
4000, that apparently doesn't work; I also tried put
for (k=0;k<5000;k+=500){
window.scroll(0,k);
}
inside the evaluate
function, that doesn't work either. I guess this is because evaluate
doesn't really trigger window.scroll
event on the page. So what should I do?
UPDATE
I change my code to scroll the page:
casper.wait(4000,function getItems(){
for (k=0;k<5000;k+=250){
this.scrollTo(0,k);
var pos = this.evaluate(function(){
var $ = jQuery;
var r = {};
//getting last item image's position and src
r.top = $('form ul>li img').get(35).getBoundingClientRect().top;
r.src = $('form ul>li img').eq(35).attr('src');
return r;
})
this.echo("top:" + pos.top);
this.echo("src:" + pos.src);
}
The output is something like this:
src://u.alicdn.com/js/5v/esite/img/loading.gif top:1005 src://u.alicdn.com/js/5v/esite/img/loading.gif top:755 src://u.alicdn.com/js/5v/esite/img/loading.gif top:505 src://u.alicdn.com/js/5v/esite/img/loading.gif top:255 src://u.alicdn.com/js/5v/esite/img/loading.gif top:5 src://u.alicdn.com/js/5v/esite/img/loading.gif top:-245 src://u.alicdn.com/js/5v/esite/img/loading.gif top:-263
Clearly the window has scrolled through the image, but the loading.gif
isn't being replaced. On the other hand, on the browser console, if I do window.scroll(0,k+=250)
, the link will be replaced. So what is the problem? Shall I wait a while for the image's link to be replaced?
Update
I changed my code to wait for a few seconds:
this.scrollTo(0,4250);
casper.wait(5000,function(){
var pos = this.evaluate(function(){
var $ = jQuery;
var r = {};
r.top = $('form ul>li img').get(35).getBoundingClientRect().top;
r.src = $('form ul>li img').eq(35).attr('src');
return r;
})
this.echo("top:" + pos.top);
this.echo("src:" + pos.src);
})
the output is:
top:5
src://u.alicdn.com/js/5v/esite/img/loading.gif
Looks likes a dead end to me..