0

The page I want to scrape is:http://pcdtattoo.en.alibaba.com/productlist-1.html

There are 7 pages of products, I want to capture each item's name, image URL and link. Following is my codes:

var products = [];
casper.start();
for (i=1;i<8;i++){
  casper.thenOpen('http://pcdtattoo.en.alibaba.com/productlist-' + i + '.html')
  casper.wait(4000,function getItems(){
    var itemArr = this.evaluate(function(){
      var $ = jQuery;
      var c = $('.app-productsGalleryView li');
      var items = [];
      c.each(function(i){
          var product = {};
          product.title = $(this).find('.product-title>a').text().trim();
          product.link = $(this).find('.product-title>a').attr('href');
          product.img = $(this).find('img').attr('src');
          items.push(product);
      })
      return items;
    })

    //To capture product description (has nothing to do with this question)
    itemArr.forEach(function(item,i){
        casper.thenOpen('http://pcdtattoo.en.alibaba.com/'+item.link,function(item){
          var f = function(){
            item.intro = this.evaluate(function(){
              var $ = jQuery;
              return $('#richTextContainer').prop('innerHTML');
            })
          }
          return f;
        }(item))
    })
    for (j=0;j<itemArr.length;j++){
      products.push(itemArr[j]);
    }
  })
}


casper.run(function(){
  //use phantom fs module to write what is captured to a js file for future use
  fs.write('test.js','Products=' + JSON.stringify(products),'w');
  this.exit();
});

The scraping goes well, except that for each of the 7 pages, only the first 2 or 3 items' img URLs are scraped correctly, others are all loading gif, the result can be seen on this page: http://nycweb.io/pcd/prodcuts.php (Every product list page has 36 items, you can see only items[0,1,36,37,72,73...]'s image URL are correct, the rest are all loading gif)

I guess this is because there is a "waterfall" effect on the product list page, you have to scroll down to have the images' URL, which is currently loading gif, replaced by true image URLs.

But how do I trigger this from within my casper code? I have tried casper.wait() 4000, that apparently doesn't work; I also tried put

  for (k=0;k<5000;k+=500){
    window.scroll(0,k);
  }

inside the evaluate function, that doesn't work either. I guess this is because evaluate doesn't really trigger window.scroll event on the page. So what should I do?

UPDATE

I change my code to scroll the page:

 casper.wait(4000,function getItems(){

for (k=0;k<5000;k+=250){
  this.scrollTo(0,k);
  var pos = this.evaluate(function(){
    var $ = jQuery;
    var r = {};

    //getting last item image's position and src
    r.top = $('form ul>li img').get(35).getBoundingClientRect().top;
    r.src = $('form ul>li img').eq(35).attr('src');
    return r;
  })
  this.echo("top:" + pos.top);
  this.echo("src:" + pos.src);
}

The output is something like this:

src://u.alicdn.com/js/5v/esite/img/loading.gif top:1005 src://u.alicdn.com/js/5v/esite/img/loading.gif top:755 src://u.alicdn.com/js/5v/esite/img/loading.gif top:505 src://u.alicdn.com/js/5v/esite/img/loading.gif top:255 src://u.alicdn.com/js/5v/esite/img/loading.gif top:5 src://u.alicdn.com/js/5v/esite/img/loading.gif top:-245 src://u.alicdn.com/js/5v/esite/img/loading.gif top:-263

Clearly the window has scrolled through the image, but the loading.gif isn't being replaced. On the other hand, on the browser console, if I do window.scroll(0,k+=250), the link will be replaced. So what is the problem? Shall I wait a while for the image's link to be replaced?

Update

I changed my code to wait for a few seconds:

  this.scrollTo(0,4250);
casper.wait(5000,function(){
  var pos = this.evaluate(function(){
    var $ = jQuery;
    var r = {};
    r.top = $('form ul>li img').get(35).getBoundingClientRect().top;
    r.src = $('form ul>li img').eq(35).attr('src');
    return r;
  })
  this.echo("top:" + pos.top);
  this.echo("src:" + pos.src);
})

the output is:

top:5

src://u.alicdn.com/js/5v/esite/img/loading.gif

Looks likes a dead end to me..

shenkwen
  • 3,536
  • 5
  • 45
  • 85
  • 2
    There are many ways to solve this: [1](http://stackoverflow.com/a/32532898/1816580), [2](http://stackoverflow.com/q/17521065/1816580), [3](http://stackoverflow.com/a/28927464/1816580). There are many more. See if one of them solves your problem. – Artjom B. Jun 29 '16 at 19:19
  • @ArtjomB. I changed my code based on your suggestion and updated the question. It is still not working, what could possibly be the cause? – shenkwen Jun 29 '16 at 21:43
  • You probably need to wait between scroll operations. Remember that `casper.wait` is asynchronous, so it's a bit tricky to combine it with a loop. – Artjom B. Jun 29 '16 at 22:40
  • Thanks. That is the direction I am heading. – shenkwen Jun 29 '16 at 22:42
  • @ArtjomB. I updated the question, although I wait for a while the link is still not being replaced. Seems to me this is not the right direction. Maybe to scrape this I need to know more about how the link replacing action is triggered on the page?? But it seems to me is clearly triggered by window scrolling... – shenkwen Jun 29 '16 at 23:05
  • @shenkwen There is so many updates in your post and make your question a total mess. In my opinion, it would be better to post another question which contains clean code and questions... – Sayakiss Jun 30 '16 at 16:39

0 Answers0