0
<?php

    $i=1;
    while ($i<=5) {
      # code...

      $url = 'http://www.amazon.in/gp/bestsellers/electronics/ref=zg_bs_nav_0#'.$i;
      echo $url;
            $html= file_get_contents($url);
            $dom = new DOMDocument();
            @$dom->loadHTML($html);
            $xPath = new DOMXPath($dom);
            $classname="zg_title";
            $elements = $xPath->query("//*[contains(@class, '$classname')]");
                foreach ($elements as $e)
              {
                $lnk = $e->getAttribute('href');
                $e->setAttribute("href", "http://www.amazon.in".$lnk);
                $newdoc = new DOMDocument;
                $e = $newdoc->importNode($e, true);
                $newdoc->appendChild($e);
                $html = $newdoc->saveHTML();
                echo $html;
            }
            $i++;
           }
?>

I am trying to crawl through the Amazon bestsellers page which has a list of top 100 bestseller items which have 20 items in each page. In every loop the $i value is changed and appended to URL. But only the first 20 items are being displayed 5 times, I think this has something to do with the ajax pagination, but i am not able to figure out what it is.

Adam Azad
  • 11,171
  • 5
  • 29
  • 70
Pramod S
  • 94
  • 11

1 Answers1

1

Try this:

<?php

    $i=1;
    while ($i<=5) {
      # code...
        $url = 'http://www.amazon.in/gp/bestsellers/electronics/ref=zg_bs_electronics_pg_'.$i.'?ie=UTF8&pg='.$i;
      echo $url;
            $html= file_get_contents($url);
            $dom = new DOMDocument();
            @$dom->loadHTML($html);
            $xPath = new DOMXPath($dom);
            $classname="zg_title";
            $elements = $xPath->query("//*[contains(@class, '$classname')]");
                foreach ($elements as $e)
              {
                $lnk = $e->getAttribute('href');
                $e->setAttribute("href", "http://www.amazon.in".$lnk);
                $newdoc = new DOMDocument;
                $e = $newdoc->importNode($e, true);
                $newdoc->appendChild($e);
                $html = $newdoc->saveHTML();
                echo $html;
            }
            $i++;
           }
?>

Change your $url

Happy Coding
  • 2,517
  • 1
  • 13
  • 24