I am using https://github.com/FriendsOfPHP/Goutte. I keep getting the wrong url on clicking the pagination link in while loop.
The selectLink on the object returns the right url for the first while loop. Looks like the second loop returns the wrong value for selectLink.
Here is the code.
public function __construct(Goutte\Client $client){
$this->client = $client;
}
public function parse(){
$url = "https://www.nextag.com/Arts-Entertainment--zz2702147z0z1zB6c4z5---html";
// crawl through first page
$crawler = $this->client->request('GET', $url);
// first page pagination links
$links = $this->paginationCrawler($crawler);
$linkBatch = array();
// get all pagination links and check if the next 10 links are available
list($linkBatch[], $_nextPage) = $this->getPaginationLinks($links);
// if $_nextPage == '11+/21+/etc' then crawl through all links
while($_nextPage != 'false'){
$link = $links->selectLink($_nextPage)->link();
$crawler = $this->client->click($link);
$links = $this->paginationCrawler($crawler);
list($linkBatch[], $_nextPage) = $this->getPaginationLinks($links);
}
dd($linkBatch);
}
public function paginationCrawler($crawler){
return $crawler->filter('#pagination');
}
public function getPaginationLinks($links){
$allLinks = $links->filter('#numbers a');
$linkNodes = $allLinks->each(function(Crawler $a) {
return $a->attr('href');
});
$lastPage = trim($links->filter('#numbers :last-child')->text());
if (strpos($lastPage,'+') === false) {
$lastPage = 'false';
}
return array($linkNodes, $lastPage);
}
Here is the output: