i hope you well in this situation. Am has question about web scraping (paginate) with nodejs and library cheerio. has some code done but there is trouble it only scrape one page, am already searching for solutions few hours, try to follow some steps but same the result is it only scrapes one page. I am appreciate any answer from you, here is the code:
const request = require('request-promise')
const cheerio = require('cheerio')
const fs = require('fs')
const baseUrl = 'https://indotrading.com/company_hdpe_620' // the website i want to scrape
const outputFile = 'data.csv'
const parsedResults = []
var indexPage = 1
var totalPage = 0
const getWebsiteContent = async (url) => {
try {
request(url).then(function(body){
const $ = cheerio.load(body)
let page = $('.footer-page').children().children().last().children().prop("href") //get last page navigation button
page = page.split("/")
totalPage = page[page.length-1] //total page that website has
//get some data from HTML attribute
$('#products_container #catcom-container').each((key,element) => {
const linkImage = $(element).find('.swiper-wrapper').children().children().children().prop('data-src')
const companyName = $(element).find('.product_title').text().replace(/\n+/g,'')
const companyAddress = $(element).find('i.fa.fa-map-marker.fs-18.mr-5').parent().find('p.d-flex.a-center').text().replace(/\s/,'')
const splitLinkImage = linkImage.split("/")
const companyID = splitLinkImage[splitLinkImage.indexOf("webp")+1]
//calling function phone data based on company id
const getdataPhone = getPhoneData(companyID)
getdataPhone.then(function(result) { // please check this one, is the promise correct?
const listCompanyPhone = JSON.parse(result.d)
const companyPhone = listCompanyPhone.Phone+" , "+listCompanyPhone.Phone2
const Company = {
Name : companyName,
Phone: companyPhone,
Address: companyAddress
}
parsedResults.push(Company)
exportResults(parsedResults)
})
})
})
const nextPageLink = baseUrl+'/'+(++indexPage) // get next page
indexPage++
if(indexPage == totalPage){
exportResults(parsedResults) // exports to csv but not work
return false
}
getWebsiteContent(nextPageLink) //it will not recursive
} catch (error) {
console.log(error)
}
}
//function for get data by calling api and it returns json
function getPhoneData(data) {
try {
var options = {
method : 'POST',
uri : 'https://www.indotrading.com/AjaxMethod.asmx/UpdateCompanyPhoneLeads',
body : {
Token : "EAAAAKTheWTVifIaYce5HmctJuDKNQO5nbySwS3GGi14hbcy0oGq3yqxMhd5sE6349byCw==",
EncCompanyID : data,
ProductID : "undefined"
},
headers : {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'Content-Type': 'application/json'
},
json: true
}
return request(options).then(function(body){
return body
}).catch(function(error){
console.log(error)
})
} catch (error) {
console.log("get phone data error : "+error)
}
}
//function for export to csv file
const exportResults = (parsedResults) => {
fs.writeFile(outputFile, JSON.stringify(parsedResults, null, 4), (err) => {
if (err) {
console.log(err)
}
console.log(`\n ${parsedResults.length} Results exported successfully to ${outputFile}\n`)
})
}
getWebsiteContent(baseUrl)
how do i solve this? all i want to is scraping all pages as long as it exists