0

i hope you well in this situation. Am has question about web scraping (paginate) with nodejs and library cheerio. has some code done but there is trouble it only scrape one page, am already searching for solutions few hours, try to follow some steps but same the result is it only scrapes one page. I am appreciate any answer from you, here is the code:

const request = require('request-promise')
const cheerio = require('cheerio')
const fs      = require('fs')

const baseUrl = 'https://indotrading.com/company_hdpe_620' // the website i want to scrape
const outputFile = 'data.csv'
const parsedResults = []
var indexPage = 1
var totalPage = 0


const getWebsiteContent = async (url) => {
  try {
    request(url).then(function(body){
        const $ = cheerio.load(body)

        let page = $('.footer-page').children().children().last().children().prop("href")   //get last page navigation button
        page = page.split("/")
        totalPage = page[page.length-1]     //total page that website has

        //get some data from HTML attribute
        $('#products_container #catcom-container').each((key,element) => {
            const linkImage = $(element).find('.swiper-wrapper').children().children().children().prop('data-src')
            const companyName = $(element).find('.product_title').text().replace(/\n+/g,'')
            const companyAddress = $(element).find('i.fa.fa-map-marker.fs-18.mr-5').parent().find('p.d-flex.a-center').text().replace(/\s/,'')

            const splitLinkImage = linkImage.split("/")

            const companyID = splitLinkImage[splitLinkImage.indexOf("webp")+1]

            //calling function phone data based on company id
            const getdataPhone = getPhoneData(companyID)
            getdataPhone.then(function(result) {        // please check this one, is the promise correct?
                const listCompanyPhone = JSON.parse(result.d)
                const companyPhone = listCompanyPhone.Phone+" , "+listCompanyPhone.Phone2
                const Company = {
                    Name : companyName,
                    Phone: companyPhone,
                    Address: companyAddress
                }
                parsedResults.push(Company)
                exportResults(parsedResults)
            })
        })
    })

    const nextPageLink = baseUrl+'/'+(++indexPage)      // get next page
    indexPage++
    if(indexPage == totalPage){
        exportResults(parsedResults)    // exports to csv but not work
        return false
    }

    getWebsiteContent(nextPageLink) //it will not recursive
} catch (error) {
    console.log(error)
}

}

//function for get data by calling api and it returns json
function getPhoneData(data) {
  try {
    var options = {
        method : 'POST',
        uri : 'https://www.indotrading.com/AjaxMethod.asmx/UpdateCompanyPhoneLeads',
        body : {
            Token : "EAAAAKTheWTVifIaYce5HmctJuDKNQO5nbySwS3GGi14hbcy0oGq3yqxMhd5sE6349byCw==",
            EncCompanyID : data,
            ProductID : "undefined"
        },
        headers : {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
            'Content-Type': 'application/json'
        },
        json: true
    }

    return request(options).then(function(body){
        return body
    }).catch(function(error){
        console.log(error)
    })
} catch (error) {
    console.log("get phone data error : "+error)
}

}

//function for export to csv file
const exportResults = (parsedResults) => {
  fs.writeFile(outputFile, JSON.stringify(parsedResults, null, 4), (err) => {
    if (err) {
      console.log(err)
    }
    console.log(`\n ${parsedResults.length} Results exported successfully to ${outputFile}\n`)
  })
}


getWebsiteContent(baseUrl)

how do i solve this? all i want to is scraping all pages as long as it exists

William
  • 13
  • 1
  • 4

1 Answers1

2

I was doing something similar and I would suggest you do it like this....

Request is now deprecated use axios instead

const axios = require('axios')
const cheerio = require('cheerio')
const fs = require('fs')

const baseUrl = 'https://example.com' // the website url to start scraping from
var parsedResults = [];
const outputFile = 'data.csv'
var saved = false // Added this for monitoring if the scraped data was saved if an error is thrown
var indexPage = 1
var totalPages = 1;


const getWebsiteContent = async (url) => {
    try {

        axios.get(url).then( res => {

            const $ = cheerio.load(res.data)

            totalPages = getTotalpages($);  // Get the pagination


            // Now we have the total pages for the url you want to scrap
            // Next we scrape all the data on the respective pages

            // Add your code here that scrapes the data

            });
        })
        .catch(err => {
            throw(err);
        });

        indexPage++; // Increment to the next page

        if (indexPage == totalPages) {
            exportResults(parsedResults)    // If we have surpassed the total pages we export the result to CSV
            return false
        }

        const nextPageLink = baseUrl + '......' + indexPage;      // get next page
    
        // Add a little  timeout to avoid getting banned by the server
        setTimeout(() => {
            getWebsiteContent(nextPageLink); // Call itself
          }, 3000);
        

    }
    catch (error) {
        console.log(error)
    }
    finally{

        // If results were written successfully to file the end else write whats in memory
        if(!saved){
            exportResults(parsedResults) ;
        }
    }
}


// Get the pagination
function getTotalpages(data){

    // Extract the total number of pages available and return it as an integer
}

//function for export to csv file
const exportResults = (parsedResults) => {
    fs.appendFile(outputFile, JSON.stringify(parsedResults, null, 4), (err) => {
        if (err) {
            console.log(err)
        }
        console.log(`\n ${parsedResults.length} Results exported successfully to ${outputFile}\n`);
        saved = true;
    })
}


getWebsiteContent(baseUrl);



MuturiAlex
  • 338
  • 1
  • 8