0

I'm working on a server side (self) project with node js (for the first time), and i ran into some difficulties.

My goal is the following:

first part - Im using "/uploads/processData" URL in my server to get URL(s) from the user request.

Now i want to access the user request URL(s) and get their HTML(s) file(s), to do so i'm using the "request" npm package (code below).

second part - I want access the body that I get back from the request package (from the first part), so I'm using cheerio npm package to do so.

Now to my problem - lets say that i'm trying to get the body of the url:

https://www.amazon.com/NIKE-Mens-Lunarconverge-Running-Shoes/dp/B06VVFGZHL?pd_rd_wg=6humg&pd_rd_r=61904ea4-c78e-43b6-8b8d-6b5ee8417541&pd_rd_w=Tue7n&ref_=pd_gw_simh&pf_rd_r=VGMA24803GJEV6DY7458&pf_rd_p=a670abbe-a1ba-52d3-b360-3badcefeb448&th=1

From some reason that i cant understand (probably because of lack of knowledge at web development), I dont always get the same body that i see when I review the above page (link) using F12, with my first part code. Hence sometimes my cheerio extraction (the second part) works as i expect and sometime does not (because some element from the full/original HTML file are missing). At first I thought it might be cache thing, so I added a middleware to set "nocache" flag.

What am I missing here? Does the way I try to operate wrong? Is there any way to ensure i get the same full/original HTML everytime?

Here is my code so far - nocache middleware

function nocache(req, res, next) {
res.header("Cache-Control", "private, no-cache, no-store, must-revalidate");
  res.header("Expires", "-1");
  res.header("Pragma", "no-cache");
  next();
}

EDIT

uploadRoutes.post("/processGoogleSearchData", nocache, (req, res) => {
  //Assuming getting in req.body the google result JSON as "googleSearchResult"
  var itemsArr = [];
  var linksArr = [];
  var bodysArr = [];
  itemsArr = req.body.googleSearchResult.items;
  if (itemsArr.length === 0) {
    //return appropriate message
    return res.status(400).send({ message: "No data sent to server" });
  }
  var linksArr = itemsArr.map(item => item.link);

  //Get the needed info from every link
  linksArr.forEach(link => {
    request(link, (err, response, body) => {
      if (!err && response.statusCode === 200) {
        var $ = cheerio.load(body);
        var tr = $(".a-lineitem").children();
        var priceTd = tr.find(".a-span12");
        var priceSpan = priceTd.find("#priceblock_ourprice");
        console.log(priceSpan.text());
        //when trying to build array of bodys the extraction doesnt work at all
        bodysArr.push(body);
      }
    });
  });
  res.send(bodysArr);
});

I changed my code to the above, and it seems like the data extraction works more often. Can anyone explain why the extraction still sometimes doesnt work? I tried return bodysArr for debbug purposes but when i do that the extraction does not work at all and my path response is always an empty array, why is that?

DXR
  • 25
  • 5

2 Answers2

0

The problem is that:

res.send(bodysArr);

is executed straight after the call to

linksArr.forEach(link => {

The callbacks

(err, response, body) => {
  if (!err && response.statusCode === 200) {
    var $ = cheerio.load(body);
    var tr = $(".a-lineitem").children();
    var priceTd = tr.find(".a-span12");
    var priceSpan = priceTd.find("#priceblock_ourprice");
    console.log(priceSpan.text());
    //when trying to build array of bodys the extraction doesnt work at all
    bodysArr.push(body);
  }

won't be guaranteed to have fired yet. What you want is ensure that res.send(bodysArr) runs after all the requests have happened

There are a few ways to handle this, one is with the excellent async library.

Hopefully you can get the gist of it with this example.

var array = [1,2,3]

function asyncRequest(input, callback){
  //Do your fetch request here and call callback when done
  setTimeout(callback, 10); //using setTiemout as an example
}


async.each(array, asyncRequest, (err) => {
  if(err){
    throw err;
  }
  console.log("All Finished");
});
<script src="https://cdnjs.cloudflare.com/ajax/libs/async/2.6.1/async.min.js"></script>
Sudsy
  • 931
  • 7
  • 16
0

After reviewing Sudsy explanation, I came across loops of asynchronous methods.

While playing with this subject I could not figure out whats wrong with my following code:

This works fine - so i ended up using it

async function getItemsInfo(itemsArr) {
  return itemsArr.map(async item => {
    try {
      var body = await axios(item.link);
      var $ = await cheerio.load(body.data);
      var tr = await $(".a-lineitem").children();
      var priceTd = await tr.find(".a-span12");
      var priceSpan = await priceTd.find("#priceblock_ourprice");
      return priceSpan.text();
    } catch (err) {
      return err.message;
    }
  });
}

getItemsInfo(linksArr)
    .then(res => Promise.all(res))
    .then(res => console.log(res))
    .catch(err => console.error(err));

Can someone explain to me what's wrong with the following codes?

async function getItemsInfo(itemsArr) {
  await Promise.all(
    itemsArr.map(async item => {
      try {
        var body = await axios(item.link);
        var $ = await cheerio.load(body.data);
        var tr = await $(".a-lineitem").children();
        var priceTd = await tr.find(".a-span12");
        var priceSpan = await priceTd.find("#priceblock_ourprice");
        return priceSpan.text();
      } catch (err) {
        throw err.message;
      }
    })
  )
    .then(resulst => {
      return results;
    })
    .catch(err => {
      throw err.message;
    });
}
//the caller function
try {
    getItemsInfo(linksArr).then(results => {
      res.status(200).send(results);
    });
  } catch (err) {
    res.status(400).send(err.message);
  }

or

async function getItemsInfo(itemsArr) {
  const promises = itemsArr.map(async item => {
    try {
      var body = await axios(item.link);
      var $ = await cheerio.load(body.data);
      var tr = await $(".a-lineitem").children();
      var priceTd = await tr.find(".a-span12");
      var priceSpan = await priceTd.find("#priceblock_ourprice");
      return priceSpan.text();
    } catch (err) {
      return err.message;
    }
  });

  var results = await Promise.all(promises)
    .then(results => {
      return results;
    })
    .catch(err => {
      return err.message;
    });
  }

    //the caller function
    try {
            getItemsInfo(linksArr).then(results => {
              res.status(200).send(results);
            });
          } catch (err) {
            res.status(400).send(err.message);
          }
DXR
  • 25
  • 5