1

I want to scrape the google search given a query, but I'm not able to get the css_identifiers to work on this code:

const axios = require("axios");
const cheerio = require("cheerio");

async function scrape (req, res) {
    try {
        const query = req.params.query;
        const encodedQuery = encodeURIComponent(query);
        // Set the number of search results you want (e.g., 10 in this case)
        const numResults = 10;

        const response = await axios.get(`https://www.google.com.mx/search?q=${encodedQuery}&start=${numResults}`);
        const html = response.data;

        console.log(html)

        const css_identifier_result = ".tF2Cxc";
        const css_identifier_title = "h3";
        const css_identifier_link = ".yuRUbf a";
        const css_identifier_text = ".IsZvec";

        const $ = cheerio.load(html);
        const results = $(css_identifier_result);

        const output = [];

        results.each((index, element) => {
            console.log("aaaaaaaaaaaaaaaaa")
            const item = {
            title: $(element).find(css_identifier_title).text(),
            link: $(element).find(css_identifier_link).attr('href'),
            text: $(element).find(css_identifier_text).text(),
            };

            output.push(item);
        });

        res.status(200).json({
            message: "Scraping successful",
            output: html,
        });
      } catch (error) {
        // Handle any errors that occurred during the request
        console.error('Error while scraping website:', error.message);
        res.status(500).json({
            message: "Error while scraping website. Contact support.",
            error: "Internal Server Error",
        });
      }
}

module.exports = {
    scrape,
}

This is my backend function, and it doesn't print the console log. I don't know how can I manage to do this, I've been trying several ways.

The provided code is using the Cheerio library and axios.

1 Answers1

0

Those selectors don't appear to match the HTML from axios. Try:

const axios = require("axios"); // 1.4.0
const cheerio = require("cheerio"); // 1.0.0-rc.12

const url = "<Your URL>";

axios
  .get(url, {responseEncoding: "latin1"})
  .then(({data: html}) => {
    const $ = cheerio.load(html);
    const data = [...$(".egMi0")]
      .map(e => ({
        title: $(e).find("h3").text().trim(),
        href: $(e).find("a").attr("href"),
      }));
    console.log(data);
  })
  .catch(err => console.error(err));

To see what axios sees, I suggest writing the html string it returns to a file, then opening the file in a browser with JS disabled or using view-source:. Looking at sites in your browser to locate elements can be misleading due to JS execution as well as server blocks and HTML responses that may be different in axios.

ggorlen
  • 44,755
  • 7
  • 76
  • 106
  • I’ll give it a try. Thank you. – La Bola Al Riel Sep 02 '23 at 14:22
  • It seems to work, the thing here is that the response's content-type is `text/html; charset=ISO-8859-1` instead of UTF-8 which is a downside for me because I also need Spanish results. I've already tried adding the header and the specific content-type but it didn't work. – La Bola Al Riel Sep 02 '23 at 17:10
  • Updated to handle it, so I'll mark your [follow-up](https://stackoverflow.com/questions/77029602/set-axios-response-type-to-utf-8-for-web-scraping-in-node-js?noredirect=1#comment135793876_77029602) as a dupe of this one. – ggorlen Sep 02 '23 at 22:52