I'm scraping google search given a query, but the thing is that the titles I scrape are given in the ISO-8859-1
and I need them in UTF-8 for the spanish language. I get the 10 first titles but some of them are shown like this: Software Java | Oracle M�xico
I receive from the axios request the text/html of the google search and scrape the titles and the urls.
I've tried the following:
const axios = require("axios");
const cheerio = require("cheerio");
const fs = require("fs");
async function scrape (req, res) {
try {
const query = req.params.query;
const encodedQuery = encodeURIComponent(query);
// Set the number of search results you want (e.g., 10 in this case)
const numResults = 10;
const url = `https://www.google.com.mx/search?q=${encodedQuery}&start=${numResults}`
await axios.get(url, {
responseType: "arraybuffer",
headers: {
"Content-Type": "text/html; charset=UTF-8"
}
}).then((response) => {
console.log(response.headers['content-type'])
const $ = cheerio.load(response.data, { decodeEntities: false });
const data = [...$(".egMi0")]
.map(e => ({
title: $(e).find("h3").text().trim(),
href: $(e).find("a").attr("href"),
}));
console.log(data);
fs.writeFileSync("test.html", response.data);
})
res.status(200).json({
message: "Scraping successful",
output: 10,
});
} catch (error) {
// Handle any errors that occurred during the request
console.error('Error while scraping website:', error.message);
res.status(500).json({
message: "Error while scraping website. Contact support.",
error: "Internal Server Error",
});
}
}
module.exports = {
scrape,
}
Even forcing the content-type it prints that my response.headers['content-type']
is text/html; charset=ISO-8859-1
.