I have a Puppeteer script where it iterates through a list of URLs saved in urls.txt to scrape. I have 2 issues:
If one of the URLs in the list times out, it stops the whole process. I would like it to skip any URLS that don't work / timeout, and just move on to the next URL. I have tried to put in a catch(err), but I'm not putting it in correctly and it fails.
If the list of URLs is more than about 5, it freezes my server and I have to reboot. I think maybe it's waiting to iterate through all the URLs before saving and that's overloading the server? Or is there something else in my code that is causing the problem?
const puppeteer = require('puppeteer');
const fs = require('fs');
const axios = require('axios');
process.setMaxListeners(Infinity); // <== Important line
async function scrapePage(url, index) {
// Launch a new browser
const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox'] });
// Open a new page
const page = await browser.newPage();
// Set the user agent
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36');
// Navigate to the desired webpage
await page.goto(url, {
waitUntil: "domcontentloaded",
});
// Wait for selector
await (async () => {
await page.waitForSelector("#root > section > section > main > div.py-6.container > div.columns.mt-4 > div.column.is-flex-grow-2 > div:nth-child(3) > div.ant-card-body > div > div > div > canvas", { visible: true });
})();
// Get the HTML content of the page
const html = await page.content();
// Generate the file name using the index value
const htmlFileName = `${index.toString().padStart(4, '0')}.html`;
const screenshotFileName = `${index.toString().padStart(4, '0')}.png`;
// Check if the HTML file exists
const filePath = '/root/Dropbox/scrapes/' + htmlFileName;
if (fs.existsSync(filePath)) {
// If the file exists, rewrite the content with the new scraped HTML
fs.writeFileSync(filePath, html);
} else {
// If the file doesn't exist, create the file
fs.closeSync(fs.openSync(filePath, 'w'));
// Save the scraped content to the newly created file
fs.writeFileSync(filePath, html);
}
// Capture a screenshot of the page
await page.screenshot({ path: '/root/scrapes/' + screenshotFileName });
// Close the browser
await browser.close();
}
// Read the lines of the file
const lines = fs.readFileSync('/root/Dropbox/urls.txt', 'utf-8').split('\n');
// Iterate through each URL in the file
for (let i = 0; i < lines.length; i++) {
// Scrape the page
scrapePage(lines[i], i + 1);
}