I made an cluster of puppeteer workers using puppeteer cluster,
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
puppeteerOptions: {
userDataDir: path.join(__dirname,'user_data/1'),
headless: false,
args: ['--no-sandbox']
},
maxConcurrency: maxCon,
monitor: true,
skipDuplicateUrls: true,
timeout:40000000,
retryLimit:5,
});
I then passes some urls using queue
through a for loop iterating over an array of urls.
The task is to capture screenshots of some websites. When I launch the script it works as intended, but instead of working parallelly, it seems it works serially.
When capturing screenshots I can see browser goes through tab by tab, takes a SS then to next tab and so on.
What can I do to make it work parallelly?
Full Code :
const puppeteer = require('puppeteer');
const { Cluster } = require('puppeteer-cluster');
const fs = require('fs');
const path = require('path');
var pdfkit = require('pdfkit');
//const zip = require('./zip_files');
//const cfolder = require('./create_folders');
const site = 'scribd.com';
const docType = ['pdf', 'word', 'spreadsheet'];
const t_out = 10000;
const wait = ms => new Promise(res => setTimeout(res, ms));
const scrnDir = 'screenshots';
const docDir = 'documents';
const zipDir = 'zips';
var data_1 = ['Exporter'];
var data_2 = [];
(async() => {
const browser = await puppeteer.launch({
headless: false,
userDataDir: path.join(__dirname,'user_data/main'),
});
const page = (await browser.pages())[0];
for(let i = 0; i < data_1.length; i++){
// for(let j = 0; j < data_2.length; j++){
var numFiles = 1000000;
let folder = data_1[i].replace(/\n/gm, '').replace(/\r/gm, '');
let searchTerm = data_1[i].replace(/\n/gm, '').replace(/\r/gm, '');
for(let pageNum = 1; pageNum < 2/*(Math.ceil(numFiles/42) +1)*/ && pageNum < 236; pageNum++){
//maxPageNum = 235
let docType = 'pdf';
let query = 'https://www.scribd.com/search?query='+searchTerm+'&content_type=documents&page='+pageNum+'&filetype='+docType;
await page.goto(query, {waitUntil : 'networkidle2'});
//await cfolder.createFolder(docDir, searchTerm);
fs.appendFileSync('progress/query.txt', query + '\n');
if(pageNum == 1){
let numFiles = await fileCount(page);
}
let docPages = await page.waitForXPath('//section[@data-testid="search-results"]', { timeout: t_out }).then(async() => {
let searchResults = await page.$x('//section[@data-testid="search-results"]');
await searchResults[0].waitForXPath('//div/ul/li');
let docPages = await searchResults[0].$x('//div/ul/li');
return docPages;
}).catch( e => {
console.log('getLinks Error');
console.log(e);
});
await save(browser, searchTerm, docPages);
}
//await zip.zipFolder(docDir + '/' + folder, zipDir + '/' + searchTerm + '.zip');
// }
}
})();
async function save(browser, searchTerm, docPages){
//let docPage = await browser.newPage();
let maxCon = 3;
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
puppeteerOptions: {
userDataDir: path.join(__dirname,'user_data/1'),
headless: false,
args: ['--no-sandbox']
},
maxConcurrency: maxCon,
monitor: true,
skipDuplicateUrls: true,
timeout:40000000,
retryLimit:5,
});
await cluster.task(async ({ page, data: {url, title} }) => {
let docPage = page;
await docPage.goto(url, {waitUntil: 'networkidle2'});
//await cfolder.createFolder(scrnDir, title);
await docPage.evaluate('document.querySelector(".nav_and_banners_fixed").remove()');
await docPage.evaluate('document.querySelector(".recommender_list_wrapper").remove()');
await docPage.evaluate('document.querySelector(".auto__doc_page_app_page_body_fixed_viewport_bottom_components").remove()');
//await autoScroll(docPage);
//await docPage.evaluate('document.querySelector(".wrapper__doc_page_webpack_doc_page_body_document_useful").remove()');
await docPage.addStyleTag({content: '.wrapper__doc_page_webpack_doc_page_body_document_useful{visibility: hidden}'})
await docPage.waitForXPath('//span[@class="page_of"]');
let numOfPagesR = await docPage.$x('//span[@class="page_of"]');
let numOfPages = parseInt((await (await numOfPagesR[0].getProperty('textContent')).jsonValue()).split('of ').pop());
console.log(numOfPages);
//const pages = await docPage.$x('//*[@class="newpage"]');
let imgs = [];
for(let j = 0; j < numOfPages; j++){
let sel = '//*[@id="page' + (j+1) + '"]';
let pages = await docPage.$x(sel);
await pages[0].screenshot({
path: scrnDir + '/' + title +j+'.jpg'
});
imgs[j] = title + j +'.jpg';
}
//await createPdf(searchTerm, title, imgs);
});
cluster.on('taskerror', (err, data) => {
console.log(` Error crawling ${data}: ${err.message}`);
});
for(let i = 0; i < 6/**docPages.length*/; i++){
await docPages[i].waitForXPath('//article/a');
let urlR = await docPages[i].$x('//article/a');
let url = await (await urlR[0].getProperty('href')).jsonValue();
await docPages[i].waitForXPath('//p[@data-e2e="title"]');
let titleR = await docPages[i].$x('//p[@data-e2e="title"]');
let title = await (await titleR[0].getProperty('textContent')).jsonValue();
cluster.queue({url : url, title : title});
//console.log(title);
}
await cluster.idle(); //docPage.close();
}
async function fileCount(page){
await page.waitForXPath('//div[@class="_7a1igU"]', { timeout: t_out }).then(async() => {
let fileCountR = await page.$x('//div[@class="_7a1igU"]');
let fileCountS = await (await fileCountR[0].getProperty('textContent')).jsonValue();
let numFiles = parseInt(fileCountS.split('of ').pop().split(' results').shift().replace(/,/g, ''));
console.log('Total Files : ' + numFiles);
return numFiles;
}).catch( e => {
console.log('File Count Error');
console.log(e);
});
}
async function getLinks(page){
}
async function createPdf(searchTerm, title, images){
//await cfolder.createFolder(docDir, searchTerm);
let pdf = new pdfkit({
autoFirstPage: false
});
let writeStream = fs.createWriteStream(docDir+ '/' + searchTerm + '/' + title + '.pdf');
pdf.pipe(writeStream);
for(let i = 0; i < images.length; i++){
let img = pdf.openImage('./' + scrnDir + '/' + title + '/' + images[i]);
pdf.addPage({size: [img.width, img.height]});
pdf.image(img, 0, 0);
}
pdf.end();
await new Promise(async (resolve) => {
writeStream.on('close', ()=>{
console.log('PDF Created succesfully');
resolve();
});
});
}
const zip = require('./zip_files');
const cfolder = require('./create_folders');
both require for the final code. But does not needed for the problem.