0

I made an cluster of puppeteer workers using puppeteer cluster,

const cluster = await Cluster.launch({
    concurrency: Cluster.CONCURRENCY_PAGE,
    puppeteerOptions: {
        userDataDir: path.join(__dirname,'user_data/1'),    
        headless: false,
        args: ['--no-sandbox']
    },
    maxConcurrency: maxCon,
    monitor: true,
    skipDuplicateUrls: true,
    timeout:40000000,
    retryLimit:5,
});

I then passes some urls using queue through a for loop iterating over an array of urls.

The task is to capture screenshots of some websites. When I launch the script it works as intended, but instead of working parallelly, it seems it works serially.

When capturing screenshots I can see browser goes through tab by tab, takes a SS then to next tab and so on.

What can I do to make it work parallelly?

Full Code :

const puppeteer = require('puppeteer');
const { Cluster } = require('puppeteer-cluster');
const fs = require('fs');
const path = require('path');
var pdfkit = require('pdfkit');

//const zip = require('./zip_files');
//const cfolder = require('./create_folders');

const site = 'scribd.com';

const docType = ['pdf', 'word', 'spreadsheet'];

const t_out = 10000;

const wait = ms => new Promise(res => setTimeout(res, ms));

const scrnDir = 'screenshots';
const docDir = 'documents';
const zipDir = 'zips';

var data_1 = ['Exporter'];
var data_2 = [];

(async() => {
    const browser = await puppeteer.launch({
        headless: false,
        userDataDir: path.join(__dirname,'user_data/main'),
    });
    const page = (await browser.pages())[0];
    
    for(let i = 0; i < data_1.length; i++){
    //  for(let j = 0; j < data_2.length; j++){
            var numFiles = 1000000;
            let folder = data_1[i].replace(/\n/gm, '').replace(/\r/gm, '');
            let searchTerm = data_1[i].replace(/\n/gm, '').replace(/\r/gm, '');
            for(let pageNum = 1; pageNum < 2/*(Math.ceil(numFiles/42) +1)*/ && pageNum < 236; pageNum++){
                //maxPageNum = 235
                let docType = 'pdf';
                let query = 'https://www.scribd.com/search?query='+searchTerm+'&content_type=documents&page='+pageNum+'&filetype='+docType;
                await page.goto(query, {waitUntil : 'networkidle2'});
                
                //await cfolder.createFolder(docDir, searchTerm);
                fs.appendFileSync('progress/query.txt', query + '\n');

                if(pageNum == 1){
                    let numFiles = await fileCount(page);
                }
                
                let docPages = await page.waitForXPath('//section[@data-testid="search-results"]', { timeout: t_out }).then(async() => {
                    let searchResults = await page.$x('//section[@data-testid="search-results"]');
                    await searchResults[0].waitForXPath('//div/ul/li');
                    let docPages = await searchResults[0].$x('//div/ul/li');

                    return docPages;
                }).catch( e => { 
                    console.log('getLinks Error');
                    console.log(e);
                });

                await save(browser, searchTerm, docPages);                  
            }
            
            //await zip.zipFolder(docDir + '/' + folder, zipDir + '/' + searchTerm + '.zip');
    //  }
    }
})();

async function save(browser, searchTerm, docPages){
    //let docPage = await browser.newPage();
    let maxCon = 3;
    const cluster = await Cluster.launch({
        concurrency: Cluster.CONCURRENCY_PAGE,
        puppeteerOptions: {
            userDataDir: path.join(__dirname,'user_data/1'),    
            headless: false,
            args: ['--no-sandbox']
        },
        maxConcurrency: maxCon,
        monitor: true,
        skipDuplicateUrls: true,
        timeout:40000000,
        retryLimit:5,
    });
    
    await cluster.task(async ({ page, data: {url, title} }) => {
        let docPage = page;
        
        await docPage.goto(url, {waitUntil: 'networkidle2'});
        
        //await cfolder.createFolder(scrnDir, title);

        await docPage.evaluate('document.querySelector(".nav_and_banners_fixed").remove()');
        await docPage.evaluate('document.querySelector(".recommender_list_wrapper").remove()');
        await docPage.evaluate('document.querySelector(".auto__doc_page_app_page_body_fixed_viewport_bottom_components").remove()');

        //await autoScroll(docPage);
        //await docPage.evaluate('document.querySelector(".wrapper__doc_page_webpack_doc_page_body_document_useful").remove()');    
        await docPage.addStyleTag({content: '.wrapper__doc_page_webpack_doc_page_body_document_useful{visibility: hidden}'})

        await docPage.waitForXPath('//span[@class="page_of"]');
        let numOfPagesR = await docPage.$x('//span[@class="page_of"]');
        let numOfPages = parseInt((await (await numOfPagesR[0].getProperty('textContent')).jsonValue()).split('of ').pop());
        
        console.log(numOfPages);
            
        //const pages = await docPage.$x('//*[@class="newpage"]');

        let imgs = [];
        for(let j = 0; j < numOfPages; j++){
            let sel = '//*[@id="page' + (j+1) + '"]';
            let pages =  await docPage.$x(sel);
            await pages[0].screenshot({
                path: scrnDir + '/' + title +j+'.jpg'
            });
            imgs[j] = title + j +'.jpg';            
        }
        
        //await createPdf(searchTerm, title, imgs);
        
    });
    cluster.on('taskerror', (err, data) => {
        console.log(`  Error crawling ${data}: ${err.message}`);
    });

    for(let i = 0; i < 6/**docPages.length*/; i++){
        await docPages[i].waitForXPath('//article/a');
        let urlR = await docPages[i].$x('//article/a');
        let url = await (await urlR[0].getProperty('href')).jsonValue();

        await docPages[i].waitForXPath('//p[@data-e2e="title"]');
        let titleR = await docPages[i].$x('//p[@data-e2e="title"]');
        let title = await (await titleR[0].getProperty('textContent')).jsonValue();
        
        cluster.queue({url : url, title : title});
        //console.log(title);
    }
    await cluster.idle();   //docPage.close();
}

async function fileCount(page){
    await page.waitForXPath('//div[@class="_7a1igU"]', { timeout: t_out }).then(async() => {
        let fileCountR = await page.$x('//div[@class="_7a1igU"]');
        let fileCountS = await (await fileCountR[0].getProperty('textContent')).jsonValue();
        let numFiles = parseInt(fileCountS.split('of ').pop().split(' results').shift().replace(/,/g, ''));
        console.log('Total Files  : ' + numFiles);
        return numFiles;
    }).catch( e => { 
        console.log('File Count Error');
        console.log(e);
    });
}

async function getLinks(page){

}

async function createPdf(searchTerm, title, images){
    //await cfolder.createFolder(docDir, searchTerm);
    let pdf = new pdfkit({
        autoFirstPage: false
    });
    let writeStream = fs.createWriteStream(docDir+ '/' + searchTerm + '/' + title + '.pdf');
    pdf.pipe(writeStream);

    for(let i = 0; i < images.length; i++){
        let img = pdf.openImage('./' + scrnDir + '/' + title + '/' + images[i]);
        pdf.addPage({size: [img.width, img.height]});
        pdf.image(img, 0, 0);
    }
    pdf.end();
    await new Promise(async (resolve) => {
        writeStream.on('close', ()=>{
            console.log('PDF Created succesfully');
            resolve();
        }); 
    });
}

const zip = require('./zip_files'); const cfolder = require('./create_folders'); both require for the final code. But does not needed for the problem.

DarkZeus
  • 61
  • 8
  • 1
    @ggorlen I added a working code example. `function save` is the function used to iterate over urls and get screenshots – DarkZeus Feb 17 '23 at 15:53
  • Thanks, although it's not exactly a _minimal_ complete example. Removing any unnecessary code and dependencies makes it easier to answer (I need to be able to run the code locally to repro the issue) and makes the question more useful to future visitors because it shows the simplest version of the failing code, without distractions. I assume `for(let i = 0; i < 6/**docPages.length*/; i++){` is the loop you're expecting to be parallel, but is actually serial? – ggorlen Feb 17 '23 at 16:01
  • yes, it is the loop used to queue the cluster. The cluster itself seems to be serial. (regarding _minimal complete example_ I cannot trim down further because it may related to the problem.) – DarkZeus Feb 17 '23 at 16:04

1 Answers1

0

The CONCURRENCY_PAGE option, seem to wait for some events that require the window to be focused (like typing) before pursuing the tasks in the blocking tabs. Use CONCURRENCY_CONTEXT (it works perfectly) if you don't mind using incognito mode.

Tyler2P
  • 2,324
  • 26
  • 22
  • 31