puppeteer-cluster seems to act in serial instead of parallel

733 Views Asked by At

I made an cluster of puppeteer workers using puppeteer cluster,

const cluster = await Cluster.launch({
    concurrency: Cluster.CONCURRENCY_PAGE,
    puppeteerOptions: {
        userDataDir: path.join(__dirname,'user_data/1'),    
        headless: false,
        args: ['--no-sandbox']
    },
    maxConcurrency: maxCon,
    monitor: true,
    skipDuplicateUrls: true,
    timeout:40000000,
    retryLimit:5,
});

I then passes some urls using queue through a for loop iterating over an array of urls.

The task is to capture screenshots of some websites. When I launch the script it works as intended, but instead of working parallelly, it seems it works serially.

When capturing screenshots I can see browser goes through tab by tab, takes a SS then to next tab and so on.

What can I do to make it work parallelly?

Full Code :

const puppeteer = require('puppeteer');
const { Cluster } = require('puppeteer-cluster');
const fs = require('fs');
const path = require('path');
var pdfkit = require('pdfkit');

//const zip = require('./zip_files');
//const cfolder = require('./create_folders');

const site = 'scribd.com';

const docType = ['pdf', 'word', 'spreadsheet'];

const t_out = 10000;

const wait = ms => new Promise(res => setTimeout(res, ms));

const scrnDir = 'screenshots';
const docDir = 'documents';
const zipDir = 'zips';

var data_1 = ['Exporter'];
var data_2 = [];

(async() => {
    const browser = await puppeteer.launch({
        headless: false,
        userDataDir: path.join(__dirname,'user_data/main'),
    });
    const page = (await browser.pages())[0];
    
    for(let i = 0; i < data_1.length; i++){
    //  for(let j = 0; j < data_2.length; j++){
            var numFiles = 1000000;
            let folder = data_1[i].replace(/\n/gm, '').replace(/\r/gm, '');
            let searchTerm = data_1[i].replace(/\n/gm, '').replace(/\r/gm, '');
            for(let pageNum = 1; pageNum < 2/*(Math.ceil(numFiles/42) +1)*/ && pageNum < 236; pageNum++){
                //maxPageNum = 235
                let docType = 'pdf';
                let query = 'https://www.scribd.com/search?query='+searchTerm+'&content_type=documents&page='+pageNum+'&filetype='+docType;
                await page.goto(query, {waitUntil : 'networkidle2'});
                
                //await cfolder.createFolder(docDir, searchTerm);
                fs.appendFileSync('progress/query.txt', query + '\n');

                if(pageNum == 1){
                    let numFiles = await fileCount(page);
                }
                
                let docPages = await page.waitForXPath('//section[@data-testid="search-results"]', { timeout: t_out }).then(async() => {
                    let searchResults = await page.$x('//section[@data-testid="search-results"]');
                    await searchResults[0].waitForXPath('//div/ul/li');
                    let docPages = await searchResults[0].$x('//div/ul/li');

                    return docPages;
                }).catch( e => { 
                    console.log('getLinks Error');
                    console.log(e);
                });

                await save(browser, searchTerm, docPages);                  
            }
            
            //await zip.zipFolder(docDir + '/' + folder, zipDir + '/' + searchTerm + '.zip');
    //  }
    }
})();

async function save(browser, searchTerm, docPages){
    //let docPage = await browser.newPage();
    let maxCon = 3;
    const cluster = await Cluster.launch({
        concurrency: Cluster.CONCURRENCY_PAGE,
        puppeteerOptions: {
            userDataDir: path.join(__dirname,'user_data/1'),    
            headless: false,
            args: ['--no-sandbox']
        },
        maxConcurrency: maxCon,
        monitor: true,
        skipDuplicateUrls: true,
        timeout:40000000,
        retryLimit:5,
    });
    
    await cluster.task(async ({ page, data: {url, title} }) => {
        let docPage = page;
        
        await docPage.goto(url, {waitUntil: 'networkidle2'});
        
        //await cfolder.createFolder(scrnDir, title);

        await docPage.evaluate('document.querySelector(".nav_and_banners_fixed").remove()');
        await docPage.evaluate('document.querySelector(".recommender_list_wrapper").remove()');
        await docPage.evaluate('document.querySelector(".auto__doc_page_app_page_body_fixed_viewport_bottom_components").remove()');

        //await autoScroll(docPage);
        //await docPage.evaluate('document.querySelector(".wrapper__doc_page_webpack_doc_page_body_document_useful").remove()');    
        await docPage.addStyleTag({content: '.wrapper__doc_page_webpack_doc_page_body_document_useful{visibility: hidden}'})

        await docPage.waitForXPath('//span[@class="page_of"]');
        let numOfPagesR = await docPage.$x('//span[@class="page_of"]');
        let numOfPages = parseInt((await (await numOfPagesR[0].getProperty('textContent')).jsonValue()).split('of ').pop());
        
        console.log(numOfPages);
            
        //const pages = await docPage.$x('//*[@class="newpage"]');

        let imgs = [];
        for(let j = 0; j < numOfPages; j++){
            let sel = '//*[@id="page' + (j+1) + '"]';
            let pages =  await docPage.$x(sel);
            await pages[0].screenshot({
                path: scrnDir + '/' + title +j+'.jpg'
            });
            imgs[j] = title + j +'.jpg';            
        }
        
        //await createPdf(searchTerm, title, imgs);
        
    });
    cluster.on('taskerror', (err, data) => {
        console.log(`  Error crawling ${data}: ${err.message}`);
    });

    for(let i = 0; i < 6/**docPages.length*/; i++){
        await docPages[i].waitForXPath('//article/a');
        let urlR = await docPages[i].$x('//article/a');
        let url = await (await urlR[0].getProperty('href')).jsonValue();

        await docPages[i].waitForXPath('//p[@data-e2e="title"]');
        let titleR = await docPages[i].$x('//p[@data-e2e="title"]');
        let title = await (await titleR[0].getProperty('textContent')).jsonValue();
        
        cluster.queue({url : url, title : title});
        //console.log(title);
    }
    await cluster.idle();   //docPage.close();
}

async function fileCount(page){
    await page.waitForXPath('//div[@class="_7a1igU"]', { timeout: t_out }).then(async() => {
        let fileCountR = await page.$x('//div[@class="_7a1igU"]');
        let fileCountS = await (await fileCountR[0].getProperty('textContent')).jsonValue();
        let numFiles = parseInt(fileCountS.split('of ').pop().split(' results').shift().replace(/,/g, ''));
        console.log('Total Files  : ' + numFiles);
        return numFiles;
    }).catch( e => { 
        console.log('File Count Error');
        console.log(e);
    });
}

async function getLinks(page){

}

async function createPdf(searchTerm, title, images){
    //await cfolder.createFolder(docDir, searchTerm);
    let pdf = new pdfkit({
        autoFirstPage: false
    });
    let writeStream = fs.createWriteStream(docDir+ '/' + searchTerm + '/' + title + '.pdf');
    pdf.pipe(writeStream);

    for(let i = 0; i < images.length; i++){
        let img = pdf.openImage('./' + scrnDir + '/' + title + '/' + images[i]);
        pdf.addPage({size: [img.width, img.height]});
        pdf.image(img, 0, 0);
    }
    pdf.end();
    await new Promise(async (resolve) => {
        writeStream.on('close', ()=>{
            console.log('PDF Created succesfully');
            resolve();
        }); 
    });
}

const zip = require('./zip_files'); const cfolder = require('./create_folders'); both require for the final code. But does not needed for the problem.

1

There are 1 best solutions below

0
montacer.mit On

The CONCURRENCY_PAGE option, seem to wait for some events that require the window to be focused (like typing) before pursuing the tasks in the blocking tabs. Use CONCURRENCY_CONTEXT (it works perfectly) if you don't mind using incognito mode.