4

I need to generate 2000+ pages from html to pdf with puppeteer/chromium.

Currently, I have the following config:

browser.js:

const p = require("puppeteer");

const isLinux = process.platform === "linux";
const LINUX_CHROMIUM = "/usr/bin/chromium-browser";
const WINDOWS_CHROME = `C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe`;

module.exports = async ({ port, host }) => {
    const options = isLinux
        ? {
            headless: true,
            executablePath: LINUX_CHROMIUM,
            args: [
                "--no-sandbox",
                "--disable-gpu",
                "--window-size=1200,1200",
                "--disable-dev-shm-usage",
                "--unlimited-storage",
                "--full-memory-crash-report"
            ],
            userDataDir: "/usr/cache",
        }
        : {
            headless: true,
            executablePath: WINDOWS_CHROME,
            args: [
                "--window-size=1200,1200",
                "--disable-dev-shm-usage",
                "--unlimited-storage",
                "--full-memory-crash-report"
            ],
        };

    return await p.launch(options);
}

server.js:

const express = require("express");
const uuid = require("uuid");
const fs = require("fs");
const path = require("path");

module.exports = async function start({ browser = null, port = 80 } = {}) {
    if (!browser) {
        throw new Error(`no browser`);
    }

    try {
        const page = await browser.newPage();
        page.close();
    } catch {
        throw new Error(`browser not working`);
    }

    const pdfFolder = path.resolve(__dirname, "./pdf");
    fs.existsSync(pdfFolder) || fs.mkdirSync(pdfFolder);

    const app = express();
    app.use(require("body-parser").text({limit: '60mb', extended: true}));
    app.use('/static', express.static(path.join(__dirname, 'static')))

    app.post("/print", async (req, res) => {
        try {
            const id = uuid.v1();
            console.time(`${id} print`);
            const html = req.body;

            const page = await browser.newPage();
            await page.setDefaultNavigationTimeout(0);
            console.time(`${id} goto page`);
            await  page.setContent(html, { waitUntil: "networkidle2" });
            console.timeEnd(`${id} goto page`);
            console.time(`${id} pdf`);
            await page.pdf({ path: path.resolve(pdfFolder, `${id}.pdf`), format: "A4" });
            console.timeEnd(`${id} pdf`);
            await page.close();

            console.timeEnd(`${id} print`);
            res.redirect(`/pdf/${id}.pdf`);
        } catch (error) {
            res.status(500);
            res.json({ message: error.message, stack: error.stack });
        }
    });

    app.get("/pdf/:file", (req, res) => {
        const id = req.params.file;
        const file = path.resolve(pdfFolder, id);
        if (fs.existsSync(file)) {
            console.time(`${id} download`);
            res.status(200);
            res.download(file);
            console.timeEnd(`${id} download`);
            res.once("finish", () => fs.unlinkSync(file));
        } else {
            res.status(404);
            res.json({ message: `no file found` });
        }
    });

    console.log(`starting server on port ${port}`);

    return app.listen(port);
};

init.js:

const yargs = require("yargs");

const createBrowser = require("./browser");
const createServer = require("./server");

(async function init() {
    const { chromeHost, chromePort, serverPort } = yargs
        .string("chromeHost")
        .number("chromePort")
        .number("serverPort")
        .default("chromeHost", "127.0.0.1")
        .default("chromePort", 9222)
        .default("serverPort", 80)
        .argv;

    const browser = await createBrowser({ host: chromeHost, port: chromePort });
    const server = await createServer({ browser, port: serverPort });

    process.on("beforeExit", () => {
        browser.close();
        server.close();
    })

})();

For pdf with ~ 650 pages, generation took 3 minutes (html input: 14.5 MB, setContent took 30s, pdf took 2m30s). I need to generate pdf files with up to 2000 pages, but puppeteer/chromium hangs. I need to improve performance also for 650 pages pdf... 3 minutes is too long. It is running in docker.

Which config memory/CPU should be tunned for better performance? It can be run in the cloud (currently on the local machine or VPS).

I saw Converting HTML to PDF for large files using Google Puppeteer but there is no solution to speedup pdf generation.

bmilczarek
  • 91
  • 3
  • 7
  • Im facing the same problem. Any chance you found a solution? As a workaround, we are generating multiple small PDFs of 25 pages and then merging them back to one large PDF using [https://www.npmjs.com/package/pdf-lib?activeTab=readme](pdf-lib). But that messes up accessibility tags and Adobe screen reader doesn't work. – Arvydas Sep 03 '21 at 07:31
  • Unfortunately, I did not find the proper solution. I used the same workaround as you. Small PDFs and merging them back. – bmilczarek Sep 04 '21 at 11:20

0 Answers0