I need to generate 2000+ pages from html to pdf with puppeteer/chromium.
Currently, I have the following config:
browser.js:
const p = require("puppeteer");
const isLinux = process.platform === "linux";
const LINUX_CHROMIUM = "/usr/bin/chromium-browser";
const WINDOWS_CHROME = `C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe`;
module.exports = async ({ port, host }) => {
const options = isLinux
? {
headless: true,
executablePath: LINUX_CHROMIUM,
args: [
"--no-sandbox",
"--disable-gpu",
"--window-size=1200,1200",
"--disable-dev-shm-usage",
"--unlimited-storage",
"--full-memory-crash-report"
],
userDataDir: "/usr/cache",
}
: {
headless: true,
executablePath: WINDOWS_CHROME,
args: [
"--window-size=1200,1200",
"--disable-dev-shm-usage",
"--unlimited-storage",
"--full-memory-crash-report"
],
};
return await p.launch(options);
}
server.js:
const express = require("express");
const uuid = require("uuid");
const fs = require("fs");
const path = require("path");
module.exports = async function start({ browser = null, port = 80 } = {}) {
if (!browser) {
throw new Error(`no browser`);
}
try {
const page = await browser.newPage();
page.close();
} catch {
throw new Error(`browser not working`);
}
const pdfFolder = path.resolve(__dirname, "./pdf");
fs.existsSync(pdfFolder) || fs.mkdirSync(pdfFolder);
const app = express();
app.use(require("body-parser").text({limit: '60mb', extended: true}));
app.use('/static', express.static(path.join(__dirname, 'static')))
app.post("/print", async (req, res) => {
try {
const id = uuid.v1();
console.time(`${id} print`);
const html = req.body;
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
console.time(`${id} goto page`);
await page.setContent(html, { waitUntil: "networkidle2" });
console.timeEnd(`${id} goto page`);
console.time(`${id} pdf`);
await page.pdf({ path: path.resolve(pdfFolder, `${id}.pdf`), format: "A4" });
console.timeEnd(`${id} pdf`);
await page.close();
console.timeEnd(`${id} print`);
res.redirect(`/pdf/${id}.pdf`);
} catch (error) {
res.status(500);
res.json({ message: error.message, stack: error.stack });
}
});
app.get("/pdf/:file", (req, res) => {
const id = req.params.file;
const file = path.resolve(pdfFolder, id);
if (fs.existsSync(file)) {
console.time(`${id} download`);
res.status(200);
res.download(file);
console.timeEnd(`${id} download`);
res.once("finish", () => fs.unlinkSync(file));
} else {
res.status(404);
res.json({ message: `no file found` });
}
});
console.log(`starting server on port ${port}`);
return app.listen(port);
};
init.js:
const yargs = require("yargs");
const createBrowser = require("./browser");
const createServer = require("./server");
(async function init() {
const { chromeHost, chromePort, serverPort } = yargs
.string("chromeHost")
.number("chromePort")
.number("serverPort")
.default("chromeHost", "127.0.0.1")
.default("chromePort", 9222)
.default("serverPort", 80)
.argv;
const browser = await createBrowser({ host: chromeHost, port: chromePort });
const server = await createServer({ browser, port: serverPort });
process.on("beforeExit", () => {
browser.close();
server.close();
})
})();
For pdf with ~ 650 pages, generation took 3 minutes (html input: 14.5 MB, setContent took 30s, pdf took 2m30s). I need to generate pdf files with up to 2000 pages, but puppeteer/chromium hangs. I need to improve performance also for 650 pages pdf... 3 minutes is too long. It is running in docker.
Which config memory/CPU should be tunned for better performance? It can be run in the cloud (currently on the local machine or VPS).
I saw Converting HTML to PDF for large files using Google Puppeteer but there is no solution to speedup pdf generation.