13

I am looking to use the chrome headless browser to replicate the 'Save As' functionality to save the complete web page with all its resources. Is there a way to do this? I tried the --print-to-pdf and --screenshot options and looking to explore the 'Save As' option as well.

uday kiran
  • 299
  • 1
  • 2
  • 9

1 Answers1

1

This is totally possible, but it isn't easy. You've got to do the heavy lifting yourself. This means:

  1. Save all linked assets to a local directory.
  2. Rewrite all links to those assets to be relative.
  3. Save the rewritten HTML file to the same local directory.

Here's an example using Playwright. (Note that this code is cut from an existing project and cleaned up for this answer. It may not work perfectly.)

const { webkit } = require('playwright');
const { parse } = require('node-html-parser');
const fs = require('fs-extra');
const path = require('path');
const url = require('url');

// Save the webpage and its assets to a local directory
async function saveWebpage(urlToSave, outputDir) {
  // Launch a new browser instance
  const browser = await webkit.launch();
  const context = await browser.newContext();
  const page = await context.newPage();

  // Navigate to the specified URL
  await page.goto(urlToSave);
  const html = await page.content();

  // Parse the HTML content
  const parsedHtml = parse(html);
  const baseTag = parsedHtml.querySelector('base');
  const baseUrl = baseTag ? baseTag.getAttribute('href') : urlToSave;

  const assetUrls = new Set();
  const assetDownloadPromises = [];

  // Fetch the asset and return its content as a buffer
  async function fetchAsset(originalUrl) {
    try {
      const assetPage = await context.newPage();
      const response = await assetPage.goto(originalUrl, { waitUntil: 'networkidle' });
      const buffer = await response.buffer();
      return buffer;
    } catch (error) {
      console.error(`Error fetching asset: ${originalUrl} - ${error.message}`);
    }
  }

  // Process the specified attribute to update the links and fetch the assets
  function processAttribute(attributeName) {
    for (const element of parsedHtml.querySelectorAll(`[${attributeName}]`)) {
      const originalUrl = element.getAttribute(attributeName);
      if (originalUrl.startsWith('data:')) continue;

      const absoluteUrl = url.resolve(baseUrl, originalUrl);
      const parsedUrl = url.parse(absoluteUrl);
      const relativePath = path.join(parsedUrl.host || '', parsedUrl.pathname);
      const localPath = path.join(outputDir, relativePath);

      element.setAttribute(attributeName, relativePath);

      if (!assetUrls.has(absoluteUrl)) {
        assetUrls.add(absoluteUrl);
        assetDownloadPromises.push(
          fetchAsset(absoluteUrl)
            .then((buffer) => buffer && fs.outputFile(localPath, buffer))
        );
      }
    }
  }

  // Process 'src' and 'href' attributes to update links and download assets
  processAttribute('src');
  processAttribute('href');

  // Save the updated HTML content
  await fs.outputFile(path.join(outputDir, 'index.html'), parsedHtml.toString());

  // Wait for all assets to be downloaded
  await Promise.allSettled(assetDownloadPromises);

  // Close the browser instance
  await browser.close();
}

const urlToSave = 'https://example.com/';
const outputDir = 'saved-website';

saveWebpage(urlToSave, outputDir).catch((error) => console.error('Error:', error));
BrowserCat
  • 26
  • 3