0

I'm trying to scrape a website and obtain information that is only available in requests made available through the browser's Network tab.

I found two situations:

  1. I can't get the routes at runtime, because the page.tracing() saves all the information in a file and even after the generated file I can't read that file while the program is running. If I use another trick, like page.on('request', ...) I can't get the route I want. Apparently not all routes are captured.

  2. When I try to run the program with the browser with headless: true, I apparently get an error: TimeoutError: waiting for target failed: timeout 30000ms exceeded.

Below I will leave my example code:

import puppeteer from "puppeteer-extra";
import StealthPlugin from "puppeteer-extra-plugin-stealth";

function holdOn(time?: number) {
  time = time ?? Math.floor(Math.random() * 3000 + 1000);
  return new Promise((resolve) => setTimeout(resolve, time));
}

async function crawler() {
  puppeteer.use(StealthPlugin());

  const browser = await puppeteer.launch({
    headless: true,
    defaultViewport: null,
    ignoreHTTPSErrors: true,
    args: [
      "accept-language:en-US,en;q=0.9",
      "--user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
    ],
    ignoreDefaultArgs: [
      "--disable-extensions",
      "--disable-default-apps",
      "--disable-component-extensions-with-background-pages",
    ],
  });

  const [page] = await browser.pages();

  await page.tracing.start({
    screenshots: true,
    categories: ["devtools.timeline"],
    path: "./tracing.json",
  });

  page.setDefaultNavigationTimeout(0);

  await page.goto("http://pixbet.com/", { waitUntil: "networkidle0" });

  await page.waitForSelector(".reg_login_btn_area");

  const element = await page.$(".btn_general");

  await element.click();

  await page.waitForSelector("div#fe_login_box_popup");

  await holdOn();

  await page.focus('input[name="username"]');
  await page.keyboard.type("user_teste_sample", { delay: 40 });

  await holdOn();

  await page.focus('input[name="password"]');
  await page.keyboard.type("P4$$W0RD_S4MPL3", { delay: 100 });

  await page.click("div.fhtxt > button");

  await page.waitForNavigation({
    waitUntil: "networkidle0",
  });

  await page.setRequestInterception(true);

  page.on("request", (request) => {
    console.log(">>", request.method(), request.url());
    request.continue();
  });

  await page.goto("https://pixbet.com/casino/game/35423-live-spaceman", {
    waitUntil: "networkidle0",
    timeout: 0,
  });

  await page.tracing.stop();

  console.log("Finish");

  await page.close();
  await browser.close();
}

crawler();
ggorlen
  • 44,755
  • 7
  • 76
  • 106
Alisson Boucinhas
  • 181
  • 2
  • 2
  • 11
  • What data are you trying to get, specifically? Often, there's an easier way to get it than you assume – ggorlen Apr 12 '23 at 22:29
  • @ggorlen I'm trying to get a JSESSIONID used by some requests. – Alisson Boucinhas Apr 12 '23 at 23:32
  • Thanks, but that's not really clear enough for me to write an answer Which JSESSIOID specifically (exact value, please)? Also, I suggest explaining [why you want to do this in the first place](https://meta.stackexchange.com/a/233676/399876). – ggorlen Apr 12 '23 at 23:34

1 Answers1

0

you can use something like this to listen to HTTP responses (Intercept a certain request and get its response (puppeteer)) and then extract cookie value from the response headers when it is set:

function doSomething(response) {
    const headers = response.headers();
    const cookie = headers["Set-Cookie"];
    if(cookie && cookie.includes("JSESSIONID")) {
        console.log("cookie: " + cookie);
    }
}

page.on('response', async(response) => {
    doSomething(response)
})

Or you could listen to all request and extract the cookies being used:

page.on('request', async (request) => {
    const headers = request.headers();
    const cookie = headers["Cookie"];
    if(cookie && cookie.includes("JSESSIONID")) {
        console.log("cookie: " + cookie);
    }

    request.continue()
});
lezhumain
  • 387
  • 2
  • 8