0

I'm experimenting with Pyppeteer and am unsure why headless is resulting in bot detection, while headless = False, isn't.

Here is my code (please note proxy-sever args will need to be removed or replaced with your proxy server):

I'm assuming there is some setting I need to tweak when doing headless?

import glob
import re
import datetime
import time
import random
import logging
import requests_html
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict
import lxml
import asyncio
import pandas as pd
from pyppeteer import launch
import pyppeteer


S_ALPHA_URL = 'https://seekingalpha.com/earnings/earnings-call-transcripts/{}'


async def make_request(url):

        args = ['--proxy-server=xx.xxxx.'] # xxx replaced by your proxy server
        
        for i in range(3):

            try:
                browser = await launch(headless = False,
                                       args = args)

                page = await browser.newPage()
                await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36')
                await page.goto(url, {'waitUntil' : 'domcontentloaded'})
                await page.reload()
                content = await page.content()

                print('return')

                return content

            except (pyppeteer.errors.PageError, pyppeteer.errors.TimeoutError):
              continue

            finally:
                await browser.close()
    

def sa_test():

    ''' testing '''    

    for i in range(2):
       
        start = time.time()  
        
        num = random.randint(1,100)
        url = S_ALPHA_URL.format(num)
        loop = asyncio.get_event_loop()
        content = loop.run_until_complete(make_request(url))
        soup = BeautifulSoup(content, 'html.parser')
        
        print(soup)
        print('time taken:', time.time() - start)

    
if __name__ = '__main__':
    
        sa_test()
Nuno André
  • 4,739
  • 1
  • 33
  • 46
MasayoMusic
  • 594
  • 1
  • 6
  • 24

1 Answers1

0

use pyppeteer STEALTH for browser:

from pyppeteer_stealth import stealth

context = await browser.createIncognitoBrowserContext()
page = await context.newPage()
await stealth(page)
await page.setUserAgent(agent)
TheTechRobo the Nerd
  • 1,249
  • 15
  • 28