I am trying to scrape this website items, however when I used httpx or even requests sometimes it passes and gets the response sometimes it doesn't. It seems random, that's why I tried doing a rerun of the failed items to get the results. However this does not seem to work 100% of all the time. Is there something that I am not doing right? Can somebody help?
Below is a sample of the web I am scraping, if it does not show any errors you can loop it to scrape the same thing twice.
Here's my current code:
urllist=['https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-16912/_summary?defaultItemSku=RAM-70107-16912-00001&pickupPointCode=PP-3239816&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-04124/_summary?defaultItemSku=RAM-70107-04124-00001&pickupPointCode=PP-3239816&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-19598/_summary?defaultItemSku=RAM-70107-19598-00001&pickupPointCode=PP-3239816&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-01115/_summary?defaultItemSku=RAM-70107-01115-00001&pickupPointCode=PP-3206709&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-02620/_summary?defaultItemSku=RAM-70107-02620-00001&pickupPointCode=PP-3239816&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-17552/_summary?defaultItemSku=RAM-70107-17552-00001&pickupPointCode=PP-3023611&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-08377/_summary?defaultItemSku=RAM-70107-08377-00001&pickupPointCode=PP-3239816&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-03274/_summary?defaultItemSku=RAM-70107-03274-00001&pickupPointCode=PP-3069769&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-04345/_summary?defaultItemSku=RAM-70107-04345-00001&pickupPointCode=PP-3239816&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-01030/_summary?defaultItemSku=RAM-70107-01030-00001&pickupPointCode=PP-3239816&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-01534/_summary?defaultItemSku=RAM-70107-01534-00001&pickupPointCode=PP-3239816&cnc=false']
rancherrors=pd.DataFrame()
ranchdf=pd.DataFrame()
for url in urllist:
try:
ua = UserAgent()
USER_AGENT = ua.random
headers={
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7",
"cache-control": "max-age=0",
"sec-ch-ua": 'Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99',
"sec-ch-ua-platform": "Windows",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
'X-Forwarded-For': f'{random_ip}',
"user-agent" : f"{USER_AGENT}",
'referer':'https://www.blibli.com/'
}
response =httpx.get(url,headers=headers)
try:
price=str(response.json()['data']['price']['listed']).replace(".0","")
discount=str(response.json()['data']['price']['totalDiscount'])
except:
price="0"
discount="0"
try:
unit=str(response.json()['data']['uniqueSellingPoint']).replace("• ","")
except:
unit=""
dat={
'product_name':response.json()['data']['name'],
'normal_price':price,
'discount':discount,
'competitor_id':response.json()['data']['ean'],
'url':input,
'unit':unit,
'astro_id':id,
'date_key':today,
'web':'ranch market'
}
dat=pd.DataFrame([dat])
ranchdf=ranchdf.append(dat)
sleep(randint(2,5))
except Exception as e:
datum={
'id':id,
'url':url
}
datum=pd.DataFrame([datum])
rancherrors=rancherrors.append(datum)
print(f'{url} error {e}')