Storing requests locally and restoring them as Beautifoul Soup object latter on
If you are iterating through pages of web site you can store each page with request
explained here.
Create folder soupCategory
in same folder where your script is.
Use any latest user agent for headers
headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15'}
def getCategorySoup():
session = requests.Session()
retry = Retry(connect=7, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
basic_url = "https://www.somescrappingdomain.com/apartments?adsWithImages=1&page="
t0 = time.time()
j=0
totalPages = 1525 # put your number of pages here
for i in range(1,totalPages):
url = basic_url+str(i)
r = requests.get(url, headers=headers)
pageName = "./soupCategory/"+str(i)+".html"
with open(pageName, mode='w', encoding='UTF-8', errors='strict', buffering=1) as f:
f.write(r.text)
print (pageName, end=" ")
t1 = time.time()
total = t1-t0
print ("Total time for getting ",totalPages," category pages is ", round(total), " seconds")
return
Latter on you can create Beautifoul Soup object as @merlin2011 mentioned with:
with open("/soupCategory/1.html") as f:
soup = BeautifulSoup(f)