I am working on a project where I have to scrape maximum URLs (placed in an S3 bucket's file) in a limited time and store them in searchable database. Right now I am having an issue while scraping web pages inside aws lambda. I have a function for my task which when runs in a google Collab environment takes only 7-8 seconds to execute and produce the desired results. But the same function when deployed as lambda is taking almost 10X more time to execute. Here is my code:
import requests
import re
import validators
import boto3
from smart_open import open
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
nltk.data.path.append("/tmp")
nltk.download("stopwords", download_dir = "/tmp")
def CrawlingLambda(event, context):
"""
This lambda crawls a list of webpages, reading URLS from S3 bucket and returns a dictionary
pairing each URL with its keywords.
Args:
http: A pckage inside PoolManager() able to send GET requests
web_url: url of the website whose availability is required
Returns:
bool: Depending upon the response of GET request, this function will return a bool indicating availability of web_url
"""
results = {}
client = boto3.client('s3')
for line in open('s3://urls-to-monitor/URLs1T.txt', transport_params={'client': client}):
if line[len(line)-1] != '/':
url = line[:len(line)-2]
else: url = line
if validation(url) == False:
continue
try:
web_content = scrape_web(url)
results[url] = web_content
except:
continue
return results
def validation(url):
"""
Validates the URL's string. This method use regular expressions for validation at backend.
Args:
url: URL to validate
Returns:
bool: True if the passes string is a valid URL and False otherwise.
"""
return validators.url(url)
def scrape_web(url):
"""
This function scrapes a given URL's web page for a specific set of keywords.
Args:
url: Page's URL to be scraped
Return:
filtered_words: A refined list of extracted words from the web page.
"""
try:
res = requests.get(url, timeout=2)
except:
raise ValueError
if res.status_code != 200:
raise ValueError
html_page = res.content
soup = remove_tags(html_page)
content = soup.get_text()
words = re.split(r"\s+|/", content.lower())
filtered_words = clean_wordlist(words)
return tuple(filtered_words)
def remove_tags(html):
"""
Remove the specified tags from HTML response recieved from request.get() method.
Args:
html: HTML response of the web page
Returns:
soup: Parsed response of HTML
"""
# parse html content
soup = BeautifulSoup(html, "html.parser")
for data in soup(['style', 'script', 'noscript']):
# Remove tags
data.decompose()
# return data by retrieving the tag content
return soup
def clean_wordlist(wordlist):
"""
This function removes any punctuation marks and stop words from our extracted wordlist.
Args:
wordlist: A list of raw words extracted from html response of web page.
Returns:
key_words: A filtered list of words containing only key words
"""
words_without_symbol = []
for word in wordlist:
#Symbols to ignore
symbols = "!@#$%^&*()_-+={[}]|\;:\"<>?/., "
for i in range(len(symbols)):
word = word.replace(symbols[i], '')
if len(word) > 0:
words_without_symbol.append(word)
#ignoring the stopwords
key_words = [word for word in words_without_symbol if not word in stopwords.words()]
return key_words
Any directions, that why there is much time difference and how can I reduce it.