I created code in python to summarize news articles and the code works on both my laptop as well as after creating a Docker image for it.
try:
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
import base64
import re
from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig
import json
except Exception as e:
print("Error imports : {} ".format(e))
def lambda_handler(event=None, context=None):
headers = {'User-Agent': 'Mozilla/5.0'}
url = 'https://news.google.com/news/rss'
client = urlopen(url)
xml_page = client.read()
client.close()
soup = BeautifulSoup(xml_page, 'xml')
contents = soup.find_all("item")
encoded_links = []
headlines = []
dates = []
for news in contents:
if "youtube.com" in str(news.source):
continue
encoded_links.append(news.link.text)
headlines.append(news.title.text)
dates.append(news.pubDate.text)
encoded_links = encoded_links[:15]
headlines = headlines[:15]
dates = dates[:15]
decoded_links = []
for link in encoded_links:
coded = link[44:-5]
while True:
try:
url = base64.b64decode(coded)
break
except:
coded += "a"
url = str(base64.b64decode(coded))
strip1 = re.search("(?P<url>https?://[^\s]+)", url).group("url")
strip2 = stripped = strip1.split('$', 1)[0]
strip3 = stripped = strip2.split('\\', 1)[0]
decoded_links.append(strip3)
summarized_texts = []
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model=BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
for link in decoded_links:
try:
new_page = requests.get(link, headers=headers)
except:
continue
new_soup = BeautifulSoup(new_page.text, 'lxml')
text = ""
paragraphs = new_soup.find_all("p")
for p in paragraphs:
text += p.text
inputs = tokenizer.batch_encode_plus([text],return_tensors='pt', truncation=True, max_length=1024)
summary_ids = model.generate(inputs['input_ids'], early_stopping=True)
bart_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summarized_texts.append(bart_summary)
print(bart_summary)
print("Success,", len(summarized_texts), "summaries created.")
returned = [{'headline': title, 'date': date, 'summary': summarized} for title, date, summarized in zip(headlines, dates, summarized_texts)]
json_summaries = json.dumps(returned)
return json_summaries
And my Dockerfile is as follows:
FROM public.ecr.aws/lambda/python:3.9
COPY requirements.txt ./
RUN pip3 install -r requirements.txt
RUN python -m nltk.downloader punkt
COPY app.py ./
CMD ["app.lambda_handler"]
But when I upload the docker image to aws ecr and use that in lambda, I get the following error:
[ERROR] OSError: [Errno 30] Read-only file system: '/home/sbx_user1051'
Traceback (most recent call last):
File "/var/task/app.py", line 58, in lambda_handler
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
File "/var/lang/lib/python3.9/site-packages/transformers/tokenization_utils_base.py", line 1744, in from_pretrained
resolved_vocab_files[file_id] = cached_path(
File "/var/lang/lib/python3.9/site-packages/transformers/utils/hub.py", line 284, in cached_path
output_path = get_from_cache(
File "/var/lang/lib/python3.9/site-packages/transformers/utils/hub.py", line 486, in get_from_cache
os.makedirs(cache_dir, exist_ok=True)
File "/var/lang/lib/python3.9/os.py", line 215, in makedirs
makedirs(head, exist_ok=exist_ok)
File "/var/lang/lib/python3.9/os.py", line 215, in makedirs
makedirs(head, exist_ok=exist_ok)
File "/var/lang/lib/python3.9/os.py", line 215, in makedirs
makedirs(head, exist_ok=exist_ok)
File "/var/lang/lib/python3.9/os.py", line 225, in makedirs
mkdir(name, mode)
And I can't seem to figure out what this error is for. I can see that the error has to do with the Transformers library, but since the code ran in a docker image I think that the issue isn't due to any missing libraries or files.