I am trying to use Luigi to build a small scraping pipeline and I'm using Pillow to save the images from the pages I scrape. However, I'm struggling with the output when I try to save each image in loop (e.g. I want to save img_1, img_2, img_3, etc. in the output folder). I tried to pass an "image_id" parameter within the output function but it doesn't work and I can't figure out how to accomplish this.
class DownloadImages(luigi.Task):
def requires(self):
pass # taking out dependencies for this example
def output(self, image_id):
return luigi.LocalTarget(f"img/img_{image_id}.jpeg")
def run(self):
resp = requests.get("https://my-site.com")
soup = BeautifulSoup(resp.content, "html.parser")
images_list = soup.select("img")
for image_id in range(len(images_list)):
image_url = images_list[image_id]["src"]
img = Image.open(requests.get(image_url, stream=True).raw)
img.save(self.output(image_id).path)