Scraped images is corrupt

Question

Hi I try to scrape the front page images on digg.com, with the follow code. The issue is that 0.jpg to 6.jpg are normal. Starting at 7.jpg to 47.jpg are corrupt. Not sure why.

Here is the code. Github here: https://github.com/kenpeter/py_mm

# os
import os
# http request
import requests
#
import pprint

import time

# import html from lxml
from lxml import html

# global
global_page_num = 0
pp = pprint.PrettyPrinter(indent=4)

# write to file
def download_image(img_urls):
    # total img urls
    amount = len(img_urls)

    # loop
    for index, value in enumerate(img_urls, start=0):
        # file name
        filename = 'img/%s.jpg' % (index)
        # dir
        os.makedirs(os.path.dirname(filename), exist_ok=True)

        print('--- start ---')
        print('filename: %s' % filename)
        print('Downloading: %s out of %s' % (index, amount))

        # open file
        with open(filename, 'wb') as f:
            # f write
            # time.sleep(1)
            f.write(requests.get(value).content)


def get_page_number(num):
    url = 'http://digg.com'
    response = requests.get(url).content
    selector = html.fromstring(response)

    img_urls = []
    img_urls = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src")

    news_texts = []
    news_texts = selector.xpath("//div[@itemprop='description']/text()")

    # test
    # print('--- something ---')
    # pp.pprint(img_urls)
    # pp.pprint(news_texts)

    download_image(img_urls)

    return img_urls


if __name__ == '__main__':
    # input, page_number, everything into the var
    # page_number = input('Please enter the page number that you want to scrape:')

    # global_page_num
    # global_page_num = page_number;
    # print('hell world!');

    page_number = 4 # hardcode
    get_page_number(page_number)

Claudio · Accepted Answer · 2017-04-21T04:36:22.667

The reason why the images are "corrupt" is that the scheme changes within the page and the images start to "hide" in the attribute data-src instead of src which content you grab with your code. See here an example of the source code of the grabbed page with both attributes:

<img
class="digg-story__image-img js--digg-story__image-img lazy-image-img need-offset"
data-src="http://static.digg.com/images/f0b92c2d8a2c4b7f829abbc0e58a408c_2oijd0Z_1_www_large_thumb.jpeg"
src="http://static.digg.com/static/fe/944294/images/x_455x248.png"
width="312"
height="170"
alt=""
/>

In other words you have to check for both attributes src and data-src giving data-src priority over src while creating the list of image URLs.

THIS code does the "trick" and downloads the proper images:

# os
import os
# http request
import requests
#
import pprint

import time

# import html from lxml
from lxml import html

# global
global_page_num = 0
pp = pprint.PrettyPrinter(indent=4)

# write to file
def download_image(img_urls):
    # total img urls
    amount = len(img_urls)

    # loop
    for index, value in enumerate(img_urls, start=0):
        # file name
        filename = 'img/%s.jpg' % (index)
        # dir
        os.makedirs(os.path.dirname(filename), exist_ok=True)

        print('--- start ---')
        print('filename: %s' % filename)
        print('Downloading: %s out of %s' % (index, amount))

        # open file
        with open(filename, 'wb') as f:
            # f write
            # time.sleep(1)
            f.write(requests.get(value).content)


def get_page_number(num):
    url = 'http://digg.com'
    response = requests.get(url).content
    selector = html.fromstring(response)

    img_urls = []
    img_urls_1a = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src")
    img_urls_1b = [item for item in img_urls_1a if 'x_455x248.png' not in item]
    img_urls_2 = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@data-src")
    img_urls = img_urls_1b + img_urls_2
    # print(img_urls)
    news_texts = []
    news_texts = selector.xpath("//div[@itemprop='description']/text()")

    # test
    # print('--- something ---')
    # pp.pprint(img_urls)
    # pp.pprint(news_texts)

    download_image(img_urls)

    return img_urls


if __name__ == '__main__':
    # input, page_number, everything into the var
    # page_number = input('Please enter the page number that you want to scrape:')

    # global_page_num
    # global_page_num = page_number;
    # print('hell world!');

    page_number = 4 # hardcode
    get_page_number(page_number)

Scraped images is corrupt

1 Answers1

Linked