Playwright Python: Get only selected attribute urls

Question

import asyncio
from playwright.async_api import Playwright, async_playwright, expect

#Get images urls
#Output: img_urls.csv (Consist of instances of {"property_id": str, "img_urls": []}
async def run(playwright):
    image_urls = [] #Will contain instances of {"property_id": "value", "img_url": [img_urls]}
    browser = await playwright.chromium.launch(headless=False)
    context = await browser.new_context()

    # Open new page
    page = await context.new_page()

    # Go to https://www.zoopla.co.uk/for-sale/details/49240624/
    await page.goto("https://www.zoopla.co.uk/for-sale/details/49240624/")

    # Click button:has-text("Accept all cookies")
    await page.frame_locator("[aria-label=\"Privacy Manager window\\.\"]").locator("button:has-text(\"Accept all cookies\")").click()

    # Click next image
    for i in range(5):
        await page.locator("[data-testid=\"arrow_right\"]").click()

    #Fetch img urls
    imgs = await page.query_selector_all("img")
    for img in imgs:
        src = await img.get_attribute("src")
        print(src)


    # ---------------------
    await context.close()
    await browser.close()
async def main() -> None:
    async with async_playwright() as playwright:
        await run(playwright)
asyncio.run(main())

The above would return

https://lid.zoocdn.com/u/2400/1800/26d9845a91c7fe21834b531a292533dcf16f6754.jpg
https://lid.zoocdn.com/u/2400/1800/2267900ffd5e795f568bf1305a5eab0b95e59e5f.jpg
https://lid.zoocdn.com/u/2400/1800/75d0a22274ed94c1b33db52f5c5cc1022905df02.jpg
https://lid.zoocdn.com/u/2400/1800/d459c1d0ff8e7a1b52f667a48e593f72f91ea368.jpg
https://lid.zoocdn.com/u/2400/1800/0e21775e213c064fd83916b27e795536c816edb0.jpg
https://maps.googleapis.com/maps/api/staticmap?size=792x398&format=jpg&scale=2&center=51.535651,-0.006482&maptype=roadmap&zoom=15&channel=Lex-LDP&client=gme-zooplapropertygroup&sensor=fa
lse&markers=scale:2%7Cicon:https://r.zoocdn.com/assets/map-static-pin-purple-76.png%7C51.535651,-0.006482&signature=EZukT7ugiBKGYFT9F9phLleIXBs=
https://lid.zoocdn.com/u/2400/1800/55e734ae277ee03b2d46f55298acd762727bd727.gif
https://r.zoocdn.com/_next/static/images/natwest-dd532b27dc13112df4f05058c26a990a.svg
https://st.zoocdn.com/zoopla_static_agent_logo_(584439).png

Every time you click on the right arrow to move on to the next image it would return a different number of urls in the console. How to you specifically only get the urls that end with ".jpg" or simply contain ".jpg"? (I could check if contain ".jpg" with Python but just want to know if there's an official way to achievement this with Playwright Python?)

score 0 · Answer 1 · answered Sep 19 '22 at 11:24

0

Look for src that end with .png:

 #Fetch img urls
    imgs = await page.query_selector_all("img")
    for img in imgs:
        src = await img.get_attribute("src")
        if src.endswith('.png'):
            print(src)

answered Sep 19 '22 at 11:24

Tal Angel

1,301
3
29
63

Playwright Python: Get only selected attribute urls

1 Answers1