2

Im trying to scrape the different values (answers to questions) on the following website. 'https://www.unpri.org/organisation/schroders-144205', more specifically the report attached to the website. https://reporting.unpri.org/surveys/PRI-Reporting-Framework-2016/6a23ed84-6bbf-4416-9d0b-6c49f63bc9ac/79894dbc337a40828d895f9402aa63de/html/2/?lang=&a=1.

In the case that the question is unanswered i would like to add a blank space to the list, and add the answer in case it is answered. I have tried many different ways now, and will put up the code on here even though it is a freaking mess.

**So the question is - How can i scrape answers to the question on the report link, and in the question is not answered, add a blank element? All answers or blank aelements should be added to a list.

urls = ['https://www.unpri.org/organisation/schroders-144205']

for i in urls:
    browser.visit(i)
    window = browser.windows[0]
    window.is_current = True
    temp_list = []
    sourcenew = browser.html
    soupnew = bs.BeautifulSoup(sourcenew, 'lxml')
    temp_list.append(browser.url)


for info in soupnew.find_all('span', class_ = 'org-type' ):
        string_com = str(info.text)
        if len(string_com) == 16:
            string_com = string_com.replace(' ', ' ')[1:-1]
        elif len(string_com) == 11:
            string_com = string_com.replace(' ', ' ')[1:-1]
        elif len(string_com) == 10:
            string_com = string_com.replace(' ', ' ')[1:-1]
        elif len(string_com) == 12:
            string_com = string_com.replace(' ', ' ')[1:-1]
        elif len(string_com) == 13:
            string_com = string_com.replace(' ', ' ')[1:-1]
        else:
            string_com = string_com.replace(' ', ' ')[40:-37]
            temp_list.append(string_com)
        if len(browser.find_by_xpath('//*[@id="main-            
content"]/div[2]/div/div/div[2]/p/a')) > 0:
        browser.find_by_xpath('//*[@id="main-
content"]/div[2]/div/div/div[2]/p/a').click()
        time.sleep(2)
        if len(browser.windows) > 1:
            window = browser.windows[1]
            window.is_current = True

            sourcenew2 = browser.html
            soupnew2 = bs.BeautifulSoup(sourcenew2, 'lxml')

            parent = soupnew2.select('div[class="indent type_^ parent_S"]')
            header_values = []

            for r in parent:
                headers = r.find_all("h3")
                for header in headers:
                    if header is not None:
                        fake_radio_button = r.find("img", src="/Style/img/checkedradio.png")
                        real_radio_button = r.select("input[checked='checked']")

                        if fake_radio_button == None:
                            if real_radio_button == None:
                                header_values.append('')
                            else:
                                if len(real_radio_button) > 0:
                                    header_values.append(
                                    real_radio_button[0].attrs["data-original"])
                                else:
                                    header_values.append("")
                        else:
                            header_values.append( fake_radio_button.parent.find(
                            "span").get_text(strip=True))





            text_values1 = []
            text_values2 = []



            for r in parent:
                headers = r.find_all("h3")
                for header in headers:
                    if header is not None:
                        fake_radio_button = r.find_all("img", src="/Style/img/checkedcheckbox.png")
                        real_radio_button = r.select("input[checked='checked']")

                        for b in fake_radio_button:
                            if b == None:

                                if real_radio_button == None:
                                    text_values1.append('')
                            else:
                                   if len(real_radio_button) > 0:
                                       text_values1.append(
                                       real_radio_button[0].attrs["data-original"])
                                   else:
                                       text_values1.append("")
                        else:
                            text_values1.append( b.parent.find(
                                  "span").get_text(strip=True))

            for r in parent:
                headers = r.find_all("h3")
                for header in headers:
                    if header is not None:
                        fake_radio_button1 = r.find("img", src="/Style/img/checkedcheckbox.png")
                        real_radio_button1 = r.select("input[checked='checked']")

                        if fake_radio_button1 == None:

                            if real_radio_button1 == None:
                                text_values2.append('')
                            else:
                                if len(real_radio_button1) > 0:
                                 text_values2.append(
                                 real_radio_button1[0].attrs["data-original"])
                                else:
                                    text_values2.append("")
                        else:

text_values2.append(fake_radio_button1.parent.find(
                              "span").get_text(strip=True))

            text_values3 = []

            for r in parent:
                headersss = r.find_all("span", class_ = 'n-text-p response')
                for headerss in headersss:
                    if headerss is not None:

                        text_values3.append(headerss.get_text(strip=True))

            for r in parent:
                headersss = r.find_all("span", class_ = 'response number')
                for headerss in headersss:
                    if headerss is not None:

                        text_values3.append(headerss.get_text(strip=True))
                    else:
                        text_values3.append('')

            for r in parent:
                headersss = r.find_all("span", class_ = 'response date')
                for headerss in headersss:
                    if headerss is not None:

                        text_values3.append(headerss.get_text(strip=True))
                    else:
                        text_values.append('')

            list_final = []


           def f7(seq):
                seen = set()
                seen_add = seen.add
                return [x for x in seq if not (x in seen or seen_add(x))]


            list_final.append(f7(temp_list))
            list_final.append(f7(header_values))
            list_final.append(f7(text_values1))
            list_final.append(f7(text_values2))
            list_final.append(f7(text_values3))

            print(list_final)
Briyan
  • 23
  • 4

1 Answers1

0

I had a look at your link, and the checkboxes are not actual checkbox elements, but rather images, but what you want can still be done. If you look,

This is the image in the checked radio buttons

<img src="/Style/img/checkedradio.png" class="readradio">

And this is the image tags for the unchecked ones

 <img src="/Style/img/uncheckedradio.png" class="readradio">

Hence, you can pick checked or unchecked answers based on that somewhat like this:

    all_question_blocks = soup_obj.findAll("div",{"class":"question-block"})
    for question_block in all_question_blocks:
        checked = question_block.findAll("a",{"src=":"/Style/img/checkedradio.png"}
        #all your checked attributes, if empty then not answered
        unchecked = question_block.findAll("a",{"src=":"/Style/img/uncheckedradio.png"}

You can then move upwards in the HTMLtree and get parent elements one by one if you wish to extract other attributes.

Hope this helps!

Rudresh Panchal
  • 980
  • 4
  • 16