How to scrape more than first instance of triple-nested list of links in Python?

Question

I am trying to determine the simplest way to record the contents of webpages linked from webpages linked from an original webpage. I would like my output to be a table with rows corresponding to the contents of the third layer deep of pages.

As you can see from the code, I am currently only able to get the first instance of a desired item on the third-level page. Also, while my current code will return one row corresponding to each h2 item on the base URL, I hope to have multiple rows per h2 item (as many as there are instances of "span.'case-doc-details' a" on the second layer).

Some additional info: At each linking state, I do not know how many pages will be linked. I am using Python and Scraperwiki, and new to both. I have attempted to research the question, but have hit a roadblock in my knowledge of what to ask. Thanks in advance for any help.

import scraperwiki
import urlparse
import lxml.html
import urllib

def scrape_table(root):
    rows = root.cssselect("h2")
    record = {}
    counter=0
    for row in rows:
        table_cells = row.cssselect("h2 a")
        for cell in table_cells:
            record['Count']=counter
            table_cellsurls = table_cells[0].cssselect("a")
            record['CaseURL'] = table_cellsurls[0].attrib.get('href')
            caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read()

            #print caselinkurl
            caseroots = lxml.html.fromstring(caselinkurl)
            title=caseroots.cssselect("title")
            record['Title'] = title[0].text_content()
            ids=caseroots.cssselect("div div div div a")
            for i in ids:
                if len(ids)<=2:
                    record['Rules']="None"
                    record['Treaty']="None"
                else:
                    record['Rules']=ids[2].text_content()
                    record['Treaty']=ids[3].text_content()
            pars = caseroots.cssselect("span.'case-doc-details' a")
            #print "pars length is", len(pars)
            caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars[0].attrib.get('href')).read()
            caseroots2=lxml.html.fromstring(caselinkurl2)
            #create another table element with rows, marked off with the case that they came from, create all the rows.
            for i in pars:     
                if len(pars)==0:
                    record['DetailsURL']="None"
                else:                    
                    record['DetailsURL']=pars[0].attrib.get('href')
                pars2=caseroots2.cssselect("div.'field-item even' span.'date-display-single'")
                if len(pars2)==0:
                    record['Doc Date']="None"
                else:                        
                    record['Doc Date']=pars2[0].text_content()
                pars3=caseroots2.cssselect("div.'field-name-field-case-doc-file' span.'file' a")
                if len(pars3) ==0:
                    record['Doc Type Link']="None"
                    record['Doc Type']="None"  
                else:
                    record['Doc Type Link']=pars3[0].attrib.get('href')
                    record['Doc Type']=pars3[0].text_content()
                pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'")
                if len(pars4)==0:
                    record['Claimant Nominee']="None"
                else:
                    record['Claimant Nominee']=pars4[0].text_content()
                pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'")
                if len(pars5)==0:
                    record['Respondent Nominee']="None"
                else:
                    record['Respondent Nominee']=pars5[0].text_content()
                pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'")
                if len(pars6)==0:
                    record['President']="None"
                else:
                    record['President']=pars6[0].text_content()

            print record, '------------'
            scraperwiki.sqlite.save(['Count'],record)
            counter+=1
def scrape_and_look_for_next_link(url):
    html = scraperwiki.scrape(url)
    print html
    root = lxml.html.fromstring(html)
    scrape_table(root)


#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)

score 0 · Answer 1 · answered Jul 11 '13 at 13:57

Here's the code I've got so far - this doesn't yet grab the documents link data (or save anything), but that should be a case of extending the principles here into another function:

import scraperwiki
import urlparse
import lxml.html
import urllib

def scrape_page(linkurl):
    html = scraperwiki.scrape(linkurl)
    root = lxml.html.fromstring(html)
    title = root.cssselect("h1")
    print "the title:", title[0].text
    record = {}
    record['title'] = title[0].text
    record['url'] = linkurl
    #<div class="field-items"><div class="field-item even"><a
    arbrules = root.cssselect("div.field-items a")
    if arbrules:
        record['arbruleurl'] = arbrules[0].attrib.get("href")
        record['arbrule'] = arbrules[0].text_content()
    else:
        record['arbruleurl'] = "NO URL"
        record['arbrule'] = "NO ARBRULE"
    legalbasis = root.cssselect("div.field-label")
    if legalbasis:
        record['legalbasis'] = legalbasis[0].text_content()
    else:
        record['legalbasis'] = "NO LEGAL BASIS GIVEN"
    extralinks = []
    contents = root.cssselect("div.view-content a")
    if contents:
        for content in contents:
            extralinks.append(content.text_content())
            extralinks.append(content.attrib.get("href"))
        record['extralinks']  = extralinks
    else:
        record['extralinks']  = "NO EXTRA LINKS"
    #record['firstparty'] = title[0].text.split(" v. ")[0]
    #record['secondparty'] = title[0].text.split(" v. ")[1]
    #record['casenumber'] = title[0].text.split(" Case No.")[1]
    print record


def scrape_table(root):
    links = root.cssselect("div.link-wrapper a")
    for link in links:
        print link.text_content()
        linkurl = link.attrib.get("href")
        print linkurl
        scrape_page('http://www.italaw.com'+linkurl)

def scrape_and_look_for_next_link(url):
    html = scraperwiki.scrape(url)
    print html
    root = lxml.html.fromstring(html)
    scrape_table(root)


#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)

Thanks! This is super helpful. I ended up getting a slightly different approach to work. I'll post that. — toddntucker, Jul 11 '13 at 15:27

score 0 · Answer 2 · answered Jul 13 '13 at 09:40

Here is what I got to work for this problem.

A few instructive general points:

Use an if else loop to distinguish the situation of a length of zero from non-zero length of your key attribute.
Just before this, create your dictionary.
In both if and else components of the loop, give printing, storing and index augmentation instructions. You'll set your index to zero just before going into the loop.
In the else bit, create a for loop that iterates over each instance i, with they key attribute you want to iterate over set to record the ith instance. Set all other attributes to the zeroth instance.
Finally, when dealing with an arbitrary number of triple-nested links, it will generally be best to scrape all data (if possible) from the lowest level you are scraping. In my case, this worked, because all of the attributes I wanted to record were repeated on this level. In other cases, I am not sure what the best way to proceed would be.

Thanks to Paul for nudging this forward.

import scraperwiki
import urlparse
import lxml.html
import urllib

def scrape_table(root):
    rows = root.cssselect("h2")
    counter=0
    for row in rows:
        table_cells = row.cssselect("h2 a")
        for cell in table_cells:
            table_cellsurls = table_cells[0].cssselect("a")
            #record['CaseURL'] = table_cellsurls[0].attrib.get('href')
            caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read()
            #print caselinkurl
            caseroots = lxml.html.fromstring(caselinkurl)
            pars = caseroots.cssselect("span.'case-doc-details' a")
            #print "pars length is", len(pars)
            record = {}
            #create another table element with rows, marked off with the case that they came from, create all the rows.
            if  len(pars)==0:
                record['DetailsURL']="None"
                record['Count']=counter
                print record, '------------'
                scraperwiki.sqlite.save(['Count'],record)
                counter+=1
            else:
                for i in range(0,len(pars)):                       
                    record['Count']=counter
                    caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars[i].attrib.get('href')).read()
                    caseroots2=lxml.html.fromstring(caselinkurl2)
                    record['DetailsURL']=pars[i].attrib.get('href')
                    title=caseroots2.cssselect("h2")
                    record['Title'] = title[1].text_content()
                    rules=caseroots2.cssselect("div.'field-name-field-arbitration-rules'")
                    if len(rules)==0:
                        record['Rules']="None"
                    else:
                        record['Rules']=rules[0].text_content()
                    treaty=caseroots2.cssselect("div.'field-name-field-case-treaties'")
                    if len(treaty)==0:
                        record['Treaty']="None"                                    
                    else:
                        record['Treaty']=treaty[0].text_content()
                    pars2=caseroots2.cssselect("div.'field-name-field-case-document-date'")
                    if len(pars2)==0:
                        record['Doc Date']="None"
                    else:                        
                        record['Doc Date']=pars2[0].text_content()
                    pars3=caseroots2.cssselect("div.'field-name-field-case-doc-file' span.'file' a")
                    if len(pars3) ==0:
                        record['Doc Type Link']="None"
                        record['Doc Type']="None"  
                    else:
                        record['Doc Type Link']=pars3[0].attrib.get('href')
                        record['Doc Type']=pars3[0].text_content()
                    pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'")
                    if len(pars4)==0:
                        record['Claimant Nominee']="None"
                    else:
                        record['Claimant Nominee']=pars4[0].text_content()
                    pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'")
                    if len(pars5)==0:
                        record['Respondent Nominee']="None"
                    else:
                        record['Respondent Nominee']=pars5[0].text_content()
                    pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'")
                    if len(pars6)==0:
                        record['President']="None"
                    else:
                        record['President']=pars6[0].text_content()

                    print record, '------------'
                    scraperwiki.sqlite.save(['Count'],record)
                    counter+=1
def scrape_and_look_for_next_link(url):
    html = scraperwiki.scrape(url)
    print html
    root = lxml.html.fromstring(html)
    scrape_table(root)


#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)

How to scrape more than first instance of triple-nested list of links in Python?

2 Answers2