0

I'm coding a toy PageRank, including a crawler as well. It looks a bit odd, as my code fails to converge the PR values. I can also note that the delta between each iteration is 0, part of the output would be:

url: http://en.m.wikipedia.org/wiki/Israel_State_Cup
links_to_node: set(['http://en.m.wikipedia.org/wiki/Association_football', 'http://en.m.wikipedia.org/wiki/Wikipedia:General_disclaimer'])
links_from_node: set(['http://en.m.wikipedia.org/wiki/Israel_State_Cup'])
PR_score: 2.41759524248e+38
ttl_time: 1
last_delta: 0

The code is as follows:

import requests 
import lxml.html
import random

class pr_node:
        """WDM PR node"""
        url = ""
        links_to_node = set([])
        links_from_node = set([])
        PR_score = 0.0001
        ttl_time = 0
        last_delta = 0

        def __init__(self, url, ttl_time):
            self.url = url
            self.links_to_node = set([])
            self.links_from_node = set([])
            self.PR_score = 0.1
            self.ttl_time = ttl_time

        def print_node_out_links(self):
            print "\n\n" + self.url + " with ttl " + str(self.ttl_time) + " = "
            s = self.links_to_node
            print "{" + "\, ".join(str(e) for e in s) + "}"

        def print_node_pr(self):
            print "\n\n" + self.url + " PR is: " + str(self.PR_score)

        def print_all(self):
            print "url: " + self.url
            print "links_to_node: " + repr(self.links_to_node)
            print "links_from_node: " + repr(self.links_from_node)
            print "PR_score: " + str(self.PR_score)
            print "ttl_time: " + str(self.ttl_time)
            print "last_delta: " + str(self.last_delta)



def crawl(url, url_ttl):
        """crawl to new url, if ttl == 0 max depth reached, don't visit same url twice"""
        if url_ttl > 0 and (url not in visited_urls):

            # create new node p from parsed page
            print "crawling to " + url + "...\n"
            res = requests.get(url)
            doc = lxml.html.fromstring(res.content)
            p = pr_node(url, url_ttl)

            # add new PR node
            global pr_nodes
            pr_nodes[url] = p

            # get all wiki links
            all_links_to_node = set([])
            for t in doc.xpath("//a[contains(@href, '/wiki/')]"):
                add_val = ""
                if not t.attrib['href'].startswith("http://") and t.attrib['href'].startswith("/wiki/"):
                    add_val = "http://en.m.wikipedia.org" + t.attrib['href']
                    all_links_to_node.add(add_val)
                elif t.attrib['href'].startswith("http://"):
                    add_val = t.attrib['href']
                    all_links_to_node.add(add_val)
                else:
                    pass

            # select random 10 of them and crawl to them
            iter_count = 0
            iter_stop_lim = 10
            while iter_count < iter_stop_lim and len(p.links_to_node) < iter_stop_lim and len(all_links_to_node) > 0:
                    current_url = random.sample(all_links_to_node, 1)[0]

                    all_links_to_node.remove(current_url)  # don't do it twice...
                    iter_count = + 1
                    if not (current_url in visited_urls) and url_ttl > 1:
                        p.links_to_node.add(current_url)
                        crawl(current_url, url_ttl - 1)
                        visited_urls.add(url)
                    elif current_url in visited_urls and url_ttl == 1:
                        p.links_to_node.add(current_url)

        else:
            print "max depth reached or you've already been here"
        return


def calc_graph_pr(pr_iter_count, damp_factor):
    "print calculating PageRank"
    current_iter = 0
    global pr_nodes

    g1 = {}
    g2 = {}
    for node in pr_nodes.itervalues():
        g1[node.url] = node
        g2[node.url] = node

    g = [g1, g2]

    while current_iter < pr_iter_count:
        print "PageRank iteration #" + str(current_iter)
        for p in g[current_iter % 2].itervalues():
            in_links_addition = 0
            for l in p.links_to_node:
                l_val = g[(current_iter - 1) % 2][l]
                l_val.delta = l_val.PR_score - g[current_iter % 2][l].PR_score
                in_links_addition += l_val.PR_score/len(l_val.links_from_node)
            p.PR_score = damp_factor + (1 - damp_factor) * in_links_addition
        current_iter += 1

    pr_nodes = g[0] #WLOG could be also g[1]...

    for p in pr_nodes.itervalues():
        p.print_all()

    print "check bool:"
    print g1 == g2
    return


def update_graph_links():
    global pr_nodes
    for node in pr_nodes.itervalues():
        for u in node.links_to_node:
            if u in pr_nodes:
                pr_nodes[u].links_from_node.add(u)
    return

visited_urls = set([])
pr_nodes = {}

glob_pr_iter_count = 50
glob_damp_factor = 0.2

crawl("http://en.m.wikipedia.org/wiki/Nikola_Mitrovic", 3)

update_graph_links()
calc_graph_pr(glob_pr_iter_count, glob_damp_factor)
GalB1t
  • 265
  • 1
  • 3
  • 13
  • So it would just carry on forever? I see when delta (the difference between the current PR score and the PR score form the last iteration ) is calculated but I don't see you putting that value to any use. Typically the graph is said to converge when the delta of any node is less than 0.0001, there needs to be some code that ends the iterations when this happens. – jksnw May 04 '15 at 01:04
  • @jksnw it is not under current use yet it demonstrates that something is rotten in this piece of code. – GalB1t May 04 '15 at 03:55

1 Answers1

1

It was the edge adding function that ruined it all. Fixed it to:

def update_graph_links():
    """register each node with neighbours pointing at it"""
    global pr_nodes
    for node in pr_nodes.itervalues():
        for u in node.links_to_node:
            if u in pr_nodes:
                pr_nodes[u].links_from_node.add(node.url)
    return

After a few adjustments, some refactoring and adding proper comments it came up to the following code:

import requests 
import lxml.html
import random
import sys

class pr_node:
        """WDM PR node"""

        url = ""
        links_to_node = set([])
        links_from_node = set([])
        PR_score = 0.01
        ttl_time = 0
        last_delta = 0  # used for debug only

        def __init__(self, url, ttl_time):
            """CTOR"""
            self.url = url
            self.links_to_node = set([])
            self.links_from_node = set([])
            self.PR_score = 0.01
            self.ttl_time = ttl_time


        def print_node_out_links(self):
            """print for q1a"""
            print "\n\n" + self.url + " with ttl " + str(self.ttl_time) + " = "
            s = self.links_to_node
            print "{" + "\, ".join(str(e) for e in s) + "}"


        def print_node_pr(self):
            """print for q1b"""
            print "\n\n" + self.url + " PR is: " + str(self.PR_score)


        def print_all(self):
            """print for q1b and debug"""
            print "url: " + self.url
            print "links_to_node: " + repr(self.links_to_node)
            print "links_from_node: " + repr(self.links_from_node)
            print "PR_score: " + str(self.PR_score)
            print "ttl_time: " + str(self.ttl_time)
            print "last_delta: " + str(self.last_delta)


def crawl(url, url_ttl):
        """crawl to new url, if ttl == 0 max depth reached, don't visit same url twice"""
        if url_ttl > 0 and (url not in visited_urls):

            # create new node p from parsed page
            print "crawling to " + url + "...\n"
            res = requests.get(url)
            doc = lxml.html.fromstring(res.content)
            p = pr_node(url, url_ttl)

            # add new PR node
            global pr_nodes
            pr_nodes[url] = p

            # get all wiki links, format to legit URL
            all_links_to_node = set([])
            for t in doc.xpath("//a[contains(@href, '/wiki/')]"):
                add_val = ""
                if not t.attrib['href'].startswith("http://") and t.attrib['href'].startswith("/wiki/"):
                    add_val = "http://en.m.wikipedia.org" + t.attrib['href']
                    all_links_to_node.add(add_val)
                elif t.attrib['href'].startswith("http://"):
                    add_val = t.attrib['href']
                    all_links_to_node.add(add_val)
                else:
                    pass

            # select random 10 of them and crawl to them
            iter_count = 0
            iter_stop_lim = 10
            while iter_count < iter_stop_lim and len(p.links_to_node) < iter_stop_lim and len(all_links_to_node) > 0:
                    # sample random site of linked sites
                    current_url = random.sample(all_links_to_node, 1)[0]
                    # don't sample it twice...
                    all_links_to_node.remove(current_url)
                    iter_count = + 1

                    # crawl if hav'nt been there and TTL enables you to check it
                    if not (current_url in visited_urls) and url_ttl > 1:
                        p.links_to_node.add(current_url)
                        crawl(current_url, url_ttl - 1)
                        visited_urls.add(url)

                    # if reached with TTL == 1 just check links to existing nodes
                    elif current_url in visited_urls and url_ttl == 1:
                        p.links_to_node.add(current_url)

        else:
            print "max depth reached or you've already been here"
        return


def calc_graph_pr(pr_nodes, pr_iter_count, damp_factor):
    """calculate and print the graph's PageRank"""
    current_iter = 0

    # use two graph copies to prevent auto-interference
    g1 = {}
    g2 = {}
    for node in pr_nodes.itervalues():
        g1[node.url] = node
        g2[node.url] = node

    g = [g1, g2]

    # do actual page rank here
    while current_iter < pr_iter_count:
        for p in g[current_iter % 2].itervalues():
            in_links_addition = 0
            # iterate over all pointing nodes and sum their PR/out_link_count
            for l in p.links_to_node:
                l_val = g[(current_iter - 1) % 2][l]
                l_val.delta = l_val.PR_score - g[current_iter % 2][l].PR_score
                in_links_addition += l_val.PR_score/len(l_val.links_from_node)
            # update w.r.t the computed sum and damp_factor
            p.PR_score = damp_factor + (1 - damp_factor) * in_links_addition
        current_iter += 1

    # WLOG could be also g[1]...
    pr_nodes = g[0]

    for p in pr_nodes.itervalues():
        p.print_node_pr()

    return


def update_graph_links():
    """register each node with neighbours pointing at him"""
    global pr_nodes
    for node in pr_nodes.itervalues():
        for u in node.links_to_node:
            if u in pr_nodes:
                pr_nodes[u].links_from_node.add(node.url)
    return


if __name__ == '__main__':

    urlToCrawl = "http://en.m.wikipedia.org/wiki/Nikola_Mitrovic"

    # crawl to the requested site as default
    if len(sys.argv) > 2:
        sys.exit("Unexpected input")
    elif len(sys.argv) == 1:
        pass
    else:
        urlToCrawl = sys.argv[1]

    print_q1a = False
    print_q1b = True

    # set global data structures for crawling and ranking
    visited_urls = set([])
    pr_nodes = {}

    # parameters for PageRank
    glob_pr_iter_count = 100
    glob_damp_factor = 0.2

    # perform crawl in depth 3
    crawl(urlToCrawl, 3)

    if print_q1a:
        for p in pr_nodes.itervalues():
            p.print_node_out_links()

    elif print_q1b:
        # first update the backlinks then start ranking
        update_graph_links()
        calc_graph_pr(pr_nodes, glob_pr_iter_count, glob_damp_factor)
    else:
        pass
GalB1t
  • 265
  • 1
  • 3
  • 13