-1

I'm trying to loop through multiple articles on Reddit, go through each article and extract the top relevant entity (done by filtering for highest relevancy score), and then add that to the master_locations list:

from __future__ import print_function
from alchemyapi import AlchemyAPI
import json
import urllib2
from bs4 import BeautifulSoup

alchemyapi = AlchemyAPI()
reddit_url = 'http://www.reddit.com/r/worldnews'
urls = []
locations = []
relevance = []
master_locations = []

def get_all_links(page):
    html = urllib2.urlopen(page).read()
    soup = BeautifulSoup(html)
    for a in soup.find_all('a', 'title may-blank ', href=True):
        urls.append(a['href'])
        run_alchemy_entity_per_link(a['href'])

def run_alchemy_entity_per_link(articleurl):
    response = alchemyapi.entities('url', articleurl) 
    if response['status'] == 'OK':
        for entity in response['entities']:
            if entity['type'] in entity == 'Country' or entity['type'] == 'Region' or entity['type'] == 'City' or entity['type'] == 'StateOrCountry' or entity['type'] == 'Continent':
                if entity.get('disambiguated'):
                    locations.append(entity['disambiguated']['name'])
                    relevance.append(entity['relevance'])
                else:
                    locations.append(entity['text'])
                    relevance.append(entity['relevance'])         
            else:
                locations.append('No Location')
                relevance.append('0')
        max_pos = relevance.index(max(relevance)) # get nth position of the highest relevancy score
        master_locations.append(locations[max_pos]) #Use n to get nth position of location and store that location name to master_locations
        del locations[0] # RESET LIST
        del relevance[0] # RESET LIST
    else:
        print('Error in entity extraction call: ', response['statusInfo'])

get_all_links('http://www.reddit.com/r/worldnews') # Gets all URLs per article, then analyzes entity

for item in master_locations:
    print(item)

But I think for some reason, the lists locations and relevance aren't being reset. Am I doing this wrong?

The result of printing this is:

Holland
Holland
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Johor Bahru

(likely from the lists not being cleared)

  • I have downvoted because this is a long wall of code, mostly irrelevant, that could have been simplified a lot. http://sscce.org/ – Davidmh Sep 06 '14 at 10:05

2 Answers2

0

del list[0] delete only the first item of the list.

If you want to delete all items, use following:

del list[:]

or

list[:] = []
falsetru
  • 357,413
  • 63
  • 732
  • 636
0

In your case, don't reuse the lists, simply create new ones:

from __future__ import print_function
from alchemyapi import AlchemyAPI
import json
import urllib2
from bs4 import BeautifulSoup

alchemyapi = AlchemyAPI()
reddit_url = 'http://www.reddit.com/r/worldnews'

def get_all_links(page):
    html = urllib2.urlopen(page).read()
    soup = BeautifulSoup(html)
    urls = []
    master_locations = []
    for a in soup.find_all('a', 'title may-blank ', href=True):
        urls.append(a['href'])
        master_locations.append(run_alchemy_entity_per_link(a['href']))
    return urls, master_locations

def run_alchemy_entity_per_link(articleurl):
    response = alchemyapi.entities('url', articleurl) 
    if response['status'] != 'OK':
        print('Error in entity extraction call: ', response['statusInfo'])
        return
    locations_with_relevance = []
    for entity in response['entities']:
        if entity['type'] in ('Country', 'Region', 'City', 'StateOrCountry', 'Continent'):
            if entity.get('disambiguated'):
                location = entity['disambiguated']['name']
            else:
                location = entity['text']
            locations_with_relevance.append((int(entity['relevance']), location))
        else:
            locations_with_relevance.append((0, 'No Location'))
    return max(locations_with_relevance)[1]

def main():
    _urls, master_locations = get_all_links(reddit_url) # Gets all URLs per article, then analyzes entity

    for item in master_locations:
        print(item)

if __name__ == '__main__':
    main()

When you have more than one item to store in a list, put the items in a tuple, and the tuple in one list, instead of two or more separate lists.

Daniel
  • 42,087
  • 4
  • 55
  • 81