I'm trying to loop through multiple articles on Reddit, go through each article and extract the top relevant entity (done by filtering for highest relevancy score), and then add that to the master_locations
list:
from __future__ import print_function
from alchemyapi import AlchemyAPI
import json
import urllib2
from bs4 import BeautifulSoup
alchemyapi = AlchemyAPI()
reddit_url = 'http://www.reddit.com/r/worldnews'
urls = []
locations = []
relevance = []
master_locations = []
def get_all_links(page):
html = urllib2.urlopen(page).read()
soup = BeautifulSoup(html)
for a in soup.find_all('a', 'title may-blank ', href=True):
urls.append(a['href'])
run_alchemy_entity_per_link(a['href'])
def run_alchemy_entity_per_link(articleurl):
response = alchemyapi.entities('url', articleurl)
if response['status'] == 'OK':
for entity in response['entities']:
if entity['type'] in entity == 'Country' or entity['type'] == 'Region' or entity['type'] == 'City' or entity['type'] == 'StateOrCountry' or entity['type'] == 'Continent':
if entity.get('disambiguated'):
locations.append(entity['disambiguated']['name'])
relevance.append(entity['relevance'])
else:
locations.append(entity['text'])
relevance.append(entity['relevance'])
else:
locations.append('No Location')
relevance.append('0')
max_pos = relevance.index(max(relevance)) # get nth position of the highest relevancy score
master_locations.append(locations[max_pos]) #Use n to get nth position of location and store that location name to master_locations
del locations[0] # RESET LIST
del relevance[0] # RESET LIST
else:
print('Error in entity extraction call: ', response['statusInfo'])
get_all_links('http://www.reddit.com/r/worldnews') # Gets all URLs per article, then analyzes entity
for item in master_locations:
print(item)
But I think for some reason, the lists locations
and relevance
aren't being reset. Am I doing this wrong?
The result of printing this is:
Holland
Holland
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Johor Bahru
(likely from the lists not being cleared)