What do I need to do when trying to crawl an article, but an Ad of sorts keeps showing up? Specifically, the ones that would pop up in the middle of the screen, asking to log in/sign up, and you have to manually close it before reading.
Because of that, my crawl is unable to extract anything. Any advice on how to code in a "close ad before crawling" with pyquery?
Edit: Now working with Selenium to try and rid the pop-ups. Any advice would be much appreciated.
import mechanize
import time
import urllib2
import pdb
import lxml.html
import re
from pyquery import PyQuery as pq
def open_url(url):
print 'open url:',url
try:
br = mechanize.Browser()
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.addheaders = [('user-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3')]
response = br.open(url)
html = response.get_data()
return html
except:
print u"!!!! url can not be open by mechanize either!!! \n"
def extract_text_pyquery(html):
p = pq(html)
article_whole = p.find(".entry-content")
p_tag = article_whole('p')
print len(p_tag)
print p_tag
for i in range (0, len(p_tag)):
text = p_tag.eq(i).text()
print text
entire = p.find(".grid_12")
author = entire.find('p')
print len(author)
print "By:", author.text()
images = p.find('#main_photo')
link = images('img')
print len(link)
for i in range(len(link)):
url = pq(link[i])
result =url.attr('src').find('smedia')
if result>0:
print url.attr('src')
if __name__ =='__main__':
#print '----------------------------------------------------------------'
url_list = ['http://www.newsobserver.com/2014/10/17/4240490/obama-weighs-ebola-czar-texas.html?sp=/99/100/&ihp=1',
]
html= open_url(url_list[0])
# dissect_article(html)
extract_text_pyquery(html)