Check the following code, it downloads the html, without the images, but you can access them from the xml file that is being parsed to get the url.
from time import sleep
import urllib
import urllib2
from xml.dom import minidom, Node
def main():
print "Hello World"
keywords = []
key_file = open("example.txt", 'r')
if key_file:
temp_lines = key_file.readlines()
for keyword_line in temp_lines:
keywords.append(keyword_line.rstrip("\n"))
key_file.close()
print "Total keywords: %d" % len(keywords)
for keyword in keywords:
url = "http://en.wikipedia.org/w/api.php?format=xml&action=opensearch&search=" + keyword
xmldoc = minidom.parse(urllib.urlopen(url))
root_node = xmldoc.childNodes[0]
section_node = None
for node in root_node.childNodes:
if node.nodeType == Node.ELEMENT_NODE and \
node.nodeName == "Section":
section_node = node
break
if section_node is not None:
items = []
for node in section_node.childNodes:
if node.nodeType == Node.ELEMENT_NODE and \
node.nodeName == "Item":
items.append(node)
if len(items) == 0:
print "NO results found"
else:
print "\nResults found for " + keyword + ":\n"
for item in items:
for node in item.childNodes:
if node.nodeType == Node.ELEMENT_NODE and \
node.nodeName == "Text":
if len(node.childNodes) == 1:
print node.childNodes[0].data.encode('utf-8')
file_name = None
for node in items[0].childNodes:
if node.nodeType == Node.ELEMENT_NODE and \
node.nodeName == "Text":
if len(node.childNodes) == 1:
file_name = "Html\%s.html" % node.childNodes[0].data.encode('utf-8')
break
if file_name is not None:
file = open(file_name, 'w')
if file:
for node in items[0].childNodes:
if node.nodeType == Node.ELEMENT_NODE and \
node.nodeName == "Url":
if len(node.childNodes) == 1:
user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)'
header = { 'User-Agent' : user_agent }
request = urllib2.Request(url=node.childNodes[0].data, headers=header)
file.write(urllib2.urlopen(request).read())
file.close()
break
print "Sleeping"
sleep(2)
if __name__ == "__main__":
main()