There are a couple of problems with your code.
First, the conditions you've set to pull different content from the PDF need to be made more restricted and precise (e.g. if int(el.attrib['left']) < 215
will pull any text that has a left position of less than 215 pixels, which applies to other content in the PDF pages you're looking at, e.g. the text "Constituency").
Second, you need a way to check when you have all the data for that row and can move on to the next one. (You could try and pull the data out by rows, but I just found it easier to grab data from each field in turn and make a new row when I had all the data for that row.)
(As to why scraperwiki.sqlite.save
wasn't working, it's probably because you had rows of empty values in there, but your data as you had it wasn't correct anyway.)
This works for me:
import scraperwiki
import urllib2
import lxml.etree
def create_blank_row():
""" Create an empty candidate data dictionary. """
return {'Rank': None,
'Name': None,
'Sex': None,
'Party': None,
'Votes': None,
'Percentage': None}
def row_is_filled(dictionary):
""" Return True if all values of dictionary are filled; False if not. """
for item in dictionary.values():
if not item:
return False
return True
def main():
url = ('http://eci.nic.in/eci_main/statisticalreports'
'/SE_1998/StatisticalReport-DEL98.pdf')
pdfdata = urllib2.urlopen(url).read()
xmldata = scraperwiki.pdftoxml(pdfdata)
root = lxml.etree.fromstring(xmldata)
# how many pages in PDF
pages = list(root)
print "There are", len(pages), "pages"
output_data = []
candidate_data = create_blank_row()
#from page 86 to 107
for page in pages[86:107]:
for el in page:
if el.tag == "text":
if 206 < int(el.attrib['left']) <= 214:
# There are some None values here which we want to ignore.
if el.text:
candidate_data['Rank'] = el.text
if int(el.attrib['left']) == 222:
# Also removes ". " from start of names.
candidate_data['Name'] = el.text[2:]
if int(el.attrib['left']) == 591:
candidate_data['Sex'] = el.text
if int(el.attrib['left']) == 622:
candidate_data['Party'] = el.text
if 725 < int(el.attrib['left']) <= 753:
candidate_data['Votes'] = el.text
if 790 < int(el.attrib['left']) < 801:
candidate_data['Percentage'] = el.text
if row_is_filled(candidate_data):
output_data.append(candidate_data)
candidate_data = create_blank_row()
# Collect candidate data into a list then add to SQL database.
# Calls to this SQL write function slow, so minimise how many times we do.
scraperwiki.sqlite.save(unique_keys=['Rank', 'Name', 'Sex', 'Party',
'Votes'],
table_name='ecidata1998',
data=output_data)
if __name__ == '__main__':
main()