I try to run the following code to extract all the text from an XML file:
please pay attention to "word_1_14" - which the word.text is found to be Nonetype thus not printed out...I found that it is because the text is with the strong tag, thus making it invisible. Do you know how to find the word with strong tag and print it out?
This line has problem - it appears that word_1_14's word is nonetype object...which makes it impossible to print out the text.
In the Python code:
for word in ocr_word:
In the XML file:
<span class='ocrx_word' id='word_1_14' title='bbox 621 383 624 387; x_wconf 50' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
It seems to me that the ET.find cannot handle XML text that is highlighted with STRONG tag.
Python code:
##marktag: print the text on top of the image:
#whether it is the area name, or the characters itself
def marktag(xmlObject,draw_img,color,printText,printTag,strongWord=None):
if printText:
if xmlObject.text:
if strongWord:
textInTag = strongWord.text
print('debug strong '+textInTag)
else:
textInTag = xmlObject.text
#debug
print('debug 1:'+textInTag)
draw.text((bbCoord_x0,bbCoord_y0),textInTag,font = fnt, fill = color)
return xmlObject
#processing the image and show it
os.chdir('/home/DocData/PDF_DOC/')
file = '2001ABI-7.png'
XMLfilename = file+'.hocr'
tree = ET.parse(XMLfilename) #2550x3300 pixels
root = tree.getroot()
ocr_carea = root.findall(".//{http://www.w3.org/1999/xhtml}div[@class='ocr_carea']")
img = Image.open('/home/bnpp/DocData/PDF_DOC/'+file)
draw = ImageDraw.Draw(img)
area_color = 255
para_color = 145
line_color = 90
word_color = 40
for area in ocr_carea:
marktag(area,draw,area_color,False,True)
ocr_para = area.findall(".//{http://www.w3.org/1999/xhtml}p[@class='ocr_par']")
for para in ocr_para:
marktag(para,draw,para_color,False,True)
#some word shown under line
ocr_line = para.findall(".//{http://www.w3.org/1999/xhtml}span[@class='ocr_line']")
for line in ocr_line:
marktag(line,draw,line_color,False,True)
ocr_word = line.findall(".//{http://www.w3.org/1999/xhtml}span[@class='ocrx_word']")
for word in ocr_word:
try:
strong_word =word[0].text
except Exception:
marktag(word,draw,word_color,True,False)
break
marktag(word,draw,word_color,False,True,strong_word)
This is the xml:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html
xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name='ocr-system' content='tesseract 3.03' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "2001ABI-7.png"; bbox 0 0 2550 3300; ppageno 0'>
<div class='ocr_carea' id='block_1_1' title="bbox 762 112 1394 161">
<p class='ocr_par' dir='ltr' id='par_1_1' title="bbox 762 112 1394 161">
<span class='ocr_line' id='line_1_1' title="bbox 762 112 1394 161; baseline 0 -1">
<span class='ocrx_word' id='word_1_1' title='bbox 762 112 1034 161; x_wconf 91' lang='eng' dir='ltr'>STATION</span>
<span class='ocrx_word' id='word_1_2' title='bbox 1056 112 1394 161; x_wconf 91' lang='eng' dir='ltr'>LOCATION</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_1_2' title="bbox 1192 182 1818 318">
<p class='ocr_par' dir='ltr' id='par_1_2' title="bbox 1203 205 1611 307">
<span class='ocr_line' id='line_1_2' title="bbox 1373 205 1611 221; baseline 0 -1">
<span class='ocrx_word' id='word_1_3' title='bbox 1373 205 1507 221; x_wconf 80' lang='eng' dir='ltr'>ELEVATION</span>
<span class='ocrx_word' id='word_1_4' title='bbox 1536 205 1611 221; x_wconf 80' lang='eng' dir='ltr'>ABOVE</span>
</span>
<span class='ocr_line' id='line_1_3' title="bbox 1218 264 1581 281; baseline 0.006 -2">
<span class='ocrx_word' id='word_1_5' title='bbox 1218 264 1262 280; x_wconf 88' lang='eng' dir='ltr'>SEA</span>
<span class='ocrx_word' id='word_1_6' title='bbox 1493 265 1581 281; x_wconf 85' lang='eng' dir='ltr'>GROUND</span>
</span>
<span class='ocr_line' id='line_1_4' title="bbox 1203 292 1276 307; baseline 0 0">
<span class='ocrx_word' id='word_1_7' title='bbox 1203 292 1276 307; x_wconf 90' lang='eng' dir='ltr'>LEVEL</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_1_3' title="bbox 131 211 1057 1378">
<p class='ocr_par' dir='ltr' id='par_1_3' title="bbox 131 211 1057 1378">
<span class='ocr_line' id='line_1_5' title="bbox 1012 211 1028 229; baseline 0 0">
<span class='ocrx_word' id='word_1_8' title='bbox 1012 211 1028 229; x_wconf 92' lang='eng' dir='ltr'>L</span>
</span>
<span class='ocr_line' id='line_1_6' title="bbox 1011 236 1027 254; baseline 0 0">
<span class='ocrx_word' id='word_1_9' title='bbox 1011 236 1027 254; x_wconf 88' lang='eng' dir='ltr'>A</span>
</span>
<span class='ocr_line' id='line_1_7' title="bbox 1013 261 1027 279; baseline 0 0">
<span class='ocrx_word' id='word_1_10' title='bbox 1013 261 1027 279; x_wconf 97' lang='eng' dir='ltr'>
<strong>T</strong>
</span>
</span>
<span class='ocr_line' id='line_1_8' title="bbox 1012 286 1020 304; baseline 0 0">
<span class='ocrx_word' id='word_1_11' title='bbox 1012 286 1020 304; x_wconf 97' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
</span>
<span class='ocr_line' id='line_1_9' title="bbox 1013 311 1027 329; baseline 0 0">
<span class='ocrx_word' id='word_1_12' title='bbox 1013 311 1027 329; x_wconf 97' lang='eng' dir='ltr'>T</span>
</span>
<span class='ocr_line' id='line_1_10' title="bbox 1012 335 1027 354; baseline 0 0">
<span class='ocrx_word' id='word_1_13' title='bbox 1012 335 1027 354; x_wconf 92' lang='eng' dir='ltr'>U</span>
</span>
<span class='ocr_line' id='line_1_11' title="bbox 621 360 1030 387; baseline 0.002 -7">
<span class='ocrx_word' id='word_1_14' title='bbox 621 383 624 387; x_wconf 50' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
<span class='ocrx_word' id='word_1_15' title='bbox 761 383 764 387; x_wconf 50' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
<span class='ocrx_word' id='word_1_16' title='bbox 849 362 922 381; x_wconf 68' lang='eng' dir='ltr'>Afifine</span>
<span class='ocrx_word' id='word_1_17' title='bbox 1012 360 1030 378; x_wconf 88' lang='eng' dir='ltr'>D</span>
</span>
</body>
</html>
Output:
bbox 762 112 1394 161
ocr_carea-block_1_1
bbox 762 112 1394 161
ocr_par-par_1_1
bbox 762 112 1394 161; baseline 0 -1
ocr_line-line_1_1
bbox 762 112 1034 161; x_wconf 91
debug 1:STATION
para_word
bbox 762 112 1034 161; x_wconf 91
debug 1:STATION
para_word
bbox 1056 112 1394 161; x_wconf 91
debug 1:LOCATION
bbox 1192 182 1818 318
ocr_carea-block_1_2
bbox 1203 205 1611 307
ocr_par-par_1_2
bbox 1373 205 1611 221; baseline 0 -1
ocr_line-line_1_2
bbox 1373 205 1507 221; x_wconf 80
debug 1:ELEVATION
bbox 1218 264 1581 281; baseline 0.006 -2
ocr_line-line_1_3
bbox 1218 264 1262 280; x_wconf 88
debug 1:SEA
bbox 1203 292 1276 307; baseline 0 0
ocr_line-line_1_4
bbox 1203 292 1276 307; x_wconf 90
debug 1:LEVEL
para_word
bbox 1373 205 1507 221; x_wconf 80
debug 1:ELEVATION
para_word
bbox 1536 205 1611 221; x_wconf 80
debug 1:ABOVE
para_word
bbox 1218 264 1262 280; x_wconf 88
debug 1:SEA
para_word
bbox 1493 265 1581 281; x_wconf 85
debug 1:GROUND
para_word
bbox 1203 292 1276 307; x_wconf 90
debug 1:LEVEL
bbox 131 211 1057 1378
ocr_carea-block_1_3
bbox 131 211 1057 1378
ocr_par-par_1_3
bbox 1012 211 1028 229; baseline 0 0
ocr_line-line_1_5
bbox 1012 211 1028 229; x_wconf 92
debug 1:L
bbox 1011 236 1027 254; baseline 0 0
ocr_line-line_1_6
bbox 1011 236 1027 254; x_wconf 88
debug 1:A
bbox 1013 261 1027 279; baseline 0 0
ocr_line-line_1_7
bbox 1013 261 1027 279; x_wconf 97
ocrx_word-word_1_10
bbox 1012 286 1020 304; baseline 0 0
ocr_line-line_1_8
bbox 1012 286 1020 304; x_wconf 97
ocrx_word-word_1_11
bbox 1013 311 1027 329; baseline 0 0
ocr_line-line_1_9
bbox 1013 311 1027 329; x_wconf 97
debug 1:T
bbox 1012 335 1027 354; baseline 0 0
ocr_line-line_1_10
bbox 1012 335 1027 354; x_wconf 92
debug 1:U
bbox 621 360 1030 387; baseline 0.002 -7
ocr_line-line_1_11
bbox 621 383 624 387; x_wconf 50
ocrx_word-word_1_14
bbox 761 383 764 387; x_wconf 50
ocrx_word-word_1_15
bbox 849 362 922 381; x_wconf 68
debug 1:Afifine