Detail Video watch this : https://www.youtube.com/watch?v=N5Y6gZgvryQ
Here is the shell script for the tesseract custom training
N=3 # number of images
#image name => languagename.fontname.expN.filetype
make box file
for i in `seq 1 $N`
do
tesseract testlan.arial.exp$i.png testlan.arial.exp$i batch.nochop makebox
done
after manually edit box file following steps need to be done
#Step 02: Create .tr file (Compounding image file and box file)
step 3: Extract the charset from the box files (Output for this command is unicharset file)
for i in `seq 1 $N`
do
tesseract testlan.arial.exp$i.png testlan.arial.exp$i box.train
unicharset_extractor testlan.arial.exp$i.box
done
step 4: Create a font_properties file based on our needs.
echo "[fontname] [italic (0 or 1)] [bold (0 or 1)] [monospace (0 or 1)] [serif (0 or 1)] [fraktur (0 or 1)]" > font_properties
echo "arial 0 0 1 0 0" > font_properties
Step 5: Training the data.
#Step 6
for i in `seq 1 $N`
do
mftraining -F font_properties -U unicharset -O testlan.unicharset testlan.arial.exp$i.tr
cntraining testlan.arial.exp$i.tr
done
#after step 5 and step 6 shapetable,inttemp,pffmtable,normproto files created
Step 7: Rename four files (shapetable,inttemp,pffmtable,normproto) into ([langname].shapetable,[langname].inttemp,[langname].pffmtable,[langname].normproto)
mv inttemp testlan.inttemp
mv normproto testlan.normproto
mv pffmtable testlan.pffmtable
mv shapetable testlan.shapetable
combine_tessdata testlan.
#move testlan.traineddata to C:\Program Files\Tesseract-OCR\tessdata