@jpnadas In this case the code you copied from my answer in this post isn't really suitable because it addresses the case when a table doesn't have surrounding grid. That algorithm looks for repeating blocks of texts and tries to find a pattern that resembles a table heuristically.
But in this particular case the table does have the grid and by taking this advantage we can achieve a lot more accurate result.
The strategy is the following:
- Increase image gamma to make the grid darker
- Get rid of colour and apply Otsu thresholding
- Find long vertical an horizontal lines in the image and create a mask from it using
erode
and dilate
functions
- Find the cell blocks in the mask using
findContours
function.
Find table objects
5.1 The rest can be as in the post about finding a table without the
grid: find table structure heuristically
5.2 Alternative approach could be using hierarchy
returned by the findContours
function. This approach is even more accurate and
allows to find multiple tables on a single image.
Having cell coordinates it's easy to extract certain cell image from the original image:
cell_image = image[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
Apply OCR to each cell_image
.
BUT! I consider the OpenCV approach as a last resort when you're not able to read the PDF's contents: for instance in case when a PDF contains raster image inside.
If it's a vector-based PDF and its contents are readable it makes more sense to find the table inside contents and just read the text from it instead of doing heavy 'OCR lifting'.
Here's the code for reference for more accurate table recognition:
import os
import imutils
import numpy as np
import argparse
import cv2
def gamma_correction(image, gamma = 1.0):
look_up_table = np.empty((1,256), np.uint8)
for i in range(256):
look_up_table[0,i] = np.clip(pow(i / 255.0, gamma) * 255.0, 0, 255)
result = cv2.LUT(image, look_up_table)
return result
def pre_process_image(image):
# Let's get rid of color first
# Applying gamma to make the table lines darker
gamma = gamma_correction(image, 2)
# Getting rid of color
gray = cv2.cvtColor(gamma, cv2.COLOR_BGR2GRAY)
# Then apply Otsu threshold to reveal important areas
ret, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
# inverting the thresholded image
return ~thresh
def get_horizontal_lines_mask(image, horizontal_size=100):
horizontal = image.copy()
horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
horizontal = cv2.erode(horizontal, horizontal_structure, anchor=(-1, -1), iterations=1)
horizontal = cv2.dilate(horizontal, horizontal_structure, anchor=(-1, -1), iterations=1)
return horizontal
def get_vertical_lines_mask(image, vertical_size=100):
vertical = image.copy()
vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
vertical = cv2.erode(vertical, vertical_structure, anchor=(-1, -1), iterations=1)
vertical = cv2.dilate(vertical, vertical_structure, anchor=(-1, -1), iterations=1)
return vertical
def make_lines_mask(preprocessed, min_horizontal_line_size=100, min_vertical_line_size=100):
hor = get_horizontal_lines_mask(preprocessed, min_horizontal_line_size)
ver = get_vertical_lines_mask(preprocessed, min_vertical_line_size)
mask = np.zeros((preprocessed.shape[0], preprocessed.shape[1], 1), dtype=np.uint8)
mask = cv2.bitwise_or(mask, hor)
mask = cv2.bitwise_or(mask, ver)
return ~mask
def find_cell_boxes(mask):
# Looking for the text spots contours
# OpenCV 3
# img, contours, hierarchy = cv2.findContours(pre, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
# OpenCV 4
contours = cv2.findContours(mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours = imutils.grab_contours(contours)
contours = sorted(contours, key=cv2.contourArea, reverse=True)
image_width = mask.shape[1]
# Getting the texts bounding boxes based on the text size assumptions
boxes = []
for contour in contours:
box = cv2.boundingRect(contour)
w = box[2]
# Excluding the page box shape but adding smaller boxes
if w < 0.95 * image_width:
boxes.append(box)
return boxes
def find_table_in_boxes(boxes, cell_threshold=10, min_columns=2):
rows = {}
cols = {}
# Clustering the bounding boxes by their positions
for box in boxes:
(x, y, w, h) = box
col_key = x // cell_threshold
row_key = y // cell_threshold
cols[row_key] = [box] if col_key not in cols else cols[col_key] + [box]
rows[row_key] = [box] if row_key not in rows else rows[row_key] + [box]
# Filtering out the clusters having less than 2 cols
table_cells = list(filter(lambda r: len(r) >= min_columns, rows.values()))
# Sorting the row cells by x coord
table_cells = [list(sorted(tb)) for tb in table_cells]
# Sorting rows by the y coord
table_cells = list(sorted(table_cells, key=lambda r: r[0][1]))
return table_cells
def build_vertical_lines(table_cells):
if table_cells is None or len(table_cells) <= 0:
return [], []
max_last_col_width_row = max(table_cells, key=lambda b: b[-1][2])
max_x = max_last_col_width_row[-1][0] + max_last_col_width_row[-1][2]
max_last_row_height_box = max(table_cells[-1], key=lambda b: b[3])
max_y = max_last_row_height_box[1] + max_last_row_height_box[3]
hor_lines = []
ver_lines = []
for box in table_cells:
x = box[0][0]
y = box[0][1]
hor_lines.append((x, y, max_x, y))
for box in table_cells[0]:
x = box[0]
y = box[1]
ver_lines.append((x, y, x, max_y))
(x, y, w, h) = table_cells[0][-1]
ver_lines.append((max_x, y, max_x, max_y))
(x, y, w, h) = table_cells[0][0]
hor_lines.append((x, max_y, max_x, max_y))
return hor_lines, ver_lines
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True, help="path to images directory")
args = vars(ap.parse_args())
in_file = args["image"]
filename_base = in_file.replace(os.path.splitext(in_file)[1], "")
img = cv2.imread(in_file)
pre_processed = pre_process_image(img)
# Visualizing pre-processed image
cv2.imwrite(filename_base + ".pre.png", pre_processed)
lines_mask = make_lines_mask(pre_processed, min_horizontal_line_size=1800, min_vertical_line_size=500)
# Visualizing table lines mask
cv2.imwrite(filename_base + ".mask.png", lines_mask)
cell_boxes = find_cell_boxes(lines_mask)
cells = find_table_in_boxes(cell_boxes)
# apply OCR to each cell rect here
# the cells array contains cell coordinates in tuples (x, y, w, h)
hor_lines, ver_lines = build_vertical_lines(cells)
# Visualize the table lines
vis = img.copy()
for line in hor_lines:
[x1, y1, x2, y2] = line
cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
for line in ver_lines:
[x1, y1, x2, y2] = line
cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
cv2.imwrite(filename_base + ".result.png", vis)
Some parameters are hard-coded:
page size threshold - 0.95
min horizontal line size - 1800 px
min vertical line size - 500 px
You can provide them as configurable parameters or make them relative to image size.
Results:
