I am trying to find the coordinates of structured column text in a PDF (single or multi-page) and to give the user visual feedback, I am creating a red rectangle over that text. Currently, I have tried using the Vision framework in order to perform the OCR and do the detection but it's not working correctly.
The idea is to be able to detect the location of the transaction data in bank statements. This is always presented in columns going from left to right. Of course the number of columns always changes depending the bank, so I need to be able to accurately detect where there are at least 3 columns going from left to right across the page and return the coordinates from the beginning of the transactions to the end of the transactions.
For example, if the transactions looked like:
04-11-2023 Blah description $4.56 $6.78 $100.00
04-12-2023 Foo description $5.67 $7.79 $104.23
the coordinates returned would begin at the top left of the first transaction, and end at the bottom right of the last transaction.
From there I can perform OCR only at those coordinates to correctly output the transactions as strings.
Another issue to mention is that sometimes the descriptions for each transaction will be on multiple lines and can get fairly lengthy.
Here's the current code to display the PDFPreview with the red rectangle as well as output a debug image so I can see what's being detected.
import SwiftUI
import PDFKit
import Vision
private extension VNRecognizedTextObservation {
func boundingBox(for text: String, in pageBounds: CGRect) throws -> CGRect {
let boundingBox = self.boundingBox
let minX = Int(boundingBox.origin.x * pageBounds.size.width)
let minY = Int((1 - boundingBox.origin.y) * pageBounds.size.height)
let maxX = Int((boundingBox.origin.x + boundingBox.size.width) * pageBounds.size.width)
let maxY = Int((1 - boundingBox.origin.y + boundingBox.size.height) * pageBounds.size.height)
let rect = CGRect(x: minX, y: minY, width: maxX - minX, height: maxY - minY)
return rect
}
}
private extension String {
func glyphRanges(in text: String) -> [Range<Int>] {
var ranges: [Range<Int>] = []
var currentIndex = text.startIndex
while currentIndex < text.endIndex {
let currentCluster = text[currentIndex...currentIndex]
var endIndex = text.index(after: currentIndex)
while endIndex < text.endIndex {
let nextCluster = text[endIndex...endIndex]
let isSameGlyph = currentCluster.compare(nextCluster, options: .caseInsensitive) == .orderedSame
if !isSameGlyph {
break
}
endIndex = text.index(after: endIndex)
}
let startIndexInt = text.distance(from: text.startIndex, to: currentIndex)
let endIndexInt = text.distance(from: text.startIndex, to: endIndex)
let range = startIndexInt..<endIndexInt
ranges.append(range)
currentIndex = endIndex
}
return ranges
}
}
enum RecognitionError: Error {
case invalidResult
}
struct PDFPreview: NSViewRepresentable {
let url: URL
@Binding var currentPageIndex: Int
@Binding var zoomLevel: CGFloat
func makeNSView(context: Context) -> PDFView {
let pdfView = PDFView()
pdfView.document = PDFDocument(url: url)
pdfView.autoScales = true
pdfView.backgroundColor = .white
pdfView.displayMode = .singlePage
pdfView.displayDirection = .vertical
pdfView.displaysPageBreaks = true
if let currentPage = pdfView.currentPage {
modifyPageWithColumnRectangles(page: currentPage) { modifiedPage in
if let modifiedPage = modifiedPage {
self.replacePage(at: self.currentPageIndex, with: modifiedPage, in: pdfView.document!)
pdfView.go(to: modifiedPage)
}
}
}
return pdfView
}
func updateNSView(_ nsView: PDFView, context: Context) {
let pageIndex = min(max(0, currentPageIndex), nsView.document?.pageCount ?? 0 - 1)
nsView.go(to: nsView.document?.page(at: pageIndex) ?? PDFPage())
nsView.scaleFactor = zoomLevel
}
private func modifyPageWithColumnRectangles(page: PDFPage, completion: @escaping (PDFPage?) -> Void) {
guard let document = page.document else {
completion(nil)
return
}
let pageIndex = document.index(for: page)
let pageBounds = page.bounds(for: .mediaBox)
guard let pageImage = convertPageToImage(page: page) else {
completion(nil)
return
}
guard let cgImage = pageImage.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
completion(nil)
return
}
let requestHandler = VNImageRequestHandler(cgImage: cgImage, options: [:])
let textRecognitionRequest = VNRecognizeTextRequest { request, error in
if let error = error {
print("Error: \(error)")
completion(nil)
return
}
guard let observations = request.results as? [VNRecognizedTextObservation] else {
completion(nil)
return
}
var columnRects: [CGRect] = []
for observation in observations {
guard let topCandidate = observation.topCandidates(1).first else {
continue
}
let recognizedString = topCandidate.string
let rect = try? observation.boundingBox(for: recognizedString, in: pageBounds)
if let rect = rect {
columnRects.append(rect)
}
}
print("Boundaries: \(columnRects)")
// Visualize the detected rectangles on the image for debugging
let debugImage = self.visualizeRectangles(rectangles: columnRects, onImage: pageImage)
// Save the debug image
let fileManager = FileManager.default
let documentDirectory = fileManager.urls(for: .documentDirectory, in: .userDomainMask).first!
let imageUrl = documentDirectory.appendingPathComponent("column_rectangles_debug.jpg")
if let debugImageData = debugImage.tiffRepresentation {
try? debugImageData.write(to: imageUrl)
}
if let modifiedPage = self.embedColumnRectsOnPage(page: page, columnRects: columnRects) {
completion(modifiedPage)
} else {
completion(nil)
}
}
textRecognitionRequest.recognitionLevel = VNRequestTextRecognitionLevel.accurate
textRecognitionRequest.usesLanguageCorrection = false
textRecognitionRequest.customWords = []
try? requestHandler.perform([textRecognitionRequest])
}
private func visualizeRectangles(rectangles: [CGRect], onImage image: NSImage) -> NSImage {
let imageSize = image.size
let scale = NSScreen.main?.backingScaleFactor ?? 1.0
let bitmap = NSBitmapImageRep(
bitmapDataPlanes: nil,
pixelsWide: Int(imageSize.width * scale),
pixelsHigh: Int(imageSize.height * scale),
bitsPerSample: 8,
samplesPerPixel: 4,
hasAlpha: true,
isPlanar: false,
colorSpaceName: .calibratedRGB,
bytesPerRow: 0,
bitsPerPixel: 0
)
bitmap?.size = imageSize
NSGraphicsContext.saveGraphicsState()
NSGraphicsContext.current = NSGraphicsContext(bitmapImageRep: bitmap!)
image.draw(at: .zero, from: .zero, operation: .copy, fraction: 1.0)
let context = NSGraphicsContext.current?.cgContext
context?.setLineWidth(2.0)
context?.setStrokeColor(NSColor.red.cgColor)
for rectangle in rectangles {
let scaledRect = rectangle.applying(CGAffineTransform(scaleX: scale, y: scale))
context?.stroke(scaledRect)
}
context?.flush()
NSGraphicsContext.restoreGraphicsState()
return NSImage(cgImage: (bitmap?.cgImage)!, size: imageSize)
}
private func saveImage(_ image: NSImage, withName name: String) {
guard let imageData = image.tiffRepresentation else {
print("Failed to get TIFF representation of the image.")
return
}
guard let bitmapImageRep = NSBitmapImageRep(data: imageData) else {
print("Failed to create NSBitmapImageRep from the image data.")
return
}
guard let jpegData = bitmapImageRep.representation(using: .jpeg, properties: [:]) else {
print("Failed to create JPEG representation of the image.")
return
}
do {
let fileURL = try FileManager.default
.url(for: .documentDirectory, in: .userDomainMask, appropriateFor: nil, create: false)
.appendingPathComponent(name)
try jpegData.write(to: fileURL)
print("Saved image at: \(fileURL.path)")
} catch {
print("Failed to save image: \(error)")
}
}
private func embedColumnRectsOnPage(page: PDFPage, columnRects: [CGRect]) -> PDFPage? {
guard let pageRef = page.pageRef, let document = page.document else {
return nil
}
let bounds = page.bounds(for: .mediaBox)
var mediaBox = bounds
let pdfData = NSMutableData()
guard let pdfConsumer = CGDataConsumer(data: pdfData as CFMutableData) else {
return nil
}
guard let pdfContext = CGContext(consumer: pdfConsumer, mediaBox: &mediaBox, nil) else {
return nil
}
pdfContext.beginPage(mediaBox: &mediaBox)
pdfContext.drawPDFPage(pageRef)
pdfContext.setFillColor(NSColor.red.withAlphaComponent(0.3).cgColor)
for rect in columnRects {
let normalizedRect = CGRect(
x: rect.origin.x / bounds.size.width,
y: rect.origin.y / bounds.size.height,
width: rect.size.width / bounds.size.width,
height: rect.size.height / bounds.size.height
)
let transformedRect = normalizedRect.applying(CGAffineTransform(scaleX: bounds.size.width, y: bounds.size.height))
pdfContext.fill(transformedRect)
}
pdfContext.endPage()
pdfContext.closePDF()
guard let newDocument = PDFDocument(data: pdfData as Data) else {
return nil
}
if let newPage = newDocument.page(at: 0), let pageIndex = document.index(for: page) as Int? {
document.removePage(at: pageIndex)
document.insert(newPage, at: pageIndex)
return newPage
}
return nil
}
private func convertPageToImage(page: PDFPage) -> NSImage? {
let bounds = page.bounds(for: .mediaBox)
let scale = NSScreen.main?.backingScaleFactor ?? 1.0
let image = NSImage(size: NSSize(width: bounds.size.width * scale, height: bounds.size.height * scale))
image.lockFocus()
if let context = NSGraphicsContext.current?.cgContext {
context.scaleBy(x: scale, y: scale)
page.draw(with: .mediaBox, to: context)
}
image.unlockFocus()
return image
}
private func replacePage(at index: Int, with newPage: PDFPage, in document: PDFDocument) {
let pageCount = document.pageCount
guard index >= 0 && index < pageCount else {
return
}
document.removePage(at: index)
document.insert(newPage, at: index)
}
}
I am probably over complicating this, or maybe not, I don't really know. Been working on this for days and haven't been able to get it to work correctly.
This may also be possible without Vision and instead using CGPDF but I spent all day trying and can't figure it out using that method either.
Thanks in advance for any and all help!