0

I am trying to find the coordinates of structured column text in a PDF (single or multi-page) and to give the user visual feedback, I am creating a red rectangle over that text. Currently, I have tried using the Vision framework in order to perform the OCR and do the detection but it's not working correctly.

The idea is to be able to detect the location of the transaction data in bank statements. This is always presented in columns going from left to right. Of course the number of columns always changes depending the bank, so I need to be able to accurately detect where there are at least 3 columns going from left to right across the page and return the coordinates from the beginning of the transactions to the end of the transactions.

For example, if the transactions looked like:

04-11-2023     Blah description     $4.56     $6.78     $100.00
04-12-2023     Foo description      $5.67     $7.79     $104.23

the coordinates returned would begin at the top left of the first transaction, and end at the bottom right of the last transaction.

From there I can perform OCR only at those coordinates to correctly output the transactions as strings.

Another issue to mention is that sometimes the descriptions for each transaction will be on multiple lines and can get fairly lengthy.

Here's the current code to display the PDFPreview with the red rectangle as well as output a debug image so I can see what's being detected.

import SwiftUI
import PDFKit
import Vision

private extension VNRecognizedTextObservation {
    func boundingBox(for text: String, in pageBounds: CGRect) throws -> CGRect {
        let boundingBox = self.boundingBox
        let minX = Int(boundingBox.origin.x * pageBounds.size.width)
        let minY = Int((1 - boundingBox.origin.y) * pageBounds.size.height)
        let maxX = Int((boundingBox.origin.x + boundingBox.size.width) * pageBounds.size.width)
        let maxY = Int((1 - boundingBox.origin.y + boundingBox.size.height) * pageBounds.size.height)

        let rect = CGRect(x: minX, y: minY, width: maxX - minX, height: maxY - minY)

        return rect
    }
}

private extension String {
    func glyphRanges(in text: String) -> [Range<Int>] {
        var ranges: [Range<Int>] = []
        var currentIndex = text.startIndex

        while currentIndex < text.endIndex {
            let currentCluster = text[currentIndex...currentIndex]

            var endIndex = text.index(after: currentIndex)
            while endIndex < text.endIndex {
                let nextCluster = text[endIndex...endIndex]
                let isSameGlyph = currentCluster.compare(nextCluster, options: .caseInsensitive) == .orderedSame

                if !isSameGlyph {
                    break
                }

                endIndex = text.index(after: endIndex)
            }

            let startIndexInt = text.distance(from: text.startIndex, to: currentIndex)
            let endIndexInt = text.distance(from: text.startIndex, to: endIndex)
            let range = startIndexInt..<endIndexInt
            ranges.append(range)

            currentIndex = endIndex
        }

        return ranges
    }
}

enum RecognitionError: Error {
    case invalidResult
}

struct PDFPreview: NSViewRepresentable {
    let url: URL
    @Binding var currentPageIndex: Int
    @Binding var zoomLevel: CGFloat

    func makeNSView(context: Context) -> PDFView {
        let pdfView = PDFView()
        pdfView.document = PDFDocument(url: url)
        pdfView.autoScales = true
        pdfView.backgroundColor = .white
        pdfView.displayMode = .singlePage
        pdfView.displayDirection = .vertical
        pdfView.displaysPageBreaks = true

        if let currentPage = pdfView.currentPage {
            modifyPageWithColumnRectangles(page: currentPage) { modifiedPage in
                if let modifiedPage = modifiedPage {
                    self.replacePage(at: self.currentPageIndex, with: modifiedPage, in: pdfView.document!)
                    pdfView.go(to: modifiedPage)
                }
            }
        }

        return pdfView
    }

    func updateNSView(_ nsView: PDFView, context: Context) {
        let pageIndex = min(max(0, currentPageIndex), nsView.document?.pageCount ?? 0 - 1)
        nsView.go(to: nsView.document?.page(at: pageIndex) ?? PDFPage())
        nsView.scaleFactor = zoomLevel
    }

    private func modifyPageWithColumnRectangles(page: PDFPage, completion: @escaping (PDFPage?) -> Void) {
        guard let document = page.document else {
            completion(nil)
            return
        }

        let pageIndex = document.index(for: page)
        let pageBounds = page.bounds(for: .mediaBox)

        guard let pageImage = convertPageToImage(page: page) else {
            completion(nil)
            return
        }

        guard let cgImage = pageImage.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
            completion(nil)
            return
        }

        let requestHandler = VNImageRequestHandler(cgImage: cgImage, options: [:])
        let textRecognitionRequest = VNRecognizeTextRequest { request, error in
            if let error = error {
                print("Error: \(error)")
                completion(nil)
                return
            }

            guard let observations = request.results as? [VNRecognizedTextObservation] else {
                completion(nil)
                return
            }

            var columnRects: [CGRect] = []

            for observation in observations {
                guard let topCandidate = observation.topCandidates(1).first else {
                    continue
                }

                let recognizedString = topCandidate.string
                let rect = try? observation.boundingBox(for: recognizedString, in: pageBounds)

                if let rect = rect {
                    columnRects.append(rect)
                }
            }

            print("Boundaries: \(columnRects)")

            // Visualize the detected rectangles on the image for debugging
            let debugImage = self.visualizeRectangles(rectangles: columnRects, onImage: pageImage)

            // Save the debug image
            let fileManager = FileManager.default
            let documentDirectory = fileManager.urls(for: .documentDirectory, in: .userDomainMask).first!
            let imageUrl = documentDirectory.appendingPathComponent("column_rectangles_debug.jpg")

            if let debugImageData = debugImage.tiffRepresentation {
                try? debugImageData.write(to: imageUrl)
            }

            if let modifiedPage = self.embedColumnRectsOnPage(page: page, columnRects: columnRects) {
                completion(modifiedPage)
            } else {
                completion(nil)
            }
        }

        textRecognitionRequest.recognitionLevel = VNRequestTextRecognitionLevel.accurate
        textRecognitionRequest.usesLanguageCorrection = false
        textRecognitionRequest.customWords = []

        try? requestHandler.perform([textRecognitionRequest])
    }


    private func visualizeRectangles(rectangles: [CGRect], onImage image: NSImage) -> NSImage {
        let imageSize = image.size
        let scale = NSScreen.main?.backingScaleFactor ?? 1.0

        let bitmap = NSBitmapImageRep(
            bitmapDataPlanes: nil,
            pixelsWide: Int(imageSize.width * scale),
            pixelsHigh: Int(imageSize.height * scale),
            bitsPerSample: 8,
            samplesPerPixel: 4,
            hasAlpha: true,
            isPlanar: false,
            colorSpaceName: .calibratedRGB,
            bytesPerRow: 0,
            bitsPerPixel: 0
        )

        bitmap?.size = imageSize

        NSGraphicsContext.saveGraphicsState()
        NSGraphicsContext.current = NSGraphicsContext(bitmapImageRep: bitmap!)

        image.draw(at: .zero, from: .zero, operation: .copy, fraction: 1.0)

        let context = NSGraphicsContext.current?.cgContext

        context?.setLineWidth(2.0)
        context?.setStrokeColor(NSColor.red.cgColor)

        for rectangle in rectangles {
            let scaledRect = rectangle.applying(CGAffineTransform(scaleX: scale, y: scale))
            context?.stroke(scaledRect)
        }

        context?.flush()

        NSGraphicsContext.restoreGraphicsState()

        return NSImage(cgImage: (bitmap?.cgImage)!, size: imageSize)
    }

    private func saveImage(_ image: NSImage, withName name: String) {
        guard let imageData = image.tiffRepresentation else {
            print("Failed to get TIFF representation of the image.")
            return
        }

        guard let bitmapImageRep = NSBitmapImageRep(data: imageData) else {
            print("Failed to create NSBitmapImageRep from the image data.")
            return
        }

        guard let jpegData = bitmapImageRep.representation(using: .jpeg, properties: [:]) else {
            print("Failed to create JPEG representation of the image.")
            return
        }

        do {
            let fileURL = try FileManager.default
                .url(for: .documentDirectory, in: .userDomainMask, appropriateFor: nil, create: false)
                .appendingPathComponent(name)

            try jpegData.write(to: fileURL)
            print("Saved image at: \(fileURL.path)")
        } catch {
            print("Failed to save image: \(error)")
        }
    }


    private func embedColumnRectsOnPage(page: PDFPage, columnRects: [CGRect]) -> PDFPage? {
        guard let pageRef = page.pageRef, let document = page.document else {
            return nil
        }

        let bounds = page.bounds(for: .mediaBox)
        var mediaBox = bounds
        let pdfData = NSMutableData()
        guard let pdfConsumer = CGDataConsumer(data: pdfData as CFMutableData) else {
            return nil
        }

        guard let pdfContext = CGContext(consumer: pdfConsumer, mediaBox: &mediaBox, nil) else {
            return nil
        }

        pdfContext.beginPage(mediaBox: &mediaBox)
        pdfContext.drawPDFPage(pageRef)
        pdfContext.setFillColor(NSColor.red.withAlphaComponent(0.3).cgColor)

        for rect in columnRects {
            let normalizedRect = CGRect(
                x: rect.origin.x / bounds.size.width,
                y: rect.origin.y / bounds.size.height,
                width: rect.size.width / bounds.size.width,
                height: rect.size.height / bounds.size.height
            )
            let transformedRect = normalizedRect.applying(CGAffineTransform(scaleX: bounds.size.width, y: bounds.size.height))
            pdfContext.fill(transformedRect)
        }

        pdfContext.endPage()
        pdfContext.closePDF()

        guard let newDocument = PDFDocument(data: pdfData as Data) else {
            return nil
        }

        if let newPage = newDocument.page(at: 0), let pageIndex = document.index(for: page) as Int? {
            document.removePage(at: pageIndex)
            document.insert(newPage, at: pageIndex)
            return newPage
        }

        return nil
    }

    private func convertPageToImage(page: PDFPage) -> NSImage? {
        let bounds = page.bounds(for: .mediaBox)
        let scale = NSScreen.main?.backingScaleFactor ?? 1.0

        let image = NSImage(size: NSSize(width: bounds.size.width * scale, height: bounds.size.height * scale))

        image.lockFocus()
        if let context = NSGraphicsContext.current?.cgContext {
            context.scaleBy(x: scale, y: scale)
            page.draw(with: .mediaBox, to: context)
        }
        image.unlockFocus()

        return image
    }

    private func replacePage(at index: Int, with newPage: PDFPage, in document: PDFDocument) {
        let pageCount = document.pageCount
        guard index >= 0 && index < pageCount else {
            return
        }

        document.removePage(at: index)
        document.insert(newPage, at: index)
    }
}

I am probably over complicating this, or maybe not, I don't really know. Been working on this for days and haven't been able to get it to work correctly.

This may also be possible without Vision and instead using CGPDF but I spent all day trying and can't figure it out using that method either.

Thanks in advance for any and all help!

kittonian
  • 1,020
  • 10
  • 22

0 Answers0