Swift iOS - Vision framework text recognition and rectangles

Question

I was trying to draw rectangles on the text area found using the Vision framework but they are always a little bit off. I am doing it like this:

    public func drawOccurrencesOnImage(_ occurrences: [CGRect], _ image: UIImage) -> UIImage? {

    UIGraphicsBeginImageContextWithOptions(image.size, false, 0.0)

    image.draw(at: CGPoint.zero)
    let currentContext = UIGraphicsGetCurrentContext()

    currentContext?.addRects(occurrences)
    currentContext?.setStrokeColor(UIColor.red.cgColor)
    currentContext?.setLineWidth(2.0)
    currentContext?.strokePath()

    guard let drawnImage = UIGraphicsGetImageFromCurrentImageContext() else { return UIImage() }

    UIGraphicsEndImageContext()
    return drawnImage
}

But the image returned always looks almost, but not really correct:

This is how I create the boxes, exactly the same way as Apple:

        let boundingRects: [CGRect] = observations.compactMap { observation in

        guard let candidate = observation.topCandidates(1).first else { return .zero }

        let stringRange = candidate.string.startIndex..<candidate.string.endIndex
        let boxObservation = try? candidate.boundingBox(for: stringRange)

        let boundingBox = boxObservation?.boundingBox ?? .zero

        return VNImageRectForNormalizedRect(boundingBox,
                                            Int(UIViewController.chosenImage?.width ?? 0),
                                            Int(UIViewController.chosenImage?.height ?? 0))
    }

(source: https://developer.apple.com/documentation/vision/recognizing_text_in_images)

Thank you.

@Rob According to the Apple documentation (https://developer.apple.com/documentation/vision/recognizing_text_in_images). I edited the question and added it in. — Filip Z, Aug 18 '22 at 07:04

Rob · Accepted Answer · 2022-08-19T21:29:08.247

The VNImageRectForNormalizedRect is returning CGRect with the y coordinates flipped. (I suspect it was written for macOS, which uses a different coordinate system than iOS.)

Instead, I might suggest a version of boundingBox adapted from Detecting Objects in Still Images:

/// Convert Vision coordinates to pixel coordinates within image.
///
/// Adapted from `boundingBox` method from
/// [Detecting Objects in Still Images](https://developer.apple.com/documentation/vision/detecting_objects_in_still_images).
/// This flips the y-axis.
///
/// - Parameters:
///   - boundingBox: The bounding box returned by Vision framework.
///   - bounds: The bounds within the image (in pixels, not points).
///
/// - Returns: The bounding box in pixel coordinates, flipped vertically so 0,0 is in the upper left corner

func convert(boundingBox: CGRect, to bounds: CGRect) -> CGRect {
    let imageWidth = bounds.width
    let imageHeight = bounds.height

    // Begin with input rect.
    var rect = boundingBox

    // Reposition origin.
    rect.origin.x *= imageWidth
    rect.origin.x += bounds.minX
    rect.origin.y = (1 - rect.maxY) * imageHeight + bounds.minY

    // Rescale normalized coordinates.
    rect.size.width *= imageWidth
    rect.size.height *= imageHeight

    return rect
}

Note, I changed the method name because it does not return a bounding box, but rather converts a bounding box (with values in [0,1]) to a CGRect. I also fixed a little bug in their boundingBox implementation. But it captures the main idea, namely flipping the y-axis of the bounding box.

Anyway, that yields the right boxes:

E.g.

func recognizeText(in image: UIImage) {
    guard let cgImage = image.cgImage else { return }
    let imageRequestHandler = VNImageRequestHandler(cgImage: cgImage, orientation: .up)

    let size = CGSize(width: cgImage.width, height: cgImage.height) // note, in pixels from `cgImage`; this assumes you have already rotate, too
    let bounds = CGRect(origin: .zero, size: size)
    // Create a new request to recognize text.
    let request = VNRecognizeTextRequest { [self] request, error in
        guard
            let results = request.results as? [VNRecognizedTextObservation],
            error == nil
        else { return }

        let rects = results.map {
            convert(boundingBox: $0.boundingBox, to: CGRect(origin: .zero, size: size))
        }

        let string = results.compactMap {
            $0.topCandidates(1).first?.string
        }.joined(separator: "\n")

        let format = UIGraphicsImageRendererFormat()
        format.scale = 1
        let final = UIGraphicsImageRenderer(bounds: bounds, format: format).image { _ in
            image.draw(in: bounds)
            UIColor.red.setStroke()
            for rect in rects {
                let path = UIBezierPath(rect: rect)
                path.lineWidth = 5
                path.stroke()
            }
        }

        DispatchQueue.main.async { [self] in
            imageView.image = final
            label.text = string
        }
    }

    DispatchQueue.global(qos: .userInitiated).async {
        do {
            try imageRequestHandler.perform([request])
        } catch {
            print("Failed to perform image request: \(error)")
            return
        }
    }
}

/// Convert Vision coordinates to pixel coordinates within image.
///
/// Adapted from `boundingBox` method from
/// [Detecting Objects in Still Images](https://developer.apple.com/documentation/vision/detecting_objects_in_still_images).
/// This flips the y-axis.
///
/// - Parameters:
///   - boundingBox: The bounding box returned by Vision framework.
///   - bounds: The bounds within the image (in pixels, not points).
///
/// - Returns: The bounding box in pixel coordinates, flipped vertically so 0,0 is in the upper left corner

func convert(boundingBox: CGRect, to bounds: CGRect) -> CGRect {
    let imageWidth = bounds.width
    let imageHeight = bounds.height

    // Begin with input rect.
    var rect = boundingBox

    // Reposition origin.
    rect.origin.x *= imageWidth
    rect.origin.x += bounds.minX
    rect.origin.y = (1 - rect.maxY) * imageHeight + bounds.minY

    // Rescale normalized coordinates.
    rect.size.width *= imageWidth
    rect.size.height *= imageHeight

    return rect
}

///  Scale and orient picture for Vision framework
///
///  From [Detecting Objects in Still Images](https://developer.apple.com/documentation/vision/detecting_objects_in_still_images).
///
///  - Parameter image: Any `UIImage` with any orientation
///  - Returns: An image that has been rotated such that it can be safely passed to Vision framework for detection.

func scaleAndOrient(image: UIImage) -> UIImage {

    // Set a default value for limiting image size.
    let maxResolution: CGFloat = 640

    guard let cgImage = image.cgImage else {
        print("UIImage has no CGImage backing it!")
        return image
    }

    // Compute parameters for transform.
    let width = CGFloat(cgImage.width)
    let height = CGFloat(cgImage.height)
    var transform = CGAffineTransform.identity

    var bounds = CGRect(x: 0, y: 0, width: width, height: height)

    if width > maxResolution ||
        height > maxResolution {
        let ratio = width / height
        if width > height {
            bounds.size.width = maxResolution
            bounds.size.height = round(maxResolution / ratio)
        } else {
            bounds.size.width = round(maxResolution * ratio)
            bounds.size.height = maxResolution
        }
    }

    let scaleRatio = bounds.size.width / width
    let orientation = image.imageOrientation
    switch orientation {
    case .up:
        transform = .identity
    case .down:
        transform = CGAffineTransform(translationX: width, y: height).rotated(by: .pi)
    case .left:
        let boundsHeight = bounds.size.height
        bounds.size.height = bounds.size.width
        bounds.size.width = boundsHeight
        transform = CGAffineTransform(translationX: 0, y: width).rotated(by: 3.0 * .pi / 2.0)
    case .right:
        let boundsHeight = bounds.size.height
        bounds.size.height = bounds.size.width
        bounds.size.width = boundsHeight
        transform = CGAffineTransform(translationX: height, y: 0).rotated(by: .pi / 2.0)
    case .upMirrored:
        transform = CGAffineTransform(translationX: width, y: 0).scaledBy(x: -1, y: 1)
    case .downMirrored:
        transform = CGAffineTransform(translationX: 0, y: height).scaledBy(x: 1, y: -1)
    case .leftMirrored:
        let boundsHeight = bounds.size.height
        bounds.size.height = bounds.size.width
        bounds.size.width = boundsHeight
        transform = CGAffineTransform(translationX: height, y: width).scaledBy(x: -1, y: 1).rotated(by: 3.0 * .pi / 2.0)
    case .rightMirrored:
        let boundsHeight = bounds.size.height
        bounds.size.height = bounds.size.width
        bounds.size.width = boundsHeight
        transform = CGAffineTransform(scaleX: -1, y: 1).rotated(by: .pi / 2.0)
    default:
        transform = .identity
    }

    return UIGraphicsImageRenderer(size: bounds.size).image { rendererContext in
        let context = rendererContext.cgContext

        if orientation == .right || orientation == .left {
            context.scaleBy(x: -scaleRatio, y: scaleRatio)
            context.translateBy(x: -height, y: 0)
        } else {
            context.scaleBy(x: scaleRatio, y: -scaleRatio)
            context.translateBy(x: 0, y: -height)
        }
        context.concatenate(transform)
        context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height))
    }
}

I solved it by ditching the VNImageRectForNormalizedRect and returning the boundingBox directly. Then just use the convert() function provided by @Rob, where boundingBox is, well the BoundingBox (CGRect) and the second parameter is UIImage size in pixels (not image.size). Create CGRect out of height and width in pixels. Convert all the occurrences by this function and draw using these. I am just writing it for anyone stumbling onto this in the future. Thank you. — Filip Z, Aug 19 '22 at 06:26

Swift iOS - Vision framework text recognition and rectangles

1 Answers1

Linked