diff --git a/android/src/main/java/com/visioncameratextrecognition/BitmapUtils.kt b/android/src/main/java/com/visioncameratextrecognition/BitmapUtils.kt new file mode 100644 index 0000000..410f702 --- /dev/null +++ b/android/src/main/java/com/visioncameratextrecognition/BitmapUtils.kt @@ -0,0 +1,202 @@ +package com.visioncameratextrecognition + +import android.graphics.Bitmap +import android.graphics.BitmapFactory +import android.graphics.ImageFormat +import android.graphics.Matrix +import android.graphics.Rect +import android.graphics.YuvImage +import android.media.Image.Plane +import android.util.Log +import com.mrousavy.camera.core.FrameInvalidError +import com.mrousavy.camera.core.types.Orientation +import com.mrousavy.camera.frameprocessors.Frame +import java.io.ByteArrayOutputStream +import java.nio.ByteBuffer + +object BitmapUtils { + + /** Converts NV21 format byte buffer to bitmap. */ + private fun getBitmap(data: ByteBuffer, metadata: FrameMetadata): Bitmap? { + data.rewind() + val imageInBuffer = ByteArray(data.limit()) + data[imageInBuffer, 0, imageInBuffer.size] + try { + val image = + YuvImage( + imageInBuffer, ImageFormat.NV21, metadata.width, metadata.height, null + ) + val stream = ByteArrayOutputStream() + image.compressToJpeg(Rect(0, 0, metadata.width, metadata.height), 80, stream) + + val bmp = BitmapFactory.decodeByteArray(stream.toByteArray(), 0, stream.size()) + + stream.close() + return rotateBitmap(bmp, metadata.rotation, false, false) + } catch (e: Exception) { + Log.e("VisionProcessorBase", "Error: " + e.message) + } + return null + } + + /** Converts a YUV_420_888 image from Vision Camera API to a bitmap. */ + @Throws(FrameInvalidError::class) + fun getBitmap(image: Frame): Bitmap? { + val frameMetadata = + FrameMetadata.Builder() + .setWidth(image.width) + .setHeight(image.height) + .setRotation(getRotationDegreeFromOrientation(image.orientation)) + .build() + + val nv21Buffer = + yuv420ThreePlanesToNV21(image.image.planes, image.width, image.height) + return getBitmap(nv21Buffer, frameMetadata) + } + + private fun getRotationDegreeFromOrientation(orientation: Orientation): Int { + if (orientation.name == Orientation.PORTRAIT.name) { + return 0 + } else if (orientation.name == Orientation.LANDSCAPE_LEFT.name) { + return 270 + } else if (orientation.name == Orientation.LANDSCAPE_RIGHT.name) { + return 90 + } else if (orientation.name == Orientation.PORTRAIT_UPSIDE_DOWN.name) { + return 180 + } + return 0 + } + + /** Rotates a bitmap if it is converted from a bytebuffer. */ + private fun rotateBitmap( + bitmap: Bitmap, rotationDegrees: Int, flipX: Boolean, flipY: Boolean + ): Bitmap { + val matrix = Matrix() + + // Rotate the image back to straight. + matrix.postRotate(rotationDegrees.toFloat()) + + // Mirror the image along the X or Y axis. + matrix.postScale(if (flipX) -1.0f else 1.0f, if (flipY) -1.0f else 1.0f) + val rotatedBitmap = + Bitmap.createBitmap(bitmap, 0, 0, bitmap.width, bitmap.height, matrix, true) + + // Recycle the old bitmap if it has changed. + if (rotatedBitmap != bitmap) { + bitmap.recycle() + } + return rotatedBitmap + } + + /** + * Converts YUV_420_888 to NV21 bytebuffer. + * + * + * The NV21 format consists of a single byte array containing the Y, U and V values. For an + * image of size S, the first S positions of the array contain all the Y values. The remaining + * positions contain interleaved V and U values. U and V are subsampled by a factor of 2 in both + * dimensions, so there are S/4 U values and S/4 V values. In summary, the NV21 array will contain + * S Y values followed by S/4 VU values: YYYYYYYYYYYYYY(...)YVUVUVUVU(...)VU + * + * + * YUV_420_888 is a generic format that can describe any YUV image where U and V are subsampled + * by a factor of 2 in both dimensions. [Image.getPlanes] returns an array with the Y, U and + * V planes. The Y plane is guaranteed not to be interleaved, so we can just copy its values into + * the first part of the NV21 array. The U and V planes may already have the representation in the + * NV21 format. This happens if the planes share the same buffer, the V buffer is one position + * before the U buffer and the planes have a pixelStride of 2. If this is case, we can just copy + * them to the NV21 array. + */ + private fun yuv420ThreePlanesToNV21( + yuv420888planes: Array, width: Int, height: Int + ): ByteBuffer { + val imageSize = width * height + val out = ByteArray(imageSize + 2 * (imageSize / 4)) + + if (areUVPlanesNV21(yuv420888planes, width, height)) { + // Copy the Y values. + yuv420888planes[0].buffer[out, 0, imageSize] + + val uBuffer = yuv420888planes[1].buffer + val vBuffer = yuv420888planes[2].buffer + // Get the first V value from the V buffer, since the U buffer does not contain it. + vBuffer[out, imageSize, 1] + // Copy the first U value and the remaining VU values from the U buffer. + uBuffer[out, imageSize + 1, 2 * imageSize / 4 - 1] + } else { + // Fallback to copying the UV values one by one, which is slower but also works. + // Unpack Y. + unpackPlane(yuv420888planes[0], width, height, out, 0, 1) + // Unpack U. + unpackPlane(yuv420888planes[1], width, height, out, imageSize + 1, 2) + // Unpack V. + unpackPlane(yuv420888planes[2], width, height, out, imageSize, 2) + } + + return ByteBuffer.wrap(out) + } + + /** Checks if the UV plane buffers of a YUV_420_888 image are in the NV21 format. */ + private fun areUVPlanesNV21(planes: Array, width: Int, height: Int): Boolean { + val imageSize = width * height + + val uBuffer = planes[1].buffer + val vBuffer = planes[2].buffer + + // Backup buffer properties. + val vBufferPosition = vBuffer.position() + val uBufferLimit = uBuffer.limit() + + // Advance the V buffer by 1 byte, since the U buffer will not contain the first V value. + vBuffer.position(vBufferPosition + 1) + // Chop off the last byte of the U buffer, since the V buffer will not contain the last U value. + uBuffer.limit(uBufferLimit - 1) + + // Check that the buffers are equal and have the expected number of elements. + val areNV21 = + (vBuffer.remaining() == (2 * imageSize / 4 - 2)) && (vBuffer.compareTo(uBuffer) == 0) + + // Restore buffers to their initial state. + vBuffer.position(vBufferPosition) + uBuffer.limit(uBufferLimit) + + return areNV21 + } + + /** + * Unpack an image plane into a byte array. + * + * + * The input plane data will be copied in 'out', starting at 'offset' and every pixel will be + * spaced by 'pixelStride'. Note that there is no row padding on the output. + */ + private fun unpackPlane( + plane: Plane, width: Int, height: Int, out: ByteArray, offset: Int, pixelStride: Int + ) { + val buffer = plane.buffer + buffer.rewind() + + // Compute the size of the current plane. + // We assume that it has the aspect ratio as the original image. + val numRow = (buffer.limit() + plane.rowStride - 1) / plane.rowStride + if (numRow == 0) { + return + } + val scaleFactor = height / numRow + val numCol = width / scaleFactor + + // Extract the data in the output buffer. + var outputPos = offset + var rowStart = 0 + for (row in 0 until numRow) { + var inputPos = rowStart + for (col in 0 until numCol) { + out[outputPos] = buffer[inputPos] + outputPos += pixelStride + inputPos += plane.pixelStride + } + rowStart += plane.rowStride + } + } +} + diff --git a/android/src/main/java/com/visioncameratextrecognition/FrameMetadata.kt b/android/src/main/java/com/visioncameratextrecognition/FrameMetadata.kt new file mode 100644 index 0000000..def6599 --- /dev/null +++ b/android/src/main/java/com/visioncameratextrecognition/FrameMetadata.kt @@ -0,0 +1,29 @@ +package com.visioncameratextrecognition + +class FrameMetadata private constructor(@JvmField val width: Int, @JvmField val height: Int, @JvmField val rotation: Int) { + /** Builder of [FrameMetadata]. */ + class Builder { + private var width = 0 + private var height = 0 + private var rotation = 0 + + fun setWidth(width: Int): Builder { + this.width = width + return this + } + + fun setHeight(height: Int): Builder { + this.height = height + return this + } + + fun setRotation(rotation: Int): Builder { + this.rotation = rotation + return this + } + + fun build(): FrameMetadata { + return FrameMetadata(width, height, rotation) + } + } +} diff --git a/android/src/main/java/com/visioncameratextrecognition/VisionCameraTextRecognitionPlugin.kt b/android/src/main/java/com/visioncameratextrecognition/VisionCameraTextRecognitionPlugin.kt index f48c947..0b1c918 100644 --- a/android/src/main/java/com/visioncameratextrecognition/VisionCameraTextRecognitionPlugin.kt +++ b/android/src/main/java/com/visioncameratextrecognition/VisionCameraTextRecognitionPlugin.kt @@ -1,5 +1,6 @@ package com.visioncameratextrecognition +import android.graphics.Bitmap import android.graphics.Point import android.graphics.Rect import android.media.Image @@ -18,12 +19,12 @@ import com.google.mlkit.vision.text.latin.TextRecognizerOptions import com.mrousavy.camera.frameprocessors.Frame import com.mrousavy.camera.frameprocessors.FrameProcessorPlugin import com.mrousavy.camera.frameprocessors.VisionCameraProxy -import java.util.HashMap class VisionCameraTextRecognitionPlugin(proxy: VisionCameraProxy, options: Map?) : FrameProcessorPlugin() { private var recognizer = TextRecognition.getClient(TextRecognizerOptions.DEFAULT_OPTIONS) + private var scanRegion: Map<*, *>? = null private val latinOptions = TextRecognizerOptions.DEFAULT_OPTIONS private val chineseOptions = ChineseTextRecognizerOptions.Builder().build() private val devanagariOptions = DevanagariTextRecognizerOptions.Builder().build() @@ -32,6 +33,7 @@ class VisionCameraTextRecognitionPlugin(proxy: VisionCameraProxy, options: Map? recognizer = when (language) { "latin" -> TextRecognition.getClient(latinOptions) "chinese" -> TextRecognition.getClient(chineseOptions) @@ -44,9 +46,28 @@ class VisionCameraTextRecognitionPlugin(proxy: VisionCameraProxy, options: Map?): HashMap? { val data = WritableNativeMap() - val mediaImage: Image = frame.image - val image = - InputImage.fromMediaImage(mediaImage, frame.imageProxy.imageInfo.rotationDegrees) + var image: InputImage? = null + if (scanRegion != null) { + var bm: Bitmap? = BitmapUtils.getBitmap(frame) + if (bm === null) return null + val left = (scanRegion!!["left"] as Double) / 100.0 * bm.width + val top = (scanRegion!!["top"] as Double) / 100.0 * bm.height + val width = (scanRegion!!["width"] as Double) / 100.0 * bm.width + val height = (scanRegion!!["height"] as Double) / 100.0 * bm.height + bm = Bitmap.createBitmap( + bm, + left.toInt(), + top.toInt(), + width.toInt(), + height.toInt(), + null, + false + ) + image = InputImage.fromBitmap(bm,frame.imageProxy.imageInfo.rotationDegrees); + } else { + val mediaImage: Image = frame.image + image = InputImage.fromMediaImage(mediaImage, frame.imageProxy.imageInfo.rotationDegrees) + } val task: Task = recognizer.process(image) try { val text: Text = Tasks.await(task) diff --git a/ios/VisionCameraTextRecognition.swift b/ios/VisionCameraTextRecognition.swift index 8bf8aef..2739368 100644 --- a/ios/VisionCameraTextRecognition.swift +++ b/ios/VisionCameraTextRecognition.swift @@ -12,6 +12,7 @@ import MLKitCommon public class VisionCameraTextRecognition: FrameProcessorPlugin { private var textRecognizer = TextRecognizer() + private var scanRegion: [String: Int]? = nil private static let latinOptions = TextRecognizerOptions() private static let chineseOptions = ChineseTextRecognizerOptions() private static let devanagariOptions = DevanagariTextRecognizerOptions() @@ -23,6 +24,7 @@ public class VisionCameraTextRecognition: FrameProcessorPlugin { public override init(proxy: VisionCameraProxyHolder, options: [AnyHashable: Any]! = [:]) { super.init(proxy: proxy, options: options) let language = options["language"] as? String ?? "latin" + scanRegion = options["scanRegion"] as? [String: Int] switch language { case "chinese": self.textRecognizer = TextRecognizer.textRecognizer(options: VisionCameraTextRecognition.chineseOptions) @@ -40,11 +42,39 @@ public class VisionCameraTextRecognition: FrameProcessorPlugin { public override func callback(_ frame: Frame, withArguments arguments: [AnyHashable: Any]?) -> Any { let buffer = frame.buffer - let image = VisionImage(buffer: buffer) - image.orientation = getOrientation(orientation: frame.orientation) - + var image: VisionImage?; do { - let result = try self.textRecognizer.results(in: image) + if scanRegion != nil { + guard let pixelBuffer = CMSampleBufferGetImageBuffer(buffer) else { + return [:] + } + let ciImage = CIImage(cvPixelBuffer: pixelBuffer).oriented(.right) + let context = CIContext(options: nil) + if let cgImage = context.createCGImage(ciImage, from: ciImage.extent) { + let imgWidth = Double(cgImage.width) + let imgHeight = Double(cgImage.height) + let left:Double = Double(scanRegion?["left"] ?? 0) / 100.0 * imgWidth + let top:Double = Double(scanRegion?["top"] ?? 0) / 100.0 * imgHeight + let width:Double = Double(scanRegion?["width"] ?? 100) / 100.0 * imgWidth + let height:Double = Double(scanRegion?["height"] ?? 100) / 100.0 * imgHeight + let cropRegion = CGRect( + x: left, + y: top, + width: width, + height: height + ) + guard let croppedCGImage = cgImage.cropping(to: cropRegion) else { + return [:] + } + let uiImage = UIImage(cgImage: croppedCGImage) + image = VisionImage(image: uiImage) + print("using cropped image") + } + }else{ + image = VisionImage(buffer: buffer) + image!.orientation = getOrientation(orientation: frame.orientation) + } + let result = try self.textRecognizer.results(in: image!) let blocks = VisionCameraTextRecognition.processBlocks(blocks: result.blocks) data["resultText"] = result.text data["blocks"] = blocks diff --git a/src/Camera.tsx b/src/Camera.tsx index 8ae84f5..3ab8371 100644 --- a/src/Camera.tsx +++ b/src/Camera.tsx @@ -44,7 +44,7 @@ export const Camera = forwardRef(function Camera( const frameProcessor: ReadonlyFrameProcessor = useFrameProcessor( (frame: Frame) => { 'worklet'; - const data: Text[] | string = plugin(frame); + const data: Text | string = plugin(frame); // @ts-ignore useWorklets(data); }, diff --git a/src/scanText.ts b/src/scanText.ts index fbd9d79..70fbeae 100644 --- a/src/scanText.ts +++ b/src/scanText.ts @@ -18,10 +18,10 @@ export function createTextRecognitionPlugin( throw new Error(LINKING_ERROR); } return { - scanText: (frame: Frame): Text[] => { + scanText: (frame: Frame): Text => { 'worklet'; // @ts-ignore - return plugin.call(frame) as Text[]; + return plugin.call(frame) as Text; }, }; } diff --git a/src/types.ts b/src/types.ts index ad397e6..dffa681 100644 --- a/src/types.ts +++ b/src/types.ts @@ -68,8 +68,16 @@ export type Languages = | 'vi' | 'cy'; +export type ScanRegion = { + left: number, + top: number, + width: number, + height: number, +} + export type TextRecognitionOptions = { language: 'latin' | 'chinese' | 'devanagari' | 'japanese' | 'korean'; + scanRegion?: ScanRegion }; export type TranslatorOptions = { @@ -85,27 +93,26 @@ export type CameraTypes = { | { mode: 'translate'; options: TranslatorOptions } );; - - export type TextRecognitionPlugin = { - scanText: (frame: Frame) => Text[]; + scanText: (frame: Frame) => Text; }; + export type TranslatorPlugin = { translate: (frame: Frame) => string; }; export type Text = { - blocks: BlocksData; + blocks: BlocksData[]; resultText: string; }; -type BlocksData = [ +type BlocksData = { blockFrame: FrameType, blockCornerPoints: CornerPointsType, lines: LinesData, blockLanguages: string[] | [], blockText: string, -]; +}; type CornerPointsType = [{ x: number; y: number }];