I know there is Core ML Image preprocessing options for this. But I want to know how to do image normalization myself in Swift for checking the result I made should be the same as the result that Core ML model with Image preprocessing does.
Core ML Image preprocessing options, I set bias and scale as the suggestion for PyTorch: scale = 1/(0.226*255.0) bias = [- 0.485/(0.229) , - 0.456/(0.224), - 0.406/(0.225)]
When Core ML model without the options, I try to normalize the image using below methods but no luck.
- UIImage convert to CVPixelBuffer -> PixelBuffer standardization -> model.predict:
let mean: [Float] = [0.406, 0.456, 0.485] // in "BGR"
let std: [Float] = [0.225, 0.224, 0.229]
func normalizePixelBuffer(_ pixelBuffer: CVPixelBuffer) -> CVPixelBuffer? {
let width = CVPixelBufferGetWidth(pixelBuffer)
let height = CVPixelBufferGetHeight(pixelBuffer)
// Create a new pixel buffer for normalized data
var normalizedPixelBuffer: CVPixelBuffer?
CVPixelBufferCreate(nil, width, height, kCVPixelFormatType_32BGRA, nil, &normalizedPixelBuffer)
guard let normalizedBuffer = normalizedPixelBuffer else {
return nil
}
CVPixelBufferLockBaseAddress(pixelBuffer, .readOnly)
CVPixelBufferLockBaseAddress(normalizedBuffer, [])
// Get pointers to the pixel buffers
let baseAddress = CVPixelBufferGetBaseAddress(pixelBuffer)
let normalizedBaseAddress = CVPixelBufferGetBaseAddress(normalizedBuffer)
// Normalize the pixel values
let bytesPerRow = CVPixelBufferGetBytesPerRow(pixelBuffer)
let normalizedBytesPerRow = CVPixelBufferGetBytesPerRow(normalizedBuffer)
for y in 0..<height {
for x in 0..<width {
let pixelOffset = y * bytesPerRow + x * 4
let normalizedPixelOffset = y * normalizedBytesPerRow + x * 4
// Access the pixel values
let pixel = baseAddress!.advanced(by: pixelOffset).assumingMemoryBound(to: UInt8.self)
let normalizedPixel = normalizedBaseAddress!.advanced(by: normalizedPixelOffset).assumingMemoryBound(to: Float.self)
// Perform normalization (assuming 8-bit pixel values)
normalizedPixel[0] = ((Float(pixel[0])/255 - mean[0]) / std[0])
normalizedPixel[1] = ((Float(pixel[1])/255 - mean[1]) / std[1])
normalizedPixel[2] = ((Float(pixel[2])/255 - mean[2]) / std[2])
normalizedPixel[3] = Float(pixel[3]) // Assuming alpha channel is not normalized
}
}
CVPixelBufferUnlockBaseAddress(pixelBuffer, .readOnly)
CVPixelBufferUnlockBaseAddress(normalizedBuffer, [])
return normalizedBuffer
}
- "UIImage normalization" -> converting to CVPixelBuffer -> model.predict
extension UIImage {
func normalize() -> UIImage? {
let colorSpace = CGColorSpaceCreateDeviceRGB()
guard let cgImage = cgImage else {
return nil
}
let width = cgImage.width
let height = cgImage.height
var rawData = [UInt8](repeating: 0, count: width * height * 4)
let bytesPerPixel = 4
let bytesPerRow = bytesPerPixel * width
let bitsPerComponent = 8
let bitmapInfo = CGImageAlphaInfo.premultipliedLast.rawValue
guard let context = CGContext(data: &rawData,
width: width,
height: height,
bitsPerComponent: bitsPerComponent,
bytesPerRow: bytesPerRow,
space: colorSpace,
bitmapInfo: bitmapInfo) else { return nil }
let drawingRect = CGRect(origin: .zero, size: CGSize(width: width, height: height))
context.draw(cgImage, in: drawingRect)
var maxValue: UInt8 = 0
var minValue: UInt8 = 255
for pixel in 0 ..< width * height {
let baseOffset = pixel * 4
for offset in baseOffset ..< baseOffset + 3 {
let value = rawData[offset]
if value > maxValue { maxValue = value }
if value < minValue { minValue = value }
}
}
let range = Float(maxValue - minValue)
guard range > 0 else { return nil }
for pixel in 0 ..< width * height {
let baseOffset = pixel * 4
rawData[baseOffset] = max(0.0, min(1.0, (((Float(rawData[baseOffset])/255) - 0.485) / 0.229))
rawData[baseOffset+1] = max(0.0, min(1.0, (((Float(rawData[baseOffset+1])/255) - 0.456) / 0.224))
rawData[baseOffset+2] = max(0.0, min(1.0, (((Float(rawData[baseOffset+2])/255) - 0.405) / 0.225))
}
return context.makeImage().map { UIImage(cgImage: $0, scale: scale, orientation: imageOrientation) }
}
}
The 1st gets the wrong result, and the 2nd gets no effect as not do normalization. I think Core ML model predict in Swift with CVPixelBuffer each channle in [0, 255]? Is there anything wrong, or something I forget?