How do I modify this intrinsics code going from YUV420 to RGB24 to output RGBA32

61 Views Asked by Connor At 27 March 2024 at 20:10

I find myself in the position of needing to really use intrinsics for the first time as optimization for image conversion. I found this project here: https://github.com/jabernet/YCbCr2RGB/blob/master/conversion.cpp and it almost works, but my output target needs to be in r8g8b8a8 while this is outputting in r8g8b8.

I tried modifiying this bit here:

const __m128 rgb1_1 = _mm_shuffle_ps(
                _mm_shuffle_ps(b1, g1, _MM_SHUFFLE(0, 0, 0, 0)),
                _mm_shuffle_ps(r1, b1, _MM_SHUFFLE(1, 1, 0, 0)),
                _MM_SHUFFLE(2, 0, 2, 0));
const __m128 rgb1_2 = _mm_shuffle_ps(
                _mm_shuffle_ps(g1, r1, _MM_SHUFFLE(1, 1, 1, 1)), 
                _mm_shuffle_ps(b1, g1, _MM_SHUFFLE(2, 2, 2, 2)), 
                _MM_SHUFFLE(2, 0, 2, 0)); 
const __m128 rgb1_3 = _mm_shuffle_ps(
                _mm_shuffle_ps(r1, b1, _MM_SHUFFLE(3, 3, 2, 2)), 
                _mm_shuffle_ps(g1, r1, _MM_SHUFFLE(3, 3, 3, 3)), 
                _MM_SHUFFLE(2, 0, 2, 0));

const __m128 rgb2_1 = _mm_shuffle_ps(
                _mm_shuffle_ps(b2, g2, _MM_SHUFFLE(0, 0, 0, 0)),
                _mm_shuffle_ps(r2, b2, _MM_SHUFFLE(1, 1, 0, 0)), 
                _MM_SHUFFLE(2, 0, 2, 0)); 
const __m128 rgb2_2 = _mm_shuffle_ps(
                _mm_shuffle_ps(g2, r2, _MM_SHUFFLE(1, 1, 1, 1)), 
                _mm_shuffle_ps(b2, g2, _MM_SHUFFLE(2, 2, 2, 2)), 
                _MM_SHUFFLE(2, 0, 2, 0)); 
const __m128 rgb2_3 = _mm_shuffle_ps(
                _mm_shuffle_ps(r2, b2, _MM_SHUFFLE(3, 3, 2, 2)), 
                _mm_shuffle_ps(g2, r2, _MM_SHUFFLE(3, 3, 3, 3)), 
                _MM_SHUFFLE(2, 0, 2, 0));

const __m128 rgb3_1 = _mm_shuffle_ps(
                _mm_shuffle_ps(b3, g3, _MM_SHUFFLE(0, 0, 0, 0)),
                _mm_shuffle_ps(r3, b3, _MM_SHUFFLE(1, 1, 0, 0)),
                _MM_SHUFFLE(2, 0, 2, 0));
const __m128 rgb3_2 = _mm_shuffle_ps(
                _mm_shuffle_ps(g3, r3, _MM_SHUFFLE(1, 1, 1, 1)),
                _mm_shuffle_ps(b3, g3, _MM_SHUFFLE(2, 2, 2, 2)),
                _MM_SHUFFLE(2, 0, 2, 0));
const __m128 rgb3_3 = _mm_shuffle_ps(
                _mm_shuffle_ps(r3, b3, _MM_SHUFFLE(3, 3, 2, 2)),
                _mm_shuffle_ps(g3, r3, _MM_SHUFFLE(3, 3, 3, 3)),
                _MM_SHUFFLE(2, 0, 2, 0));

const __m128 rgb4_1 = _mm_shuffle_ps(
                _mm_shuffle_ps(b4, g4, _MM_SHUFFLE(0, 0, 0, 0)),
                _mm_shuffle_ps(r4, b4, _MM_SHUFFLE(1, 1, 0, 0)),
                _MM_SHUFFLE(2, 0, 2, 0));
const __m128 rgb4_2 = _mm_shuffle_ps(
                _mm_shuffle_ps(g4, r4, _MM_SHUFFLE(1, 1, 1, 1)),
                _mm_shuffle_ps(b4, g4, _MM_SHUFFLE(2, 2, 2, 2)),
                _MM_SHUFFLE(2, 0, 2, 0));
const __m128 rgb4_3 = _mm_shuffle_ps(
                _mm_shuffle_ps(r4, b4, _MM_SHUFFLE(3, 3, 2, 2)),
                _mm_shuffle_ps(g4, r4, _MM_SHUFFLE(3, 3, 3, 3)),
                _MM_SHUFFLE(2, 0, 2, 0));

const __m128i pack1l = _mm_packs_epi32(_mm_cvtps_epi32(rgb1_1), _mm_cvtps_epi32(rgb1_2));
const __m128i pack1h = _mm_packs_epi32(_mm_cvtps_epi32(rgb1_3), _mm_cvtps_epi32(rgb2_1));
const __m128i pack1 = _mm_packus_epi16(pack1l, pack1h);
    
const __m128i pack2l = _mm_packs_epi32(_mm_cvtps_epi32(rgb2_2), _mm_cvtps_epi32(rgb2_3));
const __m128i pack2h = _mm_packs_epi32(_mm_cvtps_epi32(rgb3_1), _mm_cvtps_epi32(rgb3_2));
const __m128i pack2 = _mm_packus_epi16(pack2l, pack2h);
    
const __m128i pack3l = _mm_packs_epi32(_mm_cvtps_epi32(rgb3_3), _mm_cvtps_epi32(rgb4_1));
const __m128i pack3h = _mm_packs_epi32(_mm_cvtps_epi32(rgb4_2), _mm_cvtps_epi32(rgb4_3));
const __m128i pack3 = _mm_packus_epi16(pack3l, pack3h);
const __m128i packAlpha = { 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF };
    
    // and finally store in output
    
_mm_storeu_si128((__m128i*)(pixels + ((wh - width) * 4) - h * width * 4 + w * 4 + 0 * 16), pack1);
_mm_storeu_si128((__m128i*)(pixels + ((wh - width) * 4) - h * width * 4 + w * 4 + 1 * 16), pack2);
_mm_storeu_si128((__m128i*)(pixels + ((wh - width) * 4) - h * width * 4 + w * 4 + 2 * 16), pack3);
_mm_storeu_si128((__m128i*)(pixels + ((wh - width) * 4) - h * width * 4 + w * 4 + 3 * 16), packAlpha);

with the packAlpha lines added by me. I thought it would line things up properly, but what I ended up with was grayscale and had vertical while lines throughout. It's also flipped on both axes, but that was true before my changes, and can be solved by rotating the texture that pixels is given too later.

Edit: appended the calculation for the rgbX_Y variables to the top of the code block, per comment request. Also edited the numbers in the storing functions to reflect that I did in fact change those 3s to 4s

Original Q&A

How do I modify this intrinsics code going from YUV420 to RGB24 to output RGBA32

There are 0 best solutions below

Related Questions in C

Related Questions in RGB

Related Questions in INTRINSICS

Related Questions in YUV

Related Questions in IMAGE-CONVERSION

Trending Questions

Popular # Hahtags

Popular Questions