I find myself in the position of needing to really use intrinsics for the first time as optimization for image conversion. I found this project here: https://github.com/jabernet/YCbCr2RGB/blob/master/conversion.cpp and it almost works, but my output target needs to be in r8g8b8a8 while this is outputting in r8g8b8.
I tried modifiying this bit here:
const __m128 rgb1_1 = _mm_shuffle_ps(
_mm_shuffle_ps(b1, g1, _MM_SHUFFLE(0, 0, 0, 0)),
_mm_shuffle_ps(r1, b1, _MM_SHUFFLE(1, 1, 0, 0)),
_MM_SHUFFLE(2, 0, 2, 0));
const __m128 rgb1_2 = _mm_shuffle_ps(
_mm_shuffle_ps(g1, r1, _MM_SHUFFLE(1, 1, 1, 1)),
_mm_shuffle_ps(b1, g1, _MM_SHUFFLE(2, 2, 2, 2)),
_MM_SHUFFLE(2, 0, 2, 0));
const __m128 rgb1_3 = _mm_shuffle_ps(
_mm_shuffle_ps(r1, b1, _MM_SHUFFLE(3, 3, 2, 2)),
_mm_shuffle_ps(g1, r1, _MM_SHUFFLE(3, 3, 3, 3)),
_MM_SHUFFLE(2, 0, 2, 0));
const __m128 rgb2_1 = _mm_shuffle_ps(
_mm_shuffle_ps(b2, g2, _MM_SHUFFLE(0, 0, 0, 0)),
_mm_shuffle_ps(r2, b2, _MM_SHUFFLE(1, 1, 0, 0)),
_MM_SHUFFLE(2, 0, 2, 0));
const __m128 rgb2_2 = _mm_shuffle_ps(
_mm_shuffle_ps(g2, r2, _MM_SHUFFLE(1, 1, 1, 1)),
_mm_shuffle_ps(b2, g2, _MM_SHUFFLE(2, 2, 2, 2)),
_MM_SHUFFLE(2, 0, 2, 0));
const __m128 rgb2_3 = _mm_shuffle_ps(
_mm_shuffle_ps(r2, b2, _MM_SHUFFLE(3, 3, 2, 2)),
_mm_shuffle_ps(g2, r2, _MM_SHUFFLE(3, 3, 3, 3)),
_MM_SHUFFLE(2, 0, 2, 0));
const __m128 rgb3_1 = _mm_shuffle_ps(
_mm_shuffle_ps(b3, g3, _MM_SHUFFLE(0, 0, 0, 0)),
_mm_shuffle_ps(r3, b3, _MM_SHUFFLE(1, 1, 0, 0)),
_MM_SHUFFLE(2, 0, 2, 0));
const __m128 rgb3_2 = _mm_shuffle_ps(
_mm_shuffle_ps(g3, r3, _MM_SHUFFLE(1, 1, 1, 1)),
_mm_shuffle_ps(b3, g3, _MM_SHUFFLE(2, 2, 2, 2)),
_MM_SHUFFLE(2, 0, 2, 0));
const __m128 rgb3_3 = _mm_shuffle_ps(
_mm_shuffle_ps(r3, b3, _MM_SHUFFLE(3, 3, 2, 2)),
_mm_shuffle_ps(g3, r3, _MM_SHUFFLE(3, 3, 3, 3)),
_MM_SHUFFLE(2, 0, 2, 0));
const __m128 rgb4_1 = _mm_shuffle_ps(
_mm_shuffle_ps(b4, g4, _MM_SHUFFLE(0, 0, 0, 0)),
_mm_shuffle_ps(r4, b4, _MM_SHUFFLE(1, 1, 0, 0)),
_MM_SHUFFLE(2, 0, 2, 0));
const __m128 rgb4_2 = _mm_shuffle_ps(
_mm_shuffle_ps(g4, r4, _MM_SHUFFLE(1, 1, 1, 1)),
_mm_shuffle_ps(b4, g4, _MM_SHUFFLE(2, 2, 2, 2)),
_MM_SHUFFLE(2, 0, 2, 0));
const __m128 rgb4_3 = _mm_shuffle_ps(
_mm_shuffle_ps(r4, b4, _MM_SHUFFLE(3, 3, 2, 2)),
_mm_shuffle_ps(g4, r4, _MM_SHUFFLE(3, 3, 3, 3)),
_MM_SHUFFLE(2, 0, 2, 0));
const __m128i pack1l = _mm_packs_epi32(_mm_cvtps_epi32(rgb1_1), _mm_cvtps_epi32(rgb1_2));
const __m128i pack1h = _mm_packs_epi32(_mm_cvtps_epi32(rgb1_3), _mm_cvtps_epi32(rgb2_1));
const __m128i pack1 = _mm_packus_epi16(pack1l, pack1h);
const __m128i pack2l = _mm_packs_epi32(_mm_cvtps_epi32(rgb2_2), _mm_cvtps_epi32(rgb2_3));
const __m128i pack2h = _mm_packs_epi32(_mm_cvtps_epi32(rgb3_1), _mm_cvtps_epi32(rgb3_2));
const __m128i pack2 = _mm_packus_epi16(pack2l, pack2h);
const __m128i pack3l = _mm_packs_epi32(_mm_cvtps_epi32(rgb3_3), _mm_cvtps_epi32(rgb4_1));
const __m128i pack3h = _mm_packs_epi32(_mm_cvtps_epi32(rgb4_2), _mm_cvtps_epi32(rgb4_3));
const __m128i pack3 = _mm_packus_epi16(pack3l, pack3h);
const __m128i packAlpha = { 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF };
// and finally store in output
_mm_storeu_si128((__m128i*)(pixels + ((wh - width) * 4) - h * width * 4 + w * 4 + 0 * 16), pack1);
_mm_storeu_si128((__m128i*)(pixels + ((wh - width) * 4) - h * width * 4 + w * 4 + 1 * 16), pack2);
_mm_storeu_si128((__m128i*)(pixels + ((wh - width) * 4) - h * width * 4 + w * 4 + 2 * 16), pack3);
_mm_storeu_si128((__m128i*)(pixels + ((wh - width) * 4) - h * width * 4 + w * 4 + 3 * 16), packAlpha);
with the packAlpha lines added by me. I thought it would line things up properly, but what I ended up with was grayscale and had vertical while lines throughout. It's also flipped on both axes, but that was true before my changes, and can be solved by rotating the texture that pixels is given too later.
Edit: appended the calculation for the rgbX_Y variables to the top of the code block, per comment request. Also edited the numbers in the storing functions to reflect that I did in fact change those 3s to 4s