/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */ /* * Copyright the Collabora Online contributors. * * SPDX-License-Identifier: MPL-2.0 */ // This is a C file - to avoid inclusion of C++ headers // since compiling with different instruction set can generate // versions of inlined code that get injected outside of this // module by the linker. #include #include #include #include #include "DeltaSimd.h" #if ENABLE_SIMD # include #define DEBUG_LUT 0 // set of control data bytes for vperd static __m256i vpermd_lut[256]; static __m256i vpermd_shift_left; static __m256i vpermd_last_first_swap; static __m256i low_pixel_mask; // Build table we can lookup bitmasks in to generate gather data void init_gather_lut() { for (unsigned int pattern = 0; pattern < 256; ++pattern) { unsigned int i = 0, src = 0; uint8_t lut[8]; for (uint32_t bitToCheck = 1; bitToCheck < 256; bitToCheck <<= 1) { if (!(pattern & bitToCheck)) // set bit is a duplicate -> ignore. lut[i++] = src; src++; } while (i<8) // pad to copy first point lut[i++] = 0; #if DEBUG_LUG fprintf(stderr, "lut mask: 0x%x generates %d %d %d %d %d %d %d %d\n", pattern, lut[7], lut[6], lut[5], lut[4], lut[3], lut[2], lut[1], lut[0]); #endif vpermd_lut[pattern] = _mm256_set_epi8( 0, 0, 0, lut[7], 0, 0, 0, lut [6], 0, 0, 0, lut[5], 0, 0, 0, lut [4], 0, 0, 0, lut[3], 0, 0, 0, lut [2], 0, 0, 0, lut[1], 0, 0, 0, lut [0]); } vpermd_shift_left = _mm256_set_epi8( 0, 0, 0, 6, 0, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0); vpermd_last_first_swap = _mm256_set_epi8( 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 7); low_pixel_mask = _mm256_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff); } // non-intuitively we need to use the sign bit as // if floats to gather bits from 32bit words static uint64_t diffMask(__m256i prev, __m256i curr) { __m256i res = _mm256_cmpeq_epi32(prev, curr); __m256 m256 = _mm256_castsi256_ps(res); return _mm256_movemask_ps(m256); } #endif // accelerated compression of a 256 pixel run int simd_initPixRowSimd(const uint32_t *from, uint32_t *scratch, unsigned int *scratchLen, uint64_t *rleMaskBlock) { #if !ENABLE_SIMD // no fun. (void)from; (void)scratch; (void)scratchLen; (void)rleMask; return 0; #else // ENABLE_SIMD static int lut_initialized = 0; if (!lut_initialized) { lut_initialized = 1; init_gather_lut(); } *scratchLen = 0; unsigned int x = 0; const uint32_t* block = from; __m256i prev = _mm256_setzero_si256(); // transparent for (unsigned int nMask = 0; nMask < 4; ++nMask) { uint64_t rleMask = 0; uint64_t newMask = 0; int remaining = 256 - x; assert(remaining % 8 == 0); int blocks = remaining/8; if (blocks > 8) blocks = 8; for (int i = 0; i < blocks; ++i) { __m256i curr = _mm256_loadu_si256((const __m256i_u*)(block)); // Generate mask // get the last pixel into the least significant pixel __m256i lastPix = _mm256_permutevar8x32_epi32(prev, vpermd_last_first_swap); lastPix = _mm256_and_si256(low_pixel_mask, lastPix); // shift the current pixels left prev = _mm256_permutevar8x32_epi32(curr, vpermd_shift_left); // mask out the bottom pixel prev = _mm256_andnot_si256(low_pixel_mask, prev); // merge in the last pixel prev = _mm256_or_si256(prev, lastPix); // turn that into a bit-mask. newMask = diffMask(prev, curr); rleMask |= newMask << (i * 8); assert (newMask < 256); // Shuffle the pixels and pack them __m256i control_vector = _mm256_loadu_si256(&vpermd_lut[newMask]); __m256i packed = _mm256_permutevar8x32_epi32(curr, control_vector); unsigned int countBitsUnset = _mm_popcnt_u32(newMask ^ 0xff); assert(countBitsUnset <= 8); // we are guaranteed enough space worst-case _mm256_storeu_si256((__m256i*)scratch, packed); #if DEBUG_LUT if (countBitsUnset > 0) fprintf(stderr, "for mask: 0x%2x bits-unset %d we have:\n" "%4x%4x%4x%4x%4x%4x%4x%4x\n" "%4x%4x%4x%4x%4x%4x%4x%4x\n", (unsigned int)newMask, countBitsUnset, block[0], block[1], block[2], block[3], block[4], block[5], block[6], block[7], scratch[0], scratch[1], scratch[2], scratch[3], scratch[4], scratch[5], scratch[6], scratch[7]); #endif prev = curr; // ? scratch += countBitsUnset; *scratchLen += countBitsUnset; block += 8; x += 8; } rleMaskBlock[nMask] = rleMask; } return 1; #endif } /* vim:set shiftwidth=4 softtabstop=4 expandtab: */