libreoffice-online/kit/DeltaSimd.c
Michael Meeks 0153ddb554 First cut at getting aligned loads and simpler loop structure.
Remove the special case for the first pixel, and instead have a
previous pixel run initialized to zero.

AVX2 has no effective shift for the while si256 so use permutation
to shift the last pixel of the previous run into the right place,
mask it and combine.

Saves a second un-aligned load of the same data, and branch.

Change-Id: I77c9cdead13d37aaf4d9f31d98cbd5c4a9c5ce24
Signed-off-by: Michael Meeks <michael.meeks@collabora.com>
2023-09-25 16:55:04 +01:00

176 lines
5.4 KiB
C

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */
/*
* Copyright the Collabora Online contributors.
*
* SPDX-License-Identifier: MPL-2.0
*/
// This is a C file - to avoid inclusion of C++ headers
// since compiling with different instruction set can generate
// versions of inlined code that get injected outside of this
// module by the linker.
#include <config.h>
#include <assert.h>
#include <string.h>
#include <stdio.h>
#include "DeltaSimd.h"
#if ENABLE_SIMD
# include <immintrin.h>
#define DEBUG_LUT 0
// set of control data bytes for vperd
static __m256i vpermd_lut[256];
static __m256i vpermd_shift_left;
static __m256i vpermd_last_first_swap;
static __m256i low_pixel_mask;
// Build table we can lookup bitmasks in to generate gather data
void init_gather_lut()
{
for (unsigned int pattern = 0; pattern < 256; ++pattern)
{
unsigned int i = 0, src = 0;
uint8_t lut[8];
for (uint32_t bitToCheck = 1; bitToCheck < 256; bitToCheck <<= 1)
{
if (!(pattern & bitToCheck)) // set bit is a duplicate -> ignore.
lut[i++] = src;
src++;
}
while (i<8) // pad to copy first point
lut[i++] = 0;
#if DEBUG_LUG
fprintf(stderr, "lut mask: 0x%x generates %d %d %d %d %d %d %d %d\n",
pattern, lut[7], lut[6], lut[5], lut[4], lut[3], lut[2], lut[1], lut[0]);
#endif
vpermd_lut[pattern] = _mm256_set_epi8(
0, 0, 0, lut[7], 0, 0, 0, lut [6],
0, 0, 0, lut[5], 0, 0, 0, lut [4],
0, 0, 0, lut[3], 0, 0, 0, lut [2],
0, 0, 0, lut[1], 0, 0, 0, lut [0]);
}
vpermd_shift_left = _mm256_set_epi8(
0, 0, 0, 6, 0, 0, 0, 5,
0, 0, 0, 4, 0, 0, 0, 3,
0, 0, 0, 2, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0);
vpermd_last_first_swap = _mm256_set_epi8(
0, 0, 0, 0, 0, 0, 0, 6,
0, 0, 0, 5, 0, 0, 0, 4,
0, 0, 0, 3, 0, 0, 0, 2,
0, 0, 0, 1, 0, 0, 0, 7);
low_pixel_mask = _mm256_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
}
// non-intuitively we need to use the sign bit as
// if floats to gather bits from 32bit words
static uint64_t diffMask(__m256i prev, __m256i curr)
{
__m256i res = _mm256_cmpeq_epi32(prev, curr);
__m256 m256 = _mm256_castsi256_ps(res);
return _mm256_movemask_ps(m256);
}
#endif
// accelerated compression of a 256 pixel run
int simd_initPixRowSimd(const uint32_t *from, uint32_t *scratch, unsigned int *scratchLen, uint64_t *rleMaskBlock)
{
#if !ENABLE_SIMD
// no fun.
(void)from; (void)scratch; (void)scratchLen; (void)rleMask;
return 0;
#else // ENABLE_SIMD
static int lut_initialized = 0;
if (!lut_initialized)
{
lut_initialized = 1;
init_gather_lut();
}
*scratchLen = 0;
unsigned int x = 0;
const uint32_t* block = from;
__m256i prev = _mm256_setzero_si256(); // transparent
for (unsigned int nMask = 0; nMask < 4; ++nMask)
{
uint64_t rleMask = 0;
uint64_t newMask = 0;
int remaining = 256 - x;
assert(remaining % 8 == 0);
int blocks = remaining/8;
if (blocks > 8)
blocks = 8;
for (int i = 0; i < blocks; ++i)
{
__m256i curr = _mm256_loadu_si256((const __m256i_u*)(block));
// Generate mask
// get the last pixel into the least significant pixel
__m256i lastPix = _mm256_permutevar8x32_epi32(prev, vpermd_last_first_swap);
lastPix = _mm256_and_si256(low_pixel_mask, lastPix);
// shift the current pixels left
prev = _mm256_permutevar8x32_epi32(curr, vpermd_shift_left);
// mask out the bottom pixel
prev = _mm256_andnot_si256(low_pixel_mask, prev);
// merge in the last pixel
prev = _mm256_or_si256(prev, lastPix);
// turn that into a bit-mask.
newMask = diffMask(prev, curr);
rleMask |= newMask << (i * 8);
assert (newMask < 256);
// Shuffle the pixels and pack them
__m256i control_vector = _mm256_loadu_si256(&vpermd_lut[newMask]);
__m256i packed = _mm256_permutevar8x32_epi32(curr, control_vector);
unsigned int countBitsUnset = _mm_popcnt_u32(newMask ^ 0xff);
assert(countBitsUnset <= 8);
// we are guaranteed enough space worst-case
_mm256_storeu_si256((__m256i*)scratch, packed);
#if DEBUG_LUT
if (countBitsUnset > 0)
fprintf(stderr, "for mask: 0x%2x bits-unset %d we have:\n"
"%4x%4x%4x%4x%4x%4x%4x%4x\n"
"%4x%4x%4x%4x%4x%4x%4x%4x\n",
(unsigned int)newMask, countBitsUnset,
block[0], block[1], block[2], block[3], block[4], block[5], block[6], block[7],
scratch[0], scratch[1], scratch[2], scratch[3], scratch[4], scratch[5], scratch[6], scratch[7]);
#endif
prev = curr; // ?
scratch += countBitsUnset;
*scratchLen += countBitsUnset;
block += 8;
x += 8;
}
rleMaskBlock[nMask] = rleMask;
}
return 1;
#endif
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */