libreoffice-online/kit/DeltaSimd.c
Michael Meeks a9d4dcb71d deltas: zstd compress already RLE compressed pixels.
Previously we compressed raw pixels - which was a lot more data,
so this should accelerate initial keyframe compression significantly.

It should also significantly reduce the time and space & GC thrash
caused by de-compression, and potentially help firefox performance
problems in the JS from fzstd decompression.

Finally since we unpremultiply runs of similar pixels instead of
all pixels, potentially it should remove lots of un-pre-multipling
from the JS profile too.

Change-Id: I412fe0cb7272ea1ca72dc6ac1ca2823878fb7422
Signed-off-by: Michael Meeks <michael.meeks@collabora.com>
2024-06-20 15:55:01 +01:00

184 lines
5.6 KiB
C

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */
/*
* Copyright the Collabora Online contributors.
*
* SPDX-License-Identifier: MPL-2.0
*/
// This is a C file - to avoid inclusion of C++ headers
// since compiling with different instruction set can generate
// versions of inlined code that get injected outside of this
// module by the linker.
#include "config.h"
#include <assert.h>
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <endian.h>
#include "DeltaSimd.h"
#if ENABLE_SIMD
# include <immintrin.h>
#define DEBUG_LUT 0
// set of control data bytes for vperd
static __m256i vpermd_lut[256];
static __m256i vpermd_shift_left;
static __m256i vpermd_last_to_first;
static __m256i low_pixel_mask;
// Build table we can lookup bitmasks in to generate gather data
void init_gather_lut()
{
for (unsigned int pattern = 0; pattern < 256; ++pattern)
{
unsigned int i = 0, src = 0;
uint8_t lut[8];
for (uint32_t bitToCheck = 1; bitToCheck < 256; bitToCheck <<= 1)
{
if (!(pattern & bitToCheck)) // set bit is a duplicate -> ignore.
lut[i++] = src;
src++;
}
while (i<8) // pad to copy first point
lut[i++] = 0;
#if DEBUG_LUG
fprintf(stderr, "lut mask: 0x%x generates %d %d %d %d %d %d %d %d\n",
pattern, lut[7], lut[6], lut[5], lut[4], lut[3], lut[2], lut[1], lut[0]);
#endif
vpermd_lut[pattern] = _mm256_set_epi8(
0, 0, 0, lut[7], 0, 0, 0, lut [6],
0, 0, 0, lut[5], 0, 0, 0, lut [4],
0, 0, 0, lut[3], 0, 0, 0, lut [2],
0, 0, 0, lut[1], 0, 0, 0, lut [0]);
}
vpermd_shift_left = _mm256_set_epi8(
0, 0, 0, 6, 0, 0, 0, 5,
0, 0, 0, 4, 0, 0, 0, 3,
0, 0, 0, 2, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0);
vpermd_last_to_first = _mm256_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 7);
low_pixel_mask = _mm256_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
}
// non-intuitively we need to use the sign bit as
// if floats to gather bits from 32bit words
static uint64_t diffMask(__m256i prev, __m256i curr)
{
__m256i res = _mm256_cmpeq_epi32(prev, curr);
__m256 m256 = _mm256_castsi256_ps(res);
return _mm256_movemask_ps(m256);
}
#endif
void simd_deltaInit(void)
{
#if ENABLE_SIMD
init_gather_lut();
#endif
}
// accelerated compression of a 256 pixel run
int simd_initPixRowSimd(const uint32_t *from, uint32_t *scratch, size_t *scratchLen, uint64_t *rleMaskBlockWide)
{
#if !ENABLE_SIMD
// no fun.
(void)from; (void)scratch; (void)scratchLen; (void)rleMaskBlockWide;
return 0;
#else // ENABLE_SIMD
*scratchLen = 0;
uint8_t *rleMaskBlock = (uint8_t *)rleMaskBlockWide;
for (unsigned int x = 0; x < 256/8; ++x)
rleMaskBlock[x] = 0;
const uint32_t* block = from;
uint32_t* dest = scratch;
__m256i prev = _mm256_setzero_si256(); // transparent
for (unsigned int x = 0; x < 256; x += 8) // 8 pixels per cycle
{
__m256i curr = _mm256_loadu_si256((const __m256i_u*)(block + x));
// Generate mask
// get the last pixel into the least significant pixel
// FIXME: mask at the same time ?
// __m256i lastPix = _mm256_maskz_permutexvar_epi32(0x1, prev, vpermd_last_to_first);
__m256i lastPix = _mm256_permutevar8x32_epi32(prev, vpermd_last_to_first);
lastPix = _mm256_and_si256(low_pixel_mask, lastPix);
// shift the current pixels left
prev = _mm256_permutevar8x32_epi32(curr, vpermd_shift_left);
// mask out the bottom pixel
prev = _mm256_andnot_si256(low_pixel_mask, prev);
// merge in the last pixel
prev = _mm256_or_si256(prev, lastPix);
// turn that into a bit-mask.
uint64_t newMask = diffMask(prev, curr);
assert (newMask < 256);
// invert bitmask for counting non-same foo ... [!]
uint32_t newMaskInverse = ~newMask & 0xff;
// stash our mask for these 8 pixels
rleMaskBlock[x>>3] = newMask;
// Shuffle the pixels and pack them
__m256i control_vector = _mm256_loadu_si256(&vpermd_lut[newMask]);
__m256i packed = _mm256_permutevar8x32_epi32(curr, control_vector);
unsigned int countBitsUnset = _mm_popcnt_u32(newMaskInverse);
assert(countBitsUnset <= 8);
// over-store in dest: we are guaranteed enough space worst-case
_mm256_storeu_si256((__m256i*)dest, packed);
#if DEBUG_LUT
if (countBitsUnset > 0)
fprintf(stderr, "for mask: 0x%2x bits-unset %d we have:\n"
"%4x%4x%4x%4x%4x%4x%4x%4x\n"
"%4x%4x%4x%4x%4x%4x%4x%4x\n",
(unsigned int)newMask, countBitsUnset,
block[x + 0], block[x + 1], block[x + 2], block[x + 3],
block[x + 4], block[x + 5], block[x + 6], block[x + 7],
dest[0], dest[1], dest[2], dest[3],
dest[4], dest[5], dest[6], dest[7]);
#endif
// move on for the next run.
dest += countBitsUnset;
// stash current for use next time around
prev = curr;
}
*scratchLen += dest - scratch;
// a no-op for LE architectures - ~everyone.
for (unsigned int x = 0; x < 4; ++x)
rleMaskBlockWide[x] = htole64(rleMaskBlockWide[x]);
return 1;
#endif // ENABLE_SIMD
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */