a9d4dcb71d
Previously we compressed raw pixels - which was a lot more data, so this should accelerate initial keyframe compression significantly. It should also significantly reduce the time and space & GC thrash caused by de-compression, and potentially help firefox performance problems in the JS from fzstd decompression. Finally since we unpremultiply runs of similar pixels instead of all pixels, potentially it should remove lots of un-pre-multipling from the JS profile too. Change-Id: I412fe0cb7272ea1ca72dc6ac1ca2823878fb7422 Signed-off-by: Michael Meeks <michael.meeks@collabora.com>
184 lines
5.6 KiB
C
184 lines
5.6 KiB
C
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */
|
|
/*
|
|
* Copyright the Collabora Online contributors.
|
|
*
|
|
* SPDX-License-Identifier: MPL-2.0
|
|
*/
|
|
|
|
// This is a C file - to avoid inclusion of C++ headers
|
|
// since compiling with different instruction set can generate
|
|
// versions of inlined code that get injected outside of this
|
|
// module by the linker.
|
|
|
|
#include "config.h"
|
|
|
|
#include <assert.h>
|
|
#include <string.h>
|
|
#include <stdio.h>
|
|
#include <stdint.h>
|
|
#include <endian.h>
|
|
|
|
#include "DeltaSimd.h"
|
|
|
|
#if ENABLE_SIMD
|
|
# include <immintrin.h>
|
|
|
|
#define DEBUG_LUT 0
|
|
|
|
// set of control data bytes for vperd
|
|
static __m256i vpermd_lut[256];
|
|
static __m256i vpermd_shift_left;
|
|
static __m256i vpermd_last_to_first;
|
|
static __m256i low_pixel_mask;
|
|
|
|
// Build table we can lookup bitmasks in to generate gather data
|
|
void init_gather_lut()
|
|
{
|
|
for (unsigned int pattern = 0; pattern < 256; ++pattern)
|
|
{
|
|
unsigned int i = 0, src = 0;
|
|
uint8_t lut[8];
|
|
for (uint32_t bitToCheck = 1; bitToCheck < 256; bitToCheck <<= 1)
|
|
{
|
|
if (!(pattern & bitToCheck)) // set bit is a duplicate -> ignore.
|
|
lut[i++] = src;
|
|
src++;
|
|
}
|
|
while (i<8) // pad to copy first point
|
|
lut[i++] = 0;
|
|
|
|
#if DEBUG_LUG
|
|
fprintf(stderr, "lut mask: 0x%x generates %d %d %d %d %d %d %d %d\n",
|
|
pattern, lut[7], lut[6], lut[5], lut[4], lut[3], lut[2], lut[1], lut[0]);
|
|
#endif
|
|
vpermd_lut[pattern] = _mm256_set_epi8(
|
|
0, 0, 0, lut[7], 0, 0, 0, lut [6],
|
|
0, 0, 0, lut[5], 0, 0, 0, lut [4],
|
|
0, 0, 0, lut[3], 0, 0, 0, lut [2],
|
|
0, 0, 0, lut[1], 0, 0, 0, lut [0]);
|
|
}
|
|
|
|
vpermd_shift_left = _mm256_set_epi8(
|
|
0, 0, 0, 6, 0, 0, 0, 5,
|
|
0, 0, 0, 4, 0, 0, 0, 3,
|
|
0, 0, 0, 2, 0, 0, 0, 1,
|
|
0, 0, 0, 0, 0, 0, 0, 0);
|
|
|
|
vpermd_last_to_first = _mm256_set_epi8(
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 7);
|
|
|
|
low_pixel_mask = _mm256_set_epi8(
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
|
|
}
|
|
|
|
// non-intuitively we need to use the sign bit as
|
|
// if floats to gather bits from 32bit words
|
|
static uint64_t diffMask(__m256i prev, __m256i curr)
|
|
{
|
|
__m256i res = _mm256_cmpeq_epi32(prev, curr);
|
|
__m256 m256 = _mm256_castsi256_ps(res);
|
|
return _mm256_movemask_ps(m256);
|
|
}
|
|
|
|
#endif
|
|
|
|
void simd_deltaInit(void)
|
|
{
|
|
#if ENABLE_SIMD
|
|
init_gather_lut();
|
|
#endif
|
|
}
|
|
|
|
// accelerated compression of a 256 pixel run
|
|
int simd_initPixRowSimd(const uint32_t *from, uint32_t *scratch, size_t *scratchLen, uint64_t *rleMaskBlockWide)
|
|
{
|
|
#if !ENABLE_SIMD
|
|
// no fun.
|
|
(void)from; (void)scratch; (void)scratchLen; (void)rleMaskBlockWide;
|
|
return 0;
|
|
|
|
#else // ENABLE_SIMD
|
|
|
|
*scratchLen = 0;
|
|
uint8_t *rleMaskBlock = (uint8_t *)rleMaskBlockWide;
|
|
for (unsigned int x = 0; x < 256/8; ++x)
|
|
rleMaskBlock[x] = 0;
|
|
|
|
const uint32_t* block = from;
|
|
uint32_t* dest = scratch;
|
|
__m256i prev = _mm256_setzero_si256(); // transparent
|
|
|
|
for (unsigned int x = 0; x < 256; x += 8) // 8 pixels per cycle
|
|
{
|
|
__m256i curr = _mm256_loadu_si256((const __m256i_u*)(block + x));
|
|
|
|
// Generate mask
|
|
|
|
// get the last pixel into the least significant pixel
|
|
// FIXME: mask at the same time ?
|
|
// __m256i lastPix = _mm256_maskz_permutexvar_epi32(0x1, prev, vpermd_last_to_first);
|
|
__m256i lastPix = _mm256_permutevar8x32_epi32(prev, vpermd_last_to_first);
|
|
lastPix = _mm256_and_si256(low_pixel_mask, lastPix);
|
|
|
|
// shift the current pixels left
|
|
prev = _mm256_permutevar8x32_epi32(curr, vpermd_shift_left);
|
|
// mask out the bottom pixel
|
|
prev = _mm256_andnot_si256(low_pixel_mask, prev);
|
|
// merge in the last pixel
|
|
prev = _mm256_or_si256(prev, lastPix);
|
|
|
|
// turn that into a bit-mask.
|
|
uint64_t newMask = diffMask(prev, curr);
|
|
assert (newMask < 256);
|
|
|
|
// invert bitmask for counting non-same foo ... [!]
|
|
uint32_t newMaskInverse = ~newMask & 0xff;
|
|
|
|
// stash our mask for these 8 pixels
|
|
rleMaskBlock[x>>3] = newMask;
|
|
|
|
// Shuffle the pixels and pack them
|
|
__m256i control_vector = _mm256_loadu_si256(&vpermd_lut[newMask]);
|
|
__m256i packed = _mm256_permutevar8x32_epi32(curr, control_vector);
|
|
|
|
unsigned int countBitsUnset = _mm_popcnt_u32(newMaskInverse);
|
|
assert(countBitsUnset <= 8);
|
|
|
|
// over-store in dest: we are guaranteed enough space worst-case
|
|
_mm256_storeu_si256((__m256i*)dest, packed);
|
|
|
|
#if DEBUG_LUT
|
|
if (countBitsUnset > 0)
|
|
fprintf(stderr, "for mask: 0x%2x bits-unset %d we have:\n"
|
|
"%4x%4x%4x%4x%4x%4x%4x%4x\n"
|
|
"%4x%4x%4x%4x%4x%4x%4x%4x\n",
|
|
(unsigned int)newMask, countBitsUnset,
|
|
block[x + 0], block[x + 1], block[x + 2], block[x + 3],
|
|
block[x + 4], block[x + 5], block[x + 6], block[x + 7],
|
|
dest[0], dest[1], dest[2], dest[3],
|
|
dest[4], dest[5], dest[6], dest[7]);
|
|
#endif
|
|
|
|
// move on for the next run.
|
|
dest += countBitsUnset;
|
|
|
|
// stash current for use next time around
|
|
prev = curr;
|
|
}
|
|
*scratchLen += dest - scratch;
|
|
|
|
// a no-op for LE architectures - ~everyone.
|
|
for (unsigned int x = 0; x < 4; ++x)
|
|
rleMaskBlockWide[x] = htole64(rleMaskBlockWide[x]);
|
|
|
|
return 1;
|
|
#endif // ENABLE_SIMD
|
|
}
|
|
|
|
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|