remove AVX and AVX512 code from Calc

It's been a source of numerous problems since the beginning.
Poor separation of C++ code causing the compiler to emit some generic
code as CPU-specific, compiler optimizations moving CPU-specific
code out of #ifdef to unguarded static initialization, etc.

And it doesn't seem to even particularly improve performance,
on my Ryzen2500U for one full column (1m cells) sumArray() takes
about 1.6ms with AVX, 1.9ms with SSE2 and 4.6ms with generic code.
So SSE2 code is perhaps worth it, especially given that SSE2 is our
baseline requirement on x86_64 everywhere and x86 on Windows,
but AVX+ is nowhere near worth the trouble.

So this code removes all AVX+ code from Calc, and makes SSE2
a hardcoded option on where it's guaranteed. If we raise the baseline
to AVX, the SSE2 code may be replaced by the one removed by this
commit. Generic code is there for other platforms, if other platforms
add CPU-specific code, they should preferably follow the same rules.

This does not necessarily mean that CPU-specific code cannot
be used at all. Some externals use them, for example. It just
needs to be working, maintained, and worth the trouble.

Change-Id: I5ab919930df9d0223db68a94bf84947984d313ac
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/129733
Tested-by: Jenkins
Reviewed-by: Eike Rathke <erack@redhat.com>
This commit is contained in:
Luboš Luňák 2022-02-09 16:23:33 +01:00 committed by Eike Rathke
parent 51d1f84abf
commit 2d36e7f518
11 changed files with 37 additions and 363 deletions

View file

@ -23,6 +23,8 @@ or inline functions, otherwise their possibly emitted copies compiled
with the CPU-specific instructions might be chosen by the linker as the copy
to keep.
Also see the note at the top of simdsupport.hxx .
*/
namespace cpuid {

View file

@ -8,6 +8,16 @@
*
*/
// IMPORTANT: Having CPU-specific routines turned out to be a maintenance
// problem, because of various problems such as compilers moving CPU-specific
// code out of #ifdef code into static initialization or our code using C++
// features that caused the compiler to emit code that used CPU-specific
// instructions (even cpuid.hxx isn't safe, see the comment there).
// The only safe usage is using CPU-specific code that's always available,
// such as SSE2-specific code for x86_64. Do not use for anything else
// unless you really know what you are doing (and you check git history
// to learn from past problems).
// Determine the compiler support for SIMD compiler intrinsics.
// This changes from one compiled unit to the other, depending if
// the support has been detected and if the compiled unit contains

View file

@ -207,6 +207,7 @@ $(eval $(call gb_Library_add_exception_objects,sc,\
sc/source/core/tool/address \
sc/source/core/tool/adiasync \
sc/source/core/tool/appoptio \
sc/source/core/tool/arraysumSSE2 \
sc/source/core/tool/autoform \
sc/source/core/tool/calcconfig \
sc/source/core/tool/callform \
@ -682,18 +683,6 @@ $(eval $(call gb_Library_add_exception_objects,sc,\
sc/source/ui/xmlsource/xmlsourcedlg \
))
$(eval $(call gb_Library_add_exception_objects,sc,\
sc/source/core/tool/arraysumAVX512, $(CXXFLAGS_INTRINSICS_AVX512F) \
))
$(eval $(call gb_Library_add_exception_objects,sc,\
sc/source/core/tool/arraysumAVX, $(CXXFLAGS_INTRINSICS_AVX) \
))
$(eval $(call gb_Library_add_exception_objects,sc,\
sc/source/core/tool/arraysumSSE2, $(CXXFLAGS_INTRINSICS_SSE2) \
))
ifeq ($(ENABLE_FORMULA_LOGGER),TRUE)
$(eval $(call gb_Library_add_exception_objects,sc,\
sc/source/core/tool/formulalogger \

View file

@ -12,16 +12,25 @@
#include <cmath>
#include "kahan.hxx"
#include "arraysumfunctorinternal.hxx"
#include <tools/cpuid.hxx>
#include "arraysumfunctor.hxx"
#include <formula/errorcodes.hxx>
namespace sc::op
{
/* Checkout available optimization options */
const bool hasAVX512F = hasAVX512FCode() && cpuid::hasAVX512F();
const bool hasAVX = hasAVXCode() && cpuid::hasAVX();
const bool hasSSE2 = hasSSE2Code() && cpuid::hasSSE2();
// Checkout available optimization options.
// Note that it turned out to be problematic to support CPU-specific code
// that's not guaranteed to be available on that specific platform (see
// git history). SSE2 is guaranteed on x86_64 and it is our baseline requirement
// for x86 on Windows, so SSE2 use is hardcoded on those platforms.
// Whenever we raise baseline to e.g. AVX, this may get
// replaced with AVX code (get it from git history).
// Do it similarly with other platforms.
#if defined(X86_64) || (defined(X86) && defined(_WIN32))
#define SC_USE_SSE2 1
KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent);
#else
#define SC_USE_SSE2 0
#endif
/**
* If no boosts available, Unrolled KahanSum.
@ -60,12 +69,9 @@ static inline KahanSum executeUnrolled(size_t& i, size_t nSize, const double* pC
*/
static inline KahanSum executeFast(size_t& i, size_t nSize, const double* pCurrent)
{
if (hasAVX512F)
return executeAVX512F(i, nSize, pCurrent);
if (hasAVX)
return executeAVX(i, nSize, pCurrent);
if (hasSSE2)
return executeSSE2(i, nSize, pCurrent);
#if SC_USE_SSE2
return executeSSE2(i, nSize, pCurrent);
#endif
return executeUnrolled(i, nSize, pCurrent);
}

View file

@ -1,36 +0,0 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
#pragma once
#include "scdllapi.h"
namespace sc::op
{
// Plain old data structure, to be used by code compiled with CPU intrinsics without generating any
// code for it (so that code requiring intrinsics doesn't get accidentally selected as the one copy
// when merging duplicates).
struct KahanSumSimple
{
double m_fSum;
double m_fError;
};
/* Available methods */
SC_DLLPUBLIC KahanSumSimple executeAVX512F(size_t& i, size_t nSize, const double* pCurrent);
SC_DLLPUBLIC KahanSumSimple executeAVX(size_t& i, size_t nSize, const double* pCurrent);
SC_DLLPUBLIC KahanSumSimple executeSSE2(size_t& i, size_t nSize, const double* pCurrent);
SC_DLLPUBLIC bool hasAVX512FCode();
SC_DLLPUBLIC bool hasAVXCode();
SC_DLLPUBLIC bool hasSSE2Code();
} // namespace
/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */

View file

@ -12,8 +12,6 @@
#include <rtl/math.hxx>
#include <cmath>
#include "arraysumfunctorinternal.hxx"
/**
* This class provides LO with Kahan summation algorithm
* About this algorithm: https://en.wikipedia.org/wiki/Kahan_summation_algorithm
@ -40,12 +38,6 @@ public:
{
}
constexpr KahanSum(const sc::op::KahanSumSimple& sum)
: m_fSum(sum.m_fSum)
, m_fError(sum.m_fError)
{
}
constexpr KahanSum(const KahanSum& fSum) = default;
public:

View file

@ -1,5 +1,4 @@
#include "functions_test.hxx"
#include <arraysumfunctor.hxx>
class StatisticalFunctionsTest : public FunctionsTest
{
@ -7,11 +6,9 @@ public:
StatisticalFunctionsTest();
void testStatisticalFormulasFODS();
void testIntrinsicSums();
CPPUNIT_TEST_SUITE(StatisticalFunctionsTest);
CPPUNIT_TEST(testStatisticalFormulasFODS);
CPPUNIT_TEST(testIntrinsicSums);
CPPUNIT_TEST_SUITE_END();
};
@ -29,25 +26,6 @@ StatisticalFunctionsTest::StatisticalFunctionsTest():
{
}
void StatisticalFunctionsTest::testIntrinsicSums()
{
// Checkout SSE2, AVX and AVX512 operations
// Needs exactly 9 terms
double summands[9] = { 0, 1, 2, 3, 4, 10, 20, 2, -1 };
double* pCurrent = summands;
size_t i = 0;
if (sc::op::hasAVX512F)
CPPUNIT_ASSERT_EQUAL(42.0, KahanSum(sc::op::executeAVX512F(i, 9, pCurrent)).get());
i = 0;
if (sc::op::hasAVX)
CPPUNIT_ASSERT_EQUAL(42.0, KahanSum(sc::op::executeAVX(i, 9, pCurrent)).get());
i = 0;
if (sc::op::hasSSE2)
CPPUNIT_ASSERT_EQUAL(42.0, KahanSum(sc::op::executeSSE2(i, 9, pCurrent)).get());
i = 0;
CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeUnrolled(i, 9, pCurrent).get());
}
CPPUNIT_TEST_SUITE_REGISTRATION(StatisticalFunctionsTest);
CPPUNIT_PLUGIN_IMPLEMENT();

View file

@ -14,19 +14,6 @@
namespace sc::op
{
// Code must not be shared between different CPU intrinsics flags (e.g. in debug mode the compiler would not
// inline them, and merge the copies, keeping only the one with the most demanding CPU set that's not available otherwise).
// Put everything in a different namespace and additionally try to force inlining.
namespace LO_ARRAYSUM_SPACE
{
#if defined _MSC_VER
#define INLINE __forceinline static
#elif defined __GNUC__
#define INLINE __attribute__((always_inline)) static inline
#else
#define static inline
#endif
/**
* Performs one step of the Neumanier sum between doubles
* Overwrites the summand and error
@ -34,7 +21,7 @@ namespace LO_ARRAYSUM_SPACE
* @param err
* @param value
*/
INLINE void sumNeumanierNormal(double& sum, double& err, const double& value)
inline void sumNeumanierNormal(double& sum, double& err, const double& value)
{
double t = sum + value;
if (fabs(sum) >= fabs(value))
@ -44,9 +31,6 @@ INLINE void sumNeumanierNormal(double& sum, double& err, const double& value)
sum = t;
}
#undef INLINE
} // end namespace LO_ARRAYSUM_SPACE
} // end namespace sc::op
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

View file

@ -1,121 +0,0 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
*/
#define LO_ARRAYSUM_SPACE AVX
#include "arraysum.hxx"
#include <arraysumfunctorinternal.hxx>
#include <tools/simd.hxx>
#include <tools/simdsupport.hxx>
#include <stdlib.h>
namespace sc::op
{
#ifdef LO_AVX_AVAILABLE
bool hasAVXCode() { return true; }
using namespace AVX;
/** Kahan sum with AVX.
*/
static inline void sumAVX(__m256d& sum, __m256d& err, const __m256d& value)
{
static const __m256d ANNULATE_SIGN_BIT
= _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF));
// Temporal parameter
__m256d t = _mm256_add_pd(sum, value);
// Absolute value of the total sum
__m256d asum = _mm256_and_pd(sum, ANNULATE_SIGN_BIT);
// Absolute value of the value to add
__m256d avalue = _mm256_and_pd(value, ANNULATE_SIGN_BIT);
// Compare the absolute values sum >= value
__m256d mask = _mm256_cmp_pd(asum, avalue, _CMP_GE_OQ);
// The following code has this form ( a - t + b)
// Case 1: a = sum b = value
// Case 2: a = value b = sum
__m256d a = _mm256_add_pd(_mm256_and_pd(mask, sum), _mm256_andnot_pd(mask, value));
__m256d b = _mm256_add_pd(_mm256_and_pd(mask, value), _mm256_andnot_pd(mask, sum));
err = _mm256_add_pd(err, _mm256_add_pd(_mm256_sub_pd(a, t), b));
// Store result
sum = t;
}
/** Execute Kahan sum with AVX.
*/
KahanSumSimple executeAVX(size_t& i, size_t nSize, const double* pCurrent)
{
// Make sure we don't fall out of bounds.
// This works by sums of 8 terms.
// So the 8'th term is i+7
// If we iterate until nSize won't fall out of bounds
if (nSize > i + 7)
{
// Setup sums and errors as 0
__m256d sum1 = _mm256_setzero_pd();
__m256d err1 = _mm256_setzero_pd();
__m256d sum2 = _mm256_setzero_pd();
__m256d err2 = _mm256_setzero_pd();
for (; i + 7 < nSize; i += 8)
{
// Kahan sum 1
__m256d load1 = _mm256_loadu_pd(pCurrent);
sumAVX(sum1, err1, load1);
pCurrent += 4;
// Kahan sum 2
__m256d load2 = _mm256_loadu_pd(pCurrent);
sumAVX(sum2, err2, load2);
pCurrent += 4;
}
// Now we combine pairwise summation with Kahan summation
// sum 1 + sum 2 -> sum 1
sumAVX(sum1, err1, sum2);
sumAVX(sum1, err1, err2);
// Store results
double sums[4];
double errs[4];
_mm256_storeu_pd(&sums[0], sum1);
_mm256_storeu_pd(&errs[0], err1);
// First Kahan & pairwise summation
// 0+1 1+2 -> 0, 2
sumNeumanierNormal(sums[0], errs[0], sums[1]);
sumNeumanierNormal(sums[2], errs[2], sums[3]);
sumNeumanierNormal(sums[0], errs[0], errs[1]);
sumNeumanierNormal(sums[2], errs[2], errs[3]);
// 0+2 -> 0
sumNeumanierNormal(sums[0], errs[0], sums[2]);
sumNeumanierNormal(sums[0], errs[0], errs[2]);
// Store result
return { sums[0], errs[0] };
}
return { 0.0, 0.0 };
}
#else // LO_AVX_AVAILABLE
bool hasAVXCode() { return false; }
KahanSumSimple executeAVX(size_t&, size_t, const double*) { abort(); }
#endif
} // end namespace sc::op
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

View file

@ -1,120 +0,0 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
*/
#define LO_ARRAYSUM_SPACE AVX512
#include "arraysum.hxx"
#include <arraysumfunctorinternal.hxx>
#include <tools/simd.hxx>
#include <tools/simdsupport.hxx>
#include <stdlib.h>
namespace sc::op
{
#ifdef LO_AVX512F_AVAILABLE
bool hasAVX512FCode() { return true; }
using namespace AVX512;
/** Kahan sum with AVX512.
*/
static inline void sumAVX512(__m512d& sum, __m512d& err, const __m512d& value)
{
// Temporal parameter
__m512d t = _mm512_add_pd(sum, value);
// Absolute value of the total sum
__m512d asum = _mm512_abs_pd(sum);
// Absolute value of the value to add
__m512d avalue = _mm512_abs_pd(value);
// Compare the absolute values sum >= value
__mmask8 mask = _mm512_cmp_pd_mask(avalue, asum, _CMP_GE_OQ);
// The following code has this form ( a - t + b)
// Case 1: a = sum b = value
// Case 2: a = value b = sum
__m512d a = _mm512_mask_blend_pd(mask, sum, value);
__m512d b = _mm512_mask_blend_pd(mask, value, sum);
err = _mm512_add_pd(err, _mm512_add_pd(_mm512_sub_pd(a, t), b));
// Store result
sum = t;
}
/** Execute Kahan sum with AVX512.
*/
KahanSumSimple executeAVX512F(size_t& i, size_t nSize, const double* pCurrent)
{
// Make sure we don't fall out of bounds.
// This works by sums of 8 terms.
// So the 8'th term is i+7
// If we iterate until nSize won't fall out of bounds
if (nSize > i + 7)
{
// Setup sums and errors as 0
__m512d sum = _mm512_setzero_pd();
__m512d err = _mm512_setzero_pd();
// Sum the stuff
for (; i + 7 < nSize; i += 8)
{
// Kahan sum
__m512d load = _mm512_loadu_pd(pCurrent);
sumAVX512(sum, err, load);
pCurrent += 8;
}
// Store result
static_assert(sizeof(double) == 8);
double sums[8];
double errs[8];
_mm512_storeu_pd(&sums[0], sum);
_mm512_storeu_pd(&errs[0], err);
// First Kahan & pairwise summation
// 0+1 1+2 3+4 4+5 6+7 -> 0, 2, 4, 6
sumNeumanierNormal(sums[0], errs[0], sums[1]);
sumNeumanierNormal(sums[2], errs[2], sums[3]);
sumNeumanierNormal(sums[4], errs[4], sums[5]);
sumNeumanierNormal(sums[6], errs[6], sums[7]);
sumNeumanierNormal(sums[0], errs[0], errs[1]);
sumNeumanierNormal(sums[2], errs[2], errs[3]);
sumNeumanierNormal(sums[4], errs[4], errs[5]);
sumNeumanierNormal(sums[6], errs[6], errs[7]);
// Second Kahan & pairwise summation
// 0+2 4+6 -> 0, 4
sumNeumanierNormal(sums[0], errs[0], sums[2]);
sumNeumanierNormal(sums[4], errs[4], sums[6]);
sumNeumanierNormal(sums[0], errs[0], errs[2]);
sumNeumanierNormal(sums[4], errs[4], errs[6]);
// Third Kahan & pairwise summation
// 0+4 -> 0
sumNeumanierNormal(sums[0], errs[0], sums[4]);
sumNeumanierNormal(sums[0], errs[0], errs[4]);
// Return final result
return { sums[0], errs[0] };
}
return { 0.0, 0.0 };
}
#else // LO_AVX512F_AVAILABLE
bool hasAVX512FCode() { return false; }
KahanSumSimple executeAVX512F(size_t&, size_t, const double*) { abort(); }
#endif
} // end namespace sc::op
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

View file

@ -8,24 +8,19 @@
*
*/
#define LO_ARRAYSUM_SPACE SSE2
#include "arraysum.hxx"
#include <arraysumfunctorinternal.hxx>
#include <arraysumfunctor.hxx>
#include <tools/simd.hxx>
#include <tools/simdsupport.hxx>
#include <stdlib.h>
#if SC_USE_SSE2
namespace sc::op
{
#ifdef LO_SSE2_AVAILABLE
bool hasSSE2Code() { return true; }
using namespace SSE2;
/** Kahan sum with SSE2.
*/
static inline void sumSSE2(__m128d& sum, __m128d& err, const __m128d& value)
@ -51,7 +46,7 @@ static inline void sumSSE2(__m128d& sum, __m128d& err, const __m128d& value)
/** Execute Kahan sum with SSE2.
*/
KahanSumSimple executeSSE2(size_t& i, size_t nSize, const double* pCurrent)
KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent)
{
// Make sure we don't fall out of bounds.
// This works by sums of 8 terms.
@ -121,13 +116,8 @@ KahanSumSimple executeSSE2(size_t& i, size_t nSize, const double* pCurrent)
return { 0.0, 0.0 };
}
#else // LO_SSE2_AVAILABLE
bool hasSSE2Code() { return false; }
KahanSumSimple executeSSE2(size_t&, size_t, const double*) { abort(); }
} // namespace
#endif
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */