remove AVX and AVX512 code from Calc

It's been a source of numerous problems since the beginning. Poor separation of C++ code causing the compiler to emit some generic code as CPU-specific, compiler optimizations moving CPU-specific code out of #ifdef to unguarded static initialization, etc. And it doesn't seem to even particularly improve performance, on my Ryzen2500U for one full column (1m cells) sumArray() takes about 1.6ms with AVX, 1.9ms with SSE2 and 4.6ms with generic code. So SSE2 code is perhaps worth it, especially given that SSE2 is our baseline requirement on x86_64 everywhere and x86 on Windows, but AVX+ is nowhere near worth the trouble. So this code removes all AVX+ code from Calc, and makes SSE2 a hardcoded option on where it's guaranteed. If we raise the baseline to AVX, the SSE2 code may be replaced by the one removed by this commit. Generic code is there for other platforms, if other platforms add CPU-specific code, they should preferably follow the same rules. This does not necessarily mean that CPU-specific code cannot be used at all. Some externals use them, for example. It just needs to be working, maintained, and worth the trouble. Change-Id: I5ab919930df9d0223db68a94bf84947984d313ac Reviewed-on: https://gerrit.libreoffice.org/c/core/+/129733 Tested-by: Jenkins Reviewed-by: Eike Rathke <erack@redhat.com>
2022-02-09 16:23:33 +01:00 · 2022-02-09 16:23:33 +01:00 · 2d36e7f518
commit 2d36e7f518
parent 51d1f84abf
11 changed files with 37 additions and 363 deletions
--- a/include/tools/cpuid.hxx
+++ b/include/tools/cpuid.hxx
@ -23,6 +23,8 @@ or inline functions, otherwise their possibly emitted copies compiled
 with the CPU-specific instructions might be chosen by the linker as the copy
 to keep.

+Also see the note at the top of simdsupport.hxx .
+
 */

 namespace cpuid {
--- a/include/tools/simdsupport.hxx
+++ b/include/tools/simdsupport.hxx
@ -8,6 +8,16 @@
 *
 */

+// IMPORTANT: Having CPU-specific routines turned out to be a maintenance
+// problem, because of various problems such as compilers moving CPU-specific
+// code out of #ifdef code into static initialization or our code using C++
+// features that caused the compiler to emit code that used CPU-specific
+// instructions (even cpuid.hxx isn't safe, see the comment there).
+// The only safe usage is using CPU-specific code that's always available,
+// such as SSE2-specific code for x86_64. Do not use for anything else
+// unless you really know what you are doing (and you check git history
+// to learn from past problems).
+
 // Determine the compiler support for SIMD compiler intrinsics.
 // This changes from one compiled unit to the other, depending if
 // the support has been detected and if the compiled unit contains
--- a/sc/Library_sc.mk
+++ b/sc/Library_sc.mk
@ -207,6 +207,7 @@ $(eval $(call gb_Library_add_exception_objects,sc,\
    sc/source/core/tool/address \
    sc/source/core/tool/adiasync \
    sc/source/core/tool/appoptio \
+    sc/source/core/tool/arraysumSSE2 \
    sc/source/core/tool/autoform \
    sc/source/core/tool/calcconfig \
    sc/source/core/tool/callform \
@ -682,18 +683,6 @@ $(eval $(call gb_Library_add_exception_objects,sc,\
    sc/source/ui/xmlsource/xmlsourcedlg \
 ))

-$(eval $(call gb_Library_add_exception_objects,sc,\
-    sc/source/core/tool/arraysumAVX512, $(CXXFLAGS_INTRINSICS_AVX512F) \
-))
-
-$(eval $(call gb_Library_add_exception_objects,sc,\
-    sc/source/core/tool/arraysumAVX, $(CXXFLAGS_INTRINSICS_AVX) \
-))
-
-$(eval $(call gb_Library_add_exception_objects,sc,\
-    sc/source/core/tool/arraysumSSE2, $(CXXFLAGS_INTRINSICS_SSE2) \
-))
-
 ifeq ($(ENABLE_FORMULA_LOGGER),TRUE)
 $(eval $(call gb_Library_add_exception_objects,sc,\
    sc/source/core/tool/formulalogger \
--- a/sc/inc/arraysumfunctor.hxx
+++ b/sc/inc/arraysumfunctor.hxx
@ -12,16 +12,25 @@

 #include <cmath>
 #include "kahan.hxx"
-#include "arraysumfunctorinternal.hxx"
-#include <tools/cpuid.hxx>
+#include "arraysumfunctor.hxx"
 #include <formula/errorcodes.hxx>

 namespace sc::op
 {
-/* Checkout available optimization options */
-const bool hasAVX512F = hasAVX512FCode() && cpuid::hasAVX512F();
-const bool hasAVX = hasAVXCode() && cpuid::hasAVX();
-const bool hasSSE2 = hasSSE2Code() && cpuid::hasSSE2();
+// Checkout available optimization options.
+// Note that it turned out to be problematic to support CPU-specific code
+// that's not guaranteed to be available on that specific platform (see
+// git history). SSE2 is guaranteed on x86_64 and it is our baseline requirement
+// for x86 on Windows, so SSE2 use is hardcoded on those platforms.
+// Whenever we raise baseline to e.g. AVX, this may get
+// replaced with AVX code (get it from git history).
+// Do it similarly with other platforms.
+#if defined(X86_64) || (defined(X86) && defined(_WIN32))
+#define SC_USE_SSE2 1
+KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent);
+#else
+#define SC_USE_SSE2 0
+#endif

 /**
  * If no boosts available, Unrolled KahanSum.
@ -60,12 +69,9 @@ static inline KahanSum executeUnrolled(size_t& i, size_t nSize, const double* pC
  */
 static inline KahanSum executeFast(size_t& i, size_t nSize, const double* pCurrent)
 {
-    if (hasAVX512F)
-        return executeAVX512F(i, nSize, pCurrent);
-    if (hasAVX)
-        return executeAVX(i, nSize, pCurrent);
-    if (hasSSE2)
-        return executeSSE2(i, nSize, pCurrent);
+#if SC_USE_SSE2
+    return executeSSE2(i, nSize, pCurrent);
+#endif
    return executeUnrolled(i, nSize, pCurrent);
 }

--- a/sc/inc/arraysumfunctorinternal.hxx
+++ b/sc/inc/arraysumfunctorinternal.hxx
@ -1,36 +0,0 @@
-/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */
-/*
- * This file is part of the LibreOffice project.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include "scdllapi.h"
-
-namespace sc::op
-{
-// Plain old data structure, to be used by code compiled with CPU intrinsics without generating any
-// code for it (so that code requiring intrinsics doesn't get accidentally selected as the one copy
-// when merging duplicates).
-struct KahanSumSimple
-{
-    double m_fSum;
-    double m_fError;
-};
-
-/* Available methods */
-SC_DLLPUBLIC KahanSumSimple executeAVX512F(size_t& i, size_t nSize, const double* pCurrent);
-SC_DLLPUBLIC KahanSumSimple executeAVX(size_t& i, size_t nSize, const double* pCurrent);
-SC_DLLPUBLIC KahanSumSimple executeSSE2(size_t& i, size_t nSize, const double* pCurrent);
-
-SC_DLLPUBLIC bool hasAVX512FCode();
-SC_DLLPUBLIC bool hasAVXCode();
-SC_DLLPUBLIC bool hasSSE2Code();
-
-} // namespace
-
-/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */
--- a/sc/inc/kahan.hxx
+++ b/sc/inc/kahan.hxx
@ -12,8 +12,6 @@
 #include <rtl/math.hxx>
 #include <cmath>

-#include "arraysumfunctorinternal.hxx"
-
 /**
  * This class provides LO with Kahan summation algorithm
  * About this algorithm: https://en.wikipedia.org/wiki/Kahan_summation_algorithm
@ -40,12 +38,6 @@ public:
    {
    }

-    constexpr KahanSum(const sc::op::KahanSumSimple& sum)
-        : m_fSum(sum.m_fSum)
-        , m_fError(sum.m_fError)
-    {
-    }
-
    constexpr KahanSum(const KahanSum& fSum) = default;

 public:
--- a/sc/qa/unit/functions_statistical.cxx
+++ b/sc/qa/unit/functions_statistical.cxx
@ -1,5 +1,4 @@
 #include "functions_test.hxx"
-#include <arraysumfunctor.hxx>

 class StatisticalFunctionsTest : public FunctionsTest
 {
@ -7,11 +6,9 @@ public:
    StatisticalFunctionsTest();

    void testStatisticalFormulasFODS();
-    void testIntrinsicSums();

    CPPUNIT_TEST_SUITE(StatisticalFunctionsTest);
    CPPUNIT_TEST(testStatisticalFormulasFODS);
-    CPPUNIT_TEST(testIntrinsicSums);
    CPPUNIT_TEST_SUITE_END();

 };
@ -29,25 +26,6 @@ StatisticalFunctionsTest::StatisticalFunctionsTest():
 {
 }

-void StatisticalFunctionsTest::testIntrinsicSums()
-{
-    // Checkout SSE2, AVX and AVX512 operations
-        // Needs exactly 9 terms
-    double summands[9] = { 0, 1, 2, 3, 4, 10, 20, 2, -1 };
-    double* pCurrent = summands;
-    size_t i = 0;
-    if (sc::op::hasAVX512F)
-        CPPUNIT_ASSERT_EQUAL(42.0, KahanSum(sc::op::executeAVX512F(i, 9, pCurrent)).get());
-    i = 0;
-    if (sc::op::hasAVX)
-        CPPUNIT_ASSERT_EQUAL(42.0, KahanSum(sc::op::executeAVX(i, 9, pCurrent)).get());
-    i = 0;
-    if (sc::op::hasSSE2)
-        CPPUNIT_ASSERT_EQUAL(42.0, KahanSum(sc::op::executeSSE2(i, 9, pCurrent)).get());
-    i = 0;
-    CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeUnrolled(i, 9, pCurrent).get());
-}
-
 CPPUNIT_TEST_SUITE_REGISTRATION(StatisticalFunctionsTest);

 CPPUNIT_PLUGIN_IMPLEMENT();
--- a/sc/source/core/tool/arraysum.hxx
+++ b/sc/source/core/tool/arraysum.hxx
@ -14,19 +14,6 @@

 namespace sc::op
 {
-// Code must not be shared between different CPU intrinsics flags (e.g. in debug mode the compiler would not
-// inline them, and merge the copies, keeping only the one with the most demanding CPU set that's not available otherwise).
-// Put everything in a different namespace and additionally try to force inlining.
-namespace LO_ARRAYSUM_SPACE
-{
-#if defined _MSC_VER
-#define INLINE __forceinline static
-#elif defined __GNUC__
-#define INLINE __attribute__((always_inline)) static inline
-#else
-#define static inline
-#endif
-
 /**
  * Performs one step of the Neumanier sum between doubles
  * Overwrites the summand and error
@ -34,7 +21,7 @@ namespace LO_ARRAYSUM_SPACE
  * @param err
  * @param value
  */
-INLINE void sumNeumanierNormal(double& sum, double& err, const double& value)
+inline void sumNeumanierNormal(double& sum, double& err, const double& value)
 {
    double t = sum + value;
    if (fabs(sum) >= fabs(value))
@ -44,9 +31,6 @@ INLINE void sumNeumanierNormal(double& sum, double& err, const double& value)
    sum = t;
 }

-#undef INLINE
-
-} // end namespace LO_ARRAYSUM_SPACE
 } // end namespace sc::op

 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
--- a/sc/source/core/tool/arraysumAVX.cxx
+++ b/sc/source/core/tool/arraysumAVX.cxx
@ -1,121 +0,0 @@
-/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
-/*
- * This file is part of the LibreOffice project.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- *
- */
-
-#define LO_ARRAYSUM_SPACE AVX
-#include "arraysum.hxx"
-
-#include <arraysumfunctorinternal.hxx>
-
-#include <tools/simd.hxx>
-#include <tools/simdsupport.hxx>
-
-#include <stdlib.h>
-
-namespace sc::op
-{
-#ifdef LO_AVX_AVAILABLE
-
-bool hasAVXCode() { return true; }
-
-using namespace AVX;
-
-/** Kahan sum with AVX.
-  */
-static inline void sumAVX(__m256d& sum, __m256d& err, const __m256d& value)
-{
-    static const __m256d ANNULATE_SIGN_BIT
-        = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF));
-    // Temporal parameter
-    __m256d t = _mm256_add_pd(sum, value);
-    // Absolute value of the total sum
-    __m256d asum = _mm256_and_pd(sum, ANNULATE_SIGN_BIT);
-    // Absolute value of the value to add
-    __m256d avalue = _mm256_and_pd(value, ANNULATE_SIGN_BIT);
-    // Compare the absolute values sum >= value
-    __m256d mask = _mm256_cmp_pd(asum, avalue, _CMP_GE_OQ);
-    // The following code has this form ( a - t + b)
-    // Case 1: a = sum b = value
-    // Case 2: a = value b = sum
-    __m256d a = _mm256_add_pd(_mm256_and_pd(mask, sum), _mm256_andnot_pd(mask, value));
-    __m256d b = _mm256_add_pd(_mm256_and_pd(mask, value), _mm256_andnot_pd(mask, sum));
-    err = _mm256_add_pd(err, _mm256_add_pd(_mm256_sub_pd(a, t), b));
-    // Store result
-    sum = t;
-}
-
-/** Execute Kahan sum with AVX.
-  */
-KahanSumSimple executeAVX(size_t& i, size_t nSize, const double* pCurrent)
-{
-    // Make sure we don't fall out of bounds.
-    // This works by sums of 8 terms.
-    // So the 8'th term is i+7
-    // If we iterate until nSize won't fall out of bounds
-    if (nSize > i + 7)
-    {
-        // Setup sums and errors as 0
-        __m256d sum1 = _mm256_setzero_pd();
-        __m256d err1 = _mm256_setzero_pd();
-        __m256d sum2 = _mm256_setzero_pd();
-        __m256d err2 = _mm256_setzero_pd();
-
-        for (; i + 7 < nSize; i += 8)
-        {
-            // Kahan sum 1
-            __m256d load1 = _mm256_loadu_pd(pCurrent);
-            sumAVX(sum1, err1, load1);
-            pCurrent += 4;
-
-            // Kahan sum 2
-            __m256d load2 = _mm256_loadu_pd(pCurrent);
-            sumAVX(sum2, err2, load2);
-            pCurrent += 4;
-        }
-
-        // Now we combine pairwise summation with Kahan summation
-
-        // sum 1 + sum 2 -> sum 1
-        sumAVX(sum1, err1, sum2);
-        sumAVX(sum1, err1, err2);
-
-        // Store results
-        double sums[4];
-        double errs[4];
-        _mm256_storeu_pd(&sums[0], sum1);
-        _mm256_storeu_pd(&errs[0], err1);
-
-        // First Kahan & pairwise summation
-        // 0+1 1+2 -> 0, 2
-        sumNeumanierNormal(sums[0], errs[0], sums[1]);
-        sumNeumanierNormal(sums[2], errs[2], sums[3]);
-        sumNeumanierNormal(sums[0], errs[0], errs[1]);
-        sumNeumanierNormal(sums[2], errs[2], errs[3]);
-
-        // 0+2 -> 0
-        sumNeumanierNormal(sums[0], errs[0], sums[2]);
-        sumNeumanierNormal(sums[0], errs[0], errs[2]);
-
-        // Store result
-        return { sums[0], errs[0] };
-    }
-    return { 0.0, 0.0 };
-}
-
-#else // LO_AVX_AVAILABLE
-
-bool hasAVXCode() { return false; }
-
-KahanSumSimple executeAVX(size_t&, size_t, const double*) { abort(); }
-
-#endif
-
-} // end namespace sc::op
-
-/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
--- a/sc/source/core/tool/arraysumAVX512.cxx
+++ b/sc/source/core/tool/arraysumAVX512.cxx
@ -1,120 +0,0 @@
-/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
-/*
- * This file is part of the LibreOffice project.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- *
- */
-
-#define LO_ARRAYSUM_SPACE AVX512
-#include "arraysum.hxx"
-
-#include <arraysumfunctorinternal.hxx>
-
-#include <tools/simd.hxx>
-#include <tools/simdsupport.hxx>
-
-#include <stdlib.h>
-
-namespace sc::op
-{
-#ifdef LO_AVX512F_AVAILABLE
-
-bool hasAVX512FCode() { return true; }
-
-using namespace AVX512;
-
-/** Kahan sum with AVX512.
-  */
-static inline void sumAVX512(__m512d& sum, __m512d& err, const __m512d& value)
-{
-    // Temporal parameter
-    __m512d t = _mm512_add_pd(sum, value);
-    // Absolute value of the total sum
-    __m512d asum = _mm512_abs_pd(sum);
-    // Absolute value of the value to add
-    __m512d avalue = _mm512_abs_pd(value);
-    // Compare the absolute values sum >= value
-    __mmask8 mask = _mm512_cmp_pd_mask(avalue, asum, _CMP_GE_OQ);
-    // The following code has this form ( a - t + b)
-    // Case 1: a = sum b = value
-    // Case 2: a = value b = sum
-    __m512d a = _mm512_mask_blend_pd(mask, sum, value);
-    __m512d b = _mm512_mask_blend_pd(mask, value, sum);
-    err = _mm512_add_pd(err, _mm512_add_pd(_mm512_sub_pd(a, t), b));
-    // Store result
-    sum = t;
-}
-
-/** Execute Kahan sum with AVX512.
-  */
-KahanSumSimple executeAVX512F(size_t& i, size_t nSize, const double* pCurrent)
-{
-    // Make sure we don't fall out of bounds.
-    // This works by sums of 8 terms.
-    // So the 8'th term is i+7
-    // If we iterate until nSize won't fall out of bounds
-    if (nSize > i + 7)
-    {
-        // Setup sums and errors as 0
-        __m512d sum = _mm512_setzero_pd();
-        __m512d err = _mm512_setzero_pd();
-
-        // Sum the stuff
-        for (; i + 7 < nSize; i += 8)
-        {
-            // Kahan sum
-            __m512d load = _mm512_loadu_pd(pCurrent);
-            sumAVX512(sum, err, load);
-            pCurrent += 8;
-        }
-
-        // Store result
-        static_assert(sizeof(double) == 8);
-        double sums[8];
-        double errs[8];
-        _mm512_storeu_pd(&sums[0], sum);
-        _mm512_storeu_pd(&errs[0], err);
-
-        // First Kahan & pairwise summation
-        // 0+1 1+2 3+4 4+5 6+7 -> 0, 2, 4, 6
-        sumNeumanierNormal(sums[0], errs[0], sums[1]);
-        sumNeumanierNormal(sums[2], errs[2], sums[3]);
-        sumNeumanierNormal(sums[4], errs[4], sums[5]);
-        sumNeumanierNormal(sums[6], errs[6], sums[7]);
-        sumNeumanierNormal(sums[0], errs[0], errs[1]);
-        sumNeumanierNormal(sums[2], errs[2], errs[3]);
-        sumNeumanierNormal(sums[4], errs[4], errs[5]);
-        sumNeumanierNormal(sums[6], errs[6], errs[7]);
-
-        // Second Kahan & pairwise summation
-        // 0+2 4+6 -> 0, 4
-        sumNeumanierNormal(sums[0], errs[0], sums[2]);
-        sumNeumanierNormal(sums[4], errs[4], sums[6]);
-        sumNeumanierNormal(sums[0], errs[0], errs[2]);
-        sumNeumanierNormal(sums[4], errs[4], errs[6]);
-
-        // Third Kahan & pairwise summation
-        // 0+4 -> 0
-        sumNeumanierNormal(sums[0], errs[0], sums[4]);
-        sumNeumanierNormal(sums[0], errs[0], errs[4]);
-
-        // Return final result
-        return { sums[0], errs[0] };
-    }
-    return { 0.0, 0.0 };
-}
-
-#else // LO_AVX512F_AVAILABLE
-
-bool hasAVX512FCode() { return false; }
-
-KahanSumSimple executeAVX512F(size_t&, size_t, const double*) { abort(); }
-
-#endif
-
-} // end namespace sc::op
-
-/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
--- a/sc/source/core/tool/arraysumSSE2.cxx
+++ b/sc/source/core/tool/arraysumSSE2.cxx
@ -8,24 +8,19 @@
 *
 */

-#define LO_ARRAYSUM_SPACE SSE2
 #include "arraysum.hxx"

-#include <arraysumfunctorinternal.hxx>
+#include <arraysumfunctor.hxx>

 #include <tools/simd.hxx>
 #include <tools/simdsupport.hxx>

 #include <stdlib.h>

+#if SC_USE_SSE2
+
 namespace sc::op
 {
-#ifdef LO_SSE2_AVAILABLE
-
-bool hasSSE2Code() { return true; }
-
-using namespace SSE2;
-
 /** Kahan sum with SSE2.
  */
 static inline void sumSSE2(__m128d& sum, __m128d& err, const __m128d& value)
@ -51,7 +46,7 @@ static inline void sumSSE2(__m128d& sum, __m128d& err, const __m128d& value)

 /** Execute Kahan sum with SSE2.
  */
-KahanSumSimple executeSSE2(size_t& i, size_t nSize, const double* pCurrent)
+KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent)
 {
    // Make sure we don't fall out of bounds.
    // This works by sums of 8 terms.
@ -121,13 +116,8 @@ KahanSumSimple executeSSE2(size_t& i, size_t nSize, const double* pCurrent)
    return { 0.0, 0.0 };
 }

-#else // LO_SSE2_AVAILABLE
-
-bool hasSSE2Code() { return false; }
-
-KahanSumSimple executeSSE2(size_t&, size_t, const double*) { abort(); }
+} // namespace

 #endif
-}

 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */