SIMD boilerplate.

2024-05-27 22:01:53 +02:00
parent 8b3d61ff5d
commit fb2e6f960d
7 changed files with 453 additions and 50 deletions
--- a/Engine/Source/Runtime/Core/public/Math/Detail/Vector4Decl.inl
+++ b/Engine/Source/Runtime/Core/public/Math/Detail/Vector4Decl.inl
@@ -0,0 +1,167 @@
+#pragma once
+
+#include "Core/public/Math/Boilerplate.h"
+#include "Core/public/Math/MathCommon.hpp"
+#include "Core/public/Math/MathFwd.h"
+
+namespace Phanes::Core::Math::Detail
+{
+    template<RealType T, bool A>
+    struct compute_vec4_add {};
+
+    template<RealType T, bool A>
+    struct compute_vec4_sub {};
+
+    template<RealType T, bool A>
+    struct compute_vec4_mul {};
+
+    template<RealType T, bool A>
+    struct compute_vec4_div {};
+
+    template<RealType T, bool A>
+    struct compute_vec4_eq{};
+
+    template<RealType T, bool A>
+    struct compute_vec4_ieq {};
+
+    template<RealType T, bool A>
+    struct compute_vec4_inc {};
+
+    template<RealType T, bool A>
+    struct compute_vec4_dec {};
+
+
+    template<RealType T>
+    struct compute_vec4_add<T, false>
+    { 
+        static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1, const Phanes::Core::Math::TVector4<T, false>& v2)
+        {
+            r.x = v1.x + v2.x;
+            r.y = v1.y + v2.y;
+            r.z = v1.z + v2.z;
+            r.w = v1.w + v2.w;
+        }
+
+        static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1, T s)
+        {
+            r.x = v1.x + s;
+            r.y = v1.y + s;
+            r.z = v1.z + s;
+            r.w = v1.w + s;
+        }
+    };
+
+
+    template<RealType T>
+    struct compute_vec4_sub<T, false>
+    {
+        static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1, const Phanes::Core::Math::TVector4<T, false>& v2)
+        {
+            r.x = v1.x - v2.x;
+            r.y = v1.y - v2.y;
+            r.z = v1.z - v2.z;
+            r.w = v1.w - v2.w;
+        }
+
+        static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1, T s)
+        {
+            r.x = v1.x - s;
+            r.y = v1.y - s;
+            r.z = v1.z - s;
+            r.w = v1.w - s;
+        }
+    };
+
+
+    template<RealType T>
+    struct compute_vec4_mul<T, false>
+    {
+        static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1, const Phanes::Core::Math::TVector4<T, false>& v2)
+        {
+            r.x = v1.x * v2.x;
+            r.y = v1.y * v2.y;
+            r.z = v1.z * v2.z;
+            r.w = v1.w * v2.w;
+        }
+
+        static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1, T s)
+        {
+            r.x = v1.x * s;
+            r.y = v1.y * s;
+            r.z = v1.z * s;
+            r.w = v1.w * s;
+        }
+    };
+
+
+    template<RealType T>
+    struct compute_vec4_div<T, false>
+    {
+        static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1, const Phanes::Core::Math::TVector4<T, false>& v2)
+        {
+            r.x = v1.x / v2.x;
+            r.y = v1.y / v2.y;
+            r.z = v1.z / v2.z;
+            r.w = v1.w / v2.w;
+        }
+
+        static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1, T s)
+        {
+            s = (T)1.0 / s;
+
+            r.x = v1.x * s;
+            r.y = v1.y * s;
+            r.z = v1.z * s;
+            r.w = v1.w * s;
+        }
+    };
+
+    template<RealType T>
+    struct compute_vec4_eq<T, false>
+    {
+        static constexpr bool map(const Phanes::Core::Math::TVector4<T, false>& v1, const Phanes::Core::Math::TVector4<T, false>& v2)
+        {
+            return (Phanes::Core::Math::Abs(v1.x - v2.x) < P_FLT_INAC &&
+                    Phanes::Core::Math::Abs(v1.y - v2.y) < P_FLT_INAC &&
+                    Phanes::Core::Math::Abs(v1.z - v2.z) < P_FLT_INAC &&
+                    Phanes::Core::Math::Abs(v1.w - v2.w) < P_FLT_INAC);
+        }
+    };
+
+    template<RealType T>
+    struct compute_vec4_ieq<T, false>
+    {
+        static constexpr bool map(const Phanes::Core::Math::TVector4<T, false>& v1, const Phanes::Core::Math::TVector4<T, false>& v2)
+        {
+            return (Phanes::Core::Math::Abs(v1.x - v2.x) > P_FLT_INAC ||
+                    Phanes::Core::Math::Abs(v1.y - v2.y) > P_FLT_INAC ||
+                    Phanes::Core::Math::Abs(v1.z - v2.z) > P_FLT_INAC ||
+                    Phanes::Core::Math::Abs(v1.w - v2.w) > P_FLT_INAC);
+        }
+    };
+
+    template<RealType T>
+    struct compute_vec4_inc<T, false>
+    {
+        static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1)
+        {
+            r.x = v1.x + 1;
+            r.y = v1.y + 1;
+            r.z = v1.z + 1;
+            r.w = v1.w + 1;
+        }
+    };
+
+    template<RealType T>
+    struct compute_vec4_dec<T, false>
+    {
+        static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1)
+        {
+            r.x = v1.x - 1;
+            r.y = v1.y - 1;
+            r.z = v1.z - 1;
+            r.w = v1.w - 1;
+        }
+    };
+}
+
--- a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesSIMDTypes.h
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesSIMDTypes.h
@@ -0,0 +1,227 @@
+// This file includes the necessary header for vectorization intrinsics. If no specifics are defined SSE4.2 is used.
+// 
+// ARM is not supported.
+
+#include "Core/public/Math/SIMD/Platform.h"
+#include "Core/public/Math/MathTypes.h"
+
+#if P_INTRINSICS == P_INTRINSICS_AVX2
+#   include <immintrin.h>
+#elif P_INTRINSICS == P_INTRINSICS_AVX
+#   include <immintrin.h>
+#elif P_INTRINSICS == P_INTRINSICS_SSE
+#   include <nmmintrin.h> 
+#elif P_INTRINSICS == P_INTRINSICS_NEON
+#   include "neon.h" // <- Not supported
+#endif
+
+// use_simd for metaprogramming
+namespace Phanes::Core::Math::SIMD
+{
+    
+    /// <summary>
+    /// This decides, whether simd operations should be used, based on the vector type, it's size, the vector alignment and whether the right extension can be loaded during compiletime.
+    /// </summary>
+    /// <typeparam name="T">Type of vector</typeparam>
+    /// <typeparam name="L">Length of vector</typeparam>
+    /// <typeparam name="SimdActive">Whether SIMD intrinsics exist, that support the vector type and length.</typeparam>
+    /// <typeparam name="IsAligned">Whether the vector is aligned for simd usage.</typeparam>
+    template<typename T, size_t L, bool IsAligned>
+    struct use_simd
+    {
+        bool value = false;
+    };
+
+
+    // SSE / NEON
+
+    template<>
+    struct use_simd<float, 4, true>
+    {
+        bool value = true && (P_SSE__ || P_NEON__);
+    };
+
+    template<>
+    struct use_simd<float, 3, true>
+    {
+        bool value = true && (P_SSE__ || P_NEON__);
+    };
+
+    template<>
+    struct use_simd<int, 4, true>
+    {
+        bool value = true && (P_SSE__ || P_NEON__);
+    };
+
+    template<>
+    struct use_simd<int, 3, true>
+    {
+        bool value = true && (P_SSE__ || P_NEON__);
+    };
+
+    template<>
+    struct use_simd<unsigned int, 4, true>
+    {
+        bool value = true && (P_SSE__ || P_NEON__);
+    };
+
+    template<>
+    struct use_simd<unsigned int, 3, true>
+    {
+        bool value = true && (P_SSE__ || P_NEON__);
+    };
+
+    // SSE
+
+    template<>
+    struct use_simd<double, 2, true>
+    {
+        bool value = true && P_SSE__;
+    };
+
+    template<>
+    struct use_simd<Phanes::Core::Types::int64, 2, true>
+    {
+        bool value = true && P_SSE__;
+    };
+
+    template<>
+    struct use_simd<Phanes::Core::Types::uint64, 2, true>
+    {
+        bool value = true && P_SSE__;
+    };
+
+
+
+    // AVX 
+
+    template<>
+    struct use_simd<double, 4, true>
+    {
+        bool value = true && P_AVX__;
+    };
+
+    template<>
+    struct use_simd<double, 3, true>
+    {
+        bool value = true && P_AVX__;
+    };
+
+    template<>
+    struct use_simd<float, 8, true>
+    {
+        bool value = true && P_AVX__;
+    };
+
+
+    // AVX2
+
+    template<>
+    struct use_simd<Phanes::Core::Types::int64, 4, true>
+    {
+        bool value = true && P_AVX2__;
+    };
+
+    template<>
+    struct use_simd<Phanes::Core::Types::int64, 3, true>
+    {
+        bool value = true && P_AVX2__;
+    };
+
+    template<>
+    struct use_simd<Phanes::Core::Types::uint64, 4, true>
+    {
+        bool value = true && P_AVX2__;
+    };
+
+    template<>
+    struct use_simd<Phanes::Core::Types::uint64, 3, true>
+    {
+        bool value = true && P_AVX2__;
+    };
+
+    template<>
+    struct use_simd<int, 8, true>
+    {
+        bool value = true && P_AVX2__;
+    };
+
+    template<>
+    struct use_simd<unsigned int, 8, true>
+    {
+        bool value = true && P_AVX2__;
+    };
+}
+
+// Register aliases
+namespace Phanes::Core::Types
+{
+
+#if P_INTRINSICS >= 1
+
+    typedef __m128      Vec4f32Reg;
+    typedef __m128d     Vec2f64Reg;
+
+    typedef __m128i     Vec4i32Reg;
+    typedef __m128i     Vec2i64Reg;
+
+    typedef __m128i     Vec4u32Reg;
+    typedef __m128i     Vec2u64Reg;
+
+#elif P_INTRINSICS != P_INTRINSICS_NEON
+
+    typedef struct alignas(16) Vec4f32Reg { float data[4]; }                        Vec4f32Reg;
+    typedef struct alignas(16) Vec2f64Reg { double data[2]; }                       Vec2f64Reg;
+    typedef struct alignas(16) Vec4i32Reg { int data[4]; }                          Vec4i32Reg;
+    typedef struct alignas(16) Vec2i64Reg { Phanes::Core::Types::int64 data[2]; }   Vec2i64Reg;
+    typedef struct alignas(16) Vec4u32Reg { unsigned int data[4]; }                 Vec4u32Reg;
+    typedef struct alignas(16) Vec2u64Reg { Phanes::Core::Types::uint64 data[4]; }  Vec2u64Reg;
+
+#endif
+
+
+#if P_INTRINSICS >= 2
+
+    typedef __m256      Vec4x2f32Reg;
+    typedef __m256      Vec8f32Reg;
+    typedef __m256d     Vec2x2f64Reg;
+    typedef __m256d     Vec4f64Reg;
+
+#elif P_INTRINSICS != P_INTRINSICS_NEON
+
+    typedef struct alignas(32) Vec4x2f32Reg { float data[8]; }  Vec4x2f32Reg;
+    typedef struct alignas(32) Vec8f32Reg   { float data[8]; }  Vec8f32Reg;
+    typedef struct alignas(32) Vec2x2f64Reg { double data[4]; } Vec2x2f64Reg;
+    typedef struct alignas(32) Vec4f64Reg   { double data[4]; } Vec4f64Reg;
+
+#endif
+
+
+#if P_INTRINSICS == 3
+
+    typedef __m256i     Vec4x2i32Reg;
+    typedef __m256i     Vec8i32Reg;
+    typedef __m256i     Vec2x2i64Reg;
+    typedef __m256i     Vec4i64Reg;
+
+    typedef __m256i     Vec4x2u32Reg;
+    typedef __m256i     Vec8u32Reg;
+    typedef __m256i     Vec2x2u64Reg;
+    typedef __m256i     Vec4u64Reg;
+
+#elif P_INTRINSICS != P_INTRINSICS_NEON
+
+    typedef struct alignas(32) Vec4x2i32Reg { int data[8]; }                         Vec4x2i32Reg;
+    typedef struct alignas(32) Vec8i32Reg   { int data[8]; }                         Vec8i32Reg;
+    typedef struct alignas(32) Vec2x2i64Reg { Phanes::Core::Types::int64 data[4]; }  Vec2x2i64Reg;
+    typedef struct alignas(32) Vec4i64Reg   { Phanes::Core::Types::int64 data[4]; }  Vec4i64Reg;
+
+    typedef struct alignas(32) Vec4x2u32Reg { unsigned int data[8]; }                Vec4x2u32Reg;
+    typedef struct alignas(32) Vec8u32Reg   { unsigned int data[8]; }                Vec8u32Reg;
+    typedef struct alignas(32) Vec2x2u64Reg { Phanes::Core::Types::uint64 data[4]; } Vec2x2u64Reg;
+    typedef struct alignas(32) Vec4u64Reg   { Phanes::Core::Types::uint64 data[4]; } Vec4u64Reg;
+
+#endif
+
+    // NEON ...
+}
--- a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp
@@ -1,5 +1,25 @@
 #pragma once

+#include "Core/public/Math/Boilerplate.h"
 #include <nmmintrin.h> 

+// -> For IntelliSense

+// ============ //
+//   TVector4   //
+// ============ //
+
+
+namespace Phanes::Core::Math::Detail
+{
+    // Template class has already been defined and is included through: Storage.h -> Vector4.hpp -> SIMDIntrinsics.h -> PhanesVectorMathSEE.hpp
+
+    template<>
+    struct compute_vec4_add<float, true>
+    {
+        static FORCEINLINE void map(Phanes::Core::Math::TVector4<float, true>& r, const Phanes::Core::Math::TVector4<float, true>& v1, const Phanes::Core::Math::TVector4<float, true>& v2)
+        {
+            r.comp = _mm_add_ps(v1.comp, v2.comp);
+        }
+    };
+}
--- a/Engine/Source/Runtime/Core/public/Math/SIMD/Platform.h
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/Platform.h
@@ -254,6 +254,11 @@

 // Define also supported instruction sets for Visual Studio, as it only defines the latest (e.g. only __AVX__ not __SSE4__ ...).

+#define P_AVX2__    0
+#define P_AVX__     0
+#define P_SSE__     0 
+#define P_NEON__    0
+
 #ifdef P_FORCE_INTRINSICS
    
 #   undef __AVX2__
@@ -267,22 +272,22 @@
 #else

 #   ifdef __AVX2__
-#       define P_AVX2__
+#       define P_AVX2__ 1
 #   elif defined(__AVX__)
-#       define P_AVX__
+#       define P_AVX__ 1
 #   elif defined(__SSE__)
-#       define P_SSE__
+#       define P_SSE__ 1
 #   endif


 #endif // !P_FORCE_INTRINSICS

 #ifdef P_AVX2__
-#   define P_AVX__
+#   define P_AVX__ 1
 #endif

 #ifdef P_AVX__
-#   define P_SSE__ 
+#   define P_SSE__ 1
 #endif


@@ -300,14 +305,15 @@
 #   undef P_SSE__
 #   undef P_SSE__
 #else
-#   if defined(P_AVX__) && !defined(P_AVX2__)
-#      define P_INTRINSICS P_INTRINSICS_AVX
-#   elif defined(P_AVX2__)
-#      define P_INTRINSICS P_INTRINSICS_AVX2
-#   elif  (defined(__SSE__) || defined(P_SSE__)) && !defined(P_AVX__)
-#      define P_INTRINSICS P_INTRINSICS_SSE
+#   if (P_AVX__ == 1) && (P_AVX2__ == 0)
+#       define P_INTRINSICS P_INTRINSICS_AVX
+#   elif P_AVX2__ == 1
+#       define P_INTRINSICS P_INTRINSICS_AVX2
+#   elif P_SSE__ == 1
+#       define P_INTRINSICS P_INTRINSICS_SSE
 #   elif defined(P_ARM_ARCH)
-#      define P_INTRINSICS P_INTRINSICS_NEON
+#       define P_INTRINSICS P_INTRINSICS_NEON
+#       define P_NEON__ 1
 #   elif !defined(P_FORCE_INTRINSICS)
 #       error No SIMD instruction set detected. Use P_FORCE_FPU to disable SIMD extensions.
 #   endif
--- a/Engine/Source/Runtime/Core/public/Math/SIMD/SIMDIntrinsics.h
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/SIMDIntrinsics.h
@@ -0,0 +1,17 @@
+#pragma once
+
+
+#include "Core/public/Math/SIMD/Platform.h"
+
+#if P_INTRINSICS == P_INTRINSICS_AVX2
+#   include "PhanesVectorMathAVX2.hpp"
+#elif P_INTRINSICS == P_INTRINSICS_AVX
+#   include "PhanesVectorMathAVX.hpp"
+#elif P_INTRINSICS == P_INTRINSICS_SSE
+#   include "PhanesVectorMathSSE.hpp"
+#elif P_INTRINSICS == P_INTRINSICS_NEON
+#   include "PhanesVectorMathNeon.hpp"
+#elif P_INTRINSICS == P_INTRINSICS_FPU 
+#   include "PhanesVectorMathFPU.hpp"
+#endif
+
--- a/Engine/Source/Runtime/Core/public/Math/SIMD/Storage.h
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/Storage.h
@@ -1,13 +1,13 @@
 // Defines on compile time, whether a xmm register or an array should be used.
 #pragma once

-#include "Core/public/Math/SIMD/PhanesSIMD.h"
+#include "Core/public/Math/SIMD/PhanesSIMDTypes.h"

 #include "Core/public/Math/MathTypes.h"

 namespace Phanes::Core::SIMD
 {
-    template<size_t L, typename T, bool IsAligned>
+    template<size_t L, typename T, bool UseSimd>
    struct Storage;

    // General unaligned memory storage
@@ -143,37 +143,3 @@ namespace Phanes::Core::SIMD
        typedef Phanes::Core::Types::Vec4x2u32Reg type;
    };
 }
-
-
-struct Vec4
-{
-public:
-    union
-    {
-        struct
-        {
-
-            int x, y, z, w;
-
-        };
-
-        typename Phanes::Core::SIMD::Storage<4, Phanes::Core::Types::int32, true>::type comp;
-    };
-};
-
-struct Vec4x2
-{
-public:
-    union
-    {
-        struct
-        {
-
-            Vec4 v1;
-            Vec4 v2;
-
-        };
-
-        typename Phanes::Core::SIMD::Storage<8, Phanes::Core::Types::int32, true>::type comp;
-    };
-};