From 676ee8477458943b0b3b8dfbd840387756f77d3f Mon Sep 17 00:00:00 2001
From: scorpioblood <77296181+scorpioblood@users.noreply.github.com>
Date: Fri, 24 May 2024 23:43:26 +0200
Subject: [PATCH] Create SIMD boilerplate code for detection and fallbacks.

---
 .../Source/Runtime/Core/public/Math/MathFwd.h |  44 ++---
 .../Core/public/Math/SIMD/PhanesSIMD.h        | 119 ++++++------
 .../public/Math/SIMD/PhanesVectorMathAVX.hpp  |   6 +
 .../public/Math/SIMD/PhanesVectorMathAVX2.hpp |   3 +
 .../public/Math/SIMD/PhanesVectorMathFPU.hpp  |   1 +
 .../public/Math/SIMD/PhanesVectorMathNeon.hpp |   2 +
 .../public/Math/SIMD/PhanesVectorMathSSE.hpp  |   5 +
 .../Runtime/Core/public/Math/SIMD/Platform.h  | 107 ++++++++++-
 .../Runtime/Core/public/Math/SIMD/Storage.h   | 179 ++++++++++++++++++
 9 files changed, 380 insertions(+), 86 deletions(-)
 create mode 100644 Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathAVX.hpp
 create mode 100644 Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathAVX2.hpp
 create mode 100644 Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathFPU.hpp
 create mode 100644 Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathNeon.hpp
 create mode 100644 Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp
 create mode 100644 Engine/Source/Runtime/Core/public/Math/SIMD/Storage.h

diff --git a/Engine/Source/Runtime/Core/public/Math/MathFwd.h b/Engine/Source/Runtime/Core/public/Math/MathFwd.h
index bf9a53f..d22d332 100644
--- a/Engine/Source/Runtime/Core/public/Math/MathFwd.h
+++ b/Engine/Source/Runtime/Core/public/Math/MathFwd.h
@@ -24,28 +24,28 @@ namespace Phanes::Core::Math {
      * Template forward declarations.
      */
 
-    template<RealType T>			struct TColor;
-    template<RealType T>			struct TLinearColor;
-    template<RealType T>			struct TVector2;
-    template<RealType T>			struct TVector3;
-    template<RealType T>			struct TVector4;
-    template<RealType T>			struct TRay;
-    template<RealType T>			struct TLine;
-    template<RealType T>			struct TPlane;
-    template<RealType T>			struct TMatrix2;
-    template<RealType T>			struct TMatrix3;
-    template<RealType T>			struct TMatrix4;
-    template<RealType T>			struct TQuaternion;
-    template<RealType T>			struct TTransform;
-    template<RealType T>			struct TPoint2;
-    template<RealType T>			struct TPoint3;
-    template<RealType T>			struct TPoint4;
-    template<IntType T>				struct TIntVector2;
-    template<IntType T>				struct TIntVector3;
-    template<IntType T>				struct TIntVector4;
-    template<IntType T>				struct TIntPoint2;
-    template<IntType T>				struct TIntPoint3;
-    template<IntType T>				struct TIntPoint4;
+    template<RealType T>			        struct TColor;
+    template<RealType T>			        struct TLinearColor;
+    template<RealType T>			        struct TVector2;
+    template<RealType T>			        struct TVector3;
+    template<RealType T>			        struct TRay;
+    template<RealType T>			        struct TLine;
+    template<RealType T>			        struct TPlane;
+    template<RealType T>			        struct TMatrix2;
+    template<RealType T>			        struct TMatrix3;
+    template<RealType T>			        struct TMatrix4;
+    template<RealType T>			        struct TQuaternion;
+    template<RealType T>			        struct TTransform;
+    template<RealType T>			        struct TPoint2;
+    template<RealType T>			        struct TPoint3;
+    template<RealType T>			        struct TPoint4;
+    template<IntType T>				        struct TIntVector2;
+    template<IntType T>				        struct TIntVector3;
+    template<IntType T>				        struct TIntVector4;
+    template<IntType T>				        struct TIntPoint2;
+    template<IntType T>				        struct TIntPoint3;
+    template<IntType T>				        struct TIntPoint4;
+    template<RealType T, bool IsAligned>	struct TVector4;
 
     /**
      * Specific instantiation of forward declarations.
diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesSIMD.h b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesSIMD.h
index 7151fcf..92c7812 100644
--- a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesSIMD.h
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesSIMD.h
@@ -3,80 +3,91 @@
 // ARM is not supported.
 
 #include "Core/public/Math/SIMD/Platform.h"
+#include "Core/public/Math/MathTypes.h"
 
 
-
-#include <nmmintrin.h> // SSE4.2
-
-#ifdef __AVX__
-#   include <immintrin.h>
+#if P_INTRINSICS == P_INTRINSICS_AVX2
+#   include "PhanesVectorMathAVX2.hpp"
+#elif P_INTRINSICS == P_INTRINSICS_AVX
+#   include "PhanesVectorMathAVX.hpp"
+#elif P_INTRINSICS == P_INTRINSICS_SSE
+#   include "PhanesVectorMathSSE.hpp"
+#elif P_INTRINSICS == P_INTRINSICS_NEON
+#   include "PhanesVectorMathNeon.hpp"
+#elif P_INTRINSICS == P_INTRINSICS_FPU 
+#   include "PhanesVectorMathFPU.hpp"
 #endif
 
 
-namespace Phanes::Core::Math::SIMD
+// Register aliases
+namespace Phanes::Core::Types
 {
 
-    // XMM Register wrapper for 4x1 floats
+#if P_INTRINSICS >= 1
 
-    struct VectorRegister4f
-    {
-    public:
-        __m128 data;
-    };
+    typedef __m128      Vec4f32Reg;
+    typedef __m128d     Vec2f64Reg;
 
-    typedef VectorRegister4f    VectorRegister4f32;
+    typedef __m128i     Vec4i32Reg;
+    typedef __m128i     Vec2i64Reg;
+
+    typedef __m128i     Vec4u32Reg;
+    typedef __m128i     Vec2u64Reg;
+
+#elif P_INTRINSICS != P_INTRINSICS_NEON
+
+    typedef struct alignas(16) Vec4f32Reg { float data[4]; }                        Vec4f32Reg;
+    typedef struct alignas(16) Vec2f64Reg { double data[2]; }                       Vec2f64Reg;
+    typedef struct alignas(16) Vec4i32Reg { int data[4]; }                          Vec4i32Reg;
+    typedef struct alignas(16) Vec2i64Reg { Phanes::Core::Types::int64 data[2]; }   Vec2i64Reg;
+    typedef struct alignas(16) Vec4u32Reg { unsigned int data[4]; }                 Vec4u32Reg;
+    typedef struct alignas(16) Vec2u64Reg { Phanes::Core::Types::uint64 data[4]; }  Vec2u64Reg;
+
+#endif
 
 
+#if P_INTRINSICS >= 2
 
-    // XMM Register wrapper for 2x1 doubles
-    struct VectorRegister2d
-    {
-    public:
-        __m128d data;
-    };
+    typedef __m256      Vec4x2f32Reg;
+    typedef __m256      Vec8f32Reg;
+    typedef __m256d     Vec2x2f64Reg;
+    typedef __m256d     Vec4f64Reg;
 
-    typedef VectorRegister2d    VectorRegister2f64;
+#elif P_INTRINSICS != P_INTRINSICS_NEON
+
+    typedef struct alignas(32) Vec4x2f32Reg { float data[8]; }  Vec4x2f32Reg;
+    typedef struct alignas(32) Vec8f32Reg   { float data[8]; }  Vec8f32Reg;
+    typedef struct alignas(32) Vec2x2f64Reg { double data[4]; } Vec2x2f64Reg;
+    typedef struct alignas(32) Vec4f64Reg   { double data[4]; } Vec4f64Reg;
+
+#endif
 
 
+#if P_INTRINSICS == 3
 
-    // XMM Register wrapper for 4x1 integers
-    struct VectorRegister4i
-    {
-    public:
-        __m128i data;
-    };
+    typedef __m256i     Vec4x2i32Reg;
+    typedef __m256i     Vec8i32Reg;
+    typedef __m256i     Vec2x2i64Reg;
+    typedef __m256i     Vec4i64Reg;
 
-    typedef VectorRegister4i    VectorRegister4i32;
+    typedef __m256i     Vec4x2u32Reg;
+    typedef __m256i     Vec8u32Reg;
+    typedef __m256i     Vec2x2u64Reg;
+    typedef __m256i     Vec4u64Reg;
 
+#elif P_INTRINSICS != P_INTRINSICS_NEON
 
+    typedef struct alignas(32) Vec4x2i32Reg { int data[8]; }                         Vec4x2i32Reg;
+    typedef struct alignas(32) Vec8i32Reg   { int data[8]; }                         Vec8i32Reg;
+    typedef struct alignas(32) Vec2x2i64Reg { Phanes::Core::Types::int64 data[4]; }  Vec2x2i64Reg;
+    typedef struct alignas(32) Vec4i64Reg   { Phanes::Core::Types::int64 data[4]; }  Vec4i64Reg;
 
-#   ifdef __AVX__
+    typedef struct alignas(32) Vec4x2u32Reg { unsigned int data[8]; }                Vec4x2u32Reg;
+    typedef struct alignas(32) Vec8u32Reg   { unsigned int data[8]; }                Vec8u32Reg;
+    typedef struct alignas(32) Vec2x2u64Reg { Phanes::Core::Types::uint64 data[4]; } Vec2x2u64Reg;
+    typedef struct alignas(32) Vec4u64Reg   { Phanes::Core::Types::uint64 data[4]; } Vec4u64Reg;
 
-    // AVX specific types
+#endif
 
-    // XMM Register wrapper for 4x1 doubles
-    struct VectorRegister4d
-    {
-    public:
-        __m256d data;
-    };
-
-    typedef VectorRegister4d    VectorRegister4f64;
-    
-#   endif
-
-
-#   ifdef __AVX2__
-
-    // AVX2 specific types
-
-    // XMM Register wrapper for 4x1 doubles
-    struct VectorRegister4i64
-    {
-    public:
-        __m256i data;
-    };
-
-
-#   endif
+    // NEON ...
 }
\ No newline at end of file
diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathAVX.hpp b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathAVX.hpp
new file mode 100644
index 0000000..b9ccf66
--- /dev/null
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathAVX.hpp
@@ -0,0 +1,6 @@
+#pragma once
+
+#include "PhanesVectorMathSSE.hpp" // Include previous 
+
+#include <immintrin.h>
+
diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathAVX2.hpp b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathAVX2.hpp
new file mode 100644
index 0000000..409deda
--- /dev/null
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathAVX2.hpp
@@ -0,0 +1,3 @@
+#pragma once
+
+#include "PhanesVectorMathAVX.hpp" // Include previous
\ No newline at end of file
diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathFPU.hpp b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathFPU.hpp
new file mode 100644
index 0000000..6f70f09
--- /dev/null
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathFPU.hpp
@@ -0,0 +1 @@
+#pragma once
diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathNeon.hpp b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathNeon.hpp
new file mode 100644
index 0000000..5d661d5
--- /dev/null
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathNeon.hpp
@@ -0,0 +1,2 @@
+#pragma once
+#error ARM architecture is not yet supported by PhanesEngine.
\ No newline at end of file
diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp
new file mode 100644
index 0000000..4249084
--- /dev/null
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp
@@ -0,0 +1,5 @@
+#pragma once
+
+#include <nmmintrin.h> 
+
+
diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/Platform.h b/Engine/Source/Runtime/Core/public/Math/SIMD/Platform.h
index 50976c4..0b9e65a 100644
--- a/Engine/Source/Runtime/Core/public/Math/SIMD/Platform.h
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/Platform.h
@@ -2,26 +2,46 @@
 
 #pragma once
 
+// Architecture MACRO
+// Implicitly asumes x86 architecture
+#ifndef P_ARM_ARCH
+#   define P_x86_ARCH
+#else
+#   ifdef P_x86_ARCH
+#       undef P_x86_ARCH
+#   endif
+#   error ARM architecture not supported.
+#endif
+
 // Set platform MACRO depending on defined build
+
+#define P_PLATFORM_WIN      0
+#define P_PLATFORM_LIN      1
+#define P_PLATFORM_MAC      2
+// #define P_PLATFORM_FBSD  3 -> Is planed for eventual PS5 support
+
 // User defines build platform
 #ifdef P_WIN_BUILD 
-    #define P_PLATFORM 0
+#   define P_PLATFORM P_PLATFORM_WIN
 #elif P_LINUX_BUILD
-    #define P_PLATFORM 1
-#elif P_APPLE_BUILD
-    #define P_PLATFORM 2
-#elif P_PS5_BUILD
-    #define P_PLATFORM 3
+#   define P_PLATFORM P_PLATFORM_LIN
+#   error Linux / Unix system is not yet supported.
+#elif P_MAC_BUILD
+#   define P_PLATFORM P_PLATFORM_MAC
+#   error Mac target system is not yet supported.
+#elif P_PS5_BUILD || P_FBSD_BUILD
+#   define P_PLATFORM P_PLATFORM_FBSD
+#   error FreeBSD is not yet supported.
 #else 
-    #error Your target system is either not supported, or you have yet to define it.
+#   error Your target system is either not supported, or you have yet to define it.
 #endif
 
 // Set compiler depending on defined compiler 
 
 // Compiler macro definition
 
-// ID's defines like [0-9][0-x]
-// First bracket is compiler, second is the version of the compiler.
+// ID's defined like [0-9][0-x]
+// First bracket defines compiler, second defines the version of the compiler.
 
 // Visual C++
 #define P_COMPILER_VC22         001
@@ -122,7 +142,7 @@
 // Clang
 
 #elif (defined(__clang__))
-
+#   error PhanesEngine only supports MSVC -> Visual Studio
 #   if defined(__apple_build_version__)
 #   
 #	    if (__clang_major__ < 6)
@@ -188,6 +208,7 @@
 
 // G++
 #elif defined(__GNUC__) || defined(__MINGW32__)
+#   error PhanesEngine only supports MSVC -> Visual Studio
 #   if __GNUC__ >= 14
 #		define P_COMPILER P_COMPILER_GCC14
 #	elif __GNUC__ >= 13
@@ -225,3 +246,69 @@
 
 #endif
 
+
+
+
+// Vector instruction sets
+
+
+// Define also supported instruction sets for Visual Studio, as it only defines the latest (e.g. only __AVX__ not __SSE4__ ...).
+
+#ifdef P_FORCE_INTRINSICS
+    
+#   undef __AVX2__
+#   undef __AVX__
+#   undef __SSE__
+
+#   ifndef P_INTRINSICS
+#       error P_INTRINSICS must be defined by the user, when P_FORCE_INTRINSICS is used.
+#   endif
+
+#else
+
+#   ifdef __AVX2__
+#       define P_AVX2__
+#   elif defined(__AVX__)
+#       define P_AVX__
+#   elif defined(__SSE__)
+#       define P_SSE__
+#   endif
+
+
+#endif // !P_FORCE_INTRINSICS
+
+#ifdef P_AVX2__
+#   define P_AVX__
+#endif
+
+#ifdef P_AVX__
+#   define P_SSE__ 
+#endif
+
+
+#define P_INTRINSICS_FPU    0
+#define P_INTRINSICS_SSE    1
+#define P_INTRINSICS_AVX    2
+#define P_INTRINSICS_AVX2   3
+#define P_INTRINSICS_NEON   4
+
+
+#if defined(P_FORCE_FPU) // Force, that no intrinsics may be used.
+#   define P_INTRINSICS P_INTRINSICS_FPU
+#   undef P_AVX2__
+#   undef P_AVX__
+#   undef P_SSE__
+#   undef P_SSE__
+#else
+#   if defined(P_AVX__) && !defined(P_AVX2__)
+#      define P_INTRINSICS P_INTRINSICS_AVX
+#   elif defined(P_AVX2__)
+#      define P_INTRINSICS P_INTRINSICS_AVX2
+#   elif  (defined(__SSE__) || defined(P_SSE__)) && !defined(P_AVX__)
+#      define P_INTRINSICS P_INTRINSICS_SSE
+#   elif defined(P_ARM_ARCH)
+#      define P_INTRINSICS P_INTRINSICS_NEON
+#   elif !defined(P_FORCE_INTRINSICS)
+#       error No SIMD instruction set detected. Use P_FORCE_FPU to disable SIMD extensions.
+#   endif
+#endif
diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/Storage.h b/Engine/Source/Runtime/Core/public/Math/SIMD/Storage.h
new file mode 100644
index 0000000..fa2b854
--- /dev/null
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/Storage.h
@@ -0,0 +1,179 @@
+// Defines on compile time, whether a xmm register or an array should be used.
+#pragma once
+
+#include "Core/public/Math/SIMD/PhanesSIMD.h"
+
+#include "Core/public/Math/MathTypes.h"
+
+namespace Phanes::Core::SIMD
+{
+    template<size_t L, typename T, bool IsAligned>
+    struct Storage;
+
+    // General unaligned memory storage
+    template<size_t L, typename T>
+    struct Storage<L, T, false>
+    {
+        typedef struct type {
+            T data[L];
+        } type;
+
+    };
+
+    template<typename T>
+    struct Storage<3, T, false>
+    {
+        typedef struct type {
+            T data[4];
+        } type;
+    };
+
+
+    // SSE4.2
+
+    template<>
+    struct Storage<4, float, true>
+    {
+        typedef Phanes::Core::Types::Vec4f32Reg type;
+    };
+
+    template<>
+    struct Storage<3, float, true>
+    {
+        typedef Phanes::Core::Types::Vec4f32Reg type;
+    };
+    
+    template<>
+    struct Storage<4, int, true>
+    {
+        typedef Phanes::Core::Types::Vec4i32Reg type;
+    };
+
+    template<>
+    struct Storage<3, int, true>
+    {
+        typedef Phanes::Core::Types::Vec4i32Reg type;
+    };
+
+    template<>
+    struct Storage<4, unsigned int, true>
+    {
+        typedef Phanes::Core::Types::Vec4u32Reg type;
+    };
+
+    template<>
+    struct Storage<3, unsigned int, true>
+    {
+        typedef Phanes::Core::Types::Vec4u32Reg type;
+    };
+
+    template<>
+    struct Storage<2, double, true>
+    {
+        typedef Phanes::Core::Types::Vec2f64Reg type;
+    };
+
+    template<>
+    struct Storage<2, Phanes::Core::Types::int64, true>
+    {
+        typedef Phanes::Core::Types::Vec2i64Reg type;
+    };
+
+    template<>
+    struct Storage<2, Phanes::Core::Types::uint64, true>
+    {
+        typedef Phanes::Core::Types::Vec2u64Reg type;
+    };
+
+
+    // AVX
+    template<>
+    struct Storage<4, double, true>
+    {
+        typedef Phanes::Core::Types::Vec4f64Reg type;
+    };
+
+    template<>
+    struct Storage<3, double, true>
+    {
+        typedef Phanes::Core::Types::Vec4f64Reg type;
+    };
+
+    template<>
+    struct Storage<8, float, true>
+    {
+        typedef Phanes::Core::Types::Vec4x2f32Reg type;
+    };
+
+
+    // AVX2
+    template<>
+    struct Storage<4, Phanes::Core::Types::int64, true>
+    {
+        typedef Phanes::Core::Types::Vec4i64Reg type;
+    };
+
+    template<>
+    struct Storage<3, Phanes::Core::Types::int64, true>
+    {
+        typedef Phanes::Core::Types::Vec4i64Reg type;
+    };
+
+    template<>
+    struct Storage<4, Phanes::Core::Types::uint64, true>
+    {
+        typedef Phanes::Core::Types::Vec4u64Reg type;
+    };
+
+    template<>
+    struct Storage<3, Phanes::Core::Types::uint64, true>
+    {
+        typedef Phanes::Core::Types::Vec4u64Reg type;
+    };
+
+    template<>
+    struct Storage<8, int, true>
+    {
+        typedef Phanes::Core::Types::Vec4x2i32Reg type;
+    };
+
+    template<>
+    struct Storage<8, unsigned int, true>
+    {
+        typedef Phanes::Core::Types::Vec4x2u32Reg type;
+    };
+}
+
+
+struct Vec4
+{
+public:
+    union
+    {
+        struct
+        {
+
+            int x, y, z, w;
+
+        };
+
+        typename Phanes::Core::SIMD::Storage<4, Phanes::Core::Types::int32, true>::type comp;
+    };
+};
+
+struct Vec4x2
+{
+public:
+    union
+    {
+        struct
+        {
+
+            Vec4 v1;
+            Vec4 v2;
+
+        };
+
+        typename Phanes::Core::SIMD::Storage<8, Phanes::Core::Types::int32, true>::type comp;
+    };
+};
\ No newline at end of file