From d25345aa07ef7ca19b02c545642b868f28be565d Mon Sep 17 00:00:00 2001
From: scorpioblood <77296181+scorpioblood@users.noreply.github.com>
Date: Tue, 28 May 2024 22:01:51 +0200
Subject: [PATCH] More SIMD.

---
 .../Core/public/Math/SIMD/PhanesSIMDTypes.h   |   2 +
 .../public/Math/SIMD/PhanesVectorMathSSE.hpp  |  74 ++++++++++++-
 .../Runtime/Core/public/Math/SIMD/Platform.h  |  24 +++-
 .../Runtime/Core/public/Math/Vector4.hpp      |  12 +-
 .../Runtime/Core/public/Math/Vector4.inl      | 103 +++++++++++++++++-
 5 files changed, 193 insertions(+), 22 deletions(-)

diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesSIMDTypes.h b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesSIMDTypes.h
index b614e0c..6ff9116 100644
--- a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesSIMDTypes.h
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesSIMDTypes.h
@@ -1,3 +1,5 @@
+#pragma once
+
 // This file includes the necessary header for vectorization intrinsics. If no specifics are defined SSE4.2 is used.
 // 
 // ARM is not supported.
diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp
index 5901cb3..f0815c7 100644
--- a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp
@@ -1,10 +1,30 @@
 #pragma once
 
-#include "Core/public/Math/Boilerplate.h"
 #include <nmmintrin.h> 
 
+#include "Core/public/Math/SIMD/PhanesSIMDTypes.h"
+#include "Core/public/Math/Boilerplate.h"
+#include "Core/public/Math/MathCommon.hpp"
+
+#include <iostream>
+
 // -> For IntelliSense
 
+#include "Core/public/Math/Vector4.hpp"
+
+// ========== //
+//   Common   //
+// ========== //
+
+
+Phanes::Core::Types::Vec4f32Reg p_vec4_abs(const Phanes::Core::Types::Vec4f32Reg& v)
+{
+    return _mm_and_ps(v, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
+}
+
+
+
+
 // ============ //
 //   TVector4   //
 // ============ //
@@ -22,4 +42,54 @@ namespace Phanes::Core::Math::Detail
             r.comp = _mm_add_ps(v1.comp, v2.comp);
         }
     };
-}
\ No newline at end of file
+
+    template<>
+    struct compute_vec4_sub<float, true>
+    {
+        static FORCEINLINE void map(Phanes::Core::Math::TVector4<float, true>& r, const Phanes::Core::Math::TVector4<float, true>& v1, const Phanes::Core::Math::TVector4<float, true>& v2)
+        {
+            r.comp = _mm_sub_ps(v1.comp, v2.comp);
+        }
+    };
+
+    template<>
+    struct compute_vec4_mul<float, true>
+    {
+        static FORCEINLINE void map(Phanes::Core::Math::TVector4<float, true>& r, const Phanes::Core::Math::TVector4<float, true>& v1, const Phanes::Core::Math::TVector4<float, true>& v2)
+        {
+            r.comp = _mm_mul_ps(v1.comp, v2.comp);
+        }
+    };
+
+    template<>
+    struct compute_vec4_div<float, true>
+    {
+        static FORCEINLINE void map(Phanes::Core::Math::TVector4<float, true>& r, const Phanes::Core::Math::TVector4<float, true>& v1, const Phanes::Core::Math::TVector4<float, true>& v2)
+        {
+            r.comp = _mm_div_ps(v1.comp, v2.comp);
+        }
+    };
+
+    template<>
+    struct compute_vec4_eq<float, true>
+    {
+        static FORCEINLINE bool map(const Phanes::Core::Math::TVector4<float, true>& v1, const Phanes::Core::Math::TVector4<float, true>& v2)
+        {
+            float r;
+            _mm_store_ps1(&r, _mm_cmpeq_ps(v1.comp, v2.comp));
+            return (r == 0xffffffff) ? true : false;
+        }
+    };
+
+    template<>
+    struct compute_vec4_ieq<float, true>
+    {
+        static FORCEINLINE bool map(const Phanes::Core::Math::TVector4<float, true>& v1, const Phanes::Core::Math::TVector4<float, true>& v2)
+        {
+            float r;
+            _mm_store_ps1(&r, _mm_cmpneq_ps(v1.comp, v2.comp));
+            return (r == 0xffffffff) ? true : false;
+        }
+    };
+}
+
diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/Platform.h b/Engine/Source/Runtime/Core/public/Math/SIMD/Platform.h
index c15b890..a2b8b61 100644
--- a/Engine/Source/Runtime/Core/public/Math/SIMD/Platform.h
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/Platform.h
@@ -254,11 +254,6 @@
 
 // Define also supported instruction sets for Visual Studio, as it only defines the latest (e.g. only __AVX__ not __SSE4__ ...).
 
-#define P_AVX2__    0
-#define P_AVX__     0
-#define P_SSE__     0 
-#define P_NEON__    0
-
 #ifdef P_FORCE_INTRINSICS
     
 #   undef __AVX2__
@@ -272,7 +267,7 @@
 #else
 
 #   ifdef __AVX2__
-#       define P_AVX2__ 1
+#    define P_AVX2__ 1
 #   elif defined(__AVX__)
 #       define P_AVX__ 1
 #   elif defined(__SSE__)
@@ -290,6 +285,23 @@
 #   define P_SSE__ 1
 #endif
 
+// Deactivate unset SIMD
+#ifndef P_AVX2__
+#   define P_AVX2__ 0
+#endif 
+
+// Deactivate unset SIMD
+#ifndef P_AVX__
+#   define P_AVX__ 0
+#endif 
+
+#ifndef P_SSE__
+#   define P_SSE__ 0
+#endif 
+
+#ifndef P_NEON__
+#   define P_NEON__ 0
+#endif
 
 #define P_INTRINSICS_FPU    0
 #define P_INTRINSICS_SSE    1
diff --git a/Engine/Source/Runtime/Core/public/Math/Vector4.hpp b/Engine/Source/Runtime/Core/public/Math/Vector4.hpp
index 63bcd2f..81f771d 100644
--- a/Engine/Source/Runtime/Core/public/Math/Vector4.hpp
+++ b/Engine/Source/Runtime/Core/public/Math/Vector4.hpp
@@ -90,13 +90,6 @@ namespace Phanes::Core::Math
         /// <param name="comp">Array of at least 4 components</param>
         TVector4(const Real* comp);
 
-        /// <summary>
-        /// Construct the vector, by calculating the way between two points.
-        /// </summary>
-        /// <param name="start">Starting point of the vector.</param>
-        /// <param name="end">End point of the vector.</param>
-        TVector4(const TPoint4<Real>& start, const TPoint4<Real>& end);
-
     };
 
     // ===================== //
@@ -698,8 +691,5 @@ namespace Phanes::Core::Math
     TVector4<T, A> PrespectiveDivideV(TVector4<T, A>& v1);
 }
 
-// No SIMD
-#include "Core/public/Math/Vector4.inl"
 
-// SIMD
-#include "Core/public/Math/SIMD/SIMDIntrinsics.h"
\ No newline at end of file
+#include "Core/public/Math/Vector4.inl"
diff --git a/Engine/Source/Runtime/Core/public/Math/Vector4.inl b/Engine/Source/Runtime/Core/public/Math/Vector4.inl
index 4f2cfcc..f419752 100644
--- a/Engine/Source/Runtime/Core/public/Math/Vector4.inl
+++ b/Engine/Source/Runtime/Core/public/Math/Vector4.inl
@@ -3,8 +3,13 @@
 #include "Core/public/Math/Boilerplate.h"
 
 #include "Core/public/Math/Detail/Vector4Decl.inl"
+#include "Core/public/Math/SIMD/SIMDIntrinsics.h"
+
 #include "Core/public/Math/Vector4.hpp"
 
+
+#include "Core/public/Math/SIMD/PhanesSIMDTypes.h"
+
 #include <stdio.h>
 
 namespace Phanes::Core::Math
@@ -25,6 +30,30 @@ namespace Phanes::Core::Math
         w(_w)
     {}
 
+    template<RealType T, bool A>
+    Phanes::Core::Math::TVector4<T, A>::TVector4(Real s) :
+        x(s),
+        y(s),
+        z(s),
+        w(s)
+    {}
+
+    template<RealType T, bool A>
+    Phanes::Core::Math::TVector4<T, A>::TVector4(const TVector2<Real>& v1, const TVector2<Real>& v2) :
+        x(v1.x),
+        y(v1.y),
+        z(v2.x),
+        w(v2.y)
+    {}
+
+    template<RealType T, bool A>
+    Phanes::Core::Math::TVector4<T, A>::TVector4(const Real* comp) :
+        x(comp[0]),
+        y(comp[1]),
+        z(comp[2]),
+        w(comp[3])
+    {}
+
     template<RealType T, bool A>
     TVector4<T, A> operator+=(TVector4<T, A>& v1, const TVector4<T, A>& v2)
     {
@@ -159,19 +188,87 @@ namespace Phanes::Core::Math
         return Detail::compute_vec4_ieq<T, A>::map(v1, v2);
     }
 
+    
+    
+    // Inc- / Decrement
 
 
+    template<RealType T, bool A>
+    TVector4<T, A>& operator++(TVector4<T, A>& v1)
+    {
+        ++v1.x;
+        ++v1.y;
+        ++v1.z;
+        ++v1.w;
 
-    // SIMD
+        return v1;
+    }
 
+    template<RealType T, bool A>
+    TVector4<T, A>& operator--(TVector4<T, A>& v1)
+    {
+        --v1.x;
+        --v1.y;
+        --v1.z;
+        --v1.w;
+
+        return v1;
+    }
+
+    template<RealType T, bool A>
+    TVector4<T, A>& operator++(TVector4<T, A>& v1, int)
+    {
+        return ++v1;
+    }
+
+    template<RealType T, bool A>
+    TVector4<T, A>& operator--(TVector4<T, A>& v1, int)
+    {
+        return --v1;
+    }
+
+
+    // SIMD constructor
 
     template<>
-    TVector4<float, true>::TVector4(Real _x, Real _y, Real _z, Real _w) :
+    TVector4<float, true>::TVector4(const TVector4<float, true>& v)
+    {
+        this->comp = _mm_load_ps(reinterpret_cast<const float*>(&v));
+    }
+
+    template<>
+    TVector4<float, true>::TVector4(float _x, float _y, float _z, float _w) :
         x(_x),
         y(_y),
         z(_z),
         w(_w)
     {
-        this->comp = _mm_load_ps(reinterpret_cast<float*>(&x));
+        this->comp = _mm_load_ps(reinterpret_cast<float*>(&this->x));
+    }
+
+    template<>
+    TVector4<float, true>::TVector4(float s)
+    {
+        this->comp = _mm_load_ps1(&s);
+    }
+
+    template<>
+    TVector4<float, true>::TVector4(const TVector2<float>& v1, const TVector2<float>& v2) :
+        x(v1.x),
+        y(v1.y),
+        z(v2.x),
+        w(v2.y)
+    {
+        this->comp = _mm_load_ps(reinterpret_cast<float*>(&this->x));
+    }
+
+    template<>
+    TVector4<float, true>::TVector4(const float* comp) :
+        x(comp[0]),
+        y(comp[1]),
+        z(comp[2]),
+        w(comp[3])
+    {
+        this->comp = _mm_load_ps(reinterpret_cast<float*>(&this->x));
     }
 }
\ No newline at end of file