From 5861d75bdb384d2f971c2c51980171f36fe86331 Mon Sep 17 00:00:00 2001 From: scorpioblood <77296181+scorpioblood@users.noreply.github.com> Date: Thu, 20 Jun 2024 22:46:15 +0200 Subject: [PATCH] Add Matrix4 and Matrix4 SIMD. --- .../Core/public/Math/Detail/Matrix3Decl.inl | 24 + .../Core/public/Math/Detail/Matrix4Decl.inl | 98 ++++ .../Core/public/Math/Detail/Vector4Decl.inl | 10 +- .../Source/Runtime/Core/public/Math/MathFwd.h | 2 +- .../Runtime/Core/public/Math/Matrix2.hpp | 9 +- .../Runtime/Core/public/Math/Matrix3.hpp | 72 ++- .../Runtime/Core/public/Math/Matrix3.inl | 27 + .../Runtime/Core/public/Math/Matrix4.hpp | 144 ++++++ .../Runtime/Core/public/Math/Matrix4.inl | 46 ++ .../public/Math/SIMD/PhanesVectorMathSSE.hpp | 475 ++++++++++++++++++ 10 files changed, 855 insertions(+), 52 deletions(-) create mode 100644 Engine/Source/Runtime/Core/public/Math/Detail/Matrix3Decl.inl create mode 100644 Engine/Source/Runtime/Core/public/Math/Detail/Matrix4Decl.inl create mode 100644 Engine/Source/Runtime/Core/public/Math/Matrix3.inl create mode 100644 Engine/Source/Runtime/Core/public/Math/Matrix4.hpp create mode 100644 Engine/Source/Runtime/Core/public/Math/Matrix4.inl diff --git a/Engine/Source/Runtime/Core/public/Math/Detail/Matrix3Decl.inl b/Engine/Source/Runtime/Core/public/Math/Detail/Matrix3Decl.inl new file mode 100644 index 0000000..49a453e --- /dev/null +++ b/Engine/Source/Runtime/Core/public/Math/Detail/Matrix3Decl.inl @@ -0,0 +1,24 @@ +#pragma once + +#include "Core/public/Math/Boilerplate.h" +#include "Core/public/Math/MathCommon.hpp" + +namespace Phanes::Core::Math::Detail +{ + template + struct compute_mat3_transpose {}; + + template + struct compute_mat3_transpose + { + static constexpr void map(Phanes::Core::Math::TMatrix3& r, const TMatrix3& m1) + { + r = TMatrix4(m1(0, 0), m1(1, 0), m1(2, 0), + m1(0, 1), m1(1, 1), m1(2, 1), + m1(0, 2), m1(1, 2), m1(2, 2) + ); + } + + + }; +} \ No newline at end of file diff --git a/Engine/Source/Runtime/Core/public/Math/Detail/Matrix4Decl.inl b/Engine/Source/Runtime/Core/public/Math/Detail/Matrix4Decl.inl new file mode 100644 index 0000000..7fe4274 --- /dev/null +++ b/Engine/Source/Runtime/Core/public/Math/Detail/Matrix4Decl.inl @@ -0,0 +1,98 @@ +#pragma once + +#include "Core/public/Math/Boilerplate.h" +#include "Core/public/Math/MathCommon.hpp" + +namespace Phanes::Core::Math::Detail +{ + template + struct compute_mat4_det {}; + + template + struct compute_mat4_inv {}; + + template + struct compute_mat4_transpose {}; + + + template + struct compute_mat4_det + { + static constexpr T map(Phanes::Core::Math::TMatrix4& m) + { + const TVector3& a = reinterpret_cast&>(m[0]); + const TVector3& b = reinterpret_cast&>(m[1]); + const TVector3& c = reinterpret_cast&>(m[2]); + const TVector3& d = reinterpret_cast&>(m[3]); + + const float& x = m(3, 0); + const float& y = m(3, 1); + const float& z = m(3, 2); + const float& w = m(3, 3); + + TVector3 s = CrossP(a, b); + TVector3 t = CrossP(c, d); + TVector3 u = a * y - b * x; + TVector3 v = c * w - d * z; + return DotP(s, v) + DotP(t, u); + } + }; + + template + struct compute_mat4_inv + { + static constexpr bool map(Phanes::Core::Math::TMatrix4& r, const Phanes::Core::Math::TMatrix4& m) + { + const TVector3& a = reinterpret_cast&>(m[0]); + const TVector3& b = reinterpret_cast&>(m[1]); + const TVector3& c = reinterpret_cast&>(m[2]); + const TVector3& d = reinterpret_cast&>(m[3]); + + const float& x = m(3, 0); + const float& y = m(3, 1); + const float& z = m(3, 2); + const float& w = m(3, 3); + + TVector3 s = CrossP(a, b); + TVector3 t = CrossP(c, d); + TVector3 u = a * y - b * x; + TVector3 v = c * w - d * z; + + float _1_det = (T)1.0 / (DotP(s, v) + DotP(t, u)); + + if (_1_det == 0.0) + { + return false; + } + + s *= _1_det; + t *= _1_det; + u *= _1_det; + v *= _1_det; + + TVector3 r0 = Cross(b, v) + t * y; + TVector3 r1 = Cross(v, a) + t * x; + TVector3 r2 = Cross(d, u) + s * w; + TVector3 r3 = Cross(u, c) + s * z; + + r = TMatrix4(r0.x, r0.y, r0.z, -DotP(b, t), + r1.x, r1.y, r1.z, DotP(a, t), + r2.x, r2.y, r2.z, -DotP(d, s), + r3.x, r3.y, r3.z, DotP(c, s)); + + return true; + } + }; + + template + struct compute_mat4_transpose + { + static constexpr void map(Phanes::Core::Math::TMatrix4& r, const Phanes::Core::Math::TMatrix4& m) + { + r = Phanes::Core::Math::TMatrix4(m(0, 0), m(1, 0), m(2, 0), m(3, 0), + m(0, 1), m(1, 1), m(2, 1), m(3, 1), + m(0, 2), m(1, 2), m(2, 2), m(3, 2), + m(0, 3), m(1, 3), m(2, 3), m(3, 3)); + } + }; +} \ No newline at end of file diff --git a/Engine/Source/Runtime/Core/public/Math/Detail/Vector4Decl.inl b/Engine/Source/Runtime/Core/public/Math/Detail/Vector4Decl.inl index 560d371..09e8d00 100644 --- a/Engine/Source/Runtime/Core/public/Math/Detail/Vector4Decl.inl +++ b/Engine/Source/Runtime/Core/public/Math/Detail/Vector4Decl.inl @@ -38,10 +38,7 @@ namespace Phanes::Core::Math::Detail { static constexpr void map(Phanes::Core::Math::TVector4& v1, const TVector4& v2) { - v1.x = v2.x; - v1.y = v2.y; - v1.z = v2.z; - v1.w = v2.w; + memcpy(v1.data, v2.data, 4 * sizeof(T)); } @@ -72,10 +69,7 @@ namespace Phanes::Core::Math::Detail static constexpr void map(Phanes::Core::Math::TVector4& v1, const T* comp) { - v1.x = comp[0]; - v1.y = comp[1]; - v1.z = comp[2]; - v1.w = comp[3]; + memcpy(v1.data, comp, 4 * sizeof(T)); } }; diff --git a/Engine/Source/Runtime/Core/public/Math/MathFwd.h b/Engine/Source/Runtime/Core/public/Math/MathFwd.h index 36529b5..6d4c093 100644 --- a/Engine/Source/Runtime/Core/public/Math/MathFwd.h +++ b/Engine/Source/Runtime/Core/public/Math/MathFwd.h @@ -29,7 +29,6 @@ namespace Phanes::Core::Math { template struct TRay; template struct TLine; template struct TPlane; - template struct TMatrix4; template struct TQuaternion; template struct TTransform; template struct TPoint2; @@ -40,6 +39,7 @@ namespace Phanes::Core::Math { template struct TIntPoint4; template struct TMatrix2; template struct TMatrix3; + template struct TMatrix4; template struct TVector2; template struct TVector3; template struct TVector4; diff --git a/Engine/Source/Runtime/Core/public/Math/Matrix2.hpp b/Engine/Source/Runtime/Core/public/Math/Matrix2.hpp index 4f8165d..7a621c0 100644 --- a/Engine/Source/Runtime/Core/public/Math/Matrix2.hpp +++ b/Engine/Source/Runtime/Core/public/Math/Matrix2.hpp @@ -279,7 +279,7 @@ namespace Phanes::Core::Math { template TMatrix2 TransposeV(TMatrix2& m1) { - Swap(m1(0, 1), m1(1, 0)); + Swap(m1(0, 1), m1(1, 0)); } // =============== // @@ -299,7 +299,7 @@ namespace Phanes::Core::Math { TMatrix2 Transpose(const TMatrix2& m1) { return TMatrix2(m1(0, 0), m1(1, 0), - m1(0, 1), m1(1, 1)); + m1(0, 1), m1(1, 1)); } template @@ -312,4 +312,7 @@ namespace Phanes::Core::Math { } // Phanes::Core::Math -#endif // !MATRIX2_H \ No newline at end of file +#endif // !MATRIX2_H + + +#include "Core/public/Math/SIMD/SIMDIntrinsics.h" \ No newline at end of file diff --git a/Engine/Source/Runtime/Core/public/Math/Matrix3.hpp b/Engine/Source/Runtime/Core/public/Math/Matrix3.hpp index a4d9d17..1ff5178 100644 --- a/Engine/Source/Runtime/Core/public/Math/Matrix3.hpp +++ b/Engine/Source/Runtime/Core/public/Math/Matrix3.hpp @@ -111,17 +111,17 @@ namespace Phanes::Core::Math { FORCEINLINE T& operator() (int n, int m) { - return this->m[m][n]; + return this->data[m][n]; } - FORCEINLINE TVector3& operator[] (int m) + FORCEINLINE TVector3& operator[] (int m) { - return (*reinterpret_cast*>(this->m[m])); + return (*reinterpret_cast*>(this->m[m])); } FORCEINLINE const T& operator() (int n, int m) const { - return this->m[m][n]; + return this->data[m][n]; } FORCEINLINE const TVector3& operator[] (int m) const @@ -249,8 +249,8 @@ namespace Phanes::Core::Math { TMatrix3 operator+ (const TMatrix3& m, T s) { return TMatrix3(m.c0 + s, - m.c1 + s, - m.c2 + s); + m.c1 + s, + m.c2 + s); } /** @@ -384,25 +384,26 @@ namespace Phanes::Core::Math { */ template - TMatrix3 InverseV(TMatrix3& m1) + bool InverseV(TMatrix3& m1) { - const TVector3& v0 = m1[0]; - const TVector3& v1 = m1[1]; - const TVector3& v2 = m1[2]; - - TVector3 r0 = CrossP(v1, v2); - TVector3 r1 = CrossP(v2, v0); - TVector3 r2 = CrossP(v0, v1); + TVector3 r0 = CrossP(m1.c1, m1.c2); + TVector3 r1 = CrossP(m1.c2, m1.c0); + TVector3 r2 = CrossP(m1.c0, m1.c1); T _1_det = (T)1.0 / Determinant(m1); + if (_1_det == (T)0.0) + { + return false; + } + m1 = TMatrix3(r0.x, r0.y, r0.z, r1.x, r1.y, r1.z, r2.x, r2.y, r2.z); m1 *= _1_det; - return m1; + return true; } /** @@ -414,14 +415,7 @@ namespace Phanes::Core::Math { */ template - TMatrix3 TransposeV(TMatrix3& m1) - { - Swap(m1(0, 1), m1(1, 0)); - Swap(m1(0, 2), m1(2, 0)); - Swap(m1(1, 2), m1(2, 1)); - - return m1; - } + TMatrix3 TransposeV(TMatrix3& m1); // =============== // @@ -435,25 +429,26 @@ namespace Phanes::Core::Math { */ template - TMatrix3 Inverse(TMatrix3& m1) + bool Inverse(TMatrix3& r, const TMatrix3& m1) { - const TVector3& v0 = m1[0]; - const TVector3& v1 = m1[1]; - const TVector3& v2 = m1[2]; - - TVector3 r0 = CrossP(v1, v2); - TVector3 r1 = CrossP(v2, v0); - TVector3 r2 = CrossP(v0, v1); + TVector3 r0 = CrossP(m1.c1, m1.c2); + TVector3 r1 = CrossP(m1.c2, m1.c0); + TVector3 r2 = CrossP(m1.c0, m1.c1); T _1_det = (T)1.0 / Determinant(m1); - TMatrix3 inverse(r0.x, r0.y, r0.z, + if (_1_det == (T)0.0) + { + return false; + } + + r = TMatrix3(r0.x, r0.y, r0.z, r1.x, r1.y, r1.z, r2.x, r2.y, r2.z); - inverse *= _1_det; + r *= _1_det; - return inverse; + return true; } /** @@ -465,12 +460,7 @@ namespace Phanes::Core::Math { */ template - TMatrix3 Transpose(const TMatrix3& m1) - { - return TMatrix3(m1(0, 0), m1(1, 0), m1(2, 0), - m1(0, 1), m1(1, 1), m1(2, 1), - m1(0, 2), m1(1, 2), m1(2, 2)); - } + TMatrix3 Transpose(const TMatrix3& m1); /** * Checks if matrix is an identity matrix. @@ -488,3 +478,5 @@ namespace Phanes::Core::Math { #endif // !MATRIX3_H + +#include "Core/public/Math/Matrix3.inl" \ No newline at end of file diff --git a/Engine/Source/Runtime/Core/public/Math/Matrix3.inl b/Engine/Source/Runtime/Core/public/Math/Matrix3.inl new file mode 100644 index 0000000..3383d12 --- /dev/null +++ b/Engine/Source/Runtime/Core/public/Math/Matrix3.inl @@ -0,0 +1,27 @@ +#pragma once + +#include "Core/public/Math/Boilerplate.h" + +#include "Core/public/Math/Detail/Matrix3Decl.inl" +#include "Core/public/Math/SIMD/SIMDIntrinsics.h" + +#include "Core/public/Math/SIMD/PhanesSIMDTypes.h" + +namespace Phanes::Core::Math +{ + template + TMatrix3 TransposeV(const TMatrix3& m) + { + Detail::compute_mat3_transpose::map(m, m); + return m; + } + + template + TMatrix3 Transpose(const TMatrix3& m) + { + TMatrix3 r; + Detail::compute_mat3_transpose::map(r, m); + return r; + + } +} \ No newline at end of file diff --git a/Engine/Source/Runtime/Core/public/Math/Matrix4.hpp b/Engine/Source/Runtime/Core/public/Math/Matrix4.hpp new file mode 100644 index 0000000..25c1379 --- /dev/null +++ b/Engine/Source/Runtime/Core/public/Math/Matrix4.hpp @@ -0,0 +1,144 @@ +#pragma once + +#include "Core/public/Math/Boilerplate.h" + +#include "Core/public/Math/MathAbstractTypes.h" +#include "Core/public/Math/MathFwd.h" +#include "Core/public/Math/Vector4.hpp" + +#ifndef MATRIX4_H +#define MATRIX4_H + +namespace Phanes::Core::Math { + + // 4x4 Matrix defined in column-major order. + + template + struct TMatrix4 + { + public: + + union + { + struct + { + TVector4 c0; + TVector4 c1; + TVector4 c2; + TVector4 c3; + }; + + T data[4][4]; + }; + + public: + + FORCEINLINE T& operator() (int n, int m) + { + return this->data[m][n]; + } + FORCEINLINE TVector4& operator[] (int m) + { + return (*reinterpret_cast*>(this->m[m])); + } + + FORCEINLINE const T& operator() (int n, int m) const + { + return this->data[m][n]; + } + FORCEINLINE const TVector4& operator[] (int m) const + { + return (*reinterpret_cast*>(this->m[m])); + } + }; + + // ==================== // + // Matrix4 operator // + // ==================== // + + template + TMatrix4 operator+= (TMatrix4& a, T s); + + template + TMatrix4 operator+= (TMatrix4& a, const TMatrix4& b); + + template + TMatrix4 operator-= (TMatrix4& a, T s); + + template + TMatrix4 operator-= (TMatrix4& a, const TMatrix4& b); + + template + TMatrix4 operator*= (TMatrix4& a, T s); + + template + TMatrix4 operator*= (TMatrix4& a, const TMatrix4& b); + + template + TMatrix4 operator+ (const TMatrix4& a, T s); + + template + TMatrix4 operator+ (const TMatrix4& a, const TMatrix4& b); + + template + TMatrix4 operator- (const TMatrix4& a, T s); + + template + TMatrix4 operator- (const TMatrix4& a, const TMatrix4& b); + + template + TMatrix4 operator* (const TMatrix4& a, T s); + + template + TMatrix4 operator* (const TMatrix4& a, const TMatrix4& b); + + template + TVector4 operator* (const TMatrix4& a, const TVector4& v); + + template + bool operator== (const TMatrix4& a, const TMatrix4& b); + + template + bool operator!= (const TMatrix4& a, const TMatrix4& b); + + + // ================================ // + // Matrix4 function definition // + // ================================ // + + template + T Determinant(const TMatrix4& m); + + template + bool InverseV(TMatrix4& a); + + template + TMatrix4 TransposeV(TMatrix4& a); + + // =============== // + // WITH RETURN // + // =============== // + + + template + bool Inverse(TMatrix4& a); + + template + TMatrix4 Transpose(const TMatrix4& a); + + template + FORCEINLINE bool IsIndentityMatrix(const TMatrix4& a) + { + return (abs(m1(0, 0) - (T)1.0) < P_FLT_INAC && abs(m1(0, 1) - (T)0.0) < P_FLT_INAC && abs(m1(0, 2) - (T)0.0) < P_FLT_INAC && abs(m1(0, 3) - (T)0.0) < P_FLT_INAC && + abs(m1(1, 0) - (T)0.0) < P_FLT_INAC && abs(m1(1, 1) - (T)1.0) < P_FLT_INAC && abs(m1(1, 2) - (T)0.0) < P_FLT_INAC && abs(m1(1, 3) - (T)0.0) < P_FLT_INAC && + abs(m1(2, 0) - (T)0.0) < P_FLT_INAC && abs(m1(2, 1) - (T)0.0) < P_FLT_INAC && abs(m1(2, 2) - (T)1.0) < P_FLT_INAC && abs(m1(2, 3) - (T)0.0) < P_FLT_INAC && + abs(m1(3, 0) - (T)0.0) < P_FLT_INAC && abs(m1(3, 1) - (T)0.0) < P_FLT_INAC && abs(m1(3, 2) - (T)1.0) < P_FLT_INAC && abs(m1(3, 3) - (T)0.0) < P_FLT_INAC); + } + + +} // Phanes::Core::Math + + +#endif // !MATRIX4_H + +#include "Core/public/Math/Matrix4.inl" \ No newline at end of file diff --git a/Engine/Source/Runtime/Core/public/Math/Matrix4.inl b/Engine/Source/Runtime/Core/public/Math/Matrix4.inl new file mode 100644 index 0000000..4d9d6f0 --- /dev/null +++ b/Engine/Source/Runtime/Core/public/Math/Matrix4.inl @@ -0,0 +1,46 @@ +#pragma once + +#include "Core/public/Math/Boilerplate.h" + +#include "Core/public/Math/Detail/Matrix4Decl.inl" +#include "Core/public/Math/SIMD/SIMDIntrinsics.h" + +#include "Core/public/Math/SIMD/PhanesSIMDTypes.h" + + +namespace Phanes::Core::Math +{ + template + T Determinant(const TMatrix4& m) + { + return Detail::compute_mat4_det::map(m); + } + + template + bool InverseV(TMatrix4& a) + { + return Detail::compute_mat4_inv::map(a, a); + } + + template + TMatrix4 TransposeV(TMatrix4& a) + { + return Detail::compute_mat4_transpose::map(a, a); + } + + template + bool Inverse(TMatrix4& a) + { + TMatrix4 r; + return Detail::compute_mat4_inv::map(r, a); + return r; + } + + template + TMatrix4 Transpose(TMatrix4& a) + { + TMatrix4 r; + return Detail::compute_mat4_transpose::map(r, a); + return r; + } +} \ No newline at end of file diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp index eeb20d6..78dae78 100644 --- a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp +++ b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp @@ -15,6 +15,9 @@ #include "Core/public/Math/IntVector3.hpp" #include "Core/public/Math/IntVector4.hpp" +#include "Core/public/Math/Matrix3.hpp" +#include "Core/public/Math/Matrix4.hpp" + // ========== // // Common // @@ -739,4 +742,476 @@ namespace Phanes::Core::Math::Detail r.comp = _mm_srl_epi64(v1.comp, _mm_set1_epi64x(s)); } }; + + + // =========== // + // Matrix3 // + // =========== // + + template<> + struct compute_mat3_transpose + { + static FORCEINLINE void map(Phanes::Core::Math::TMatrix3& r, const TMatrix3& m1) + { + __m128 tmp0 = _mm_shuffle_ps(m1.c0.data, m1.c1.data, 0x44); + __m128 tmp2 = _mm_shuffle_ps(m1.c0.data, m1.c1.data, 0xEE); + __m128 tmp1 = _mm_shuffle_ps(m1.c2.data, m1.c2.data, 0x44); + __m128 tmp3 = _mm_shuffle_ps(m1.c2.data, m1.c2.data, 0xEE); + + r.c0.data = _mm_shuffle_ps(tmp0, tmp1, 0x88); + r.c1.data = _mm_shuffle_ps(tmp0, tmp1, 0xDD); + r.c2.data = _mm_shuffle_ps(tmp2, tmp3, 0x88); + } + }; + + // =========== // + // Matrix4 // + // =========== // + + template<> + struct compute_mat4_det + { + + // From: GLM: https://github.com/g-truc/glm/blob/master/glm/simd/matrix.h (MIT License) + static FORCEINLINE float map(const TMatrix4& m1) + { + __m128 Fac0; + { + // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3]; + // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3]; + // valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3]; + // valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3]; + + __m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(2, 2, 2, 2)); + + __m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(3, 3, 3, 3)); + + __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); + __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); + Fac0 = _mm_sub_ps(Mul00, Mul01); + } + + __m128 Fac1; + { + // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3]; + // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3]; + // valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3]; + // valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3]; + + __m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(1, 1, 1, 1)); + + __m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(3, 3, 3, 3)); + + __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); + __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); + Fac1 = _mm_sub_ps(Mul00, Mul01); + } + + + __m128 Fac2; + { + // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2]; + // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2]; + // valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2]; + // valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2]; + + __m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(1, 1, 1, 1)); + + __m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(2, 2, 2, 2)); + + __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); + __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); + Fac2 = _mm_sub_ps(Mul00, Mul01); + } + + __m128 Fac3; + { + // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3]; + // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3]; + // valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3]; + // valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3]; + + __m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(0, 0, 0, 0)); + + __m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(3, 3, 3, 3)); + + __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); + __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); + Fac3 = _mm_sub_ps(Mul00, Mul01); + } + + __m128 Fac4; + { + // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2]; + // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2]; + // valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2]; + // valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2]; + + __m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(0, 0, 0, 0)); + + __m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(2, 2, 2, 2)); + + __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); + __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); + Fac4 = _mm_sub_ps(Mul00, Mul01); + } + + __m128 Fac5; + { + // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1]; + // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1]; + // valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1]; + // valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1]; + + __m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(0, 0, 0, 0)); + + __m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(1, 1, 1, 1)); + + __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); + __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); + Fac5 = _mm_sub_ps(Mul00, Mul01); + } + + __m128 SignA = _mm_set_ps(1.0f, -1.0f, 1.0f, -1.0f); + __m128 SignB = _mm_set_ps(-1.0f, 1.0f, -1.0f, 1.0f); + + // m[1][0] + // m[0][0] + // m[0][0] + // m[0][0] + __m128 Temp0 = _mm_shuffle_ps(m1.c1.data, m1.c0.data, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0)); + + // m[1][1] + // m[0][1] + // m[0][1] + // m[0][1] + __m128 Temp1 = _mm_shuffle_ps(m1.c1.data, m1.c0.data, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0)); + + // m[1][2] + // m[0][2] + // m[0][2] + // m[0][2] + __m128 Temp2 = _mm_shuffle_ps(m1.c1.data, m1.c0.data, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0)); + + // m[1][3] + // m[0][3] + // m[0][3] + // m[0][3] + __m128 Temp3 = _mm_shuffle_ps(m1.c1.data, m1.c0.data, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0)); + + // col0 + // + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]), + // - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]), + // + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]), + // - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]), + __m128 Mul00 = _mm_mul_ps(Vec1, Fac0); + __m128 Mul01 = _mm_mul_ps(Vec2, Fac1); + __m128 Mul02 = _mm_mul_ps(Vec3, Fac2); + __m128 Sub00 = _mm_sub_ps(Mul00, Mul01); + __m128 Add00 = _mm_add_ps(Sub00, Mul02); + __m128 Inv0 = _mm_mul_ps(SignB, Add00); + + // col1 + // - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]), + // + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]), + // - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]), + // + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]), + __m128 Mul03 = _mm_mul_ps(Vec0, Fac0); + __m128 Mul04 = _mm_mul_ps(Vec2, Fac3); + __m128 Mul05 = _mm_mul_ps(Vec3, Fac4); + __m128 Sub01 = _mm_sub_ps(Mul03, Mul04); + __m128 Add01 = _mm_add_ps(Sub01, Mul05); + __m128 Inv1 = _mm_mul_ps(SignA, Add01); + + // col2 + // + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]), + // - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]), + // + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]), + // - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]), + __m128 Mul06 = _mm_mul_ps(Vec0, Fac1); + __m128 Mul07 = _mm_mul_ps(Vec1, Fac3); + __m128 Mul08 = _mm_mul_ps(Vec3, Fac5); + __m128 Sub02 = _mm_sub_ps(Mul06, Mul07); + __m128 Add02 = _mm_add_ps(Sub02, Mul08); + __m128 Inv2 = _mm_mul_ps(SignB, Add02); + + // col3 + // - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]), + // + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]), + // - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]), + // + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3])); + __m128 Mul09 = _mm_mul_ps(Vec0, Fac2); + __m128 Mul10 = _mm_mul_ps(Vec1, Fac4); + __m128 Mul11 = _mm_mul_ps(Vec2, Fac5); + __m128 Sub03 = _mm_sub_ps(Mul09, Mul10); + __m128 Add03 = _mm_add_ps(Sub03, Mul11); + __m128 Inv3 = _mm_mul_ps(SignA, Add03); + + __m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0)); + + // valType Determinant = m[0][0] * Inverse[0][0] + // + m[0][1] * Inverse[1][0] + // + m[0][2] * Inverse[2][0] + // + m[0][3] * Inverse[3][0]; + __m128 Det0 = Phanes::Core::Math::SIMD::vec4_dot(m1.c0.data, Row2); + return _mm_cvtss_f32(Det0); + } + }; + + + template<> + struct compute_mat4_inv + { + // From: GLM: https://github.com/g-truc/glm/blob/master/glm/simd/matrix.h (MIT License) + static FORCEINLINE bool map(Phanes::Core::Math::TMatrix4& r, const Phanes::Core::Math::TMatrix4& m1) + { + __m128 Fac0; + { + // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3]; + // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3]; + // valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3]; + // valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3]; + + __m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(2, 2, 2, 2)); + + __m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(3, 3, 3, 3)); + + __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); + __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); + Fac0 = _mm_sub_ps(Mul00, Mul01); + } + + __m128 Fac1; + { + // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3]; + // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3]; + // valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3]; + // valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3]; + + __m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(1, 1, 1, 1)); + + __m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(3, 3, 3, 3)); + + __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); + __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); + Fac1 = _mm_sub_ps(Mul00, Mul01); + } + + + __m128 Fac2; + { + // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2]; + // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2]; + // valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2]; + // valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2]; + + __m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(1, 1, 1, 1)); + + __m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(2, 2, 2, 2)); + + __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); + __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); + Fac2 = _mm_sub_ps(Mul00, Mul01); + } + + __m128 Fac3; + { + // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3]; + // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3]; + // valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3]; + // valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3]; + + __m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(0, 0, 0, 0)); + + __m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(3, 3, 3, 3)); + + __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); + __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); + Fac3 = _mm_sub_ps(Mul00, Mul01); + } + + __m128 Fac4; + { + // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2]; + // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2]; + // valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2]; + // valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2]; + + __m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(0, 0, 0, 0)); + + __m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(2, 2, 2, 2)); + + __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); + __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); + Fac4 = _mm_sub_ps(Mul00, Mul01); + } + + __m128 Fac5; + { + // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1]; + // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1]; + // valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1]; + // valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1]; + + __m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(0, 0, 0, 0)); + + __m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); + __m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(1, 1, 1, 1)); + + __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); + __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); + Fac5 = _mm_sub_ps(Mul00, Mul01); + } + + __m128 SignA = _mm_set_ps(1.0f, -1.0f, 1.0f, -1.0f); + __m128 SignB = _mm_set_ps(-1.0f, 1.0f, -1.0f, 1.0f); + + // m[1][0] + // m[0][0] + // m[0][0] + // m[0][0] + __m128 Temp0 = _mm_shuffle_ps(m1.c1.data, m1.c0.data, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0)); + + // m[1][1] + // m[0][1] + // m[0][1] + // m[0][1] + __m128 Temp1 = _mm_shuffle_ps(m1.c1.data, m1.c0.data, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0)); + + // m[1][2] + // m[0][2] + // m[0][2] + // m[0][2] + __m128 Temp2 = _mm_shuffle_ps(m1.c1.data, m1.c0.data, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0)); + + // m[1][3] + // m[0][3] + // m[0][3] + // m[0][3] + __m128 Temp3 = _mm_shuffle_ps(m1.c1.data, m1.c0.data, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0)); + + // col0 + // + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]), + // - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]), + // + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]), + // - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]), + __m128 Mul00 = _mm_mul_ps(Vec1, Fac0); + __m128 Mul01 = _mm_mul_ps(Vec2, Fac1); + __m128 Mul02 = _mm_mul_ps(Vec3, Fac2); + __m128 Sub00 = _mm_sub_ps(Mul00, Mul01); + __m128 Add00 = _mm_add_ps(Sub00, Mul02); + __m128 Inv0 = _mm_mul_ps(SignB, Add00); + + // col1 + // - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]), + // + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]), + // - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]), + // + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]), + __m128 Mul03 = _mm_mul_ps(Vec0, Fac0); + __m128 Mul04 = _mm_mul_ps(Vec2, Fac3); + __m128 Mul05 = _mm_mul_ps(Vec3, Fac4); + __m128 Sub01 = _mm_sub_ps(Mul03, Mul04); + __m128 Add01 = _mm_add_ps(Sub01, Mul05); + __m128 Inv1 = _mm_mul_ps(SignA, Add01); + + // col2 + // + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]), + // - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]), + // + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]), + // - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]), + __m128 Mul06 = _mm_mul_ps(Vec0, Fac1); + __m128 Mul07 = _mm_mul_ps(Vec1, Fac3); + __m128 Mul08 = _mm_mul_ps(Vec3, Fac5); + __m128 Sub02 = _mm_sub_ps(Mul06, Mul07); + __m128 Add02 = _mm_add_ps(Sub02, Mul08); + __m128 Inv2 = _mm_mul_ps(SignB, Add02); + + // col3 + // - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]), + // + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]), + // - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]), + // + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3])); + __m128 Mul09 = _mm_mul_ps(Vec0, Fac2); + __m128 Mul10 = _mm_mul_ps(Vec1, Fac4); + __m128 Mul11 = _mm_mul_ps(Vec2, Fac5); + __m128 Sub03 = _mm_sub_ps(Mul09, Mul10); + __m128 Add03 = _mm_add_ps(Sub03, Mul11); + __m128 Inv3 = _mm_mul_ps(SignA, Add03); + + __m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0)); + + // valType Determinant = m[0][0] * Inverse[0][0] + // + m[0][1] * Inverse[1][0] + // + m[0][2] * Inverse[2][0] + // + m[0][3] * Inverse[3][0]; + __m128 Det0 = Phanes::Core::Math::SIMD::vec4_dot(m1.c0.data, Row2); + __m128 Rcp0 = _mm_div_ps(_mm_set1_ps(1.0f), Det0); + //__m128 Rcp0 = _mm_rcp_ps(Det0); + + // Inverse /= Determinant; + r.c0.data = _mm_mul_ps(Inv0, Rcp0); + r.c1.data = _mm_mul_ps(Inv1, Rcp0); + r.c2.data = _mm_mul_ps(Inv2, Rcp0); + r.c3.data = _mm_mul_ps(Inv3, Rcp0); + } + }; } \ No newline at end of file