Add Matrix4 and Matrix4 SIMD.

This commit is contained in:
scorpioblood 2024-06-20 22:46:15 +02:00
parent ed44c3695c
commit 5861d75bdb
10 changed files with 855 additions and 52 deletions

View File

@ -0,0 +1,24 @@
#pragma once
#include "Core/public/Math/Boilerplate.h"
#include "Core/public/Math/MathCommon.hpp"
namespace Phanes::Core::Math::Detail
{
template<RealType T, bool S>
struct compute_mat3_transpose {};
template<RealType T>
struct compute_mat3_transpose<T, false>
{
static constexpr void map(Phanes::Core::Math::TMatrix3<T, false>& r, const TMatrix3<T, false>& m1)
{
r = TMatrix4<T, false>(m1(0, 0), m1(1, 0), m1(2, 0),
m1(0, 1), m1(1, 1), m1(2, 1),
m1(0, 2), m1(1, 2), m1(2, 2)
);
}
};
}

View File

@ -0,0 +1,98 @@
#pragma once
#include "Core/public/Math/Boilerplate.h"
#include "Core/public/Math/MathCommon.hpp"
namespace Phanes::Core::Math::Detail
{
template<RealType T, bool S>
struct compute_mat4_det {};
template<RealType T, bool S>
struct compute_mat4_inv {};
template<RealType T, bool S>
struct compute_mat4_transpose {};
template<RealType T>
struct compute_mat4_det<T, false>
{
static constexpr T map(Phanes::Core::Math::TMatrix4<T, S>& m)
{
const TVector3<T, false>& a = reinterpret_cast<TVector3<T, false>&>(m[0]);
const TVector3<T, false>& b = reinterpret_cast<TVector3<T, false>&>(m[1]);
const TVector3<T, false>& c = reinterpret_cast<TVector3<T, false>&>(m[2]);
const TVector3<T, false>& d = reinterpret_cast<TVector3<T, false>&>(m[3]);
const float& x = m(3, 0);
const float& y = m(3, 1);
const float& z = m(3, 2);
const float& w = m(3, 3);
TVector3<T, false> s = CrossP(a, b);
TVector3<T, false> t = CrossP(c, d);
TVector3<T, false> u = a * y - b * x;
TVector3<T, false> v = c * w - d * z;
return DotP(s, v) + DotP(t, u);
}
};
template<RealType T>
struct compute_mat4_inv<T, false>
{
static constexpr bool map(Phanes::Core::Math::TMatrix4<T, false>& r, const Phanes::Core::Math::TMatrix4<T, false>& m)
{
const TVector3<T, false>& a = reinterpret_cast<TVector3<T, false>&>(m[0]);
const TVector3<T, false>& b = reinterpret_cast<TVector3<T, false>&>(m[1]);
const TVector3<T, false>& c = reinterpret_cast<TVector3<T, false>&>(m[2]);
const TVector3<T, false>& d = reinterpret_cast<TVector3<T, false>&>(m[3]);
const float& x = m(3, 0);
const float& y = m(3, 1);
const float& z = m(3, 2);
const float& w = m(3, 3);
TVector3<T, false> s = CrossP(a, b);
TVector3<T, false> t = CrossP(c, d);
TVector3<T, false> u = a * y - b * x;
TVector3<T, false> v = c * w - d * z;
float _1_det = (T)1.0 / (DotP(s, v) + DotP(t, u));
if (_1_det == 0.0)
{
return false;
}
s *= _1_det;
t *= _1_det;
u *= _1_det;
v *= _1_det;
TVector3<T, false> r0 = Cross(b, v) + t * y;
TVector3<T, false> r1 = Cross(v, a) + t * x;
TVector3<T, false> r2 = Cross(d, u) + s * w;
TVector3<T, false> r3 = Cross(u, c) + s * z;
r = TMatrix4<T, false>(r0.x, r0.y, r0.z, -DotP(b, t),
r1.x, r1.y, r1.z, DotP(a, t),
r2.x, r2.y, r2.z, -DotP(d, s),
r3.x, r3.y, r3.z, DotP(c, s));
return true;
}
};
template<RealType T>
struct compute_mat4_transpose<T, false>
{
static constexpr void map(Phanes::Core::Math::TMatrix4<T, S>& r, const Phanes::Core::Math::TMatrix4<T, S>& m)
{
r = Phanes::Core::Math::TMatrix4<T, false>(m(0, 0), m(1, 0), m(2, 0), m(3, 0),
m(0, 1), m(1, 1), m(2, 1), m(3, 1),
m(0, 2), m(1, 2), m(2, 2), m(3, 2),
m(0, 3), m(1, 3), m(2, 3), m(3, 3));
}
};
}

View File

@ -38,10 +38,7 @@ namespace Phanes::Core::Math::Detail
{ {
static constexpr void map(Phanes::Core::Math::TVector4<T, false>& v1, const TVector4<T, false>& v2) static constexpr void map(Phanes::Core::Math::TVector4<T, false>& v1, const TVector4<T, false>& v2)
{ {
v1.x = v2.x; memcpy(v1.data, v2.data, 4 * sizeof(T));
v1.y = v2.y;
v1.z = v2.z;
v1.w = v2.w;
} }
@ -72,10 +69,7 @@ namespace Phanes::Core::Math::Detail
static constexpr void map(Phanes::Core::Math::TVector4<T, false>& v1, const T* comp) static constexpr void map(Phanes::Core::Math::TVector4<T, false>& v1, const T* comp)
{ {
v1.x = comp[0]; memcpy(v1.data, comp, 4 * sizeof(T));
v1.y = comp[1];
v1.z = comp[2];
v1.w = comp[3];
} }
}; };

View File

@ -29,7 +29,6 @@ namespace Phanes::Core::Math {
template<RealType T> struct TRay; template<RealType T> struct TRay;
template<RealType T> struct TLine; template<RealType T> struct TLine;
template<RealType T> struct TPlane; template<RealType T> struct TPlane;
template<RealType T> struct TMatrix4;
template<RealType T> struct TQuaternion; template<RealType T> struct TQuaternion;
template<RealType T> struct TTransform; template<RealType T> struct TTransform;
template<RealType T> struct TPoint2; template<RealType T> struct TPoint2;
@ -40,6 +39,7 @@ namespace Phanes::Core::Math {
template<IntType T> struct TIntPoint4; template<IntType T> struct TIntPoint4;
template<RealType T> struct TMatrix2; template<RealType T> struct TMatrix2;
template<RealType T, bool S> struct TMatrix3; template<RealType T, bool S> struct TMatrix3;
template<RealType T, bool S> struct TMatrix4;
template<RealType T, bool S> struct TVector2; template<RealType T, bool S> struct TVector2;
template<RealType T, bool S> struct TVector3; template<RealType T, bool S> struct TVector3;
template<RealType T, bool S> struct TVector4; template<RealType T, bool S> struct TVector4;

View File

@ -313,3 +313,6 @@ namespace Phanes::Core::Math {
#endif // !MATRIX2_H #endif // !MATRIX2_H
#include "Core/public/Math/SIMD/SIMDIntrinsics.h"

View File

@ -111,17 +111,17 @@ namespace Phanes::Core::Math {
FORCEINLINE T& operator() (int n, int m) FORCEINLINE T& operator() (int n, int m)
{ {
return this->m[m][n]; return this->data[m][n];
} }
FORCEINLINE TVector3<T>& operator[] (int m) FORCEINLINE TVector3<T, S>& operator[] (int m)
{ {
return (*reinterpret_cast<TVector3<T>*>(this->m[m])); return (*reinterpret_cast<TVector3<T, S>*>(this->m[m]));
} }
FORCEINLINE const T& operator() (int n, int m) const FORCEINLINE const T& operator() (int n, int m) const
{ {
return this->m[m][n]; return this->data[m][n];
} }
FORCEINLINE const TVector3<T, S>& operator[] (int m) const FORCEINLINE const TVector3<T, S>& operator[] (int m) const
@ -384,25 +384,26 @@ namespace Phanes::Core::Math {
*/ */
template<RealType T, bool S> template<RealType T, bool S>
TMatrix3<T, S> InverseV(TMatrix3<T, S>& m1) bool InverseV(TMatrix3<T, S>& m1)
{ {
const TVector3<T, S>& v0 = m1[0]; TVector3<T, S> r0 = CrossP(m1.c1, m1.c2);
const TVector3<T, S>& v1 = m1[1]; TVector3<T, S> r1 = CrossP(m1.c2, m1.c0);
const TVector3<T, S>& v2 = m1[2]; TVector3<T, S> r2 = CrossP(m1.c0, m1.c1);
TVector3<T, S> r0 = CrossP(v1, v2);
TVector3<T, S> r1 = CrossP(v2, v0);
TVector3<T, S> r2 = CrossP(v0, v1);
T _1_det = (T)1.0 / Determinant(m1); T _1_det = (T)1.0 / Determinant(m1);
if (_1_det == (T)0.0)
{
return false;
}
m1 = TMatrix3<T, S>(r0.x, r0.y, r0.z, m1 = TMatrix3<T, S>(r0.x, r0.y, r0.z,
r1.x, r1.y, r1.z, r1.x, r1.y, r1.z,
r2.x, r2.y, r2.z); r2.x, r2.y, r2.z);
m1 *= _1_det; m1 *= _1_det;
return m1; return true;
} }
/** /**
@ -414,14 +415,7 @@ namespace Phanes::Core::Math {
*/ */
template<RealType T, bool S> template<RealType T, bool S>
TMatrix3<T, S> TransposeV(TMatrix3<T, S>& m1) TMatrix3<T, S> TransposeV(TMatrix3<T, S>& m1);
{
Swap(m1(0, 1), m1(1, 0));
Swap(m1(0, 2), m1(2, 0));
Swap(m1(1, 2), m1(2, 1));
return m1;
}
// =============== // // =============== //
@ -435,25 +429,26 @@ namespace Phanes::Core::Math {
*/ */
template<RealType T, bool S> template<RealType T, bool S>
TMatrix3<T, S> Inverse(TMatrix3<T, S>& m1) bool Inverse(TMatrix3<T, S>& r, const TMatrix3<T, S>& m1)
{ {
const TVector3<T>& v0 = m1[0]; TVector3<T, S> r0 = CrossP(m1.c1, m1.c2);
const TVector3<T>& v1 = m1[1]; TVector3<T, S> r1 = CrossP(m1.c2, m1.c0);
const TVector3<T>& v2 = m1[2]; TVector3<T, S> r2 = CrossP(m1.c0, m1.c1);
TVector3<T> r0 = CrossP(v1, v2);
TVector3<T> r1 = CrossP(v2, v0);
TVector3<T> r2 = CrossP(v0, v1);
T _1_det = (T)1.0 / Determinant(m1); T _1_det = (T)1.0 / Determinant(m1);
TMatrix3<T, S> inverse(r0.x, r0.y, r0.z, if (_1_det == (T)0.0)
{
return false;
}
r = TMatrix3<T, S>(r0.x, r0.y, r0.z,
r1.x, r1.y, r1.z, r1.x, r1.y, r1.z,
r2.x, r2.y, r2.z); r2.x, r2.y, r2.z);
inverse *= _1_det; r *= _1_det;
return inverse; return true;
} }
/** /**
@ -465,12 +460,7 @@ namespace Phanes::Core::Math {
*/ */
template<RealType T, bool S> template<RealType T, bool S>
TMatrix3<T, S> Transpose(const TMatrix3<T, S>& m1) TMatrix3<T, S> Transpose(const TMatrix3<T, S>& m1);
{
return TMatrix3<T, S>(m1(0, 0), m1(1, 0), m1(2, 0),
m1(0, 1), m1(1, 1), m1(2, 1),
m1(0, 2), m1(1, 2), m1(2, 2));
}
/** /**
* Checks if matrix is an identity matrix. * Checks if matrix is an identity matrix.
@ -488,3 +478,5 @@ namespace Phanes::Core::Math {
#endif // !MATRIX3_H #endif // !MATRIX3_H
#include "Core/public/Math/Matrix3.inl"

View File

@ -0,0 +1,27 @@
#pragma once
#include "Core/public/Math/Boilerplate.h"
#include "Core/public/Math/Detail/Matrix3Decl.inl"
#include "Core/public/Math/SIMD/SIMDIntrinsics.h"
#include "Core/public/Math/SIMD/PhanesSIMDTypes.h"
namespace Phanes::Core::Math
{
template<RealType T, bool S>
TMatrix3<T, S> TransposeV(const TMatrix3<T, S>& m)
{
Detail::compute_mat3_transpose<T, S>::map(m, m);
return m;
}
template<RealType T, bool S>
TMatrix3<T, S> Transpose(const TMatrix3<T, S>& m)
{
TMatrix3<T, S> r;
Detail::compute_mat3_transpose<T, S>::map(r, m);
return r;
}
}

View File

@ -0,0 +1,144 @@
#pragma once
#include "Core/public/Math/Boilerplate.h"
#include "Core/public/Math/MathAbstractTypes.h"
#include "Core/public/Math/MathFwd.h"
#include "Core/public/Math/Vector4.hpp"
#ifndef MATRIX4_H
#define MATRIX4_H
namespace Phanes::Core::Math {
// 4x4 Matrix defined in column-major order.
template<RealType T, bool S>
struct TMatrix4
{
public:
union
{
struct
{
TVector4<T, S> c0;
TVector4<T, S> c1;
TVector4<T, S> c2;
TVector4<T, S> c3;
};
T data[4][4];
};
public:
FORCEINLINE T& operator() (int n, int m)
{
return this->data[m][n];
}
FORCEINLINE TVector4<T, S>& operator[] (int m)
{
return (*reinterpret_cast<TVector4<T, S>*>(this->m[m]));
}
FORCEINLINE const T& operator() (int n, int m) const
{
return this->data[m][n];
}
FORCEINLINE const TVector4<T, S>& operator[] (int m) const
{
return (*reinterpret_cast<TVector4<T, S>*>(this->m[m]));
}
};
// ==================== //
// Matrix4 operator //
// ==================== //
template<RealType T, bool S>
TMatrix4<T, S> operator+= (TMatrix4<T, S>& a, T s);
template<RealType T, bool S>
TMatrix4<T, S> operator+= (TMatrix4<T, S>& a, const TMatrix4<T, S>& b);
template<RealType T, bool S>
TMatrix4<T, S> operator-= (TMatrix4<T, S>& a, T s);
template<RealType T, bool S>
TMatrix4<T, S> operator-= (TMatrix4<T, S>& a, const TMatrix4<T, S>& b);
template<RealType T, bool S>
TMatrix4<T, S> operator*= (TMatrix4<T, S>& a, T s);
template<RealType T, bool S>
TMatrix4<T, S> operator*= (TMatrix4<T, S>& a, const TMatrix4<T, S>& b);
template<RealType T, bool S>
TMatrix4<T, S> operator+ (const TMatrix4<T, S>& a, T s);
template<RealType T, bool S>
TMatrix4<T, S> operator+ (const TMatrix4<T, S>& a, const TMatrix4<T, S>& b);
template<RealType T, bool S>
TMatrix4<T, S> operator- (const TMatrix4<T, S>& a, T s);
template<RealType T, bool S>
TMatrix4<T, S> operator- (const TMatrix4<T, S>& a, const TMatrix4<T, S>& b);
template<RealType T, bool S>
TMatrix4<T, S> operator* (const TMatrix4<T, S>& a, T s);
template<RealType T, bool S>
TMatrix4<T, S> operator* (const TMatrix4<T, S>& a, const TMatrix4<T, S>& b);
template<RealType T, bool S>
TVector4<T, S> operator* (const TMatrix4<T, S>& a, const TVector4<T, S>& v);
template<RealType T, bool S>
bool operator== (const TMatrix4<T, S>& a, const TMatrix4<T, S>& b);
template<RealType T, bool S>
bool operator!= (const TMatrix4<T, S>& a, const TMatrix4<T, S>& b);
// ================================ //
// Matrix4 function definition //
// ================================ //
template<RealType T, bool S>
T Determinant(const TMatrix4<T, S>& m);
template<RealType T, bool S>
bool InverseV(TMatrix4<T, S>& a);
template<RealType T, bool S>
TMatrix4<T, S> TransposeV(TMatrix4<T, S>& a);
// =============== //
// WITH RETURN //
// =============== //
template<RealType T, bool S>
bool Inverse(TMatrix4<T, S>& a);
template<RealType T, bool S>
TMatrix4<T, S> Transpose(const TMatrix4<T, S>& a);
template<RealType T, bool S>
FORCEINLINE bool IsIndentityMatrix(const TMatrix4<T, S>& a)
{
return (abs(m1(0, 0) - (T)1.0) < P_FLT_INAC && abs(m1(0, 1) - (T)0.0) < P_FLT_INAC && abs(m1(0, 2) - (T)0.0) < P_FLT_INAC && abs(m1(0, 3) - (T)0.0) < P_FLT_INAC &&
abs(m1(1, 0) - (T)0.0) < P_FLT_INAC && abs(m1(1, 1) - (T)1.0) < P_FLT_INAC && abs(m1(1, 2) - (T)0.0) < P_FLT_INAC && abs(m1(1, 3) - (T)0.0) < P_FLT_INAC &&
abs(m1(2, 0) - (T)0.0) < P_FLT_INAC && abs(m1(2, 1) - (T)0.0) < P_FLT_INAC && abs(m1(2, 2) - (T)1.0) < P_FLT_INAC && abs(m1(2, 3) - (T)0.0) < P_FLT_INAC &&
abs(m1(3, 0) - (T)0.0) < P_FLT_INAC && abs(m1(3, 1) - (T)0.0) < P_FLT_INAC && abs(m1(3, 2) - (T)1.0) < P_FLT_INAC && abs(m1(3, 3) - (T)0.0) < P_FLT_INAC);
}
} // Phanes::Core::Math
#endif // !MATRIX4_H
#include "Core/public/Math/Matrix4.inl"

View File

@ -0,0 +1,46 @@
#pragma once
#include "Core/public/Math/Boilerplate.h"
#include "Core/public/Math/Detail/Matrix4Decl.inl"
#include "Core/public/Math/SIMD/SIMDIntrinsics.h"
#include "Core/public/Math/SIMD/PhanesSIMDTypes.h"
namespace Phanes::Core::Math
{
template<RealType T, bool S>
T Determinant(const TMatrix4<T, S>& m)
{
return Detail::compute_mat4_det<T, S>::map(m);
}
template<RealType T, bool S>
bool InverseV(TMatrix4<T, S>& a)
{
return Detail::compute_mat4_inv<T, S>::map(a, a);
}
template<RealType T, bool S>
TMatrix4<T, S> TransposeV(TMatrix4<T, S>& a)
{
return Detail::compute_mat4_transpose<T, S>::map(a, a);
}
template<RealType T, bool S>
bool Inverse(TMatrix4<T, S>& a)
{
TMatrix4<T, S> r;
return Detail::compute_mat4_inv<T, S>::map(r, a);
return r;
}
template<RealType T, bool S>
TMatrix4<T, S> Transpose(TMatrix4<T, S>& a)
{
TMatrix4<T, S> r;
return Detail::compute_mat4_transpose<T, S>::map(r, a);
return r;
}
}

View File

@ -15,6 +15,9 @@
#include "Core/public/Math/IntVector3.hpp" #include "Core/public/Math/IntVector3.hpp"
#include "Core/public/Math/IntVector4.hpp" #include "Core/public/Math/IntVector4.hpp"
#include "Core/public/Math/Matrix3.hpp"
#include "Core/public/Math/Matrix4.hpp"
// ========== // // ========== //
// Common // // Common //
@ -739,4 +742,476 @@ namespace Phanes::Core::Math::Detail
r.comp = _mm_srl_epi64(v1.comp, _mm_set1_epi64x(s)); r.comp = _mm_srl_epi64(v1.comp, _mm_set1_epi64x(s));
} }
}; };
// =========== //
// Matrix3 //
// =========== //
template<>
struct compute_mat3_transpose<float, true>
{
static FORCEINLINE void map(Phanes::Core::Math::TMatrix3<float, true>& r, const TMatrix3<float, true>& m1)
{
__m128 tmp0 = _mm_shuffle_ps(m1.c0.data, m1.c1.data, 0x44);
__m128 tmp2 = _mm_shuffle_ps(m1.c0.data, m1.c1.data, 0xEE);
__m128 tmp1 = _mm_shuffle_ps(m1.c2.data, m1.c2.data, 0x44);
__m128 tmp3 = _mm_shuffle_ps(m1.c2.data, m1.c2.data, 0xEE);
r.c0.data = _mm_shuffle_ps(tmp0, tmp1, 0x88);
r.c1.data = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
r.c2.data = _mm_shuffle_ps(tmp2, tmp3, 0x88);
}
};
// =========== //
// Matrix4 //
// =========== //
template<>
struct compute_mat4_det<float, true>
{
// From: GLM: https://github.com/g-truc/glm/blob/master/glm/simd/matrix.h (MIT License)
static FORCEINLINE float map(const TMatrix4<float, true>& m1)
{
__m128 Fac0;
{
// valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
// valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
// valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
// valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];
__m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
Fac0 = _mm_sub_ps(Mul00, Mul01);
}
__m128 Fac1;
{
// valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
// valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
// valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
// valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];
__m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
Fac1 = _mm_sub_ps(Mul00, Mul01);
}
__m128 Fac2;
{
// valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
// valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
// valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
// valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
__m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
Fac2 = _mm_sub_ps(Mul00, Mul01);
}
__m128 Fac3;
{
// valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
// valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
// valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
// valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];
__m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
Fac3 = _mm_sub_ps(Mul00, Mul01);
}
__m128 Fac4;
{
// valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
// valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
// valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
// valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];
__m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
Fac4 = _mm_sub_ps(Mul00, Mul01);
}
__m128 Fac5;
{
// valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
// valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
// valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
// valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];
__m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
Fac5 = _mm_sub_ps(Mul00, Mul01);
}
__m128 SignA = _mm_set_ps(1.0f, -1.0f, 1.0f, -1.0f);
__m128 SignB = _mm_set_ps(-1.0f, 1.0f, -1.0f, 1.0f);
// m[1][0]
// m[0][0]
// m[0][0]
// m[0][0]
__m128 Temp0 = _mm_shuffle_ps(m1.c1.data, m1.c0.data, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));
// m[1][1]
// m[0][1]
// m[0][1]
// m[0][1]
__m128 Temp1 = _mm_shuffle_ps(m1.c1.data, m1.c0.data, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));
// m[1][2]
// m[0][2]
// m[0][2]
// m[0][2]
__m128 Temp2 = _mm_shuffle_ps(m1.c1.data, m1.c0.data, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));
// m[1][3]
// m[0][3]
// m[0][3]
// m[0][3]
__m128 Temp3 = _mm_shuffle_ps(m1.c1.data, m1.c0.data, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));
// col0
// + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]),
// - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]),
// + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]),
// - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]),
__m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
__m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
__m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
__m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
__m128 Add00 = _mm_add_ps(Sub00, Mul02);
__m128 Inv0 = _mm_mul_ps(SignB, Add00);
// col1
// - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]),
// + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]),
// - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]),
// + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]),
__m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
__m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
__m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
__m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
__m128 Add01 = _mm_add_ps(Sub01, Mul05);
__m128 Inv1 = _mm_mul_ps(SignA, Add01);
// col2
// + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]),
// - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]),
// + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]),
// - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]),
__m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
__m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
__m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
__m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
__m128 Add02 = _mm_add_ps(Sub02, Mul08);
__m128 Inv2 = _mm_mul_ps(SignB, Add02);
// col3
// - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]),
// + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]),
// - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]),
// + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3]));
__m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
__m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
__m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
__m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
__m128 Add03 = _mm_add_ps(Sub03, Mul11);
__m128 Inv3 = _mm_mul_ps(SignA, Add03);
__m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));
// valType Determinant = m[0][0] * Inverse[0][0]
// + m[0][1] * Inverse[1][0]
// + m[0][2] * Inverse[2][0]
// + m[0][3] * Inverse[3][0];
__m128 Det0 = Phanes::Core::Math::SIMD::vec4_dot(m1.c0.data, Row2);
return _mm_cvtss_f32(Det0);
}
};
template<>
struct compute_mat4_inv<float, false>
{
// From: GLM: https://github.com/g-truc/glm/blob/master/glm/simd/matrix.h (MIT License)
static FORCEINLINE bool map(Phanes::Core::Math::TMatrix4<float, true>& r, const Phanes::Core::Math::TMatrix4<float, true>& m1)
{
__m128 Fac0;
{
// valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
// valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
// valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
// valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];
__m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
Fac0 = _mm_sub_ps(Mul00, Mul01);
}
__m128 Fac1;
{
// valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
// valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
// valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
// valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];
__m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
Fac1 = _mm_sub_ps(Mul00, Mul01);
}
__m128 Fac2;
{
// valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
// valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
// valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
// valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
__m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
Fac2 = _mm_sub_ps(Mul00, Mul01);
}
__m128 Fac3;
{
// valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
// valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
// valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
// valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];
__m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
Fac3 = _mm_sub_ps(Mul00, Mul01);
}
__m128 Fac4;
{
// valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
// valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
// valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
// valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];
__m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
Fac4 = _mm_sub_ps(Mul00, Mul01);
}
__m128 Fac5;
{
// valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
// valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
// valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
// valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];
__m128 Swp0a = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Swp0b = _mm_shuffle_ps(m1.c3.data, m1.c2.data, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp00 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(m1.c2.data, m1.c1.data, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
Fac5 = _mm_sub_ps(Mul00, Mul01);
}
__m128 SignA = _mm_set_ps(1.0f, -1.0f, 1.0f, -1.0f);
__m128 SignB = _mm_set_ps(-1.0f, 1.0f, -1.0f, 1.0f);
// m[1][0]
// m[0][0]
// m[0][0]
// m[0][0]
__m128 Temp0 = _mm_shuffle_ps(m1.c1.data, m1.c0.data, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));
// m[1][1]
// m[0][1]
// m[0][1]
// m[0][1]
__m128 Temp1 = _mm_shuffle_ps(m1.c1.data, m1.c0.data, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));
// m[1][2]
// m[0][2]
// m[0][2]
// m[0][2]
__m128 Temp2 = _mm_shuffle_ps(m1.c1.data, m1.c0.data, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));
// m[1][3]
// m[0][3]
// m[0][3]
// m[0][3]
__m128 Temp3 = _mm_shuffle_ps(m1.c1.data, m1.c0.data, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));
// col0
// + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]),
// - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]),
// + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]),
// - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]),
__m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
__m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
__m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
__m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
__m128 Add00 = _mm_add_ps(Sub00, Mul02);
__m128 Inv0 = _mm_mul_ps(SignB, Add00);
// col1
// - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]),
// + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]),
// - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]),
// + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]),
__m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
__m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
__m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
__m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
__m128 Add01 = _mm_add_ps(Sub01, Mul05);
__m128 Inv1 = _mm_mul_ps(SignA, Add01);
// col2
// + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]),
// - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]),
// + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]),
// - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]),
__m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
__m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
__m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
__m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
__m128 Add02 = _mm_add_ps(Sub02, Mul08);
__m128 Inv2 = _mm_mul_ps(SignB, Add02);
// col3
// - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]),
// + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]),
// - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]),
// + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3]));
__m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
__m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
__m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
__m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
__m128 Add03 = _mm_add_ps(Sub03, Mul11);
__m128 Inv3 = _mm_mul_ps(SignA, Add03);
__m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));
// valType Determinant = m[0][0] * Inverse[0][0]
// + m[0][1] * Inverse[1][0]
// + m[0][2] * Inverse[2][0]
// + m[0][3] * Inverse[3][0];
__m128 Det0 = Phanes::Core::Math::SIMD::vec4_dot(m1.c0.data, Row2);
__m128 Rcp0 = _mm_div_ps(_mm_set1_ps(1.0f), Det0);
//__m128 Rcp0 = _mm_rcp_ps(Det0);
// Inverse /= Determinant;
r.c0.data = _mm_mul_ps(Inv0, Rcp0);
r.c1.data = _mm_mul_ps(Inv1, Rcp0);
r.c2.data = _mm_mul_ps(Inv2, Rcp0);
r.c3.data = _mm_mul_ps(Inv3, Rcp0);
}
};
} }