SIMD boilerplate.

This commit is contained in:
scorpioblood 2024-05-27 22:01:53 +02:00
parent 8b3d61ff5d
commit fb2e6f960d
7 changed files with 453 additions and 50 deletions

View File

@ -0,0 +1,167 @@
#pragma once
#include "Core/public/Math/Boilerplate.h"
#include "Core/public/Math/MathCommon.hpp"
#include "Core/public/Math/MathFwd.h"
namespace Phanes::Core::Math::Detail
{
template<RealType T, bool A>
struct compute_vec4_add {};
template<RealType T, bool A>
struct compute_vec4_sub {};
template<RealType T, bool A>
struct compute_vec4_mul {};
template<RealType T, bool A>
struct compute_vec4_div {};
template<RealType T, bool A>
struct compute_vec4_eq{};
template<RealType T, bool A>
struct compute_vec4_ieq {};
template<RealType T, bool A>
struct compute_vec4_inc {};
template<RealType T, bool A>
struct compute_vec4_dec {};
template<RealType T>
struct compute_vec4_add<T, false>
{
static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1, const Phanes::Core::Math::TVector4<T, false>& v2)
{
r.x = v1.x + v2.x;
r.y = v1.y + v2.y;
r.z = v1.z + v2.z;
r.w = v1.w + v2.w;
}
static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1, T s)
{
r.x = v1.x + s;
r.y = v1.y + s;
r.z = v1.z + s;
r.w = v1.w + s;
}
};
template<RealType T>
struct compute_vec4_sub<T, false>
{
static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1, const Phanes::Core::Math::TVector4<T, false>& v2)
{
r.x = v1.x - v2.x;
r.y = v1.y - v2.y;
r.z = v1.z - v2.z;
r.w = v1.w - v2.w;
}
static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1, T s)
{
r.x = v1.x - s;
r.y = v1.y - s;
r.z = v1.z - s;
r.w = v1.w - s;
}
};
template<RealType T>
struct compute_vec4_mul<T, false>
{
static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1, const Phanes::Core::Math::TVector4<T, false>& v2)
{
r.x = v1.x * v2.x;
r.y = v1.y * v2.y;
r.z = v1.z * v2.z;
r.w = v1.w * v2.w;
}
static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1, T s)
{
r.x = v1.x * s;
r.y = v1.y * s;
r.z = v1.z * s;
r.w = v1.w * s;
}
};
template<RealType T>
struct compute_vec4_div<T, false>
{
static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1, const Phanes::Core::Math::TVector4<T, false>& v2)
{
r.x = v1.x / v2.x;
r.y = v1.y / v2.y;
r.z = v1.z / v2.z;
r.w = v1.w / v2.w;
}
static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1, T s)
{
s = (T)1.0 / s;
r.x = v1.x * s;
r.y = v1.y * s;
r.z = v1.z * s;
r.w = v1.w * s;
}
};
template<RealType T>
struct compute_vec4_eq<T, false>
{
static constexpr bool map(const Phanes::Core::Math::TVector4<T, false>& v1, const Phanes::Core::Math::TVector4<T, false>& v2)
{
return (Phanes::Core::Math::Abs(v1.x - v2.x) < P_FLT_INAC &&
Phanes::Core::Math::Abs(v1.y - v2.y) < P_FLT_INAC &&
Phanes::Core::Math::Abs(v1.z - v2.z) < P_FLT_INAC &&
Phanes::Core::Math::Abs(v1.w - v2.w) < P_FLT_INAC);
}
};
template<RealType T>
struct compute_vec4_ieq<T, false>
{
static constexpr bool map(const Phanes::Core::Math::TVector4<T, false>& v1, const Phanes::Core::Math::TVector4<T, false>& v2)
{
return (Phanes::Core::Math::Abs(v1.x - v2.x) > P_FLT_INAC ||
Phanes::Core::Math::Abs(v1.y - v2.y) > P_FLT_INAC ||
Phanes::Core::Math::Abs(v1.z - v2.z) > P_FLT_INAC ||
Phanes::Core::Math::Abs(v1.w - v2.w) > P_FLT_INAC);
}
};
template<RealType T>
struct compute_vec4_inc<T, false>
{
static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1)
{
r.x = v1.x + 1;
r.y = v1.y + 1;
r.z = v1.z + 1;
r.w = v1.w + 1;
}
};
template<RealType T>
struct compute_vec4_dec<T, false>
{
static constexpr void map(Phanes::Core::Math::TVector4<T, false>& r, const Phanes::Core::Math::TVector4<T, false>& v1)
{
r.x = v1.x - 1;
r.y = v1.y - 1;
r.z = v1.z - 1;
r.w = v1.w - 1;
}
};
}

View File

@ -0,0 +1,227 @@
// This file includes the necessary header for vectorization intrinsics. If no specifics are defined SSE4.2 is used.
//
// ARM is not supported.
#include "Core/public/Math/SIMD/Platform.h"
#include "Core/public/Math/MathTypes.h"
#if P_INTRINSICS == P_INTRINSICS_AVX2
# include <immintrin.h>
#elif P_INTRINSICS == P_INTRINSICS_AVX
# include <immintrin.h>
#elif P_INTRINSICS == P_INTRINSICS_SSE
# include <nmmintrin.h>
#elif P_INTRINSICS == P_INTRINSICS_NEON
# include "neon.h" // <- Not supported
#endif
// use_simd for metaprogramming
namespace Phanes::Core::Math::SIMD
{
/// <summary>
/// This decides, whether simd operations should be used, based on the vector type, it's size, the vector alignment and whether the right extension can be loaded during compiletime.
/// </summary>
/// <typeparam name="T">Type of vector</typeparam>
/// <typeparam name="L">Length of vector</typeparam>
/// <typeparam name="SimdActive">Whether SIMD intrinsics exist, that support the vector type and length.</typeparam>
/// <typeparam name="IsAligned">Whether the vector is aligned for simd usage.</typeparam>
template<typename T, size_t L, bool IsAligned>
struct use_simd
{
bool value = false;
};
// SSE / NEON
template<>
struct use_simd<float, 4, true>
{
bool value = true && (P_SSE__ || P_NEON__);
};
template<>
struct use_simd<float, 3, true>
{
bool value = true && (P_SSE__ || P_NEON__);
};
template<>
struct use_simd<int, 4, true>
{
bool value = true && (P_SSE__ || P_NEON__);
};
template<>
struct use_simd<int, 3, true>
{
bool value = true && (P_SSE__ || P_NEON__);
};
template<>
struct use_simd<unsigned int, 4, true>
{
bool value = true && (P_SSE__ || P_NEON__);
};
template<>
struct use_simd<unsigned int, 3, true>
{
bool value = true && (P_SSE__ || P_NEON__);
};
// SSE
template<>
struct use_simd<double, 2, true>
{
bool value = true && P_SSE__;
};
template<>
struct use_simd<Phanes::Core::Types::int64, 2, true>
{
bool value = true && P_SSE__;
};
template<>
struct use_simd<Phanes::Core::Types::uint64, 2, true>
{
bool value = true && P_SSE__;
};
// AVX
template<>
struct use_simd<double, 4, true>
{
bool value = true && P_AVX__;
};
template<>
struct use_simd<double, 3, true>
{
bool value = true && P_AVX__;
};
template<>
struct use_simd<float, 8, true>
{
bool value = true && P_AVX__;
};
// AVX2
template<>
struct use_simd<Phanes::Core::Types::int64, 4, true>
{
bool value = true && P_AVX2__;
};
template<>
struct use_simd<Phanes::Core::Types::int64, 3, true>
{
bool value = true && P_AVX2__;
};
template<>
struct use_simd<Phanes::Core::Types::uint64, 4, true>
{
bool value = true && P_AVX2__;
};
template<>
struct use_simd<Phanes::Core::Types::uint64, 3, true>
{
bool value = true && P_AVX2__;
};
template<>
struct use_simd<int, 8, true>
{
bool value = true && P_AVX2__;
};
template<>
struct use_simd<unsigned int, 8, true>
{
bool value = true && P_AVX2__;
};
}
// Register aliases
namespace Phanes::Core::Types
{
#if P_INTRINSICS >= 1
typedef __m128 Vec4f32Reg;
typedef __m128d Vec2f64Reg;
typedef __m128i Vec4i32Reg;
typedef __m128i Vec2i64Reg;
typedef __m128i Vec4u32Reg;
typedef __m128i Vec2u64Reg;
#elif P_INTRINSICS != P_INTRINSICS_NEON
typedef struct alignas(16) Vec4f32Reg { float data[4]; } Vec4f32Reg;
typedef struct alignas(16) Vec2f64Reg { double data[2]; } Vec2f64Reg;
typedef struct alignas(16) Vec4i32Reg { int data[4]; } Vec4i32Reg;
typedef struct alignas(16) Vec2i64Reg { Phanes::Core::Types::int64 data[2]; } Vec2i64Reg;
typedef struct alignas(16) Vec4u32Reg { unsigned int data[4]; } Vec4u32Reg;
typedef struct alignas(16) Vec2u64Reg { Phanes::Core::Types::uint64 data[4]; } Vec2u64Reg;
#endif
#if P_INTRINSICS >= 2
typedef __m256 Vec4x2f32Reg;
typedef __m256 Vec8f32Reg;
typedef __m256d Vec2x2f64Reg;
typedef __m256d Vec4f64Reg;
#elif P_INTRINSICS != P_INTRINSICS_NEON
typedef struct alignas(32) Vec4x2f32Reg { float data[8]; } Vec4x2f32Reg;
typedef struct alignas(32) Vec8f32Reg { float data[8]; } Vec8f32Reg;
typedef struct alignas(32) Vec2x2f64Reg { double data[4]; } Vec2x2f64Reg;
typedef struct alignas(32) Vec4f64Reg { double data[4]; } Vec4f64Reg;
#endif
#if P_INTRINSICS == 3
typedef __m256i Vec4x2i32Reg;
typedef __m256i Vec8i32Reg;
typedef __m256i Vec2x2i64Reg;
typedef __m256i Vec4i64Reg;
typedef __m256i Vec4x2u32Reg;
typedef __m256i Vec8u32Reg;
typedef __m256i Vec2x2u64Reg;
typedef __m256i Vec4u64Reg;
#elif P_INTRINSICS != P_INTRINSICS_NEON
typedef struct alignas(32) Vec4x2i32Reg { int data[8]; } Vec4x2i32Reg;
typedef struct alignas(32) Vec8i32Reg { int data[8]; } Vec8i32Reg;
typedef struct alignas(32) Vec2x2i64Reg { Phanes::Core::Types::int64 data[4]; } Vec2x2i64Reg;
typedef struct alignas(32) Vec4i64Reg { Phanes::Core::Types::int64 data[4]; } Vec4i64Reg;
typedef struct alignas(32) Vec4x2u32Reg { unsigned int data[8]; } Vec4x2u32Reg;
typedef struct alignas(32) Vec8u32Reg { unsigned int data[8]; } Vec8u32Reg;
typedef struct alignas(32) Vec2x2u64Reg { Phanes::Core::Types::uint64 data[4]; } Vec2x2u64Reg;
typedef struct alignas(32) Vec4u64Reg { Phanes::Core::Types::uint64 data[4]; } Vec4u64Reg;
#endif
// NEON ...
}

View File

@ -1,5 +1,25 @@
#pragma once
#include "Core/public/Math/Boilerplate.h"
#include <nmmintrin.h>
// -> For IntelliSense
// ============ //
// TVector4 //
// ============ //
namespace Phanes::Core::Math::Detail
{
// Template class has already been defined and is included through: Storage.h -> Vector4.hpp -> SIMDIntrinsics.h -> PhanesVectorMathSEE.hpp
template<>
struct compute_vec4_add<float, true>
{
static FORCEINLINE void map(Phanes::Core::Math::TVector4<float, true>& r, const Phanes::Core::Math::TVector4<float, true>& v1, const Phanes::Core::Math::TVector4<float, true>& v2)
{
r.comp = _mm_add_ps(v1.comp, v2.comp);
}
};
}

View File

@ -254,6 +254,11 @@
// Define also supported instruction sets for Visual Studio, as it only defines the latest (e.g. only __AVX__ not __SSE4__ ...).
#define P_AVX2__ 0
#define P_AVX__ 0
#define P_SSE__ 0
#define P_NEON__ 0
#ifdef P_FORCE_INTRINSICS
# undef __AVX2__
@ -267,22 +272,22 @@
#else
# ifdef __AVX2__
# define P_AVX2__
# define P_AVX2__ 1
# elif defined(__AVX__)
# define P_AVX__
# define P_AVX__ 1
# elif defined(__SSE__)
# define P_SSE__
# define P_SSE__ 1
# endif
#endif // !P_FORCE_INTRINSICS
#ifdef P_AVX2__
# define P_AVX__
# define P_AVX__ 1
#endif
#ifdef P_AVX__
# define P_SSE__
# define P_SSE__ 1
#endif
@ -300,14 +305,15 @@
# undef P_SSE__
# undef P_SSE__
#else
# if defined(P_AVX__) && !defined(P_AVX2__)
# define P_INTRINSICS P_INTRINSICS_AVX
# elif defined(P_AVX2__)
# define P_INTRINSICS P_INTRINSICS_AVX2
# elif (defined(__SSE__) || defined(P_SSE__)) && !defined(P_AVX__)
# define P_INTRINSICS P_INTRINSICS_SSE
# if (P_AVX__ == 1) && (P_AVX2__ == 0)
# define P_INTRINSICS P_INTRINSICS_AVX
# elif P_AVX2__ == 1
# define P_INTRINSICS P_INTRINSICS_AVX2
# elif P_SSE__ == 1
# define P_INTRINSICS P_INTRINSICS_SSE
# elif defined(P_ARM_ARCH)
# define P_INTRINSICS P_INTRINSICS_NEON
# define P_INTRINSICS P_INTRINSICS_NEON
# define P_NEON__ 1
# elif !defined(P_FORCE_INTRINSICS)
# error No SIMD instruction set detected. Use P_FORCE_FPU to disable SIMD extensions.
# endif

View File

@ -0,0 +1,17 @@
#pragma once
#include "Core/public/Math/SIMD/Platform.h"
#if P_INTRINSICS == P_INTRINSICS_AVX2
# include "PhanesVectorMathAVX2.hpp"
#elif P_INTRINSICS == P_INTRINSICS_AVX
# include "PhanesVectorMathAVX.hpp"
#elif P_INTRINSICS == P_INTRINSICS_SSE
# include "PhanesVectorMathSSE.hpp"
#elif P_INTRINSICS == P_INTRINSICS_NEON
# include "PhanesVectorMathNeon.hpp"
#elif P_INTRINSICS == P_INTRINSICS_FPU
# include "PhanesVectorMathFPU.hpp"
#endif

View File

@ -1,13 +1,13 @@
// Defines on compile time, whether a xmm register or an array should be used.
#pragma once
#include "Core/public/Math/SIMD/PhanesSIMD.h"
#include "Core/public/Math/SIMD/PhanesSIMDTypes.h"
#include "Core/public/Math/MathTypes.h"
namespace Phanes::Core::SIMD
{
template<size_t L, typename T, bool IsAligned>
template<size_t L, typename T, bool UseSimd>
struct Storage;
// General unaligned memory storage
@ -143,37 +143,3 @@ namespace Phanes::Core::SIMD
typedef Phanes::Core::Types::Vec4x2u32Reg type;
};
}
struct Vec4
{
public:
union
{
struct
{
int x, y, z, w;
};
typename Phanes::Core::SIMD::Storage<4, Phanes::Core::Types::int32, true>::type comp;
};
};
struct Vec4x2
{
public:
union
{
struct
{
Vec4 v1;
Vec4 v2;
};
typename Phanes::Core::SIMD::Storage<8, Phanes::Core::Types::int32, true>::type comp;
};
};