From b55f456178d1275cefaf8362030f91161f26a50f Mon Sep 17 00:00:00 2001 From: mancini Date: Thu, 7 Aug 2025 14:30:28 +0200 Subject: [PATCH] Add XSIMD implementation --- CMakeLists.txt | 1 + benchmarks/CMakeLists.txt | 5 + benchmarks/benchmark_lookup_xsimd.cpp | 13 ++ include/trigdx/lookup_xsimd.hpp | 22 ++++ src/CMakeLists.txt | 6 + src/lookup_xsimd.cpp | 183 ++++++++++++++++++++++++++ tests/CMakeLists.txt | 6 + tests/test_lookup_xsimd.cpp | 19 +++ 8 files changed, 255 insertions(+) create mode 100644 benchmarks/benchmark_lookup_xsimd.cpp create mode 100644 include/trigdx/lookup_xsimd.hpp create mode 100644 src/lookup_xsimd.cpp create mode 100644 tests/test_lookup_xsimd.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 46c8e61..33d5d36 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) option(USE_MKL "Enable Intel MKL backend" OFF) option(USE_GPU "Enable GPU backend" OFF) +option(USE_XSIMD "Enable XSIMD backend" OFF) include_directories(${PROJECT_SOURCE_DIR}/include) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 266c822..64e8ee8 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -16,3 +16,8 @@ if(USE_GPU) add_executable(benchmark_gpu benchmark_gpu.cpp) target_link_libraries(benchmark_gpu PRIVATE trigdx gpu) endif() + +if(USE_XSIMD) + add_executable(benchmark_lookup_xsimd benchmark_lookup_xsimd.cpp) + target_link_libraries(benchmark_lookup_xsimd PRIVATE trigdx) +endif() diff --git a/benchmarks/benchmark_lookup_xsimd.cpp b/benchmarks/benchmark_lookup_xsimd.cpp new file mode 100644 index 0000000..bcbbef9 --- /dev/null +++ b/benchmarks/benchmark_lookup_xsimd.cpp @@ -0,0 +1,13 @@ +#include + +#include "benchmark_utils.hpp" + +int main() { + benchmark_sinf>(); + benchmark_cosf>(); + benchmark_sincosf>(); + + benchmark_sinf>(); + benchmark_cosf>(); + benchmark_sincosf>(); +} diff --git a/include/trigdx/lookup_xsimd.hpp b/include/trigdx/lookup_xsimd.hpp new file mode 100644 index 0000000..7b14713 --- /dev/null +++ b/include/trigdx/lookup_xsimd.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include +#include + +#include "interface.hpp" + +template class LookupXSIMDBackend : public Backend { +public: + LookupXSIMDBackend(); + ~LookupXSIMDBackend() override; + + void init(size_t n = 0) override; + void compute_sinf(std::size_t n, const float *x, float *s) const override; + void compute_cosf(std::size_t n, const float *x, float *c) const override; + void compute_sincosf(std::size_t n, const float *x, float *s, + float *c) const override; + +private: + struct Impl; + std::unique_ptr impl; +}; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f26d1d1..3f5d85c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -18,3 +18,9 @@ if(USE_GPU) target_link_libraries(trigdx PRIVATE CUDA::cudart) target_link_libraries(trigdx PRIVATE gpu) endif() + +if(USE_XSIMD) + find_package(xsimd REQUIRED) + target_sources(trigdx PRIVATE lookup_xsimd.cpp) + target_link_libraries(trigdx PRIVATE xsimd) +endif() diff --git a/src/lookup_xsimd.cpp b/src/lookup_xsimd.cpp new file mode 100644 index 0000000..b5858d0 --- /dev/null +++ b/src/lookup_xsimd.cpp @@ -0,0 +1,183 @@ +#include +#include +#include +#include + +#include "trigdx/lookup_xsimd.hpp" + +template struct lookup_table { + static constexpr std::size_t MASK = NR_SAMPLES - 1; + static constexpr float SCALE = NR_SAMPLES / (2.0f * float(M_PI)); + lookup_table() : values{} { + for (uint_fast32_t i = 0; i < NR_SAMPLES; i++) { + values[i] = sinf(i * (2.0f * float(M_PI) / NR_SAMPLES)); + } + } + std::array values; +}; + +template struct cosf_dispatcher { + constexpr cosf_dispatcher() : lookup_table_(){}; + template + void operator()(Arch, const size_t n, const float *a, float *c, Tag) const { + using b_type = xsimd::batch; + using m_type = xsimd::batch; + + constexpr uint_fast32_t VL = b_type::size; + const uint_fast32_t VS = n - n % VL; + const uint_fast32_t Q_PI = NR_SAMPLES / 4U; + const b_type scale = b_type::broadcast(lookup_table_.SCALE); + const m_type mask = m_type::broadcast(lookup_table_.MASK); + + const m_type quarter_pi = m_type::broadcast(Q_PI); + uint_fast32_t i; + for (i = 0; i < VS; i += VL) { + const b_type vx = b_type::load(a + i, Tag()); + const b_type scaled = xsimd::mul(vx, scale); + m_type idx = xsimd::to_int(scaled); + m_type idx_cos = xsimd::add(idx, quarter_pi); + idx_cos = xsimd::bitwise_and(idx_cos, mask); + const b_type cosv = b_type::gather(lookup_table_.values.data(), idx_cos); + + cosv.store(c + i, Tag()); + } + for (; i < n; i++) { + std::size_t idx = static_cast(a[i] * lookup_table_.SCALE) & + lookup_table_.MASK; + std::size_t idx_cos = (idx + Q_PI) & lookup_table_.MASK; + + c[i] = lookup_table_.values[idx_cos]; + } + } + lookup_table lookup_table_; +}; + +template struct sinf_dispatcher { + constexpr sinf_dispatcher() : lookup_table_(){}; + template + void operator()(Arch, const size_t n, const float *a, float *s, Tag) const { + using b_type = xsimd::batch; + using m_type = xsimd::batch; + + constexpr uint_fast32_t VL = b_type::size; + const uint_fast32_t VS = n - n % VL; + const uint_fast32_t Q_PI = NR_SAMPLES / 4U; + const b_type scale = b_type::broadcast(lookup_table_.SCALE); + const m_type mask = m_type::broadcast(lookup_table_.MASK); + + const m_type quarter_pi = m_type::broadcast(Q_PI); + uint_fast32_t i; + for (i = 0; i < VS; i += VL) { + const b_type vx = b_type::load(a + i, Tag()); + const b_type scaled = xsimd::mul(vx, scale); + m_type idx = xsimd::to_int(scaled); + idx = xsimd::bitwise_and(idx, mask); + const b_type sinv = b_type::gather(lookup_table_.values.data(), idx); + + sinv.store(s + i, Tag()); + } + for (; i < n; i++) { + std::size_t idx = static_cast(a[i] * lookup_table_.SCALE) & + lookup_table_.MASK; + s[i] = lookup_table_.values[idx]; + } + } + lookup_table lookup_table_; +}; + +template struct sin_cosf_dispatcher { + template + void operator()(Arch, const size_t n, const float *a, float *s, float *c, + Tag) const { + using b_type = xsimd::batch; + using m_type = xsimd::batch; + + constexpr uint_fast32_t VL = b_type::size; + const uint_fast32_t VS = n - n % VL; + const uint_fast32_t Q_PI = NR_SAMPLES / 4U; + const b_type scale = b_type::broadcast(lookup_table_.SCALE); + const m_type mask = m_type::broadcast(lookup_table_.MASK); + + const m_type quarter_pi = m_type::broadcast(Q_PI); + uint_fast32_t i; + for (i = 0; i < VS; i += VL) { + const b_type vx = b_type::load(a + i, Tag()); + const b_type scaled = xsimd::mul(vx, scale); + m_type idx = xsimd::to_int(scaled); + m_type idx_cos = xsimd::add(idx, quarter_pi); + idx = xsimd::bitwise_and(idx, mask); + idx_cos = xsimd::bitwise_and(idx_cos, mask); + const b_type sinv = b_type::gather(lookup_table_.values.data(), idx); + const b_type cosv = b_type::gather(lookup_table_.values.data(), idx_cos); + + sinv.store(s + i, Tag()); + cosv.store(c + i, Tag()); + } + for (; i < n; i++) { + std::size_t idx = static_cast(a[i] * lookup_table_.SCALE) & + lookup_table_.MASK; + std::size_t idx_cos = (idx + Q_PI) & lookup_table_.MASK; + s[i] = lookup_table_.values[idx]; + c[i] = lookup_table_.values[idx_cos]; + } + } + lookup_table lookup_table_; +}; + +template struct LookupXSIMDBackend::Impl { + cosf_dispatcher cosf_dispatcher_; + sinf_dispatcher sinf_dispatcher_; + sin_cosf_dispatcher sin_cosf_dispatcher_; + + void init() {} + + void compute_sincosf(std::size_t n, const float *x, float *s, + float *c) const { + xsimd::dispatch(sin_cosf_dispatcher_)(n, x, s, c, xsimd::unaligned_mode()); + } + + void compute_sinf(std::size_t n, const float *x, float *s) const { + xsimd::dispatch(sinf_dispatcher_)(n, x, s, xsimd::unaligned_mode()); + } + + void compute_cosf(std::size_t n, const float *x, float *c) const { + xsimd::dispatch(cosf_dispatcher_)(n, x, c, xsimd::unaligned_mode()); + } +}; + +template +LookupXSIMDBackend::LookupXSIMDBackend() + : impl(std::make_unique()) {} + +template +LookupXSIMDBackend::~LookupXSIMDBackend() = default; + +template +void LookupXSIMDBackend::init(size_t) { + impl->init(); +} + +template +void LookupXSIMDBackend::compute_sinf(const std::size_t n, + const float *x, + float *s) const { + impl->compute_sinf(n, x, s); +} + +template +void LookupXSIMDBackend::compute_cosf(const std::size_t n, + const float *x, + float *c) const { + impl->compute_cosf(n, x, c); +} + +template +void LookupXSIMDBackend::compute_sincosf(const std::size_t n, + const float *x, float *s, + float *c) const { + impl->compute_sincosf(n, x, s, c); +} + +// Explicit instantiations +template class LookupXSIMDBackend<16384>; +template class LookupXSIMDBackend<32768>; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9be3731..8978e43 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -32,3 +32,9 @@ if(USE_GPU) target_link_libraries(test_gpu PRIVATE trigdx Catch2::Catch2WithMain) add_test(NAME test_gpu COMMAND test_gpu) endif() + +if(USE_XSIMD) + add_executable(test_lookup_xsimd test_lookup_xsimd.cpp) + target_link_libraries(test_lookup_xsimd PRIVATE trigdx Catch2::Catch2WithMain) + add_test(NAME test_lookup_xsimd COMMAND test_lookup_xsimd) +endif() diff --git a/tests/test_lookup_xsimd.cpp b/tests/test_lookup_xsimd.cpp new file mode 100644 index 0000000..9192ad2 --- /dev/null +++ b/tests/test_lookup_xsimd.cpp @@ -0,0 +1,19 @@ +#include +#include + +#include "test_utils.hpp" + +TEST_CASE("sincosf") { + test_sincosf>(1e-2f); + test_sincosf>(1e-2f); +} + +TEST_CASE("sinf") { + test_sinf>(1e-2f); + test_sinf>(1e-2f); +} + +TEST_CASE("cosf") { + test_cosf>(1e-2f); + test_cosf>(1e-2f); +} \ No newline at end of file