Add XSIMD implementation

2025-08-07 14:30:28 +02:00
parent b7c13be6c0
commit b55f456178
8 changed files with 255 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 option(USE_MKL "Enable Intel MKL backend" OFF)
 option(USE_GPU "Enable GPU backend" OFF)
 option(USE_XSIMD "Enable XSIMD backend" OFF)
 include_directories(${PROJECT_SOURCE_DIR}/include)
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -16,3 +16,8 @@ if(USE_GPU)
  add_executable(benchmark_gpu benchmark_gpu.cpp)
  target_link_libraries(benchmark_gpu PRIVATE trigdx gpu)
 endif()
 if(USE_XSIMD)
  add_executable(benchmark_lookup_xsimd benchmark_lookup_xsimd.cpp)
  target_link_libraries(benchmark_lookup_xsimd PRIVATE trigdx)
 endif()
--- a/benchmarks/benchmark_lookup_xsimd.cpp
+++ b/benchmarks/benchmark_lookup_xsimd.cpp
@@ -0,0 +1,13 @@
 #include <trigdx/lookup_xsimd.hpp>
 #include "benchmark_utils.hpp"
 int main() {
  benchmark_sinf<LookupXSIMDBackend<16384>>();
  benchmark_cosf<LookupXSIMDBackend<16384>>();
  benchmark_sincosf<LookupXSIMDBackend<16384>>();
  benchmark_sinf<LookupXSIMDBackend<32768>>();
  benchmark_cosf<LookupXSIMDBackend<32768>>();
  benchmark_sincosf<LookupXSIMDBackend<32768>>();
 }
--- a/include/trigdx/lookup_xsimd.hpp
+++ b/include/trigdx/lookup_xsimd.hpp
@@ -0,0 +1,22 @@
 #pragma once
 #include <cstddef>
 #include <memory>
 #include "interface.hpp"
 template <std::size_t NR_SAMPLES> class LookupXSIMDBackend : public Backend {
 public:
  LookupXSIMDBackend();
  ~LookupXSIMDBackend() override;
  void init(size_t n = 0) override;
  void compute_sinf(std::size_t n, const float *x, float *s) const override;
  void compute_cosf(std::size_t n, const float *x, float *c) const override;
  void compute_sincosf(std::size_t n, const float *x, float *s,
                       float *c) const override;
 private:
  struct Impl;
  std::unique_ptr<Impl> impl;
 };
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,3 +18,9 @@ if(USE_GPU)
  target_link_libraries(trigdx PRIVATE CUDA::cudart)
  target_link_libraries(trigdx PRIVATE gpu)
 endif()
 if(USE_XSIMD)
  find_package(xsimd REQUIRED)
  target_sources(trigdx PRIVATE lookup_xsimd.cpp)
  target_link_libraries(trigdx PRIVATE xsimd)
 endif()
--- a/src/lookup_xsimd.cpp
+++ b/src/lookup_xsimd.cpp
@@ -0,0 +1,183 @@
 #include <algorithm>
 #include <cmath>
 #include <vector>
 #include <xsimd/xsimd.hpp>
 #include "trigdx/lookup_xsimd.hpp"
 template <std::size_t NR_SAMPLES> struct lookup_table {
  static constexpr std::size_t MASK = NR_SAMPLES - 1;
  static constexpr float SCALE = NR_SAMPLES / (2.0f * float(M_PI));
  lookup_table() : values{} {
    for (uint_fast32_t i = 0; i < NR_SAMPLES; i++) {
      values[i] = sinf(i * (2.0f * float(M_PI) / NR_SAMPLES));
    }
  }
  std::array<float, NR_SAMPLES> values;
 };
 template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
  constexpr cosf_dispatcher() : lookup_table_(){};
  template <class Tag, class Arch>
  void operator()(Arch, const size_t n, const float *a, float *c, Tag) const {
    using b_type = xsimd::batch<float, Arch>;
    using m_type = xsimd::batch<int32_t, Arch>;
    constexpr uint_fast32_t VL = b_type::size;
    const uint_fast32_t VS = n - n % VL;
    const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
    const b_type scale = b_type::broadcast(lookup_table_.SCALE);
    const m_type mask = m_type::broadcast(lookup_table_.MASK);
    const m_type quarter_pi = m_type::broadcast(Q_PI);
    uint_fast32_t i;
    for (i = 0; i < VS; i += VL) {
      const b_type vx = b_type::load(a + i, Tag());
      const b_type scaled = xsimd::mul(vx, scale);
      m_type idx = xsimd::to_int(scaled);
      m_type idx_cos = xsimd::add(idx, quarter_pi);
      idx_cos = xsimd::bitwise_and(idx_cos, mask);
      const b_type cosv = b_type::gather(lookup_table_.values.data(), idx_cos);
      cosv.store(c + i, Tag());
    }
    for (; i < n; i++) {
      std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) &
                        lookup_table_.MASK;
      std::size_t idx_cos = (idx + Q_PI) & lookup_table_.MASK;
      c[i] = lookup_table_.values[idx_cos];
    }
  }
  lookup_table<NR_SAMPLES> lookup_table_;
 };
 template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
  constexpr sinf_dispatcher() : lookup_table_(){};
  template <class Tag, class Arch>
  void operator()(Arch, const size_t n, const float *a, float *s, Tag) const {
    using b_type = xsimd::batch<float, Arch>;
    using m_type = xsimd::batch<int32_t, Arch>;
    constexpr uint_fast32_t VL = b_type::size;
    const uint_fast32_t VS = n - n % VL;
    const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
    const b_type scale = b_type::broadcast(lookup_table_.SCALE);
    const m_type mask = m_type::broadcast(lookup_table_.MASK);
    const m_type quarter_pi = m_type::broadcast(Q_PI);
    uint_fast32_t i;
    for (i = 0; i < VS; i += VL) {
      const b_type vx = b_type::load(a + i, Tag());
      const b_type scaled = xsimd::mul(vx, scale);
      m_type idx = xsimd::to_int(scaled);
      idx = xsimd::bitwise_and(idx, mask);
      const b_type sinv = b_type::gather(lookup_table_.values.data(), idx);
      sinv.store(s + i, Tag());
    }
    for (; i < n; i++) {
      std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) &
                        lookup_table_.MASK;
      s[i] = lookup_table_.values[idx];
    }
  }
  lookup_table<NR_SAMPLES> lookup_table_;
 };
 template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
  template <class Tag, class Arch>
  void operator()(Arch, const size_t n, const float *a, float *s, float *c,
                  Tag) const {
    using b_type = xsimd::batch<float, Arch>;
    using m_type = xsimd::batch<int32_t, Arch>;
    constexpr uint_fast32_t VL = b_type::size;
    const uint_fast32_t VS = n - n % VL;
    const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
    const b_type scale = b_type::broadcast(lookup_table_.SCALE);
    const m_type mask = m_type::broadcast(lookup_table_.MASK);
    const m_type quarter_pi = m_type::broadcast(Q_PI);
    uint_fast32_t i;
    for (i = 0; i < VS; i += VL) {
      const b_type vx = b_type::load(a + i, Tag());
      const b_type scaled = xsimd::mul(vx, scale);
      m_type idx = xsimd::to_int(scaled);
      m_type idx_cos = xsimd::add(idx, quarter_pi);
      idx = xsimd::bitwise_and(idx, mask);
      idx_cos = xsimd::bitwise_and(idx_cos, mask);
      const b_type sinv = b_type::gather(lookup_table_.values.data(), idx);
      const b_type cosv = b_type::gather(lookup_table_.values.data(), idx_cos);
      sinv.store(s + i, Tag());
      cosv.store(c + i, Tag());
    }
    for (; i < n; i++) {
      std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) &
                        lookup_table_.MASK;
      std::size_t idx_cos = (idx + Q_PI) & lookup_table_.MASK;
      s[i] = lookup_table_.values[idx];
      c[i] = lookup_table_.values[idx_cos];
    }
  }
  lookup_table<NR_SAMPLES> lookup_table_;
 };
 template <std::size_t NR_SAMPLES> struct LookupXSIMDBackend<NR_SAMPLES>::Impl {
  cosf_dispatcher<NR_SAMPLES> cosf_dispatcher_;
  sinf_dispatcher<NR_SAMPLES> sinf_dispatcher_;
  sin_cosf_dispatcher<NR_SAMPLES> sin_cosf_dispatcher_;
  void init() {}
  void compute_sincosf(std::size_t n, const float *x, float *s,
                       float *c) const {
    xsimd::dispatch(sin_cosf_dispatcher_)(n, x, s, c, xsimd::unaligned_mode());
  }
  void compute_sinf(std::size_t n, const float *x, float *s) const {
    xsimd::dispatch(sinf_dispatcher_)(n, x, s, xsimd::unaligned_mode());
  }
  void compute_cosf(std::size_t n, const float *x, float *c) const {
    xsimd::dispatch(cosf_dispatcher_)(n, x, c, xsimd::unaligned_mode());
  }
 };
 template <std::size_t NR_SAMPLES>
 LookupXSIMDBackend<NR_SAMPLES>::LookupXSIMDBackend()
    : impl(std::make_unique<Impl>()) {}
 template <std::size_t NR_SAMPLES>
 LookupXSIMDBackend<NR_SAMPLES>::~LookupXSIMDBackend() = default;
 template <std::size_t NR_SAMPLES>
 void LookupXSIMDBackend<NR_SAMPLES>::init(size_t) {
  impl->init();
 }
 template <std::size_t NR_SAMPLES>
 void LookupXSIMDBackend<NR_SAMPLES>::compute_sinf(const std::size_t n,
                                                  const float *x,
                                                  float *s) const {
  impl->compute_sinf(n, x, s);
 }
 template <std::size_t NR_SAMPLES>
 void LookupXSIMDBackend<NR_SAMPLES>::compute_cosf(const std::size_t n,
                                                  const float *x,
                                                  float *c) const {
  impl->compute_cosf(n, x, c);
 }
 template <std::size_t NR_SAMPLES>
 void LookupXSIMDBackend<NR_SAMPLES>::compute_sincosf(const std::size_t n,
                                                     const float *x, float *s,
                                                     float *c) const {
  impl->compute_sincosf(n, x, s, c);
 }
 // Explicit instantiations
 template class LookupXSIMDBackend<16384>;
 template class LookupXSIMDBackend<32768>;
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -32,3 +32,9 @@ if(USE_GPU)
  target_link_libraries(test_gpu PRIVATE trigdx Catch2::Catch2WithMain)
  add_test(NAME test_gpu COMMAND test_gpu)
 endif()
 if(USE_XSIMD)
  add_executable(test_lookup_xsimd test_lookup_xsimd.cpp)
  target_link_libraries(test_lookup_xsimd PRIVATE trigdx Catch2::Catch2WithMain)
  add_test(NAME test_lookup_xsimd COMMAND test_lookup_xsimd)
 endif()
--- a/tests/test_lookup_xsimd.cpp
+++ b/tests/test_lookup_xsimd.cpp
@@ -0,0 +1,19 @@
 #include <catch2/catch_test_macros.hpp>
 #include <trigdx/lookup_xsimd.hpp>
 #include "test_utils.hpp"
 TEST_CASE("sincosf") {
  test_sincosf<LookupXSIMDBackend<16384>>(1e-2f);
  test_sincosf<LookupXSIMDBackend<32768>>(1e-2f);
 }
 TEST_CASE("sinf") {
  test_sinf<LookupXSIMDBackend<16384>>(1e-2f);
  test_sinf<LookupXSIMDBackend<32768>>(1e-2f);
 }
 TEST_CASE("cosf") {
  test_cosf<LookupXSIMDBackend<16384>>(1e-2f);
  test_cosf<LookupXSIMDBackend<32768>>(1e-2f);
 }