Add build subdirectory

Use v4
Switch to upload/download-artifacts that retain permissions
2025-08-21 15:06:10 +02:00 · 2025-08-21 14:58:57 +02:00 · 2025-08-21 14:54:46 +02:00 · 2025-08-21 09:32:25 +02:00 · 2025-08-21 09:18:29 +02:00
13 changed files with 76 additions and 216 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,4 +8,3 @@ repos:
    hooks:
      - id: cmake-format
      - id: cmake-lint
        args: [--disabled-codes=C0301]
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,11 +12,6 @@ option(TRIGDX_BUILD_TESTS "Build tests" ON)
 option(TRIGDX_BUILD_BENCHMARKS "Build tests" ON)
 option(TRIGDX_BUILD_PYTHON "Build Python interface" ON)
 # Add compiler flags
 set(CMAKE_CXX_FLAGS
    "${CMAKE_CXX_FLAGS} -Wall -Wnon-virtual-dtor -Wduplicated-branches -Wvla -Wpointer-arith -Wextra -Wno-unused-parameter"
 )
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 configure_file(
  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/trigdx_config.hpp.in
--- a/README.md
+++ b/README.md
@@ -1,54 +0,0 @@
 # TrigDx
 High‑performance C++ library offering multiple implementations of transcendental trigonometric functions (e.g., sin, cos, tan and their variants), designed for numerical, signal‑processing, and real‑time systems where trading a small loss of accuracy for significantly higher throughput on modern CPUs (scalar and SIMD) and NVIDIA GPUs is acceptable.
 ## Why TrigDx?
 Many applications use the standard library implementations, which prioritise correctness but are not always optimal for throughput on vectorized or GPU hardware. TrigDx gives you multiple implementations so you can:
 - Replace `std::sin` / `std::cos` calls with faster approximations when a small, bounded reduction in accuracy is acceptable.
 - Use SIMD/vectorized implementations and compact lookup tables for high throughput lookups.
 - Run massively parallel kernels that take advantage of a GPU's _Special Function Units_ (SFUs).
 ## Requirements
 - A C++ compiler with at least C++17 support (GCC, Clang)
 - CMake 3.15+
 - Optional: NVIDIA CUDA Toolkit 11+ to build GPU kernels
 - Optional: GoogleTest (for unit tests) and GoogleBenchmark (for microbenchmarks)
 ## Building
 ```bash
 git clone https://github.com/astron-rd/TrigDx.git
 cd TrigDx
 mkdir build && cd build
 # CPU-only:
 cmake -DCMAKE_BUILD_TYPE=Release -DTRIGDX_USE_XSIMD=ON ..
 cmake --build . -j
 # Enable CUDA (if available):
 cmake -DCMAKE_BUILD_TYPE=Release -DTRIGDX_USE_GPU=ON ..
 cmake --build . -j
 # Run tests:
 ctest --output-on-failure -j
 ```
 Common CMake options:
 - `TRIGDX_USE_GPU=ON/OFF` — build GPU support.
 - `TRIGDX_BUILD_TESTS=ON/OFF` — build tests.
 - `TRIGDX_BUILD_BENCHMARKS=ON/OFF` — build benchmarks.
 - `TRIGDX_BUILD_PYTHON` — build Python interface.
 ## Contributing
 - Fork → create a feature branch → open a PR.
 - Include unit tests for correctness‑sensitive changes and benchmark results for performance changes.
 - Follow project style (clang‑format) and run tests locally before submitting.
 ## Reporting issues
 When opening an issue for incorrect results or performance regressions, please include:
 - Platform and CPU/GPU model.
 - Compiler and version with exact compile flags.
 - Small reproducer (input data and the TrigDx implementation used).
 ## License
 See the LICENSE file in the repository for licensing details.
--- a/benchmarks/benchmark_utils.hpp
+++ b/benchmarks/benchmark_utils.hpp
@@ -2,14 +2,13 @@
 #include <chrono>
 #include <cmath>
 #include <stdexcept>
 #include <string>
 #include <vector>
 #include <benchmark/benchmark.h>
-void init_x(float *x, size_t n) {
+void init_x(std::vector<float> &x) {
-  for (size_t i = 0; i < n; ++i) {
+  for (size_t i = 0; i < x.size(); ++i) {
    x[i] = (i % 360) * 0.0174533f; // degrees to radians
  }
 }
@@ -17,31 +16,24 @@ void init_x(float *x, size_t n) {
 template <typename Backend>
 static void benchmark_sinf(benchmark::State &state) {
  const size_t N = static_cast<size_t>(state.range(0));
  std::vector<float> x(N), s(N);
  init_x(x);
  Backend backend;
  auto start = std::chrono::high_resolution_clock::now();
  backend.init(N);
  float *x =
      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  float *s =
      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  auto end = std::chrono::high_resolution_clock::now();
  state.counters["init_ms"] =
      std::chrono::duration_cast<std::chrono::microseconds>(end - start)
          .count() /
      1.e3;
  init_x(x, N);
  for (auto _ : state) {
-    backend.compute_sinf(N, x, s);
+    backend.compute_sinf(N, x.data(), s.data());
    benchmark::DoNotOptimize(s);
  }
  backend.free_memory(x);
  backend.free_memory(s);
  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
                          static_cast<int64_t>(N));
 }
@@ -49,35 +41,24 @@ static void benchmark_sinf(benchmark::State &state) {
 template <typename Backend>
 static void benchmark_cosf(benchmark::State &state) {
  const size_t N = static_cast<size_t>(state.range(0));
  std::vector<float> x(N), c(N);
  init_x(x);
  Backend backend;
  auto start = std::chrono::high_resolution_clock::now();
  backend.init(N);
  float *x =
      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  float *c =
      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  if (!x || !c) {
    throw std::runtime_error("Buffer allocation failed");
  }
  auto end = std::chrono::high_resolution_clock::now();
  state.counters["init_ms"] =
      std::chrono::duration_cast<std::chrono::microseconds>(end - start)
          .count() /
      1.e3;
  init_x(x, N);
  for (auto _ : state) {
-    backend.compute_cosf(N, x, c);
+    backend.compute_cosf(N, x.data(), c.data());
    benchmark::DoNotOptimize(c);
  }
  backend.free_memory(x);
  backend.free_memory(c);
  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
                          static_cast<int64_t>(N));
 }
@@ -85,38 +66,25 @@ static void benchmark_cosf(benchmark::State &state) {
 template <typename Backend>
 static void benchmark_sincosf(benchmark::State &state) {
  const size_t N = static_cast<size_t>(state.range(0));
  std::vector<float> x(N), s(N), c(N);
  init_x(x);
  Backend backend;
  auto start = std::chrono::high_resolution_clock::now();
  backend.init(N);
  float *x =
      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  float *s =
      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  float *c =
      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  if (!x || !s || !c) {
    throw std::runtime_error("Buffer allocation failed");
  }
  auto end = std::chrono::high_resolution_clock::now();
  state.counters["init_ms"] =
      std::chrono::duration_cast<std::chrono::microseconds>(end - start)
          .count() /
      1.e3;
  init_x(x, N);
  for (auto _ : state) {
-    backend.compute_sincosf(N, x, s, c);
+    backend.compute_sincosf(N, x.data(), s.data(), c.data());
    benchmark::DoNotOptimize(s);
    benchmark::DoNotOptimize(c);
  }
  backend.free_memory(x);
  backend.free_memory(s);
  backend.free_memory(c);
  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
                          static_cast<int64_t>(N));
 }
--- a/include/trigdx/gpu.hpp
+++ b/include/trigdx/gpu.hpp
@@ -11,8 +11,7 @@ public:
  GPUBackend();
  ~GPUBackend() override;
-  void *allocate_memory(size_t bytes) const override;
+  void init(size_t n = 0) override;
  void free_memory(void *ptr) const override;
  void compute_sinf(size_t n, const float *x, float *s) const override;
  void compute_cosf(size_t n, const float *x, float *c) const override;
  void compute_sincosf(size_t n, const float *x, float *s,
--- a/include/trigdx/interface.hpp
+++ b/include/trigdx/interface.hpp
@@ -1,8 +1,6 @@
 #pragma once
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
 // Base interface for all math backends
 class Backend {
@@ -12,12 +10,6 @@ public:
  // Optional initialization
  virtual void init(size_t n = 0) {}
  virtual void *allocate_memory(size_t bytes) const {
    return static_cast<void *>(new uint8_t[bytes]);
  };
  virtual void free_memory(void *ptr) const { std::free(ptr); };
  // Compute sine for n elements
  virtual void compute_sinf(size_t n, const float *x, float *s) const = 0;
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,6 +1,4 @@
-find_package(pybind11 CONFIG QUIET)
+if(NOT TARGET pybind11)
 if(NOT pybind11_FOUND)
  FetchContent_Declare(
    pybind11
    GIT_REPOSITORY https://github.com/pybind/pybind11.git
@@ -8,16 +6,5 @@ if(NOT pybind11_FOUND)
  FetchContent_MakeAvailable(pybind11)
 endif()
 # Needed to set ${Python_VERSION_MAJOR} and ${Python_VERSION_MINOR}
 find_package(Python REQUIRED)
 pybind11_add_module(pytrigdx bindings.cpp)
 target_link_libraries(pytrigdx PRIVATE trigdx)
 set_target_properties(pytrigdx PROPERTIES OUTPUT_NAME "trigdx")
 set(PYTHON_SITE_PACKAGES
    "${CMAKE_INSTALL_LIBDIR}/python${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}/site-packages/trigdx"
 )
 install(TARGETS pytrigdx DESTINATION ${PYTHON_SITE_PACKAGES})
 install(FILES __init__.py DESTINATION ${PYTHON_SITE_PACKAGES})
--- a/python/init.py
+++ b/python/init.py
@@ -1,16 +0,0 @@
 from .trigdx import Reference, Lookup16K, Lookup32K, LookupAVX16K, LookupAVX32K
 try:
    from .trigdx import MKL
 except ImportError:
    pass
 try:
    from .trigdx import GPU
 except ImportError:
    pass
 try:
    from .trigdx import LookupXSIMD16K, LookupXSIMD32K
 except ImportError:
    pass
--- a/python/bindings.cpp
+++ b/python/bindings.cpp
@@ -72,9 +72,7 @@ void bind_backend(py::module &m, const char *name) {
      .def("compute_sincosf", &compute_sincos<float>);
 }
-PYBIND11_MODULE(trigdx, m) {
+PYBIND11_MODULE(pytrigdx, m) {
  m.doc() = "TrigDx python bindings";
  py::class_<Backend, std::shared_ptr<Backend>>(m, "Backend")
      .def("init", &Backend::init);
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,24 +2,6 @@ include(FetchContent)
 include(FindAVX)
 add_library(trigdx reference.cpp lookup.cpp)
 if(HAVE_AVX2)
  target_compile_definitions(trigdx PUBLIC HAVE_AVX2)
  if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL
                                               "IntelLLVM")
    target_compile_options(trigdx PUBLIC -xCORE-AVX2)
  else()
    target_compile_options(trigdx PUBLIC -mavx2)
  endif()
 elseif(HAVE_AVX)
  target_compile_definitions(trigdx PUBLIC HAVE_AVX)
  if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL
                                               "IntelLLVM")
    target_compile_options(trigdx PUBLIC -xAVX)
  else()
    target_compile_options(trigdx PUBLIC -mavx)
  endif()
 endif()
 target_include_directories(trigdx PUBLIC ${PROJECT_SOURCE_DIR}/include)
 if(HAVE_AVX)
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -10,63 +10,79 @@
 struct GPUBackend::Impl {
-  void *allocate_memory(size_t bytes) const {
+  ~Impl() {
-    void *ptr;
+    if (h_x) {
-    cudaMallocHost(&ptr, bytes);
+      cudaFreeHost(h_x);
-    return ptr;
+    }
    if (h_s) {
      cudaFreeHost(h_s);
    }
    if (h_c) {
      cudaFreeHost(h_c);
    }
    if (d_x) {
      cudaFree(d_x);
    }
    if (d_s) {
      cudaFree(d_s);
    }
    if (d_c) {
      cudaFree(d_c);
    }
  }
-  void free_memory(void *ptr) const { cudaFreeHost(ptr); }
+  void init(size_t n) {
    const size_t bytes = n * sizeof(float);
    cudaMallocHost(&h_x, bytes);
    cudaMallocHost(&h_s, bytes);
    cudaMallocHost(&h_c, bytes);
    cudaMalloc(&d_x, bytes);
    cudaMalloc(&d_s, bytes);
    cudaMalloc(&d_c, bytes);
  }
  void compute_sinf(size_t n, const float *x, float *s) const {
    const size_t bytes = n * sizeof(float);
-    float *d_x, *d_s;
+    std::memcpy(h_x, x, bytes);
-    cudaMalloc(&d_x, bytes);
+    cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
    cudaMalloc(&d_s, bytes);
    cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
    launch_sinf_kernel(d_x, d_s, n);
-    cudaMemcpy(s, d_s, bytes, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost);
-    cudaFree(d_x);
+    std::memcpy(s, h_s, bytes);
    cudaFree(d_s);
  }
  void compute_cosf(size_t n, const float *x, float *c) const {
    const size_t bytes = n * sizeof(float);
-    float *d_x, *d_c;
+    std::memcpy(h_x, x, bytes);
-    cudaMalloc(&d_x, bytes);
+    cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
    cudaMalloc(&d_c, bytes);
    cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
    launch_cosf_kernel(d_x, d_c, n);
-    cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
-    cudaFree(d_x);
+    std::memcpy(c, h_c, bytes);
    cudaFree(d_c);
  }
  void compute_sincosf(size_t n, const float *x, float *s, float *c) const {
    const size_t bytes = n * sizeof(float);
-    float *d_x, *d_s, *d_c;
+    std::memcpy(h_x, x, bytes);
-    cudaMalloc(&d_x, bytes);
+    cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
    cudaMalloc(&d_s, bytes);
    cudaMalloc(&d_c, bytes);
    cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
    launch_sincosf_kernel(d_x, d_s, d_c, n);
-    cudaMemcpy(s, d_s, bytes, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost);
-    cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
-    cudaFree(d_x);
+    std::memcpy(s, h_s, bytes);
-    cudaFree(d_s);
+    std::memcpy(c, h_c, bytes);
    cudaFree(d_c);
  }
  float *h_x = nullptr;
  float *h_s = nullptr;
  float *h_c = nullptr;
  float *d_x = nullptr;
  float *d_s = nullptr;
  float *d_c = nullptr;
 };
 GPUBackend::GPUBackend() : impl(std::make_unique<Impl>()) {}
 GPUBackend::~GPUBackend() = default;
-void *GPUBackend::allocate_memory(size_t bytes) const {
+void GPUBackend::init(size_t n) { impl->init(n); }
  return impl->allocate_memory(bytes);
 }
 void GPUBackend::free_memory(void *ptr) const { impl->free_memory(ptr); }
 void GPUBackend::compute_sinf(size_t n, const float *x, float *s) const {
  impl->compute_sinf(n, x, s);
--- a/src/lookup_avx.cpp
+++ b/src/lookup_avx.cpp
@@ -6,16 +6,6 @@
 #include "trigdx/lookup_avx.hpp"
 #if defined(HAVE_AVX) && !defined(__AVX__)
 static_assert(HAVE_AVX == 0, "__AVX__ should be defined when HAVE_AVX is "
                             "defined");
 #endif
 #if defined(HAVE_AVX2) && !defined(__AVX2__)
 static_assert(HAVE_AVX2 == 0, "__AVX2__ should be defined when HAVE_AVX2 is "
                              "defined");
 #endif
 template <std::size_t NR_SAMPLES> struct LookupAVXBackend<NR_SAMPLES>::Impl {
  std::vector<float> lookup;
  static constexpr std::size_t MASK = NR_SAMPLES - 1;
--- a/src/lookup_xsimd.cpp
+++ b/src/lookup_xsimd.cpp
@@ -20,8 +20,8 @@ template <std::size_t NR_SAMPLES> struct lookup_table {
      cos_values[i] = cosf(i * PI_FRAC);
    }
  }
  std::array<float, NR_SAMPLES> sin_values;
  std::array<float, NR_SAMPLES> cos_values;
  std::array<float, NR_SAMPLES> sin_values;
 };
 template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
@@ -33,6 +33,7 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
    constexpr uint_fast32_t VL = b_type::size;
    const uint_fast32_t VS = n - n % VL;
    const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
    const b_type scale = b_type::broadcast(lookup_table_.SCALE);
    const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
    const m_type mask = m_type::broadcast(lookup_table_.MASK);
@@ -41,7 +42,7 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
    const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
    const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
    const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
-
+    const m_type quarter_pi = m_type::broadcast(Q_PI);
    uint_fast32_t i;
    for (i = 0; i < VS; i += VL) {
      const b_type vx = b_type::load(a + i, Tag());
@@ -59,7 +60,7 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
      const b_type dx4 = xsimd::mul(dx2, dx);
      const b_type t2 = xsimd::mul(dx2, term2);
      const b_type t3 = xsimd::mul(dx3, term3);
-      const b_type t4 = xsimd::mul(dx4, term4);
+      const b_type t4 = xsimd::mul(dx4, term3);
      const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
@@ -97,6 +98,7 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
    constexpr uint_fast32_t VL = b_type::size;
    const uint_fast32_t VS = n - n % VL;
    const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
    const b_type scale = b_type::broadcast(lookup_table_.SCALE);
    const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
    const m_type mask = m_type::broadcast(lookup_table_.MASK);
@@ -105,7 +107,7 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
    const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
    const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
    const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
-
+    const m_type quarter_pi = m_type::broadcast(Q_PI);
    uint_fast32_t i;
    for (i = 0; i < VS; i += VL) {
      const b_type vx = b_type::load(a + i, Tag());
@@ -118,7 +120,7 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
      const b_type dx4 = xsimd::mul(dx2, dx);
      const b_type t2 = xsimd::mul(dx2, term2);
      const b_type t3 = xsimd::mul(dx3, term3);
-      const b_type t4 = xsimd::mul(dx4, term4);
+      const b_type t4 = xsimd::mul(dx4, term3);
      const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
      const b_type sindx = xsimd::sub(dx, t3);
@@ -158,6 +160,7 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
    constexpr uint_fast32_t VL = b_type::size;
    const uint_fast32_t VS = n - n % VL;
    const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
    const b_type scale = b_type::broadcast(lookup_table_.SCALE);
    const m_type mask = m_type::broadcast(lookup_table_.MASK);
    const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
@@ -167,6 +170,7 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
    const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
    const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
    const m_type quarter_pi = m_type::broadcast(Q_PI);
    uint_fast32_t i;
    for (i = 0; i < VS; i += VL) {
      const b_type vx = b_type::load(a + i, Tag());
@@ -179,7 +183,7 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
      const b_type dx4 = xsimd::mul(dx2, dx);
      const b_type t2 = xsimd::mul(dx2, term2);
      const b_type t3 = xsimd::mul(dx3, term3);
-      const b_type t4 = xsimd::mul(dx4, term4);
+      const b_type t4 = xsimd::mul(dx4, term3);
      idx = xsimd::bitwise_and(idx, mask);
      b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
Author	SHA1	Message	Date
Bram Veenboer	83d60fdda0	Add build subdirectory	2025-08-21 15:06:10 +02:00
Bram Veenboer	660a800ece	Use v4	2025-08-21 14:58:57 +02:00
Bram Veenboer	24f3ccfca8	Switch to upload/download-artifacts that retain permissions	2025-08-21 14:54:46 +02:00
Bram Veenboer	2381981197	DEBUG	2025-08-21 09:32:25 +02:00
Bram Veenboer	0774fd9123	Add build with Intel compiler	2025-08-21 09:18:29 +02:00