Fix compiler warnings

2025-10-22 15:20:50 +02:00
7 changed files with 15 additions and 98 deletions
--- a/README.md
+++ b/README.md
@@ -1,54 +0,0 @@
 # TrigDx
 High‑performance C++ library offering multiple implementations of transcendental trigonometric functions (e.g., sin, cos, tan and their variants), designed for numerical, signal‑processing, and real‑time systems where trading a small loss of accuracy for significantly higher throughput on modern CPUs (scalar and SIMD) and NVIDIA GPUs is acceptable.
 ## Why TrigDx?
 Many applications use the standard library implementations, which prioritise correctness but are not always optimal for throughput on vectorized or GPU hardware. TrigDx gives you multiple implementations so you can:
 - Replace `std::sin` / `std::cos` calls with faster approximations when a small, bounded reduction in accuracy is acceptable.
 - Use SIMD/vectorized implementations and compact lookup tables for high throughput lookups.
 - Run massively parallel kernels that take advantage of a GPU's _Special Function Units_ (SFUs).
 ## Requirements
 - A C++ compiler with at least C++17 support (GCC, Clang)
 - CMake 3.15+
 - Optional: NVIDIA CUDA Toolkit 11+ to build GPU kernels
 - Optional: GoogleTest (for unit tests) and GoogleBenchmark (for microbenchmarks)
 ## Building
 ```bash
 git clone https://github.com/astron-rd/TrigDx.git
 cd TrigDx
 mkdir build && cd build
 # CPU-only:
 cmake -DCMAKE_BUILD_TYPE=Release -DTRIGDX_USE_XSIMD=ON ..
 cmake --build . -j
 # Enable CUDA (if available):
 cmake -DCMAKE_BUILD_TYPE=Release -DTRIGDX_USE_GPU=ON ..
 cmake --build . -j
 # Run tests:
 ctest --output-on-failure -j
 ```
 Common CMake options:
 - `TRIGDX_USE_GPU=ON/OFF` — build GPU support.
 - `TRIGDX_BUILD_TESTS=ON/OFF` — build tests.
 - `TRIGDX_BUILD_BENCHMARKS=ON/OFF` — build benchmarks.
 - `TRIGDX_BUILD_PYTHON` — build Python interface.
 ## Contributing
 - Fork → create a feature branch → open a PR.
 - Include unit tests for correctness‑sensitive changes and benchmark results for performance changes.
 - Follow project style (clang‑format) and run tests locally before submitting.
 ## Reporting issues
 When opening an issue for incorrect results or performance regressions, please include:
 - Platform and CPU/GPU model.
 - Compiler and version with exact compile flags.
 - Small reproducer (input data and the TrigDx implementation used).
 ## License
 See the LICENSE file in the repository for licensing details.
--- a/benchmarks/benchmark_utils.hpp
+++ b/benchmarks/benchmark_utils.hpp
@@ -26,9 +26,6 @@ static void benchmark_sinf(benchmark::State &state) {
      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  float *s =
      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  if (!x || !s) {
    throw std::runtime_error("Buffer allocation failed");
  }
  auto end = std::chrono::high_resolution_clock::now();
  state.counters["init_ms"] =
      std::chrono::duration_cast<std::chrono::microseconds>(end - start)
--- a/include/trigdx/interface.hpp
+++ b/include/trigdx/interface.hpp
@@ -16,7 +16,7 @@ public:
    return static_cast<void *>(new uint8_t[bytes]);
  };
-  virtual void free_memory(void *ptr) const { delete[] static_cast<uint8_t*>(ptr); };
+  virtual void free_memory(void *ptr) const { std::free(ptr); };
  // Compute sine for n elements
  virtual void compute_sinf(size_t n, const float *x, float *s) const = 0;
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,24 +2,6 @@ include(FetchContent)
 include(FindAVX)
 add_library(trigdx reference.cpp lookup.cpp)
 if(HAVE_AVX2)
  target_compile_definitions(trigdx PUBLIC HAVE_AVX2)
  if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL
                                               "IntelLLVM")
    target_compile_options(trigdx PUBLIC -xCORE-AVX2)
  else()
    target_compile_options(trigdx PUBLIC -mavx2)
  endif()
 elseif(HAVE_AVX)
  target_compile_definitions(trigdx PUBLIC HAVE_AVX)
  if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL
                                               "IntelLLVM")
    target_compile_options(trigdx PUBLIC -xAVX)
  else()
    target_compile_options(trigdx PUBLIC -mavx)
  endif()
 endif()
 target_include_directories(trigdx PUBLIC ${PROJECT_SOURCE_DIR}/include)
 if(HAVE_AVX)
--- a/src/lookup_avx.cpp
+++ b/src/lookup_avx.cpp
@@ -6,16 +6,6 @@
 #include "trigdx/lookup_avx.hpp"
 #if defined(HAVE_AVX) && !defined(__AVX__)
 static_assert(HAVE_AVX == 0, "__AVX__ should be defined when HAVE_AVX is "
                             "defined");
 #endif
 #if defined(HAVE_AVX2) && !defined(__AVX2__)
 static_assert(HAVE_AVX2 == 0, "__AVX2__ should be defined when HAVE_AVX2 is "
                              "defined");
 #endif
 template <std::size_t NR_SAMPLES> struct LookupAVXBackend<NR_SAMPLES>::Impl {
  std::vector<float> lookup;
  static constexpr std::size_t MASK = NR_SAMPLES - 1;
@@ -89,6 +79,7 @@ template <std::size_t NR_SAMPLES> struct LookupAVXBackend<NR_SAMPLES>::Impl {
    constexpr std::size_t VL = 8; // AVX processes 8 floats
    const __m256 scale = _mm256_set1_ps(SCALE);
    const __m256i mask = _mm256_set1_epi32(MASK);
    const __m256i quarter_pi = _mm256_set1_epi32(NR_SAMPLES / 4);
    std::size_t i = 0;
    for (; i + VL <= n; i += VL) {
@@ -103,7 +94,7 @@ template <std::size_t NR_SAMPLES> struct LookupAVXBackend<NR_SAMPLES>::Impl {
 #else
      // fallback gather for AVX1
      float sin_tmp[VL];
-      int idx_a[VL];
+      int idx_a[VL], idxc_a[VL];
      _mm256_store_si256((__m256i *)idx_a, idx);
      for (std::size_t k = 0; k < VL; ++k) {
        sin_tmp[k] = lookup[idx_a[k]];
--- a/src/lookup_xsimd.cpp
+++ b/src/lookup_xsimd.cpp
@@ -56,7 +56,7 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
      const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
      const b_type dx2 = xsimd::mul(dx, dx);
      const b_type dx3 = xsimd::mul(dx2, dx);
-      const b_type dx4 = xsimd::mul(dx2, dx2);
+      const b_type dx4 = xsimd::mul(dx2, dx);
      const b_type t2 = xsimd::mul(dx2, term2);
      const b_type t3 = xsimd::mul(dx3, term3);
      const b_type t4 = xsimd::mul(dx4, term4);
@@ -78,7 +78,7 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
      const float dx = a[i] - idx * lookup_table_.PI_FRAC;
      const float dx2 = dx * dx;
      const float dx3 = dx2 * dx;
-      const float dx4 = dx2 * dx2;
+      const float dx4 = dx3 * dx;
      const float cosdx =
          1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
      const float sindx = dx - lookup_table_.TERM3 * dx3;
@@ -115,7 +115,7 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
      const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
      const b_type dx2 = xsimd::mul(dx, dx);
      const b_type dx3 = xsimd::mul(dx2, dx);
-      const b_type dx4 = xsimd::mul(dx2, dx2);
+      const b_type dx4 = xsimd::mul(dx2, dx);
      const b_type t2 = xsimd::mul(dx2, term2);
      const b_type t3 = xsimd::mul(dx3, term3);
      const b_type t4 = xsimd::mul(dx4, term4);
@@ -138,7 +138,7 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
      const float dx = a[i] - idx * lookup_table_.PI_FRAC;
      const float dx2 = dx * dx;
      const float dx3 = dx2 * dx;
-      const float dx4 = dx2 * dx2;
+      const float dx4 = dx3 * dx;
      const float cosdx =
          1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
      const float sindx = dx - lookup_table_.TERM3 * dx3;
@@ -176,20 +176,20 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
      const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
      const b_type dx2 = xsimd::mul(dx, dx);
      const b_type dx3 = xsimd::mul(dx2, dx);
-      const b_type dx4 = xsimd::mul(dx2, dx2);
+      const b_type dx4 = xsimd::mul(dx2, dx);
      const b_type t2 = xsimd::mul(dx2, term2);
      const b_type t3 = xsimd::mul(dx3, term3);
      const b_type t4 = xsimd::mul(dx4, term4);
      idx = xsimd::bitwise_and(idx, mask);
-      const b_type sinv_base = b_type::gather(lookup_table_.sin_values.data(), idx);
+      b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
-      const b_type cosv_base = b_type::gather(lookup_table_.cos_values.data(), idx);
+      b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx);
      const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
      const b_type sindx = xsimd::sub(dx, t3);
-      b_type sinv = xsimd::add(xsimd::mul(cosv_base, sindx), xsimd::mul(sinv_base, cosdx));
+      sinv = xsimd::add(xsimd::mul(cosv, sindx), xsimd::mul(sinv, cosdx));
-      b_type cosv = xsimd::sub(xsimd::mul(cosv_base, cosdx), xsimd::mul(sinv_base, sindx));
+      cosv = xsimd::sub(xsimd::mul(cosv, cosdx), xsimd::mul(sinv, sindx));
      sinv.store(s + i, Tag());
      cosv.store(c + i, Tag());
@@ -202,7 +202,7 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
      const float dx = a[i] - idx * lookup_table_.PI_FRAC;
      const float dx2 = dx * dx;
      const float dx3 = dx2 * dx;
-      const float dx4 = dx2 * dx2;
+      const float dx4 = dx3 * dx;
      const float cosdx =
          1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
      const float sindx = dx - lookup_table_.TERM3 * dx3;
--- a/src/reference.cpp
+++ b/src/reference.cpp
@@ -17,6 +17,7 @@ void ReferenceBackend::compute_cosf(size_t n, const float *x, float *c) const {
 void ReferenceBackend::compute_sincosf(size_t n, const float *x, float *s,
                                       float *c) const {
  for (size_t i = 0; i < n; ++i) {
-    sincosf(x[i], &s[i], &c[i]);
+    s[i] = sinf(x[i]);
    c[i] = cosf(x[i]);
  }
 }