32: Set AVX and AVX2 flags using CMake checks (#34 )

* 32: Set mavx and mavx2 based on CMake checks * 32: Update flags for Intel compiler * Fix: AVX2 instead of AVX__2 Co-authored-by: Bram Veenboer <bram.veenboer@gmail.com> Co-authored-by: lukken <lukken@astron.nl>
Add README.md (#38 )
2025-10-29 09:18:43 +01:00 · 2025-10-22 16:55:37 +02:00 · 2025-10-22 16:48:26 +02:00 · 2025-10-20 17:09:35 +02:00 · 2025-10-10 09:19:18 +02:00 · 2025-09-17 20:03:28 +02:00
31 changed files with 946 additions and 289 deletions
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -0,0 +1,15 @@
 name: linting
 on:
  push:
 jobs:
  format:
    name: Linting
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: '3.12'
      - run: sudo apt-get update
      - run: sudo apt-get install -y clang-format-14 cmake-format pre-commit
      - run: pre-commit run -a
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -0,0 +1,89 @@
 name: test
 on:
  push:
 jobs:
  build:
    runs-on: [slurm]
    strategy:
      matrix: &matrix
        include:
          - name: GNU, NVIDIA A4000
            partition: defq
            gres: gpu:A4000
            cmake_flags: "-DTRIGDX_USE_MKL=1 -DTRIGDX_USE_GPU=1 -DTRIGDX_USE_MKL=1 -DTRIGDX_USE_XSIMD=1 -DCMAKE_CUDA_ARCHITECTURES=86"
            environment_modules: "spack/20250403 intel-oneapi-mkl cuda python"
          - name: Intel, NVIDIA A4000
            partition: defq
            gres: gpu:A4000
            cmake_flags: "-DTRIGDX_USE_MKL=1 -DTRIGDX_USE_GPU=1 -DTRIGDX_USE_MKL=1 -DTRIGDX_USE_XSIMD=1 -DCMAKE_CUDA_ARCHITECTURES=86"
            environment_modules: "spack/20250403 intel-oneapi-compilers intel-oneapi-mkl cuda python"
          - name: NVIDIA GH200
            partition: ghq
            gres: gpu:GH200
            cmake_flags: "-DTRIGDX_USE_GPU=1 -DTRIGDX_USE_XSIMD=1 -DCMAKE_CUDA_ARCHITECTURES=90a -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc"
    env:
      DEVICE_NAME: ${{ matrix.name }}
      PARTITION_NAME: ${{ matrix.partition }}
      CMAKE_FLAGS: ${{ matrix.cmake_flags }}
      ENVIRONMENT_MODULES: ${{ matrix.environment_modules }}
    steps:
      - &cleanup
        name: "Remove old workspace"
        run: |
          rm -rf $GITHUB_WORKSPACE/*
          mkdir -p $GITHUB_WORKSPACE
      - uses: actions/checkout@v4
      - uses: astron-rd/slurm-action@v1.2
        with:
          partition: ${{ matrix.partition }}
          gres: ${{ matrix.gres }}
          commands: |
            module load ${ENVIRONMENT_MODULES}
            cmake -S . -B build ${CMAKE_FLAGS}
            make -C build -j
      - name: Upload build
        uses: pyTooling/upload-artifact@v4
        with:
          name: build-${{ matrix.name }}
          path: build
  test:
    runs-on: [slurm]
    needs: build
    strategy:
      matrix: *matrix
    env:
      DEVICE_NAME: ${{ matrix.name }}
      PARTITION_NAME: ${{ matrix.partition }}
    steps:
      - *cleanup
      - uses: pyTooling/download-artifact@v4
        with:
          name: build-${{ matrix.name }}
      - uses: astron-rd/slurm-action@v1.2
        with:
          partition: ${{ matrix.partition }}
          gres: ${{ matrix.gres }}
          commands: |
            find build/tests -type f -executable -exec {} \;
  benchmark:
    runs-on: [slurm]
    needs: build
    strategy:
      matrix: *matrix
    env:
      DEVICE_NAME: ${{ matrix.name }}
      PARTITION_NAME: ${{ matrix.partition }}
    steps:
      - *cleanup
      - uses: pyTooling/download-artifact@v4
        with:
          name: build-${{ matrix.name }}
      - uses: astron-rd/slurm-action@v1.2
        with:
          partition: ${{ matrix.partition }}
          gres: ${{ matrix.gres }}
          commands: |
            find build/benchmarks -type f -executable -exec {} \;
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,72 @@
 # Generated files
 **/.DS_Store
 *.slo
 *.lo
 *.o
 *.obj
 # Precompiled Headers
 *.gch
 *.pch
 # Compiled Dynamic libraries
 *.so
 *.dylib
 *.dll
 # Fortran module files
 *.mod
 *.smod
 # Compiled Static libraries
 *.lai
 *.la
 *.a
 *.lib
 # Executables
 *.exe
 *.out
 *.app
 # CMake-related
 **/cmake-build-debug
 **/CMakeCache.txt
 **/cmake_install.cmake
 **/install_manifest.txt
 **/CMakeFiles/
 **/CTestTestfile.cmake
 **/Makefile
 **/*.cbp
 **/CMakeScripts
 **/compile_commands.json
 **/CMakeLists.txt.user
 **/_deps
 **/Testing
 **/CMakeUserPresets.json
 # Local
 build/**/*
 lib/*
 bin/*
 test/test_runner
 *.user*
 # Visual Studio work directory
 .vs/
 # Visual Studio build directory
 out/
 # Visual Studio Code
 .vscode/
 # CLion work directory
 .idea/
 # CLion build directories
 cmake-build-*/
 # Virtual environments
 env
 venv
 .env
 .venv
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,4 +7,5 @@ repos:
    rev: v0.6.13
    hooks:
      - id: cmake-format
-      - id: cmake-lint
+      - id: cmake-lint
        args: [--disabled-codes=C0301]
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,77 +0,0 @@
 {
  "files.associations": {
    "*.cc": "cpp",
    "*.tpp": "cpp",
    "array": "cpp",
    "atomic": "cpp",
    "bit": "cpp",
    "bitset": "cpp",
    "cctype": "cpp",
    "charconv": "cpp",
    "chrono": "cpp",
    "clocale": "cpp",
    "cmath": "cpp",
    "compare": "cpp",
    "concepts": "cpp",
    "condition_variable": "cpp",
    "csignal": "cpp",
    "cstdarg": "cpp",
    "cstddef": "cpp",
    "cstdint": "cpp",
    "cstdio": "cpp",
    "cstdlib": "cpp",
    "cstring": "cpp",
    "ctime": "cpp",
    "cwchar": "cpp",
    "cwctype": "cpp",
    "deque": "cpp",
    "list": "cpp",
    "map": "cpp",
    "set": "cpp",
    "string": "cpp",
    "unordered_map": "cpp",
    "vector": "cpp",
    "exception": "cpp",
    "algorithm": "cpp",
    "functional": "cpp",
    "iterator": "cpp",
    "memory": "cpp",
    "memory_resource": "cpp",
    "numeric": "cpp",
    "optional": "cpp",
    "random": "cpp",
    "ratio": "cpp",
    "regex": "cpp",
    "string_view": "cpp",
    "system_error": "cpp",
    "tuple": "cpp",
    "type_traits": "cpp",
    "utility": "cpp",
    "format": "cpp",
    "fstream": "cpp",
    "future": "cpp",
    "initializer_list": "cpp",
    "iomanip": "cpp",
    "iosfwd": "cpp",
    "iostream": "cpp",
    "istream": "cpp",
    "limits": "cpp",
    "mutex": "cpp",
    "new": "cpp",
    "numbers": "cpp",
    "ostream": "cpp",
    "ranges": "cpp",
    "semaphore": "cpp",
    "span": "cpp",
    "sstream": "cpp",
    "stdexcept": "cpp",
    "stop_token": "cpp",
    "streambuf": "cpp",
    "text_encoding": "cpp",
    "thread": "cpp",
    "typeinfo": "cpp",
    "variant": "cpp",
    "queue": "cpp",
    "stack": "cpp"
  }
 }
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,13 +3,45 @@ project(trigdx LANGUAGES CXX)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 option(TRIGDX_USE_MKL "Enable Intel MKL backend" OFF)
 option(TRIGDX_USE_GPU "Enable GPU backend" OFF)
 option(TRIGDX_USE_XSIMD "Enable XSIMD backend" OFF)
 option(TRIGDX_BUILD_TESTS "Build tests" ON)
 option(TRIGDX_BUILD_BENCHMARKS "Build tests" ON)
 option(TRIGDX_BUILD_PYTHON "Build Python interface" ON)
 # Add compiler flags
 set(CMAKE_CXX_FLAGS
    "${CMAKE_CXX_FLAGS} -Wall -Wnon-virtual-dtor -Wduplicated-branches -Wvla -Wpointer-arith -Wextra -Wno-unused-parameter"
 )
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 configure_file(
  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/trigdx_config.hpp.in
  ${CMAKE_CURRENT_BINARY_DIR}/include/trigdx/trigdx_config.hpp @ONLY)
 if(TRIGDX_BUILD_TESTS
   OR TRIGDX_BUILD_BENCHMARKS
   OR TRIGDX_BUILD_PYTHON)
  include(FetchContent)
 endif()
 include_directories(${PROJECT_SOURCE_DIR}/include)
 add_subdirectory(src)
-add_subdirectory(tests)
+
-add_subdirectory(benchmarks)
+if(TRIGDX_BUILD_TESTS)
  include(CTest)
  enable_testing()
  add_subdirectory(tests)
 endif()
 if(TRIGDX_BUILD_BENCHMARKS)
  add_subdirectory(benchmarks)
 endif()
 if(TRIGDX_BUILD_PYTHON)
  add_subdirectory(python)
 endif()
--- a/README.md
+++ b/README.md
@@ -0,0 +1,54 @@
 # TrigDx
 High‑performance C++ library offering multiple implementations of transcendental trigonometric functions (e.g., sin, cos, tan and their variants), designed for numerical, signal‑processing, and real‑time systems where trading a small loss of accuracy for significantly higher throughput on modern CPUs (scalar and SIMD) and NVIDIA GPUs is acceptable.
 ## Why TrigDx?
 Many applications use the standard library implementations, which prioritise correctness but are not always optimal for throughput on vectorized or GPU hardware. TrigDx gives you multiple implementations so you can:
 - Replace `std::sin` / `std::cos` calls with faster approximations when a small, bounded reduction in accuracy is acceptable.
 - Use SIMD/vectorized implementations and compact lookup tables for high throughput lookups.
 - Run massively parallel kernels that take advantage of a GPU's _Special Function Units_ (SFUs).
 ## Requirements
 - A C++ compiler with at least C++17 support (GCC, Clang)
 - CMake 3.15+
 - Optional: NVIDIA CUDA Toolkit 11+ to build GPU kernels
 - Optional: GoogleTest (for unit tests) and GoogleBenchmark (for microbenchmarks)
 ## Building
 ```bash
 git clone https://github.com/astron-rd/TrigDx.git
 cd TrigDx
 mkdir build && cd build
 # CPU-only:
 cmake -DCMAKE_BUILD_TYPE=Release -DTRIGDX_USE_XSIMD=ON ..
 cmake --build . -j
 # Enable CUDA (if available):
 cmake -DCMAKE_BUILD_TYPE=Release -DTRIGDX_USE_GPU=ON ..
 cmake --build . -j
 # Run tests:
 ctest --output-on-failure -j
 ```
 Common CMake options:
 - `TRIGDX_USE_GPU=ON/OFF` — build GPU support.
 - `TRIGDX_BUILD_TESTS=ON/OFF` — build tests.
 - `TRIGDX_BUILD_BENCHMARKS=ON/OFF` — build benchmarks.
 - `TRIGDX_BUILD_PYTHON` — build Python interface.
 ## Contributing
 - Fork → create a feature branch → open a PR.
 - Include unit tests for correctness‑sensitive changes and benchmark results for performance changes.
 - Follow project style (clang‑format) and run tests locally before submitting.
 ## Reporting issues
 When opening an issue for incorrect results or performance regressions, please include:
 - Platform and CPU/GPU model.
 - Compiler and version with exact compile flags.
 - Small reproducer (input data and the TrigDx implementation used).
 ## License
 See the LICENSE file in the repository for licensing details.
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1,23 +1,36 @@
 FetchContent_Declare(
  benchmark
  GIT_REPOSITORY https://github.com/google/benchmark.git
  GIT_TAG v1.9.4)
 set(BENCHMARK_ENABLE_TESTING
    OFF
    CACHE BOOL "" FORCE)
 FetchContent_MakeAvailable(benchmark)
 add_executable(benchmark_reference benchmark_reference.cpp)
-target_link_libraries(benchmark_reference PRIVATE trigdx)
+target_link_libraries(benchmark_reference PRIVATE trigdx benchmark::benchmark)
 add_executable(benchmark_lookup benchmark_lookup.cpp)
-target_link_libraries(benchmark_lookup PRIVATE trigdx)
+target_link_libraries(benchmark_lookup PRIVATE trigdx benchmark::benchmark)
-add_executable(benchmark_lookup_avx benchmark_lookup_avx.cpp)
+if(HAVE_AVX)
-target_link_libraries(benchmark_lookup_avx PRIVATE trigdx)
+  add_executable(benchmark_lookup_avx benchmark_lookup_avx.cpp)
  target_link_libraries(benchmark_lookup_avx PRIVATE trigdx
                                                     benchmark::benchmark)
 endif()
 if(TRIGDX_USE_MKL)
  add_executable(benchmark_mkl benchmark_mkl.cpp)
-  target_link_libraries(benchmark_mkl PRIVATE trigdx)
+  target_link_libraries(benchmark_mkl PRIVATE trigdx benchmark::benchmark)
 endif()
 if(TRIGDX_USE_GPU)
  add_executable(benchmark_gpu benchmark_gpu.cpp)
-  target_link_libraries(benchmark_gpu PRIVATE trigdx gpu)
+  target_link_libraries(benchmark_gpu PRIVATE trigdx gpu benchmark::benchmark)
 endif()
 if(TRIGDX_USE_XSIMD)
  add_executable(benchmark_lookup_xsimd benchmark_lookup_xsimd.cpp)
-  target_link_libraries(benchmark_lookup_xsimd PRIVATE trigdx)
+  target_link_libraries(benchmark_lookup_xsimd PRIVATE trigdx
                                                       benchmark::benchmark)
 endif()
--- a/benchmarks/benchmark_gpu.cpp
+++ b/benchmarks/benchmark_gpu.cpp
@@ -1,9 +1,21 @@
-#include <trigdx/gpu.hpp>
+#include <trigdx/trigdx.hpp>
 #include "benchmark_utils.hpp"
-int main() {
+BENCHMARK_TEMPLATE(benchmark_sinf, GPUBackend)
-  benchmark_sinf<GPUBackend>();
+    ->Unit(benchmark::kMillisecond)
-  benchmark_cosf<GPUBackend>();
+    ->Arg(1e5)
-  benchmark_sincosf<GPUBackend>();
+    ->Arg(1e6)
-}
+    ->Arg(1e7);
 BENCHMARK_TEMPLATE(benchmark_cosf, GPUBackend)
    ->Unit(benchmark::kMillisecond)
    ->Arg(1e5)
    ->Arg(1e6)
    ->Arg(1e7);
 BENCHMARK_TEMPLATE(benchmark_sincosf, GPUBackend)
    ->Unit(benchmark::kMillisecond)
    ->Arg(1e5)
    ->Arg(1e6)
    ->Arg(1e7);
 BENCHMARK_MAIN();
--- a/benchmarks/benchmark_lookup.cpp
+++ b/benchmarks/benchmark_lookup.cpp
@@ -1,13 +1,30 @@
-#include <trigdx/lookup.hpp>
+#include <trigdx/trigdx.hpp>
 #include "benchmark_utils.hpp"
-int main() {
+template <typename Backend> void register_benchmarks() {
-  benchmark_sinf<LookupBackend<16384>>();
+  BENCHMARK_TEMPLATE(benchmark_sinf, Backend)
-  benchmark_cosf<LookupBackend<16384>>();
+      ->Unit(benchmark::kMillisecond)
-  benchmark_sincosf<LookupBackend<16384>>();
+      ->Arg(1e5)
-
+      ->Arg(1e6)
-  benchmark_sinf<LookupBackend<32768>>();
+      ->Arg(1e7);
-  benchmark_cosf<LookupBackend<32768>>();
+  BENCHMARK_TEMPLATE(benchmark_cosf, Backend)
-  benchmark_sincosf<LookupBackend<32768>>();
+      ->Unit(benchmark::kMillisecond)
      ->Arg(1e5)
      ->Arg(1e6)
      ->Arg(1e7);
  BENCHMARK_TEMPLATE(benchmark_sincosf, Backend)
      ->Unit(benchmark::kMillisecond)
      ->Arg(1e5)
      ->Arg(1e6)
      ->Arg(1e7);
 }
 int main(int argc, char **argv) {
  ::benchmark::Initialize(&argc, argv);
  register_benchmarks<LookupBackend<16384>>();
  register_benchmarks<LookupBackend<32768>>();
  return ::benchmark::RunSpecifiedBenchmarks();
 }
--- a/benchmarks/benchmark_lookup_avx.cpp
+++ b/benchmarks/benchmark_lookup_avx.cpp
@@ -1,13 +1,30 @@
-#include <trigdx/lookup_avx.hpp>
+#include <trigdx/trigdx.hpp>
 #include "benchmark_utils.hpp"
-int main() {
+template <typename Backend> void register_benchmarks() {
-  benchmark_sinf<LookupAVXBackend<16384>>();
+  BENCHMARK_TEMPLATE(benchmark_sinf, Backend)
-  benchmark_cosf<LookupAVXBackend<16384>>();
+      ->Unit(benchmark::kMillisecond)
-  benchmark_sincosf<LookupAVXBackend<16384>>();
+      ->Arg(1e5)
-
+      ->Arg(1e6)
-  benchmark_sinf<LookupAVXBackend<32768>>();
+      ->Arg(1e7);
-  benchmark_cosf<LookupAVXBackend<32768>>();
+  BENCHMARK_TEMPLATE(benchmark_cosf, Backend)
-  benchmark_sincosf<LookupAVXBackend<32768>>();
+      ->Unit(benchmark::kMillisecond)
      ->Arg(1e5)
      ->Arg(1e6)
      ->Arg(1e7);
  BENCHMARK_TEMPLATE(benchmark_sincosf, Backend)
      ->Unit(benchmark::kMillisecond)
      ->Arg(1e5)
      ->Arg(1e6)
      ->Arg(1e7);
 }
 int main(int argc, char **argv) {
  ::benchmark::Initialize(&argc, argv);
  register_benchmarks<LookupAVXBackend<16384>>();
  register_benchmarks<LookupAVXBackend<32768>>();
  return ::benchmark::RunSpecifiedBenchmarks();
 }
--- a/benchmarks/benchmark_lookup_xsimd.cpp
+++ b/benchmarks/benchmark_lookup_xsimd.cpp
@@ -1,13 +1,30 @@
-#include <trigdx/lookup_xsimd.hpp>
+#include <trigdx/trigdx.hpp>
 #include "benchmark_utils.hpp"
-int main() {
+template <typename Backend> void register_benchmarks() {
-  benchmark_sinf<LookupXSIMDBackend<16384>>();
+  BENCHMARK_TEMPLATE(benchmark_sinf, Backend)
-  benchmark_cosf<LookupXSIMDBackend<16384>>();
+      ->Unit(benchmark::kMillisecond)
-  benchmark_sincosf<LookupXSIMDBackend<16384>>();
+      ->Arg(1e5)
-
+      ->Arg(1e6)
-  benchmark_sinf<LookupXSIMDBackend<32768>>();
+      ->Arg(1e7);
-  benchmark_cosf<LookupXSIMDBackend<32768>>();
+  BENCHMARK_TEMPLATE(benchmark_cosf, Backend)
-  benchmark_sincosf<LookupXSIMDBackend<32768>>();
+      ->Unit(benchmark::kMillisecond)
      ->Arg(1e5)
      ->Arg(1e6)
      ->Arg(1e7);
  BENCHMARK_TEMPLATE(benchmark_sincosf, Backend)
      ->Unit(benchmark::kMillisecond)
      ->Arg(1e5)
      ->Arg(1e6)
      ->Arg(1e7);
 }
 int main(int argc, char **argv) {
  ::benchmark::Initialize(&argc, argv);
  register_benchmarks<LookupXSIMDBackend<16384>>();
  register_benchmarks<LookupXSIMDBackend<32768>>();
  return ::benchmark::RunSpecifiedBenchmarks();
 }
--- a/benchmarks/benchmark_mkl.cpp
+++ b/benchmarks/benchmark_mkl.cpp
@@ -1,9 +1,21 @@
-#include <trigdx/mkl.hpp>
+#include <trigdx/trigdx.hpp>
 #include "benchmark_utils.hpp"
-int main() {
+BENCHMARK_TEMPLATE(benchmark_sinf, MKLBackend)
-  benchmark_sinf<MKLBackend>();
+    ->Unit(benchmark::kMillisecond)
-  benchmark_cosf<MKLBackend>();
+    ->Arg(1e5)
-  benchmark_sincosf<MKLBackend>();
+    ->Arg(1e6)
-}
+    ->Arg(1e7);
 BENCHMARK_TEMPLATE(benchmark_cosf, MKLBackend)
    ->Unit(benchmark::kMillisecond)
    ->Arg(1e5)
    ->Arg(1e6)
    ->Arg(1e7);
 BENCHMARK_TEMPLATE(benchmark_sincosf, MKLBackend)
    ->Unit(benchmark::kMillisecond)
    ->Arg(1e5)
    ->Arg(1e6)
    ->Arg(1e7);
 BENCHMARK_MAIN();
--- a/benchmarks/benchmark_reference.cpp
+++ b/benchmarks/benchmark_reference.cpp
@@ -1,9 +1,21 @@
-#include <trigdx/reference.hpp>
+#include <trigdx/trigdx.hpp>
 #include "benchmark_utils.hpp"
-int main() {
+BENCHMARK_TEMPLATE(benchmark_sinf, ReferenceBackend)
-  benchmark_sinf<ReferenceBackend>();
+    ->Unit(benchmark::kMillisecond)
-  benchmark_cosf<ReferenceBackend>();
+    ->Arg(1e5)
-  benchmark_sincosf<ReferenceBackend>();
+    ->Arg(1e6)
-}
+    ->Arg(1e7);
 BENCHMARK_TEMPLATE(benchmark_cosf, ReferenceBackend)
    ->Unit(benchmark::kMillisecond)
    ->Arg(1e5)
    ->Arg(1e6)
    ->Arg(1e7);
 BENCHMARK_TEMPLATE(benchmark_sincosf, ReferenceBackend)
    ->Unit(benchmark::kMillisecond)
    ->Arg(1e5)
    ->Arg(1e6)
    ->Arg(1e7);
 BENCHMARK_MAIN();
--- a/benchmarks/benchmark_utils.hpp
+++ b/benchmarks/benchmark_utils.hpp
@@ -1,76 +1,122 @@
 #pragma once
 #include <chrono>
-#include <iomanip>
+#include <cmath>
-#include <iostream>
+#include <stdexcept>
 #include <string>
 #include <vector>
-const size_t N = 1e7;
+#include <benchmark/benchmark.h>
-inline void report(const std::string &name, double sec, double throughput) {
+void init_x(float *x, size_t n) {
-  std::ios state(nullptr);
+  for (size_t i = 0; i < n; ++i) {
-  state.copyfmt(std::cout);
+    x[i] = (i % 360) * 0.0174533f; // degrees to radians
-  std::cout << std::setw(7) << name << " -> ";
+  }
  std::cout << "time: ";
  std::cout << std::fixed << std::setprecision(3) << std::setfill('0');
  std::cout << sec << " s, ";
  std::cout << "throughput: " << throughput << " M elems/sec\n";
  std::cout.copyfmt(state);
 }
-template <typename Backend> inline void benchmark_sinf() {
+template <typename Backend>
-  std::vector<float> x(N), s(N);
+static void benchmark_sinf(benchmark::State &state) {
-
+  const size_t N = static_cast<size_t>(state.range(0));
  for (size_t i = 0; i < N; ++i)
    x[i] = (i % 360) * 0.0174533f; // degrees to radians
  Backend backend;
  backend.init(N);
  auto start = std::chrono::high_resolution_clock::now();
-  backend.compute_sinf(N, x.data(), s.data());
+  backend.init(N);
  float *x =
      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  float *s =
      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  auto end = std::chrono::high_resolution_clock::now();
  state.counters["init_ms"] =
      std::chrono::duration_cast<std::chrono::microseconds>(end - start)
          .count() /
      1.e3;
-  double sec = std::chrono::duration<double>(end - start).count();
+  init_x(x, N);
  double throughput = N / sec / 1e6;
-  report("sinf", sec, throughput);
+  for (auto _ : state) {
    backend.compute_sinf(N, x, s);
    benchmark::DoNotOptimize(s);
  }
  backend.free_memory(x);
  backend.free_memory(s);
  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
                          static_cast<int64_t>(N));
 }
-template <typename Backend> inline void benchmark_cosf() {
+template <typename Backend>
-  std::vector<float> x(N), c(N);
+static void benchmark_cosf(benchmark::State &state) {
-
+  const size_t N = static_cast<size_t>(state.range(0));
  for (size_t i = 0; i < N; ++i)
    x[i] = (i % 360) * 0.0174533f; // degrees to radians
  Backend backend;
  backend.init(N);
  auto start = std::chrono::high_resolution_clock::now();
-  backend.compute_cosf(N, x.data(), c.data());
+  backend.init(N);
  float *x =
      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  float *c =
      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  if (!x || !c) {
    throw std::runtime_error("Buffer allocation failed");
  }
  auto end = std::chrono::high_resolution_clock::now();
  state.counters["init_ms"] =
      std::chrono::duration_cast<std::chrono::microseconds>(end - start)
          .count() /
      1.e3;
-  double sec = std::chrono::duration<double>(end - start).count();
+  init_x(x, N);
  double throughput = N / sec / 1e6;
-  report("cosf", sec, throughput);
+  for (auto _ : state) {
    backend.compute_cosf(N, x, c);
    benchmark::DoNotOptimize(c);
  }
  backend.free_memory(x);
  backend.free_memory(c);
  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
                          static_cast<int64_t>(N));
 }
-template <typename Backend> inline void benchmark_sincosf() {
+template <typename Backend>
-  std::vector<float> x(N), s(N), c(N);
+static void benchmark_sincosf(benchmark::State &state) {
-
+  const size_t N = static_cast<size_t>(state.range(0));
  for (size_t i = 0; i < N; ++i)
    x[i] = (i % 360) * 0.0174533f; // degrees to radians
  Backend backend;
  backend.init(N);
  auto start = std::chrono::high_resolution_clock::now();
-  backend.compute_sincosf(N, x.data(), s.data(), c.data());
+  backend.init(N);
  float *x =
      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  float *s =
      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  float *c =
      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  if (!x || !s || !c) {
    throw std::runtime_error("Buffer allocation failed");
  }
  auto end = std::chrono::high_resolution_clock::now();
  state.counters["init_ms"] =
      std::chrono::duration_cast<std::chrono::microseconds>(end - start)
          .count() /
      1.e3;
-  double sec = std::chrono::duration<double>(end - start).count();
+  init_x(x, N);
  double throughput = N / sec / 1e6;
-  report("sincosf", sec, throughput);
+  for (auto _ : state) {
    backend.compute_sincosf(N, x, s, c);
    benchmark::DoNotOptimize(s);
    benchmark::DoNotOptimize(c);
  }
  backend.free_memory(x);
  backend.free_memory(s);
  backend.free_memory(c);
  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
                          static_cast<int64_t>(N));
 }
--- a/cmake/FindAVX.cmake
+++ b/cmake/FindAVX.cmake
@@ -0,0 +1,42 @@
 include(CheckCXXSourceRuns)
 set(SUPPORTED_COMPILERS Clang;GNU;Intel;IntelLLVM)
 if(CMAKE_CXX_COMPILER_ID IN_LIST SUPPORTED_COMPILERS)
  if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
    set(CMAKE_REQUIRED_FLAGS "-xHost") # ICC
  elseif(CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
    set(CMAKE_REQUIRED_FLAGS "-march=native") # ICX
  else()
    set(CMAKE_REQUIRED_FLAGS "-march=native") # GCC/Clang
  endif()
 else()
  message(FATAL_ERROR "Unsupported compiler: ${CMAKE_CXX_COMPILER_ID}.")
 endif()
 # AVX check
 check_cxx_source_runs(
  "
 #include <immintrin.h>
 int main() {
    __m256 a = _mm256_setzero_ps(); // AVX
    (void) a;
    return 0;
 }
 "
  HAVE_AVX)
 if(HAVE_AVX)
  # AVX2 check
  check_cxx_source_runs(
    "
 #include <immintrin.h>
 int main() {
    __m256i a = _mm256_set1_epi32(-1);
    __m256i b = _mm256_abs_epi32(a); // AVX2
    (void) b;
    return 0;
 }
 "
    HAVE_AVX2)
 endif()
--- a/cmake/trigdx_config.hpp.in
+++ b/cmake/trigdx_config.hpp.in
@@ -0,0 +1,5 @@
 #pragma once
 #cmakedefine TRIGDX_USE_MKL
 #cmakedefine TRIGDX_USE_GPU
 #cmakedefine TRIGDX_USE_XSIMD
--- a/include/trigdx/gpu.hpp
+++ b/include/trigdx/gpu.hpp
@@ -11,7 +11,8 @@ public:
  GPUBackend();
  ~GPUBackend() override;
-  void init(size_t n = 0) override;
+  void *allocate_memory(size_t bytes) const override;
  void free_memory(void *ptr) const override;
  void compute_sinf(size_t n, const float *x, float *s) const override;
  void compute_cosf(size_t n, const float *x, float *c) const override;
  void compute_sincosf(size_t n, const float *x, float *s,
--- a/include/trigdx/interface.hpp
+++ b/include/trigdx/interface.hpp
@@ -1,6 +1,8 @@
 #pragma once
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
 // Base interface for all math backends
 class Backend {
@@ -10,6 +12,12 @@ public:
  // Optional initialization
  virtual void init(size_t n = 0) {}
  virtual void *allocate_memory(size_t bytes) const {
    return static_cast<void *>(new uint8_t[bytes]);
  };
  virtual void free_memory(void *ptr) const { std::free(ptr); };
  // Compute sine for n elements
  virtual void compute_sinf(size_t n, const float *x, float *s) const = 0;
--- a/include/trigdx/trigdx.hpp
+++ b/include/trigdx/trigdx.hpp
@@ -0,0 +1,19 @@
 #pragma once
 #include <trigdx/trigdx_config.hpp>
 #include <trigdx/lookup.hpp>
 #include <trigdx/lookup_avx.hpp>
 #include <trigdx/reference.hpp>
 #if defined(TRIGDX_USE_MKL)
 #include <trigdx/mkl.hpp>
 #endif
 #if defined(TRIGDX_USE_GPU)
 #include <trigdx/gpu.hpp>
 #endif
 #if defined(TRIGDX_USE_XSIMD)
 #include <trigdx/lookup_xsimd.hpp>
 #endif
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -0,0 +1,23 @@
 find_package(pybind11 CONFIG QUIET)
 if(NOT pybind11_FOUND)
  FetchContent_Declare(
    pybind11
    GIT_REPOSITORY https://github.com/pybind/pybind11.git
    GIT_TAG v3.0.0)
  FetchContent_MakeAvailable(pybind11)
 endif()
 # Needed to set ${Python_VERSION_MAJOR} and ${Python_VERSION_MINOR}
 find_package(Python REQUIRED)
 pybind11_add_module(pytrigdx bindings.cpp)
 target_link_libraries(pytrigdx PRIVATE trigdx)
 set_target_properties(pytrigdx PROPERTIES OUTPUT_NAME "trigdx")
 set(PYTHON_SITE_PACKAGES
    "${CMAKE_INSTALL_LIBDIR}/python${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}/site-packages/trigdx"
 )
 install(TARGETS pytrigdx DESTINATION ${PYTHON_SITE_PACKAGES})
 install(FILES __init__.py DESTINATION ${PYTHON_SITE_PACKAGES})
--- a/python/init.py
+++ b/python/init.py
@@ -0,0 +1,16 @@
 from .trigdx import Reference, Lookup16K, Lookup32K, LookupAVX16K, LookupAVX32K
 try:
    from .trigdx import MKL
 except ImportError:
    pass
 try:
    from .trigdx import GPU
 except ImportError:
    pass
 try:
    from .trigdx import LookupXSIMD16K, LookupXSIMD32K
 except ImportError:
    pass
--- a/python/bindings.cpp
+++ b/python/bindings.cpp
@@ -0,0 +1,96 @@
 #include <memory>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <trigdx/trigdx.hpp>
 namespace py = pybind11;
 template <typename T>
 py::array_t<T>
 compute_sin(const Backend &backend,
            py::array_t<T, py::array::c_style | py::array::forcecast> x) {
  const size_t n = x.shape(0);
  if (x.ndim() != 1) {
    throw py::value_error("Input array must be 1-dimensional");
  }
  const T *x_ptr = x.data();
  py::array_t<float> s(n);
  T *s_ptr = s.mutable_data();
  backend.compute_sinf(n, x_ptr, s_ptr);
  return s;
 }
 template <typename T>
 py::array_t<T>
 compute_cos(const Backend &backend,
            py::array_t<T, py::array::c_style | py::array::forcecast> x) {
  const size_t n = x.shape(0);
  if (x.ndim() != 1) {
    throw py::value_error("Input array must be 1-dimensional");
  }
  const T *x_ptr = x.data();
  py::array_t<T> c(n);
  T *c_ptr = c.mutable_data();
  backend.compute_cosf(n, x_ptr, c_ptr);
  return c;
 }
 template <typename T>
 std::tuple<py::array_t<T>, py::array_t<T>>
 compute_sincos(const Backend &backend,
               py::array_t<T, py::array::c_style | py::array::forcecast> x) {
  const size_t n = x.shape(0);
  if (x.ndim() != 1) {
    throw py::value_error("Input array must be 1-dimensional");
  }
  const T *x_ptr = x.data();
  py::array_t<T> s(n);
  py::array_t<T> c(n);
  backend.compute_sincosf(n, x_ptr, s.mutable_data(), c.mutable_data());
  return std::make_tuple(s, c);
 }
 template <typename BackendType>
 void bind_backend(py::module &m, const char *name) {
  py::class_<BackendType, Backend, std::shared_ptr<BackendType>>(m, name)
      .def(py::init<>())
      .def("compute_sinf", &compute_sin<float>)
      .def("compute_cosf", &compute_cos<float>)
      .def("compute_sincosf", &compute_sincos<float>);
 }
 PYBIND11_MODULE(trigdx, m) {
  m.doc() = "TrigDx python bindings";
  py::class_<Backend, std::shared_ptr<Backend>>(m, "Backend")
      .def("init", &Backend::init);
  bind_backend<ReferenceBackend>(m, "Reference");
  bind_backend<LookupBackend<16384>>(m, "Lookup16K");
  bind_backend<LookupBackend<32768>>(m, "Lookup32K");
  bind_backend<LookupAVXBackend<16384>>(m, "LookupAVX16K");
  bind_backend<LookupAVXBackend<32768>>(m, "LookupAVX32K");
 #if defined(TRIGDX_USE_MKL)
  bind_backend<MKLBackend>(m, "MKL");
 #endif
 #if defined(TRIGDX_USE_GPU)
  bind_backend<GPUBackend>(m, "GPU");
 #endif
 #if defined(TRIGDX_USE_XSIMD)
  bind_backend<LookupXSIMDBackend<16384>>(m, "LookupXSIMD16K");
  bind_backend<LookupXSIMDBackend<32768>>(m, "LookupXSIMD32K");
 #endif
 }
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,8 +1,30 @@
-add_library(trigdx reference.cpp lookup.cpp lookup_avx.cpp)
+include(FetchContent)
 include(FindAVX)
 add_library(trigdx reference.cpp lookup.cpp)
 if(HAVE_AVX2)
  target_compile_definitions(trigdx PUBLIC HAVE_AVX2)
  if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL
                                               "IntelLLVM")
    target_compile_options(trigdx PUBLIC -xCORE-AVX2)
  else()
    target_compile_options(trigdx PUBLIC -mavx2)
  endif()
 elseif(HAVE_AVX)
  target_compile_definitions(trigdx PUBLIC HAVE_AVX)
  if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL
                                               "IntelLLVM")
    target_compile_options(trigdx PUBLIC -xAVX)
  else()
    target_compile_options(trigdx PUBLIC -mavx)
  endif()
 endif()
 target_include_directories(trigdx PUBLIC ${PROJECT_SOURCE_DIR}/include)
-target_compile_options(trigdx PRIVATE -O3 -march=native)
+if(HAVE_AVX)
  target_sources(trigdx PRIVATE lookup_avx.cpp)
 endif()
 if(TRIGDX_USE_MKL)
  find_package(MKL REQUIRED)
@@ -20,7 +42,19 @@ if(TRIGDX_USE_GPU)
 endif()
 if(TRIGDX_USE_XSIMD)
-  find_package(xsimd REQUIRED)
+  # Requires XSIMD > 13 for architecture independent dispatching
  find_package(xsimd 13 QUIET)
  if(NOT TARGET xsimd)
    FetchContent_Declare(
      xsimd
      GIT_REPOSITORY https://github.com/xtensor-stack/xsimd.git
      GIT_TAG 13.2.0)
    FetchContent_MakeAvailable(xsimd)
  endif()
  target_sources(trigdx PRIVATE lookup_xsimd.cpp)
  target_link_libraries(trigdx PRIVATE xsimd)
 endif()
 target_include_directories(
  trigdx INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
                   $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>)
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -10,75 +10,63 @@
 struct GPUBackend::Impl {
-  ~Impl() {
+  void *allocate_memory(size_t bytes) const {
-    if (h_x) {
+    void *ptr;
-      cudaFreeHost(h_x);
+    cudaMallocHost(&ptr, bytes);
-    }
+    return ptr;
    if (h_s) {
      cudaFreeHost(h_s);
    }
    if (h_c) {
      cudaFreeHost(h_c);
    }
    if (d_x) {
      cudaFree(d_x);
    }
    if (d_s) {
      cudaFree(d_s);
    }
    if (d_c) {
      cudaFree(d_c);
    }
  }
-  void init(size_t n) {
+  void free_memory(void *ptr) const { cudaFreeHost(ptr); }
    const size_t bytes = n * sizeof(float);
    cudaMallocHost(&h_x, bytes);
    cudaMallocHost(&h_s, bytes);
    cudaMallocHost(&h_c, bytes);
    cudaMalloc(&d_x, bytes);
    cudaMalloc(&d_s, bytes);
    cudaMalloc(&d_c, bytes);
  }
  void compute_sinf(size_t n, const float *x, float *s) const {
    const size_t bytes = n * sizeof(float);
-    std::memcpy(h_x, x, bytes);
+    float *d_x, *d_s;
-    cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
+    cudaMalloc(&d_x, bytes);
-    launch_sincosf_kernel(d_x, d_s, d_c, n);
+    cudaMalloc(&d_s, bytes);
-    cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost);
+    cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
-    std::memcpy(s, h_s, bytes);
+    launch_sinf_kernel(d_x, d_s, n);
    cudaMemcpy(s, d_s, bytes, cudaMemcpyDeviceToHost);
    cudaFree(d_x);
    cudaFree(d_s);
  }
  void compute_cosf(size_t n, const float *x, float *c) const {
    const size_t bytes = n * sizeof(float);
-    cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
+    float *d_x, *d_c;
-    launch_sincosf_kernel(d_x, d_s, d_c, n);
+    cudaMalloc(&d_x, bytes);
-    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
+    cudaMalloc(&d_c, bytes);
-    std::memcpy(c, h_c, bytes);
+    cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
    launch_cosf_kernel(d_x, d_c, n);
    cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost);
    cudaFree(d_x);
    cudaFree(d_c);
  }
  void compute_sincosf(size_t n, const float *x, float *s, float *c) const {
    const size_t bytes = n * sizeof(float);
-    cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
+    float *d_x, *d_s, *d_c;
    cudaMalloc(&d_x, bytes);
    cudaMalloc(&d_s, bytes);
    cudaMalloc(&d_c, bytes);
    cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
    launch_sincosf_kernel(d_x, d_s, d_c, n);
-    cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost);
+    cudaMemcpy(s, d_s, bytes, cudaMemcpyDeviceToHost);
-    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
+    cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost);
    cudaFree(d_x);
    cudaFree(d_s);
    cudaFree(d_c);
  }
  float *h_x = nullptr;
  float *h_s = nullptr;
  float *h_c = nullptr;
  float *d_x = nullptr;
  float *d_s = nullptr;
  float *d_c = nullptr;
 };
 GPUBackend::GPUBackend() : impl(std::make_unique<Impl>()) {}
 GPUBackend::~GPUBackend() = default;
-void GPUBackend::init(size_t n) { impl->init(n); }
+void *GPUBackend::allocate_memory(size_t bytes) const {
  return impl->allocate_memory(bytes);
 }
 void GPUBackend::free_memory(void *ptr) const { impl->free_memory(ptr); }
 void GPUBackend::compute_sinf(size_t n, const float *x, float *s) const {
  impl->compute_sinf(n, x, s);
--- a/src/lookup_avx.cpp
+++ b/src/lookup_avx.cpp
@@ -6,6 +6,16 @@
 #include "trigdx/lookup_avx.hpp"
 #if defined(HAVE_AVX) && !defined(__AVX__)
 static_assert(HAVE_AVX == 0, "__AVX__ should be defined when HAVE_AVX is "
                             "defined");
 #endif
 #if defined(HAVE_AVX2) && !defined(__AVX2__)
 static_assert(HAVE_AVX2 == 0, "__AVX2__ should be defined when HAVE_AVX2 is "
                              "defined");
 #endif
 template <std::size_t NR_SAMPLES> struct LookupAVXBackend<NR_SAMPLES>::Impl {
  std::vector<float> lookup;
  static constexpr std::size_t MASK = NR_SAMPLES - 1;
@@ -160,7 +170,6 @@ template <std::size_t NR_SAMPLES> struct LookupAVXBackend<NR_SAMPLES>::Impl {
    for (std::size_t i = 0; i < n; ++i) {
      std::size_t idx = static_cast<std::size_t>(x[i] * SCALE) & MASK;
      std::size_t idx_cos = (idx + NR_SAMPLES / 4) & MASK;
      s[i] = lookup[idx];
      c[i] = lookup[idx_cos];
    }
 #endif
--- a/src/lookup_xsimd.cpp
+++ b/src/lookup_xsimd.cpp
@@ -8,12 +8,20 @@
 template <std::size_t NR_SAMPLES> struct lookup_table {
  static constexpr std::size_t MASK = NR_SAMPLES - 1;
  static constexpr float SCALE = NR_SAMPLES / (2.0f * float(M_PI));
-  lookup_table() : values{} {
+  static constexpr float PI_FRAC = 2.0f * M_PIf32 / NR_SAMPLES;
  static constexpr float TERM1 = 1.0f;         // 1
  static constexpr float TERM2 = 0.5f;         // 1/2!
  static constexpr float TERM3 = 1.0f / 6.0f;  // 1/3!
  static constexpr float TERM4 = 1.0f / 24.0f; // 1/4!
  constexpr lookup_table() : sin_values{}, cos_values{} {
    for (uint_fast32_t i = 0; i < NR_SAMPLES; i++) {
-      values[i] = sinf(i * (2.0f * float(M_PI) / NR_SAMPLES));
+      sin_values[i] = sinf(i * PI_FRAC);
      cos_values[i] = cosf(i * PI_FRAC);
    }
  }
-  std::array<float, NR_SAMPLES> values;
+  std::array<float, NR_SAMPLES> sin_values;
  std::array<float, NR_SAMPLES> cos_values;
 };
 template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
@@ -25,28 +33,56 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
    constexpr uint_fast32_t VL = b_type::size;
    const uint_fast32_t VS = n - n % VL;
    const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
    const b_type scale = b_type::broadcast(lookup_table_.SCALE);
    const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
    const m_type mask = m_type::broadcast(lookup_table_.MASK);
-    const m_type quarter_pi = m_type::broadcast(Q_PI);
+    const b_type term1 = b_type::broadcast(lookup_table_.TERM1); // 1
    const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
    const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
    const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
    uint_fast32_t i;
    for (i = 0; i < VS; i += VL) {
      const b_type vx = b_type::load(a + i, Tag());
      const b_type scaled = xsimd::mul(vx, scale);
      m_type idx = xsimd::to_int(scaled);
-      m_type idx_cos = xsimd::add(idx, quarter_pi);
+      const b_type f_idx = xsimd::to_float(idx);
-      idx_cos = xsimd::bitwise_and(idx_cos, mask);
+      idx = xsimd::bitwise_and(idx, mask);
-      const b_type cosv = b_type::gather(lookup_table_.values.data(), idx_cos);
+
      b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx);
      b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
      const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
      const b_type dx2 = xsimd::mul(dx, dx);
      const b_type dx3 = xsimd::mul(dx2, dx);
      const b_type dx4 = xsimd::mul(dx2, dx);
      const b_type t2 = xsimd::mul(dx2, term2);
      const b_type t3 = xsimd::mul(dx3, term3);
      const b_type t4 = xsimd::mul(dx4, term4);
      const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
      const b_type sindx = xsimd::sub(dx, t3);
      cosv = xsimd::sub(xsimd::mul(cosv, cosdx), xsimd::mul(sinv, sindx));
      cosv.store(c + i, Tag());
    }
    for (; i < n; i++) {
-      std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) &
+      std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE);
                        lookup_table_.MASK;
      std::size_t idx_cos = (idx + Q_PI) & lookup_table_.MASK;
-      c[i] = lookup_table_.values[idx_cos];
+      std::size_t masked = idx & lookup_table_.MASK;
      const float cosv = lookup_table_.cos_values[masked];
      const float sinv = lookup_table_.sin_values[masked];
      const float dx = a[i] - idx * lookup_table_.PI_FRAC;
      const float dx2 = dx * dx;
      const float dx3 = dx2 * dx;
      const float dx4 = dx3 * dx;
      const float cosdx =
          1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
      const float sindx = dx - lookup_table_.TERM3 * dx3;
      c[i] = cosv * cosdx - sinv * sindx;
    }
  }
  lookup_table<NR_SAMPLES> lookup_table_;
@@ -61,25 +97,53 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
    constexpr uint_fast32_t VL = b_type::size;
    const uint_fast32_t VS = n - n % VL;
    const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
    const b_type scale = b_type::broadcast(lookup_table_.SCALE);
    const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
    const m_type mask = m_type::broadcast(lookup_table_.MASK);
-    const m_type quarter_pi = m_type::broadcast(Q_PI);
+    const b_type term1 = b_type::broadcast(lookup_table_.TERM1); // 1
    const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
    const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
    const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
    uint_fast32_t i;
    for (i = 0; i < VS; i += VL) {
      const b_type vx = b_type::load(a + i, Tag());
      const b_type scaled = xsimd::mul(vx, scale);
      m_type idx = xsimd::to_int(scaled);
-      idx = xsimd::bitwise_and(idx, mask);
+      b_type f_idx = xsimd::to_float(idx);
-      const b_type sinv = b_type::gather(lookup_table_.values.data(), idx);
+      const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
      const b_type dx2 = xsimd::mul(dx, dx);
      const b_type dx3 = xsimd::mul(dx2, dx);
      const b_type dx4 = xsimd::mul(dx2, dx);
      const b_type t2 = xsimd::mul(dx2, term2);
      const b_type t3 = xsimd::mul(dx3, term3);
      const b_type t4 = xsimd::mul(dx4, term4);
      const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
      const b_type sindx = xsimd::sub(dx, t3);
      idx = xsimd::bitwise_and(idx, mask);
      b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
      const b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx);
      sinv = xsimd::add(xsimd::mul(cosv, sindx), xsimd::mul(sinv, cosdx));
      sinv.store(s + i, Tag());
    }
    for (; i < n; i++) {
-      std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) &
+      std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE);
-                        lookup_table_.MASK;
+      std::size_t masked = idx & lookup_table_.MASK;
-      s[i] = lookup_table_.values[idx];
+      const float cosv = lookup_table_.cos_values[masked];
      const float sinv = lookup_table_.sin_values[masked];
      const float dx = a[i] - idx * lookup_table_.PI_FRAC;
      const float dx2 = dx * dx;
      const float dx3 = dx2 * dx;
      const float dx4 = dx3 * dx;
      const float cosdx =
          1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
      const float sindx = dx - lookup_table_.TERM3 * dx3;
      s[i] = sinv * cosdx + cosv * sindx;
    }
  }
  lookup_table<NR_SAMPLES> lookup_table_;
@@ -94,31 +158,56 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
    constexpr uint_fast32_t VL = b_type::size;
    const uint_fast32_t VS = n - n % VL;
    const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
    const b_type scale = b_type::broadcast(lookup_table_.SCALE);
    const m_type mask = m_type::broadcast(lookup_table_.MASK);
    const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
    const b_type term1 = b_type::broadcast(lookup_table_.TERM1); // 1
    const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
    const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
    const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
    const m_type quarter_pi = m_type::broadcast(Q_PI);
    uint_fast32_t i;
    for (i = 0; i < VS; i += VL) {
      const b_type vx = b_type::load(a + i, Tag());
      const b_type scaled = xsimd::mul(vx, scale);
      m_type idx = xsimd::to_int(scaled);
-      m_type idx_cos = xsimd::add(idx, quarter_pi);
+      b_type f_idx = xsimd::to_float(idx);
      const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
      const b_type dx2 = xsimd::mul(dx, dx);
      const b_type dx3 = xsimd::mul(dx2, dx);
      const b_type dx4 = xsimd::mul(dx2, dx);
      const b_type t2 = xsimd::mul(dx2, term2);
      const b_type t3 = xsimd::mul(dx3, term3);
      const b_type t4 = xsimd::mul(dx4, term4);
      idx = xsimd::bitwise_and(idx, mask);
-      idx_cos = xsimd::bitwise_and(idx_cos, mask);
+      b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
-      const b_type sinv = b_type::gather(lookup_table_.values.data(), idx);
+      b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx);
-      const b_type cosv = b_type::gather(lookup_table_.values.data(), idx_cos);
+
      const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
      const b_type sindx = xsimd::sub(dx, t3);
      sinv = xsimd::add(xsimd::mul(cosv, sindx), xsimd::mul(sinv, cosdx));
      cosv = xsimd::sub(xsimd::mul(cosv, cosdx), xsimd::mul(sinv, sindx));
      sinv.store(s + i, Tag());
      cosv.store(c + i, Tag());
    }
    for (; i < n; i++) {
-      std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) &
+      std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE);
-                        lookup_table_.MASK;
+      std::size_t masked = idx & lookup_table_.MASK;
-      std::size_t idx_cos = (idx + Q_PI) & lookup_table_.MASK;
+      const float cosv = lookup_table_.cos_values[masked];
-      s[i] = lookup_table_.values[idx];
+      const float sinv = lookup_table_.sin_values[masked];
-      c[i] = lookup_table_.values[idx_cos];
+      const float dx = a[i] - idx * lookup_table_.PI_FRAC;
      const float dx2 = dx * dx;
      const float dx3 = dx2 * dx;
      const float dx4 = dx3 * dx;
      const float cosdx =
          1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
      const float sindx = dx - lookup_table_.TERM3 * dx3;
      s[i] = sinv * cosdx + cosv * sindx;
      c[i] = cosv * cosdx - sinv * sindx;
    }
  }
  lookup_table<NR_SAMPLES> lookup_table_;
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,5 +1,3 @@
 include(FetchContent)
 FetchContent_Declare(
  catch2
  GIT_REPOSITORY https://github.com/catchorg/Catch2.git
@@ -9,30 +7,30 @@ FetchContent_MakeAvailable(catch2)
 # Lookup backend test
 add_executable(test_lookup test_lookup.cpp)
 target_link_libraries(test_lookup PRIVATE trigdx Catch2::Catch2WithMain)
 add_test(NAME test_lookup COMMAND test_lookup)
 # LookupAVX backend test
-add_executable(test_lookup_avx test_lookup_avx.cpp)
+if(HAVE_AVX)
-target_link_libraries(test_lookup_avx PRIVATE trigdx Catch2::Catch2WithMain)
+  add_executable(test_lookup_avx test_lookup_avx.cpp)
  target_link_libraries(test_lookup_avx PRIVATE trigdx Catch2::Catch2WithMain)
  add_test(NAME test_lookup_avx COMMAND test_lookup_avx)
 endif()
 # MKL backend test
 if(TRIGDX_USE_MKL)
  add_executable(test_mkl test_mkl.cpp)
  target_link_libraries(test_mkl PRIVATE trigdx Catch2::Catch2WithMain)
 endif()
 include(CTest)
 add_test(NAME test_lookup COMMAND test_lookup)
 if(TRIGDX_USE_MKL)
  add_test(NAME test_mkl COMMAND test_mkl)
 endif()
 # GPU backend test
 if(TRIGDX_USE_GPU)
  add_executable(test_gpu test_gpu.cpp)
  target_link_libraries(test_gpu PRIVATE trigdx Catch2::Catch2WithMain)
  add_test(NAME test_gpu COMMAND test_gpu)
 endif()
 # XSIMD backend test
 if(TRIGDX_USE_XSIMD)
  add_executable(test_lookup_xsimd test_lookup_xsimd.cpp)
  target_link_libraries(test_lookup_xsimd PRIVATE trigdx Catch2::Catch2WithMain)
--- a/tests/test_gpu.cpp
+++ b/tests/test_gpu.cpp
@@ -3,8 +3,8 @@
 #include "test_utils.hpp"
-TEST_CASE("sinf") { test_sinf<GPUBackend>(1e-1f); }
+TEST_CASE("sinf") { test_sinf<GPUBackend>(1e-6f); }
-TEST_CASE("cosf") { test_cosf<GPUBackend>(1e-1f); }
+TEST_CASE("cosf") { test_cosf<GPUBackend>(1e-6f); }
-TEST_CASE("sincosf") { test_sincosf<GPUBackend>(1e-1f); }
+TEST_CASE("sincosf") { test_sincosf<GPUBackend>(1e-6f); }
--- a/tests/test_lookup_xsimd.cpp
+++ b/tests/test_lookup_xsimd.cpp
@@ -4,16 +4,16 @@
 #include "test_utils.hpp"
 TEST_CASE("sincosf") {
-  test_sincosf<LookupXSIMDBackend<16384>>(1e-2f);
+  test_sincosf<LookupXSIMDBackend<16384>>(1e-6f);
-  test_sincosf<LookupXSIMDBackend<32768>>(1e-2f);
+  test_sincosf<LookupXSIMDBackend<32768>>(1e-6f);
 }
 TEST_CASE("sinf") {
-  test_sinf<LookupXSIMDBackend<16384>>(1e-2f);
+  test_sinf<LookupXSIMDBackend<16384>>(1e-6f);
-  test_sinf<LookupXSIMDBackend<32768>>(1e-2f);
+  test_sinf<LookupXSIMDBackend<32768>>(1e-6f);
 }
 TEST_CASE("cosf") {
-  test_cosf<LookupXSIMDBackend<16384>>(1e-2f);
+  test_cosf<LookupXSIMDBackend<16384>>(1e-6f);
-  test_cosf<LookupXSIMDBackend<32768>>(1e-2f);
+  test_cosf<LookupXSIMDBackend<32768>>(1e-6f);
 }
--- a/tests/test_utils.hpp
+++ b/tests/test_utils.hpp
@@ -7,14 +7,17 @@
 #include <catch2/matchers/catch_matchers_floating_point.hpp>
 #include <trigdx/reference.hpp>
-const size_t N = 1e7;
+const size_t N = 1234;
 void init_x(std::vector<float> &x) {
  for (size_t i = 0; i < x.size(); ++i) {
    x[i] = (i % 360) * 0.0174533f; // degrees to radians
  }
 }
 template <typename Backend> inline void test_sinf(float tol) {
  std::vector<float> x(N), s_ref(N), s(N);
-
+  init_x(x);
  for (size_t i = 0; i < N; ++i) {
    x[i] = float(i) * 0.01f;
  }
  ReferenceBackend ref;
  Backend backend;
@@ -30,10 +33,7 @@ template <typename Backend> inline void test_sinf(float tol) {
 template <typename Backend> inline void test_cosf(float tol) {
  std::vector<float> x(N), c_ref(N), c(N);
-
+  init_x(x);
  for (size_t i = 0; i < N; ++i) {
    x[i] = float(i) * 0.01f;
  }
  ReferenceBackend ref;
  Backend backend;
@@ -49,10 +49,7 @@ template <typename Backend> inline void test_cosf(float tol) {
 template <typename Backend> inline void test_sincosf(float tol) {
  std::vector<float> x(N), s_ref(N), c_ref(N), s(N), c(N);
-
+  init_x(x);
  for (size_t i = 0; i < N; ++i) {
    x[i] = float(i) * 0.01f;
  }
  ReferenceBackend ref;
  Backend backend;
Author	SHA1	Message	Date
Dantali0n	58bc640d6d	32: Set AVX and AVX2 flags using CMake checks (#34 ) * 32: Set mavx and mavx2 based on CMake checks * 32: Update flags for Intel compiler * Fix: AVX2 instead of AVX__2 Co-authored-by: Bram Veenboer <bram.veenboer@gmail.com> Co-authored-by: lukken <lukken@astron.nl>	2025-10-29 09:18:43 +01:00
Wiebe van Breukelen	5f00c5d304	Add README.md (#38 ) * Add README.md * Improve README description of TrigDx library * Apply suggestion from @mickveldhuis Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl> * Apply suggestion from @mickveldhuis Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl> * Apply suggestion from @mickveldhuis Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl> --------- Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl>	2025-10-22 16:55:37 +02:00
Wiebe van Breukelen	f85e67e669	Fix compiler warnings (#37 )	2025-10-22 16:48:26 +02:00
mmancini-skao	76998a137a	Fix error in Taylor expansion (#36 ) Replaced term3 with term4 in calculations for t4.	2025-10-20 17:09:35 +02:00
Bram Veenboer	500d35070e	Fix formatting (#35 ) * Run pre-commit * Skip line-length check in cmake-lint	2025-10-10 09:19:18 +02:00
Dantali0n	bfe752433f	Fixes #30 , Add CMake steps to install python bindings (#31 )	2025-09-17 20:03:28 +02:00
Bram Veenboer	8fe8314905	Update GPU backend (#29 ) * Update GPU memory management * Add allocate_memory and free_memory	2025-09-03 09:16:28 +02:00
Wiebe van Breukelen	9d3af8c202	Fixed broken pybind11 target check (#28 ) * Fixed broken pybind11 target check * Update python/CMakeLists.txt Co-authored-by: Bram Veenboer <bram.veenboer@gmail.com> --------- Co-authored-by: Bram Veenboer <bram.veenboer@gmail.com>	2025-08-27 17:07:30 +02:00
Bram Veenboer	0f7fd06be8	Extend CI (#27 ) * Add build with Intel compiler * Switch to upload/download-artifacts that retain permissions	2025-08-21 15:15:25 +02:00
Bram Veenboer	112baf447b	Update the test workflow (#26 ) - Upgrade to v1.2 of slurm-action - Add extra element to matrix to test this new action - Add cleanup step - Simplify handling of artefacts - Execute tests directly (not with ctest)	2025-08-21 09:06:42 +02:00
Wiebe van Breukelen	a5ba99ff5f	Only fetch pybind11 when target not available (#25 )	2025-08-15 16:19:34 +02:00
Wiebe van Breukelen	77c55d6824	Change CMAKE_SOURCE_DIR to PROJECT_SOURCE_DIR (#24 )	2025-08-15 15:30:37 +02:00
Bram Veenboer	c85df5f69c	Add Taylor expansion to LookupXSIMD (#23 ) Co-authored-by: Wiebe van Breukelen <breukelen@astron.nl> Co-authored-by: mancini <mancini@astron.nl>	2025-08-15 10:53:15 +02:00
Bram Veenboer	9c17e90c77	Add AVX checks (#20 ) Co-authored-by: mancini <mancini@astron.nl>	2025-08-15 10:30:57 +02:00
Bram Veenboer	e755c1a454	Fix CI (#21 )	2025-08-15 10:29:48 +02:00
Bram Veenboer	b129d9ffaf	Update CI (#19 )	2025-08-15 10:12:55 +02:00
Bram Veenboer	428f60f6d6	Delete .vscode/settings.json (#18 )	2025-08-14 17:16:02 +02:00
Bram Veenboer	0e8ea57025	Add initial CI (#17 )	2025-08-14 17:14:29 +02:00
Bram Veenboer	a072ffd12f	Merge pull request #10 from astron-rd/add-python-interface Add Python interface	2025-08-14 15:21:13 +02:00
Bram Veenboer	b3467840f9	Use const size_t for n	2025-08-14 11:07:45 +02:00
Bram Veenboer	79dc7b4285	Add dimension checks	2025-08-14 11:05:31 +02:00
Bram Veenboer	97692cface	Use Type template for helper functions	2025-08-14 11:02:30 +02:00
Bram Veenboer	f40c44d5dd	Add Python interface	2025-08-14 10:57:06 +02:00
Wiebe van Breukelen	0cbefb77b7	Fix CMake interface include paths (#14 ) * Fixed include path * Formatting * Remove lookup_constexpr.hpp	2025-08-14 09:34:27 +02:00
Bram Veenboer	cdc94ab2cb	Merge pull request #11 from astron-rd/fix-cuda-test Bugfixes in data copies in CUDA backend	2025-08-12 17:16:10 +02:00
Bram Veenboer	33f98abc48	Set tolerance for CUDA tests to 1e-6	2025-08-12 17:14:41 +02:00
Bram Veenboer	c7ab463b43	Bugfix gpu output data copy	2025-08-12 17:14:41 +02:00
Bram Veenboer	ebb6d50c0b	Bugfix gpu input data copy	2025-08-12 17:14:41 +02:00
Bram Veenboer	5338f3e135	Cleanup data initialization of tests	2025-08-12 17:14:41 +02:00
Bram Veenboer	f6575599fd	Cleanup data initialization of benchmarks	2025-08-12 17:14:41 +02:00
Bram Veenboer	cd048e5581	Remove unused DEFAULT_N	2025-08-12 17:14:41 +02:00
Bram Veenboer	eb3806442d	Merge pull request #9 from astron-rd/cmake-bugfix Fix issue when building with TRIGDX_USE_GPU=1	2025-08-12 15:28:47 +02:00
Bram Veenboer	e58d6dae8d	Merge pull request #7 from astron-rd/use-google-benchmark Refactor benchmarks using Google Benchmark	2025-08-12 15:24:58 +02:00
Bram Veenboer	a65137322d	Use static_cast	2025-08-12 15:18:07 +02:00
Bram Veenboer	b936b3998e	Fix issue when building with TRIGDX_USE_GPU=1	2025-08-12 14:59:36 +02:00
Bram Veenboer	0e2d9862d5	Merge pull request #8 from astron-rd/use-common-headers Add trigdx_config.hpp and trigdx.hpp header files	2025-08-12 14:50:19 +02:00
Bram Veenboer	2f39c5c86e	Add trigdx_config.hpp and trigdx.hpp header files	2025-08-12 14:04:13 +02:00
Bram Veenboer	5e7aca89bb	Refactor benchmarks using Google Benchmark	2025-08-12 13:17:23 +02:00
Bram Veenboer	e9a74ef283	Merge pull request #6 from astron-rd/fix-tests-cmake Improve the CMake for the tests	2025-08-12 11:38:25 +02:00
Bram Veenboer	832da6229d	Improve the CMake	2025-08-12 11:28:43 +02:00
Wiebe van Breukelen	b71611ed17	Merge pull request #5 from astron-rd/auto-pull-xsimd Pull xsimd when not available	2025-08-12 10:29:35 +02:00
Wiebe van Breukelen	5a4e80ea4a	Pull xsimd when not available	2025-08-12 10:18:32 +02:00
Wiebe van Breukelen	13f18847bc	Merge pull request #3 from astron-rd/add-gitignore Add .gitignore	2025-08-12 10:11:04 +02:00
Wiebe van Breukelen	eee382307f	Merge pull request #2 from astron-rd/add-cmake-option-prefix Add TRIGDX prefix to CMake configurable options	2025-08-12 10:10:55 +02:00
Wiebe van Breukelen	8eef1e966b	Add .gitignore	2025-08-12 09:55:40 +02:00